|
import streamlit as st |
|
from streamlit.logger import get_logger |
|
import gematriapy |
|
|
|
from timeit import default_timer as timer |
|
import sqlite3 |
|
import pandas as pd |
|
import ast |
|
import pymongo |
|
|
|
|
|
|
|
LOGGER = get_logger(__name__) |
|
|
|
@st.cache_resource |
|
def get_dfs()->object: |
|
import pandas as pd |
|
|
|
def to_daf_long(i:int)->str: |
|
if i>0 and i<999: |
|
i+=1 |
|
if i%2 ==0: |
|
return gematriapy.to_hebrew(i//2)+' ืขืืื ื ' |
|
else: |
|
return gematriapy.to_hebrew(i//2)+' ืขืืื ื' |
|
return i |
|
|
|
def gematria(i)->str: |
|
if type(i) == int and i>0 and i<999: |
|
return gematriapy.to_hebrew(i) + ' ' |
|
else: return i if type(i)==str else '' |
|
|
|
|
|
print('hello from get_dfs..') |
|
|
|
conn = sqlite3.connect('test42.db') |
|
|
|
|
|
cursor = conn.execute("SELECT * FROM books") |
|
results = cursor.fetchall() |
|
|
|
|
|
books = pd.DataFrame(list(results)) |
|
books.columns=list(map(lambda x: x[0], cursor.description)) |
|
|
|
|
|
books['heSectionNames']=books['heSectionNames'].apply(lambda x: ast.literal_eval(x) if x is not None else [''] ) |
|
|
|
|
|
|
|
|
|
cursor = conn.execute("SELECT * FROM texts") |
|
results = cursor.fetchall() |
|
|
|
|
|
texts = pd.DataFrame(results) |
|
texts.columns=list(map(lambda x: x[0], cursor.description)) |
|
|
|
|
|
|
|
cursor = conn.execute("SELECT * FROM titles") |
|
results = cursor.fetchall() |
|
|
|
|
|
titles = pd.DataFrame(results) |
|
titles.columns=list(map(lambda x: x[0], cursor.description)) |
|
|
|
merged = pd.merge(texts,books,how='inner',left_on='bid',right_on='_id') |
|
|
|
|
|
has_dafs = merged.loc[merged['heSectionNames'].apply(lambda x: True if len(x)>1 and x[-2] == 'ืืฃ' else False)==True] |
|
merged.loc[has_dafs.index,'level2'] = has_dafs['level2'].map(to_daf_long) |
|
|
|
|
|
merged['ref_text_long']= merged['heTitle'] + ' ' + \ |
|
merged['heSectionNames'].map(lambda x:x[-4] + ' ' if len(x)>3 else "") + merged['level4'].map(gematria) + \ |
|
merged['heSectionNames'].map(lambda x:x[-3] + ' ' if len(x)>2 else "") + merged['level3'].map(gematria) + \ |
|
merged['heSectionNames'].map(lambda x:x[-2] + ' ' if len(x)>1 else "") + merged['level2'].map(gematria) |
|
|
|
titles_df = titles |
|
texts_df = merged |
|
return titles_df, texts_df |
|
|
|
|
|
def find_ref(titles_df,texts_df,input_text,top_k,num_of_results): |
|
from rapidfuzz import process as rapidfuzz_process |
|
print('hello from find_ref..') |
|
if not input_text: return |
|
|
|
results = [] |
|
books = titles_df['he_titles'] |
|
input_text = input_text.replace(':','ืขืืื ื').replace('.','ืขืืื ื') |
|
|
|
|
|
if top_k == 0: |
|
refs = texts_df['ref_text_long'].unique() |
|
for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs, limit=num_of_results): |
|
results += [{'ref':ref,'ref_score':ref_score}] |
|
|
|
else: |
|
|
|
for book, book_score, _ in rapidfuzz_process.extract(input_text, books, limit=top_k): |
|
|
|
book_title = list(titles_df.loc[titles_df['he_titles']==book]['title'])[0] |
|
refs = texts_df.loc[texts_df['title']==book_title]['ref_text_long'].unique() |
|
|
|
for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs, limit=10): |
|
results += [{'ref':ref,'ref_score':ref_score,'book':book,'book_score':book_score}] |
|
|
|
results.sort(key=lambda x: x['ref_score'],reverse=True) |
|
|
|
return results[:num_of_results] |
|
|
|
|
|
def run(): |
|
|
|
st.set_page_config( |
|
page_title=" ืืืคืืฉ ืืงืืจืืช", |
|
page_icon="๐", |
|
layout="wide", |
|
initial_sidebar_state="expanded" |
|
) |
|
get_dfs() |
|
st.write("# ืืืคืืฉ ืืงืืจืืช ืืืืฆืขืืช ืืจืืง ืืืื ืฉืืืื") |
|
|
|
titles_df,texts_df = get_dfs() |
|
user_input = st.text_input('ืืชืื ืืช ืืืงืืจ ืืืืืงืฉ', placeholder='ืืื ืงืื ืืฃ ื ืขืืื ื') |
|
top_k = st.sidebar.slider('ืืื ืกืคืจืื ืืกืจืืง top_k:',0,20,10) |
|
num_of_results = st.sidebar.slider('ืืกืคืจ ืืชืืฆืืืช ืฉืืจืฆืื ื ืืืฆืื:',1,25,5) |
|
|
|
if user_input!="": |
|
time0 = timer() |
|
results = find_ref(titles_df,texts_df,user_input,top_k,num_of_results) |
|
time = f"finished in {1e3*(timer()-time0):.1f} ms" |
|
st.write(time) |
|
for result in results: |
|
st.write(result) |
|
|
|
if __name__ == "__main__": |
|
run() |
|
|