Spaces:

sivan22
/

sefaria-ref-finder

Sleeping

App Files Files Community

sivan22 commited on Jan 12

Commit

e710286

•

1 Parent(s): 2a6c96b

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -11

app.py CHANGED Viewed

@@ -1,18 +1,18 @@
 import streamlit as st
 from streamlit.logger import get_logger
 import gematriapy
 from timeit import default_timer as timer
 import sqlite3
-import pandas as pd
 import ast
 LOGGER = get_logger(__name__)
 @st.cache_resource
 def get_dfs()->object:
-    import pandas as pd
     def to_daf_long(i:int)->str:
         if i>0 and i<999:
@@ -27,9 +27,11 @@ def get_dfs()->object:
         if type(i) == int and i>0 and i<999:
             return gematriapy.to_hebrew(i) + ' '
         else: return i if type(i)==str else ''
-    # //get the books table//
     print('hello from get_dfs..')
     # Connect to the database
     conn = sqlite3.connect('test42.db')
@@ -54,7 +56,7 @@ def get_dfs()->object:
     texts = pd.DataFrame(results)
     texts.columns=list(map(lambda x: x[0], cursor.description))
-    # get the table that includes the titles, from the MongoDB database - b/c the sqlite just don't have it
   # Query the database and retrieve the results
     cursor = conn.execute("SELECT * FROM titles")
     results = cursor.fetchall()
@@ -82,7 +84,7 @@ def get_dfs()->object:
 def find_ref(titles_df,texts_df,input_text,top_k,num_of_results,algorithm):
     from rapidfuzz import fuzz, process as rapidfuzz_process
-    from rapidfuzz.fuzz import token_ratio,ratio,partial_ratio,token_set_ratio,partial_token_set_ratio,token_sort_ratio,WRatio,QRatio
     print('hello from find_ref..')
@@ -95,17 +97,17 @@ def find_ref(titles_df,texts_df,input_text,top_k,num_of_results,algorithm):
     # search only the references database in case the user set the top_k to 0
     if top_k == 0:
         refs = texts_df['ref_text_long'].unique()
-        for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs,scorer=scorer, limit=num_of_results):
            results += [{'ref':ref,'ref_score':ref_score}]
     else:
         # search first only in the books database (for top_k books)
-        for book, book_score, _ in rapidfuzz_process.extract(input_text, books, scorer=scorer, limit=top_k):
             # get all the references of that book
             book_title = list(titles_df.loc[titles_df['he_titles']==book]['title'])[0]
             refs = texts_df.loc[texts_df['title']==book_title]['ref_text_long'].unique()
             # then search these references and add them all to the results
-            for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs, limit=10,scorer=scorer):
                 results += [{'ref':ref,'ref_score':ref_score,'book':book,'book_score':book_score}]
         # finaly, sort all the references by their own score (and not the book score)
         results.sort(key=lambda x: x['ref_score'],reverse=True)

 import streamlit as st
 from streamlit.logger import get_logger
 import gematriapy
 from timeit import default_timer as timer
 import sqlite3
 import ast
+import pandas as pd
 LOGGER = get_logger(__name__)
+def preprocess(s:str)->str:
+    return s.replace('"','').replace('על','').replace('פרק','').replace('פסוק','').replace('דף','').replace('עמוד','').replace('סימן','').replace('סעיף','').replace('חידושי','').replace("'",'')
 @st.cache_resource
 def get_dfs()->object:
     def to_daf_long(i:int)->str:
         if i>0 and i<999:
         if type(i) == int and i>0 and i<999:
             return gematriapy.to_hebrew(i) + ' '
         else: return i if type(i)==str else ''
     print('hello from get_dfs..')
+    # //get the books table//
     # Connect to the database
     conn = sqlite3.connect('test42.db')
     texts = pd.DataFrame(results)
     texts.columns=list(map(lambda x: x[0], cursor.description))
+    # // get the table that includes the titles//
   # Query the database and retrieve the results
     cursor = conn.execute("SELECT * FROM titles")
     results = cursor.fetchall()
 def find_ref(titles_df,texts_df,input_text,top_k,num_of_results,algorithm):
     from rapidfuzz import fuzz, process as rapidfuzz_process
+    from rapidfuzz.fuzz import token_ratio,ratio,partial_ratio,token_set_ratio,partial_token_set_ratio,token_sort_ratio, WRatio
     print('hello from find_ref..')
     # search only the references database in case the user set the top_k to 0
     if top_k == 0:
         refs = texts_df['ref_text_long'].unique()
+        for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs,scorer=scorer, limit=num_of_results,processor=preprocess):
            results += [{'ref':ref,'ref_score':ref_score}]
     else:
         # search first only in the books database (for top_k books)
+        for book, book_score, _ in rapidfuzz_process.extract(input_text, books, scorer=scorer, limit=top_k,processor=preprocess):
             # get all the references of that book
             book_title = list(titles_df.loc[titles_df['he_titles']==book]['title'])[0]
             refs = texts_df.loc[texts_df['title']==book_title]['ref_text_long'].unique()
             # then search these references and add them all to the results
+            for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs, limit=10,scorer=scorer,processor=preprocess):
                 results += [{'ref':ref,'ref_score':ref_score,'book':book,'book_score':book_score}]
         # finaly, sort all the references by their own score (and not the book score)
         results.sort(key=lambda x: x['ref_score'],reverse=True)