Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,18 +1,18 @@
|
|
1 |
import streamlit as st
|
2 |
from streamlit.logger import get_logger
|
3 |
import gematriapy
|
4 |
-
|
5 |
from timeit import default_timer as timer
|
6 |
import sqlite3
|
7 |
-
import pandas as pd
|
8 |
import ast
|
9 |
-
|
10 |
|
11 |
LOGGER = get_logger(__name__)
|
12 |
|
|
|
|
|
|
|
13 |
@st.cache_resource
|
14 |
def get_dfs()->object:
|
15 |
-
import pandas as pd
|
16 |
|
17 |
def to_daf_long(i:int)->str:
|
18 |
if i>0 and i<999:
|
@@ -27,9 +27,11 @@ def get_dfs()->object:
|
|
27 |
if type(i) == int and i>0 and i<999:
|
28 |
return gematriapy.to_hebrew(i) + ' '
|
29 |
else: return i if type(i)==str else ''
|
30 |
-
|
31 |
-
# //get the books table//
|
32 |
print('hello from get_dfs..')
|
|
|
|
|
|
|
33 |
# Connect to the database
|
34 |
conn = sqlite3.connect('test42.db')
|
35 |
|
@@ -54,7 +56,7 @@ def get_dfs()->object:
|
|
54 |
texts = pd.DataFrame(results)
|
55 |
texts.columns=list(map(lambda x: x[0], cursor.description))
|
56 |
|
57 |
-
# get the table that includes the titles
|
58 |
# Query the database and retrieve the results
|
59 |
cursor = conn.execute("SELECT * FROM titles")
|
60 |
results = cursor.fetchall()
|
@@ -82,7 +84,7 @@ def get_dfs()->object:
|
|
82 |
|
83 |
def find_ref(titles_df,texts_df,input_text,top_k,num_of_results,algorithm):
|
84 |
from rapidfuzz import fuzz, process as rapidfuzz_process
|
85 |
-
from rapidfuzz.fuzz import token_ratio,ratio,partial_ratio,token_set_ratio,partial_token_set_ratio,token_sort_ratio,WRatio
|
86 |
|
87 |
print('hello from find_ref..')
|
88 |
|
@@ -95,17 +97,17 @@ def find_ref(titles_df,texts_df,input_text,top_k,num_of_results,algorithm):
|
|
95 |
# search only the references database in case the user set the top_k to 0
|
96 |
if top_k == 0:
|
97 |
refs = texts_df['ref_text_long'].unique()
|
98 |
-
for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs,scorer=scorer, limit=num_of_results):
|
99 |
results += [{'ref':ref,'ref_score':ref_score}]
|
100 |
|
101 |
else:
|
102 |
# search first only in the books database (for top_k books)
|
103 |
-
for book, book_score, _ in rapidfuzz_process.extract(input_text, books, scorer=scorer, limit=top_k):
|
104 |
# get all the references of that book
|
105 |
book_title = list(titles_df.loc[titles_df['he_titles']==book]['title'])[0]
|
106 |
refs = texts_df.loc[texts_df['title']==book_title]['ref_text_long'].unique()
|
107 |
# then search these references and add them all to the results
|
108 |
-
for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs, limit=10,scorer=scorer):
|
109 |
results += [{'ref':ref,'ref_score':ref_score,'book':book,'book_score':book_score}]
|
110 |
# finaly, sort all the references by their own score (and not the book score)
|
111 |
results.sort(key=lambda x: x['ref_score'],reverse=True)
|
|
|
1 |
import streamlit as st
|
2 |
from streamlit.logger import get_logger
|
3 |
import gematriapy
|
|
|
4 |
from timeit import default_timer as timer
|
5 |
import sqlite3
|
|
|
6 |
import ast
|
7 |
+
import pandas as pd
|
8 |
|
9 |
LOGGER = get_logger(__name__)
|
10 |
|
11 |
+
def preprocess(s:str)->str:
|
12 |
+
return s.replace('"','').replace('על','').replace('פרק','').replace('פסוק','').replace('דף','').replace('עמוד','').replace('סימן','').replace('סעיף','').replace('חידושי','').replace("'",'')
|
13 |
+
|
14 |
@st.cache_resource
|
15 |
def get_dfs()->object:
|
|
|
16 |
|
17 |
def to_daf_long(i:int)->str:
|
18 |
if i>0 and i<999:
|
|
|
27 |
if type(i) == int and i>0 and i<999:
|
28 |
return gematriapy.to_hebrew(i) + ' '
|
29 |
else: return i if type(i)==str else ''
|
30 |
+
|
|
|
31 |
print('hello from get_dfs..')
|
32 |
+
|
33 |
+
# //get the books table//
|
34 |
+
|
35 |
# Connect to the database
|
36 |
conn = sqlite3.connect('test42.db')
|
37 |
|
|
|
56 |
texts = pd.DataFrame(results)
|
57 |
texts.columns=list(map(lambda x: x[0], cursor.description))
|
58 |
|
59 |
+
# // get the table that includes the titles//
|
60 |
# Query the database and retrieve the results
|
61 |
cursor = conn.execute("SELECT * FROM titles")
|
62 |
results = cursor.fetchall()
|
|
|
84 |
|
85 |
def find_ref(titles_df,texts_df,input_text,top_k,num_of_results,algorithm):
|
86 |
from rapidfuzz import fuzz, process as rapidfuzz_process
|
87 |
+
from rapidfuzz.fuzz import token_ratio,ratio,partial_ratio,token_set_ratio,partial_token_set_ratio,token_sort_ratio, WRatio
|
88 |
|
89 |
print('hello from find_ref..')
|
90 |
|
|
|
97 |
# search only the references database in case the user set the top_k to 0
|
98 |
if top_k == 0:
|
99 |
refs = texts_df['ref_text_long'].unique()
|
100 |
+
for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs,scorer=scorer, limit=num_of_results,processor=preprocess):
|
101 |
results += [{'ref':ref,'ref_score':ref_score}]
|
102 |
|
103 |
else:
|
104 |
# search first only in the books database (for top_k books)
|
105 |
+
for book, book_score, _ in rapidfuzz_process.extract(input_text, books, scorer=scorer, limit=top_k,processor=preprocess):
|
106 |
# get all the references of that book
|
107 |
book_title = list(titles_df.loc[titles_df['he_titles']==book]['title'])[0]
|
108 |
refs = texts_df.loc[texts_df['title']==book_title]['ref_text_long'].unique()
|
109 |
# then search these references and add them all to the results
|
110 |
+
for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs, limit=10,scorer=scorer,processor=preprocess):
|
111 |
results += [{'ref':ref,'ref_score':ref_score,'book':book,'book_score':book_score}]
|
112 |
# finaly, sort all the references by their own score (and not the book score)
|
113 |
results.sort(key=lambda x: x['ref_score'],reverse=True)
|