sivan22 commited on
Commit
e710286
1 Parent(s): 2a6c96b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -11
app.py CHANGED
@@ -1,18 +1,18 @@
1
  import streamlit as st
2
  from streamlit.logger import get_logger
3
  import gematriapy
4
-
5
  from timeit import default_timer as timer
6
  import sqlite3
7
- import pandas as pd
8
  import ast
9
-
10
 
11
  LOGGER = get_logger(__name__)
12
 
 
 
 
13
  @st.cache_resource
14
  def get_dfs()->object:
15
- import pandas as pd
16
 
17
  def to_daf_long(i:int)->str:
18
  if i>0 and i<999:
@@ -27,9 +27,11 @@ def get_dfs()->object:
27
  if type(i) == int and i>0 and i<999:
28
  return gematriapy.to_hebrew(i) + ' '
29
  else: return i if type(i)==str else ''
30
-
31
- # //get the books table//
32
  print('hello from get_dfs..')
 
 
 
33
  # Connect to the database
34
  conn = sqlite3.connect('test42.db')
35
 
@@ -54,7 +56,7 @@ def get_dfs()->object:
54
  texts = pd.DataFrame(results)
55
  texts.columns=list(map(lambda x: x[0], cursor.description))
56
 
57
- # get the table that includes the titles, from the MongoDB database - b/c the sqlite just don't have it
58
  # Query the database and retrieve the results
59
  cursor = conn.execute("SELECT * FROM titles")
60
  results = cursor.fetchall()
@@ -82,7 +84,7 @@ def get_dfs()->object:
82
 
83
  def find_ref(titles_df,texts_df,input_text,top_k,num_of_results,algorithm):
84
  from rapidfuzz import fuzz, process as rapidfuzz_process
85
- from rapidfuzz.fuzz import token_ratio,ratio,partial_ratio,token_set_ratio,partial_token_set_ratio,token_sort_ratio,WRatio,QRatio
86
 
87
  print('hello from find_ref..')
88
 
@@ -95,17 +97,17 @@ def find_ref(titles_df,texts_df,input_text,top_k,num_of_results,algorithm):
95
  # search only the references database in case the user set the top_k to 0
96
  if top_k == 0:
97
  refs = texts_df['ref_text_long'].unique()
98
- for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs,scorer=scorer, limit=num_of_results):
99
  results += [{'ref':ref,'ref_score':ref_score}]
100
 
101
  else:
102
  # search first only in the books database (for top_k books)
103
- for book, book_score, _ in rapidfuzz_process.extract(input_text, books, scorer=scorer, limit=top_k):
104
  # get all the references of that book
105
  book_title = list(titles_df.loc[titles_df['he_titles']==book]['title'])[0]
106
  refs = texts_df.loc[texts_df['title']==book_title]['ref_text_long'].unique()
107
  # then search these references and add them all to the results
108
- for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs, limit=10,scorer=scorer):
109
  results += [{'ref':ref,'ref_score':ref_score,'book':book,'book_score':book_score}]
110
  # finaly, sort all the references by their own score (and not the book score)
111
  results.sort(key=lambda x: x['ref_score'],reverse=True)
 
1
  import streamlit as st
2
  from streamlit.logger import get_logger
3
  import gematriapy
 
4
  from timeit import default_timer as timer
5
  import sqlite3
 
6
  import ast
7
+ import pandas as pd
8
 
9
  LOGGER = get_logger(__name__)
10
 
11
+ def preprocess(s:str)->str:
12
+ return s.replace('"','').replace('על','').replace('פרק','').replace('פסוק','').replace('דף','').replace('עמוד','').replace('סימן','').replace('סעיף','').replace('חידושי','').replace("'",'')
13
+
14
  @st.cache_resource
15
  def get_dfs()->object:
 
16
 
17
  def to_daf_long(i:int)->str:
18
  if i>0 and i<999:
 
27
  if type(i) == int and i>0 and i<999:
28
  return gematriapy.to_hebrew(i) + ' '
29
  else: return i if type(i)==str else ''
30
+
 
31
  print('hello from get_dfs..')
32
+
33
+ # //get the books table//
34
+
35
  # Connect to the database
36
  conn = sqlite3.connect('test42.db')
37
 
 
56
  texts = pd.DataFrame(results)
57
  texts.columns=list(map(lambda x: x[0], cursor.description))
58
 
59
+ # // get the table that includes the titles//
60
  # Query the database and retrieve the results
61
  cursor = conn.execute("SELECT * FROM titles")
62
  results = cursor.fetchall()
 
84
 
85
  def find_ref(titles_df,texts_df,input_text,top_k,num_of_results,algorithm):
86
  from rapidfuzz import fuzz, process as rapidfuzz_process
87
+ from rapidfuzz.fuzz import token_ratio,ratio,partial_ratio,token_set_ratio,partial_token_set_ratio,token_sort_ratio, WRatio
88
 
89
  print('hello from find_ref..')
90
 
 
97
  # search only the references database in case the user set the top_k to 0
98
  if top_k == 0:
99
  refs = texts_df['ref_text_long'].unique()
100
+ for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs,scorer=scorer, limit=num_of_results,processor=preprocess):
101
  results += [{'ref':ref,'ref_score':ref_score}]
102
 
103
  else:
104
  # search first only in the books database (for top_k books)
105
+ for book, book_score, _ in rapidfuzz_process.extract(input_text, books, scorer=scorer, limit=top_k,processor=preprocess):
106
  # get all the references of that book
107
  book_title = list(titles_df.loc[titles_df['he_titles']==book]['title'])[0]
108
  refs = texts_df.loc[texts_df['title']==book_title]['ref_text_long'].unique()
109
  # then search these references and add them all to the results
110
+ for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs, limit=10,scorer=scorer,processor=preprocess):
111
  results += [{'ref':ref,'ref_score':ref_score,'book':book,'book_score':book_score}]
112
  # finaly, sort all the references by their own score (and not the book score)
113
  results.sort(key=lambda x: x['ref_score'],reverse=True)