sivan22 commited on
Commit
618357a
โ€ข
1 Parent(s): bce8d3d

Upload folder using huggingface_hub

Browse files
Files changed (5) hide show
  1. .gitattributes +1 -0
  2. README.md +12 -0
  3. app.py +140 -0
  4. requirements.txt +2 -0
  5. test42.db +3 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ test42.db filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Sefaria Ref Finder
3
+ emoji: ๐Ÿจ
4
+ colorFrom: gray
5
+ colorTo: gray
6
+ sdk: streamlit
7
+ sdk_version: 1.29.0
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from streamlit.logger import get_logger
3
+ import gematriapy
4
+
5
+ from timeit import default_timer as timer
6
+ import sqlite3
7
+ import pandas as pd
8
+ import ast
9
+ import pymongo
10
+
11
+
12
+
13
+ LOGGER = get_logger(__name__)
14
+
15
+ @st.cache_resource
16
+ def get_dfs()->object:
17
+ import pandas as pd
18
+
19
+ def to_daf_long(i:int)->str:
20
+ if i>0 and i<999:
21
+ i+=1
22
+ if i%2 ==0:
23
+ return gematriapy.to_hebrew(i//2)+' ืขืžื•ื“ ื '
24
+ else:
25
+ return gematriapy.to_hebrew(i//2)+' ืขืžื•ื“ ื‘'
26
+ return i
27
+
28
+ def gematria(i)->str:
29
+ if type(i) == int and i>0 and i<999:
30
+ return gematriapy.to_hebrew(i) + ' '
31
+ else: return i if type(i)==str else ''
32
+
33
+ # //get the books table//
34
+ print('hello from get_dfs..')
35
+ # Connect to the database
36
+ conn = sqlite3.connect('test42.db')
37
+
38
+ # Query the database and retrieve the results
39
+ cursor = conn.execute("SELECT * FROM books")
40
+ results = cursor.fetchall()
41
+
42
+ # Convert the query results into a Pandas DataFrame
43
+ books = pd.DataFrame(list(results))
44
+ books.columns=list(map(lambda x: x[0], cursor.description))
45
+
46
+ # convert the array format string "["Section","Section"]" that came from the database into a real array [Section,Section]
47
+ books['heSectionNames']=books['heSectionNames'].apply(lambda x: ast.literal_eval(x) if x is not None else [''] )
48
+
49
+ # //get the texts table//
50
+
51
+ # Query the database and retrieve the results
52
+ cursor = conn.execute("SELECT * FROM texts")
53
+ results = cursor.fetchall()
54
+
55
+ # Convert the query results into a Pandas DataFrame
56
+ texts = pd.DataFrame(results)
57
+ texts.columns=list(map(lambda x: x[0], cursor.description))
58
+
59
+ # get the table that includes the titles, from the MongoDB database - b/c the sqlite just don't have it
60
+ # Query the database and retrieve the results
61
+ cursor = conn.execute("SELECT * FROM titles")
62
+ results = cursor.fetchall()
63
+
64
+ # Convert the query results into a Pandas DataFrame
65
+ titles = pd.DataFrame(results)
66
+ titles.columns=list(map(lambda x: x[0], cursor.description))
67
+ # merge the texts with the original books table (without the extra hebrew titles)
68
+ merged = pd.merge(texts,books,how='inner',left_on='bid',right_on='_id')
69
+
70
+ #convert the Talmud marks (1,2,3...) into dafs (ื ืขืžื•ื“ ื..)
71
+ has_dafs = merged.loc[merged['heSectionNames'].apply(lambda x: True if len(x)>1 and x[-2] == 'ื“ืฃ' else False)==True]
72
+ merged.loc[has_dafs.index,'level2'] = has_dafs['level2'].map(to_daf_long)
73
+
74
+ # create a reference text, for exapmle: ืจืฉ"ื™ ืขืœ ื‘ืจืืฉื™ืช ืคืจืง ื ืคืกื•ืง ื
75
+ merged['ref_text_long']= merged['heTitle'] + ' ' + \
76
+ merged['heSectionNames'].map(lambda x:x[-4] + ' ' if len(x)>3 else "") + merged['level4'].map(gematria) + \
77
+ merged['heSectionNames'].map(lambda x:x[-3] + ' ' if len(x)>2 else "") + merged['level3'].map(gematria) + \
78
+ merged['heSectionNames'].map(lambda x:x[-2] + ' ' if len(x)>1 else "") + merged['level2'].map(gematria)
79
+
80
+ titles_df = titles
81
+ texts_df = merged
82
+ return titles_df, texts_df
83
+
84
+
85
+ def find_ref(titles_df,texts_df,input_text,top_k,num_of_results):
86
+ from rapidfuzz import process as rapidfuzz_process
87
+ print('hello from find_ref..')
88
+ if not input_text: return
89
+
90
+ results = []
91
+ books = titles_df['he_titles']
92
+ input_text = input_text.replace(':','ืขืžื•ื“ ื‘').replace('.','ืขืžื•ื“ ื')
93
+
94
+ # search only the references database in case the user set the top_k to 0
95
+ if top_k == 0:
96
+ refs = texts_df['ref_text_long'].unique()
97
+ for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs, limit=num_of_results):
98
+ results += [{'ref':ref,'ref_score':ref_score}]
99
+
100
+ else:
101
+ # search first only in the books database (for top_k books)
102
+ for book, book_score, _ in rapidfuzz_process.extract(input_text, books, limit=top_k):
103
+ # get all the references of that book
104
+ book_title = list(titles_df.loc[titles_df['he_titles']==book]['title'])[0]
105
+ refs = texts_df.loc[texts_df['title']==book_title]['ref_text_long'].unique()
106
+ # then search these references and add them all to the results
107
+ for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs, limit=10):
108
+ results += [{'ref':ref,'ref_score':ref_score,'book':book,'book_score':book_score}]
109
+ # finaly, sort all the references by their own score (and not the book score)
110
+ results.sort(key=lambda x: x['ref_score'],reverse=True)
111
+
112
+ return results[:num_of_results]
113
+
114
+
115
+ def run():
116
+
117
+ st.set_page_config(
118
+ page_title=" ื—ื™ืคื•ืฉ ืžืงื•ืจื•ืช",
119
+ page_icon="๐Ÿ“š",
120
+ layout="wide",
121
+ initial_sidebar_state="expanded"
122
+ )
123
+ get_dfs()
124
+ st.write("# ื—ื™ืคื•ืฉ ืžืงื•ืจื•ืช ื‘ืืžืฆืขื•ืช ืžืจื—ืง ืœื•ื™ื ืฉื˜ื™ื™ืŸ")
125
+
126
+ titles_df,texts_df = get_dfs()
127
+ user_input = st.text_input('ื›ืชื•ื‘ ืืช ื”ืžืงื•ืจ ื”ืžื‘ื•ืงืฉ', placeholder='ื‘ื‘ื ืงืžื ื“ืฃ ื‘ ืขืžื•ื“ ื‘')
128
+ top_k = st.sidebar.slider('ื›ืžื” ืกืคืจื™ื ืœืกืจื•ืง top_k:',0,20,10)
129
+ num_of_results = st.sidebar.slider('ืžืกืคืจ ื”ืชื•ืฆืื•ืช ืฉื‘ืจืฆื•ื ืš ืœื”ืฆื™ื’:',1,25,5)
130
+
131
+ if user_input!="":
132
+ time0 = timer()
133
+ results = find_ref(titles_df,texts_df,user_input,top_k,num_of_results)
134
+ time = f"finished in {1e3*(timer()-time0):.1f} ms"
135
+ st.write(time)
136
+ for result in results:
137
+ st.write(result)
138
+
139
+ if __name__ == "__main__":
140
+ run()
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ gematriapy
2
+ pandas
test42.db ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76e5c2fa4efd1ec73ec3babf569b831182849d1ce1e46fdadbd2a6e54aa538c4
3
+ size 2063155200