sivan22
/

sefaria-ref-finder

Model card Files Files and versions Community

sivan22 commited on Jan 11

Commit

618357a

•

1 Parent(s): bce8d3d

Upload folder using huggingface_hub

Browse files

Files changed (5) hide show

.gitattributes +1 -0
README.md +12 -0
app.py +140 -0
requirements.txt +2 -0
test42.db +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+test42.db filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,12 @@

+---
+title: Sefaria Ref Finder
+emoji: 🐨
+colorFrom: gray
+colorTo: gray
+sdk: streamlit
+sdk_version: 1.29.0
+app_file: app.py
+pinned: false
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import streamlit as st
+from streamlit.logger import get_logger
+import gematriapy
+from timeit import default_timer as timer
+import sqlite3
+import pandas as pd
+import ast
+import pymongo
+LOGGER = get_logger(__name__)
+@st.cache_resource
+def get_dfs()->object:
+    import pandas as pd
+    def to_daf_long(i:int)->str:
+        if i>0 and i<999:
+            i+=1
+            if  i%2 ==0:
+                return gematriapy.to_hebrew(i//2)+' עמוד א '
+            else:
+                return gematriapy.to_hebrew(i//2)+' עמוד ב'
+        return i
+    def gematria(i)->str:
+        if type(i) == int and i>0 and i<999:
+            return gematriapy.to_hebrew(i) + ' '
+        else: return i if type(i)==str else ''
+    # //get the books table//
+    print('hello from get_dfs..')
+    # Connect to the database
+    conn = sqlite3.connect('test42.db')
+    # Query the database and retrieve the results
+    cursor = conn.execute("SELECT * FROM books")
+    results = cursor.fetchall()
+    # Convert the query results into a Pandas DataFrame
+    books = pd.DataFrame(list(results))
+    books.columns=list(map(lambda x: x[0], cursor.description))
+    # convert the array format string "["Section","Section"]"  that came from the database into a real array [Section,Section]
+    books['heSectionNames']=books['heSectionNames'].apply(lambda x: ast.literal_eval(x) if x is not None else [''] )
+    # //get the texts table//
+    # Query the database and retrieve the results
+    cursor = conn.execute("SELECT * FROM texts")
+    results = cursor.fetchall()
+    # Convert the query results into a Pandas DataFrame
+    texts = pd.DataFrame(results)
+    texts.columns=list(map(lambda x: x[0], cursor.description))
+    # get the table that includes the titles, from the MongoDB database - b/c the sqlite just don't have it
+  # Query the database and retrieve the results
+    cursor = conn.execute("SELECT * FROM titles")
+    results = cursor.fetchall()
+    # Convert the query results into a Pandas DataFrame
+    titles = pd.DataFrame(results)
+    titles.columns=list(map(lambda x: x[0], cursor.description))
+    # merge the texts with the original books table (without the extra hebrew titles)
+    merged = pd.merge(texts,books,how='inner',left_on='bid',right_on='_id')
+    #convert the Talmud marks (1,2,3...) into dafs (א עמוד א..)
+    has_dafs = merged.loc[merged['heSectionNames'].apply(lambda x: True if len(x)>1 and x[-2] == 'דף' else False)==True]
+    merged.loc[has_dafs.index,'level2'] = has_dafs['level2'].map(to_daf_long)
+    # create a reference text, for exapmle: רש"י על בראשית פרק א פסוק א
+    merged['ref_text_long']= merged['heTitle'] + ' ' + \
+        merged['heSectionNames'].map(lambda x:x[-4] + ' ' if len(x)>3 else "")  + merged['level4'].map(gematria) + \
+        merged['heSectionNames'].map(lambda x:x[-3] + ' ' if len(x)>2 else "")  + merged['level3'].map(gematria) + \
+        merged['heSectionNames'].map(lambda x:x[-2] + ' ' if len(x)>1 else "")  +  merged['level2'].map(gematria)
+    titles_df = titles
+    texts_df = merged
+    return titles_df, texts_df
+def find_ref(titles_df,texts_df,input_text,top_k,num_of_results):
+    from rapidfuzz import process as rapidfuzz_process
+    print('hello from find_ref..')
+    if not input_text: return
+    results = []
+    books = titles_df['he_titles']
+    input_text = input_text.replace(':','עמוד ב').replace('.','עמוד א')
+    # search only the references database in case the user set the top_k to 0
+    if top_k == 0:
+        refs = texts_df['ref_text_long'].unique()
+        for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs, limit=num_of_results):
+           results += [{'ref':ref,'ref_score':ref_score}]
+    else:
+        # search first only in the books database (for top_k books)
+        for book, book_score, _ in rapidfuzz_process.extract(input_text, books, limit=top_k):
+            # get all the references of that book
+            book_title = list(titles_df.loc[titles_df['he_titles']==book]['title'])[0]
+            refs = texts_df.loc[texts_df['title']==book_title]['ref_text_long'].unique()
+            # then search these references and add them all to the results
+            for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs, limit=10):
+                results += [{'ref':ref,'ref_score':ref_score,'book':book,'book_score':book_score}]
+        # finaly, sort all the references by their own score (and not the book score)
+        results.sort(key=lambda x: x['ref_score'],reverse=True)
+    return results[:num_of_results]
+def run():
+    st.set_page_config(
+        page_title=" חיפוש מקורות",
+        page_icon="📚",
+        layout="wide",
+        initial_sidebar_state="expanded"
+    )
+    get_dfs()
+    st.write("# חיפוש מקורות באמצעות מרחק לוינשטיין")
+    titles_df,texts_df = get_dfs()
+    user_input = st.text_input('כתוב את המקור המבוקש', placeholder='בבא קמא דף ב עמוד ב')
+    top_k =  st.sidebar.slider('כמה ספרים לסרוק top_k:',0,20,10)
+    num_of_results = st.sidebar.slider('מספר התוצאות שברצונך להציג:',1,25,5)
+    if user_input!="":
+        time0 = timer()
+        results = find_ref(titles_df,texts_df,user_input,top_k,num_of_results)
+        time = f"finished in {1e3*(timer()-time0):.1f} ms"
+        st.write(time)
+        for result in results:
+            st.write(result)
+if __name__ == "__main__":
+    run()

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ gematriapy
2	+ pandas

test42.db ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:76e5c2fa4efd1ec73ec3babf569b831182849d1ce1e46fdadbd2a6e54aa538c4
+size 2063155200