Mohamed-Maher commited on
Commit
18feff1
1 Parent(s): f2a320a

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -0
app.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import nltk
3
+ import pickle
4
+ import numpy as np
5
+ import pandas as pd
6
+ import streamlit as st
7
+ from datasets import load_dataset
8
+ from sklearn.metrics.pairwise import cosine_similarity
9
+
10
+ nltk.download('punkt')
11
+
12
+ dataset = pd.read_csv("Preprocess_LK_Hadith_dataset.csv")
13
+
14
+ labels = dataset['Arabic_Grade']
15
+
16
+ # Helper functions
17
+ def remove_tashkeel(text):
18
+ tashkeel_pattern = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
19
+ return re.sub(tashkeel_pattern, '', text)
20
+
21
+ def preprocess_arabic_text(text):
22
+ text = remove_tashkeel(text)
23
+ tokens = nltk.word_tokenize(text)
24
+ cleaned_tokens = [token for token in tokens if token.isalnum()]
25
+ lowercase_tokens = [token.lower() for token in cleaned_tokens]
26
+ return " ".join(lowercase_tokens)
27
+
28
+ # Function to predict label
29
+ def predict_label(input_text, threshold=0.5):
30
+ with open("tfidf_vectorizer.pkl", "rb") as f:
31
+ vectorizer = pickle.load(f)
32
+ with open("cosine_similarity_model.pkl", "rb") as f:
33
+ X = pickle.load(f)
34
+
35
+ input_text = preprocess_arabic_text(input_text)
36
+ input_vector = vectorizer.transform([input_text])
37
+ similarities = cosine_similarity(input_vector, X).flatten()
38
+
39
+ max_index = np.argmax(similarities)
40
+ max_similarity = similarities[max_index]
41
+
42
+ if max_similarity >= threshold:
43
+ return labels.iloc[max_index]
44
+ else:
45
+ return "No similar text found in dataset"
46
+
47
+ x = st.slider('Enter Hadith')
48
+ st.write(x, 'Hadith Classification', predict_label)