File size: 2,200 Bytes
39ccf9b
18feff1
 
 
 
39ccf9b
 
18feff1
 
39ccf9b
 
 
 
 
 
 
 
 
 
 
 
 
 
18feff1
39ccf9b
 
 
 
18feff1
39ccf9b
 
 
 
 
 
18feff1
39ccf9b
 
 
 
18feff1
39ccf9b
 
18feff1
39ccf9b
 
 
 
18feff1
39ccf9b
 
18feff1
39ccf9b
 
 
18feff1
39ccf9b
 
 
 
 
 
 
 
18feff1
39ccf9b
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import os
import re
import pickle
import numpy as np
import pandas as pd
import nltk
import gradio as gr
from sklearn.metrics.pairwise import cosine_similarity

class HadithClassificationApp:
    def __init__(self):
        # Download NLTK resources if needed
        nltk.download('punkt')

        # Load the dataset and labels
        self.dataset = pd.read_csv("Preprocess_LK_Hadith_dataset.csv")
        self.labels = self.dataset['Arabic_Grade']

        # Load the models
        with open("tfidf_vectorizer.pkl", "rb") as f:
            self.vectorizer = pickle.load(f)
        with open("cosine_similarity_model.pkl", "rb") as f:
            self.X = pickle.load(f)

    @staticmethod
    def remove_tashkeel(text):
        tashkeel_pattern = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
        return re.sub(tashkeel_pattern, '', text)

    def preprocess_arabic_text(self, text):
        text = self.remove_tashkeel(text)
        tokens = nltk.word_tokenize(text)
        cleaned_tokens = [token for token in tokens if token.isalnum()]
        lowercase_tokens = [token.lower() for token in cleaned_tokens]
        return " ".join(lowercase_tokens)

    def predict_label(self, input_text, threshold=0.5):
        input_text = self.preprocess_arabic_text(input_text)
        input_vector = self.vectorizer.transform([input_text])
        similarities = cosine_similarity(input_vector, self.X).flatten()

        max_index = np.argmax(similarities)
        max_similarity = similarities[max_index]

        if max_similarity >= threshold:
            return self.labels.iloc[max_index]
        else:
            return "No similar text found in dataset"

    def classify_hadith(self, input_text):
        return self.predict_label(input_text)

if __name__ == "__main__":
    # Initialize the app
    hadith_classification_app = HadithClassificationApp()

    # Set up the Gradio interface
    iface = gr.Interface(
        fn=hadith_classification_app.classify_hadith,
        inputs="text",
        outputs="text",
        title="Hadith Classification App",
        description="Classify Hadith text based on pre-trained model."
    )

    # Launch the Gradio interface
    iface.launch()