Mohamed-Maher's picture
Update app.py
39ccf9b verified
raw
history blame
2.2 kB
import os
import re
import pickle
import numpy as np
import pandas as pd
import nltk
import gradio as gr
from sklearn.metrics.pairwise import cosine_similarity
class HadithClassificationApp:
def __init__(self):
# Download NLTK resources if needed
nltk.download('punkt')
# Load the dataset and labels
self.dataset = pd.read_csv("Preprocess_LK_Hadith_dataset.csv")
self.labels = self.dataset['Arabic_Grade']
# Load the models
with open("tfidf_vectorizer.pkl", "rb") as f:
self.vectorizer = pickle.load(f)
with open("cosine_similarity_model.pkl", "rb") as f:
self.X = pickle.load(f)
@staticmethod
def remove_tashkeel(text):
tashkeel_pattern = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
return re.sub(tashkeel_pattern, '', text)
def preprocess_arabic_text(self, text):
text = self.remove_tashkeel(text)
tokens = nltk.word_tokenize(text)
cleaned_tokens = [token for token in tokens if token.isalnum()]
lowercase_tokens = [token.lower() for token in cleaned_tokens]
return " ".join(lowercase_tokens)
def predict_label(self, input_text, threshold=0.5):
input_text = self.preprocess_arabic_text(input_text)
input_vector = self.vectorizer.transform([input_text])
similarities = cosine_similarity(input_vector, self.X).flatten()
max_index = np.argmax(similarities)
max_similarity = similarities[max_index]
if max_similarity >= threshold:
return self.labels.iloc[max_index]
else:
return "No similar text found in dataset"
def classify_hadith(self, input_text):
return self.predict_label(input_text)
if __name__ == "__main__":
# Initialize the app
hadith_classification_app = HadithClassificationApp()
# Set up the Gradio interface
iface = gr.Interface(
fn=hadith_classification_app.classify_hadith,
inputs="text",
outputs="text",
title="Hadith Classification App",
description="Classify Hadith text based on pre-trained model."
)
# Launch the Gradio interface
iface.launch()