Obai33 commited on
Commit
dcbadce
1 Parent(s): 2c79d96

initial commit

Browse files
Files changed (1) hide show
  1. app.py +134 -0
app.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """ArabicPoetryGeneration.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1HDyT5F8qnrbR_PW_HYpiM3O-7i6htGG2
8
+ """
9
+
10
+ !pip install transformers
11
+ !pip install tashaphyne
12
+ !pip install gradio
13
+ !pip install translate
14
+
15
+ import pandas as pd
16
+ import nltk
17
+ from nltk.tokenize import word_tokenize
18
+ from transformers import BertTokenizer
19
+ from transformers import AutoTokenizer
20
+ import random
21
+ from tashaphyne import normalize
22
+ import re
23
+ import numpy as np
24
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
25
+ from tensorflow.keras.models import Sequential
26
+ from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, GRU
27
+ import tensorflow as tf
28
+ from transformers import AutoTokenizer
29
+
30
+ nltk.download('punkt')
31
+ nltk.download('wordnet')
32
+
33
+ aurl = 'https://raw.githubusercontent.com/Obai33/NLP_PoemGenerationDatasets/main/arabicpoems.csv'
34
+ adf = pd.read_csv(aurl)
35
+
36
+ # Function to normalize text
37
+ def normalize_text(text):
38
+ normalize.strip_tashkeel(text)
39
+ normalize.strip_tatweel(text)
40
+ normalize.normalize_hamza(text)
41
+ normalize.normalize_lamalef(text)
42
+ return text
43
+
44
+ # Normalize the text
45
+ allah = normalize_text('الله')
46
+ adf = adf['poem_text']
47
+ i = random.randint(0, len(adf))
48
+ adf = adf.sample(n=100, random_state=i)
49
+ adf = adf.apply(lambda x: normalize_text(x))
50
+ adf = adf[~adf.str.contains(allah)]
51
+
52
+ # Function to clean text
53
+ def remove_non_arabic_symbols(text):
54
+ arabic_pattern = r'[\u0600-\u06FF\s]+'
55
+ arabic_text = re.findall(arabic_pattern, text)
56
+ cleaned_text = ''.join(arabic_text)
57
+ return cleaned_text
58
+
59
+ # Clean the text
60
+ adf = adf.apply(lambda x: remove_non_arabic_symbols(x))
61
+
62
+ # Tokenize the text
63
+ tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv2")
64
+ tokens = tokenizer.tokenize(adf.tolist(), is_split_into_words=True)
65
+
66
+ input_sequences = []
67
+ for line in adf:
68
+ token_list = tokenizer.encode(line, add_special_tokens=True)
69
+ for i in range(1, len(token_list)):
70
+ n_gram_sequence = token_list[:i+1]
71
+ input_sequences.append(n_gram_sequence)
72
+
73
+ max_sequence_len = 100
74
+ input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
75
+
76
+ total_words = tokenizer.vocab_size
77
+
78
+ xs, labels = input_sequences[:, :-1], input_sequences[:, -1]
79
+ ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)
80
+
81
+
82
+ ##############
83
+
84
+ import requests
85
+
86
+ # URL of the model
87
+ url = 'https://github.com/Obai33/NLP_PoemGenerationDatasets/raw/main/modelarab1.h5'
88
+ # Local file path to save the model
89
+ local_filename = 'modelarab1.h5'
90
+
91
+ # Download the model file
92
+ response = requests.get(url)
93
+ with open(local_filename, 'wb') as f:
94
+ f.write(response.content)
95
+
96
+ # Load the pre-trained model
97
+ model = tf.keras.models.load_model(local_filename)
98
+
99
+ ##############
100
+
101
+ # Import the necessary library for translation
102
+ import translate
103
+
104
+ # Function to translate text to English
105
+ def translate_to_english(text):
106
+ translator = translate.Translator(from_lang="ar", to_lang="en")
107
+ translated_text = translator.translate(text)
108
+ return translated_text
109
+
110
+ def generate_arabic_text(seed_text, next_words=50):
111
+ generated_text = seed_text
112
+ for _ in range(next_words):
113
+ token_list = tokenizer.encode(generated_text, add_special_tokens=False)
114
+ token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
115
+ predicted = np.argmax(model.predict(token_list), axis=-1)
116
+ output_word = tokenizer.decode(predicted[0])
117
+ generated_text += " " + output_word
118
+ reconnected_text = generated_text.replace(" ##", "")
119
+ t_text = translate_to_english(reconnected_text)
120
+ return reconnected_text, t_text
121
+
122
+ import gradio as gr
123
+
124
+ # Update Gradio interface to include both Arabic and English outputs
125
+ iface = gr.Interface(
126
+ fn=generate_arabic_text,
127
+ inputs="text",
128
+ outputs=["text", "text"],
129
+ title="Arabic Poetry Generation",
130
+ description="Enter Arabic text to generate a small poem.",
131
+ theme="compact"
132
+ )
133
+ # Run the interface
134
+ iface.launch()