File size: 4,035 Bytes
8899bf4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# -*- coding: utf-8 -*-
"""ArabicPoetryGeneration.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1HDyT5F8qnrbR_PW_HYpiM3O-7i6htGG2
"""

!pip install transformers
!pip install tashaphyne
!pip install gradio
!pip install translate

import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from transformers import BertTokenizer
from transformers import AutoTokenizer
import random
from tashaphyne import normalize
import re
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, GRU
import tensorflow as tf
from transformers import AutoTokenizer

nltk.download('punkt')
nltk.download('wordnet')

aurl = 'https://raw.githubusercontent.com/Obai33/NLP_PoemGenerationDatasets/main/arabicpoems.csv'
adf = pd.read_csv(aurl)

# Function to normalize text
def normalize_text(text):
    normalize.strip_tashkeel(text)
    normalize.strip_tatweel(text)
    normalize.normalize_hamza(text)
    normalize.normalize_lamalef(text)
    return text

# Normalize the text
allah = normalize_text('الله')
adf = adf['poem_text']
i = random.randint(0, len(adf))
adf = adf.sample(n=100, random_state=i)
adf = adf.apply(lambda x: normalize_text(x))
adf = adf[~adf.str.contains(allah)]

# Function to clean text
def remove_non_arabic_symbols(text):
    arabic_pattern = r'[\u0600-\u06FF\s]+'
    arabic_text = re.findall(arabic_pattern, text)
    cleaned_text = ''.join(arabic_text)
    return cleaned_text

# Clean the text
adf = adf.apply(lambda x: remove_non_arabic_symbols(x))

# Tokenize the text
tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv2")
tokens = tokenizer.tokenize(adf.tolist(), is_split_into_words=True)

input_sequences = []
for line in adf:
    token_list = tokenizer.encode(line, add_special_tokens=True)
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

max_sequence_len = 100
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

total_words = tokenizer.vocab_size

xs, labels = input_sequences[:, :-1], input_sequences[:, -1]
ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)


##############

import requests

# URL of the model
url = 'https://github.com/Obai33/NLP_PoemGenerationDatasets/raw/main/modelarab1.h5'
# Local file path to save the model
local_filename = 'modelarab1.h5'

# Download the model file
response = requests.get(url)
with open(local_filename, 'wb') as f:
    f.write(response.content)

# Load the pre-trained model
model = tf.keras.models.load_model(local_filename)

##############

# Import the necessary library for translation
import translate

# Function to translate text to English
def translate_to_english(text):
    translator = translate.Translator(from_lang="ar", to_lang="en")
    translated_text = translator.translate(text)
    return translated_text

def generate_arabic_text(seed_text, next_words=50):
    generated_text = seed_text
    for _ in range(next_words):
        token_list = tokenizer.encode(generated_text, add_special_tokens=False)
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = np.argmax(model.predict(token_list), axis=-1)
        output_word = tokenizer.decode(predicted[0])
        generated_text += " " + output_word
    reconnected_text = generated_text.replace(" ##", "")
    t_text = translate_to_english(reconnected_text)
    return reconnected_text, t_text

import gradio as gr

# Update Gradio interface to include both Arabic and English outputs
iface = gr.Interface(
    fn=generate_arabic_text,
    inputs="text",
    outputs=["text", "text"],
    title="Arabic Poetry Generation",
    description="Enter Arabic text to generate a small poem.",
    theme="compact"
)
# Run the interface
iface.launch()