File size: 4,035 Bytes
8899bf4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
# -*- coding: utf-8 -*-
"""ArabicPoetryGeneration.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1HDyT5F8qnrbR_PW_HYpiM3O-7i6htGG2
"""
!pip install transformers
!pip install tashaphyne
!pip install gradio
!pip install translate
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from transformers import BertTokenizer
from transformers import AutoTokenizer
import random
from tashaphyne import normalize
import re
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, GRU
import tensorflow as tf
from transformers import AutoTokenizer
nltk.download('punkt')
nltk.download('wordnet')
aurl = 'https://raw.githubusercontent.com/Obai33/NLP_PoemGenerationDatasets/main/arabicpoems.csv'
adf = pd.read_csv(aurl)
# Function to normalize text
def normalize_text(text):
normalize.strip_tashkeel(text)
normalize.strip_tatweel(text)
normalize.normalize_hamza(text)
normalize.normalize_lamalef(text)
return text
# Normalize the text
allah = normalize_text('الله')
adf = adf['poem_text']
i = random.randint(0, len(adf))
adf = adf.sample(n=100, random_state=i)
adf = adf.apply(lambda x: normalize_text(x))
adf = adf[~adf.str.contains(allah)]
# Function to clean text
def remove_non_arabic_symbols(text):
arabic_pattern = r'[\u0600-\u06FF\s]+'
arabic_text = re.findall(arabic_pattern, text)
cleaned_text = ''.join(arabic_text)
return cleaned_text
# Clean the text
adf = adf.apply(lambda x: remove_non_arabic_symbols(x))
# Tokenize the text
tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv2")
tokens = tokenizer.tokenize(adf.tolist(), is_split_into_words=True)
input_sequences = []
for line in adf:
token_list = tokenizer.encode(line, add_special_tokens=True)
for i in range(1, len(token_list)):
n_gram_sequence = token_list[:i+1]
input_sequences.append(n_gram_sequence)
max_sequence_len = 100
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
total_words = tokenizer.vocab_size
xs, labels = input_sequences[:, :-1], input_sequences[:, -1]
ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)
##############
import requests
# URL of the model
url = 'https://github.com/Obai33/NLP_PoemGenerationDatasets/raw/main/modelarab1.h5'
# Local file path to save the model
local_filename = 'modelarab1.h5'
# Download the model file
response = requests.get(url)
with open(local_filename, 'wb') as f:
f.write(response.content)
# Load the pre-trained model
model = tf.keras.models.load_model(local_filename)
##############
# Import the necessary library for translation
import translate
# Function to translate text to English
def translate_to_english(text):
translator = translate.Translator(from_lang="ar", to_lang="en")
translated_text = translator.translate(text)
return translated_text
def generate_arabic_text(seed_text, next_words=50):
generated_text = seed_text
for _ in range(next_words):
token_list = tokenizer.encode(generated_text, add_special_tokens=False)
token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
predicted = np.argmax(model.predict(token_list), axis=-1)
output_word = tokenizer.decode(predicted[0])
generated_text += " " + output_word
reconnected_text = generated_text.replace(" ##", "")
t_text = translate_to_english(reconnected_text)
return reconnected_text, t_text
import gradio as gr
# Update Gradio interface to include both Arabic and English outputs
iface = gr.Interface(
fn=generate_arabic_text,
inputs="text",
outputs=["text", "text"],
title="Arabic Poetry Generation",
description="Enter Arabic text to generate a small poem.",
theme="compact"
)
# Run the interface
iface.launch() |