Spaces:

Bajiyo
/

transliteration_ml

Runtime error

App Files Files Community

Bajiyo commited on Jul 21

Commit

4755ab1

•

1 Parent(s): 96273c1

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -41

app.py CHANGED Viewed

@@ -1,53 +1,50 @@
 import gradio as gr
-import json
 from keras.preprocessing.sequence import pad_sequences
-from huggingface_hub import cached_download, from_pretrained_keras
-# Load the model from Hugging Face Hub (assuming the model identifier is "Bajiyo/ml-en-transliteration")
-model = from_pretrained_keras("Bajiyo/ml-en-transliteration")
-# Define URLs for tokenizer files on Hugging Face Hub (replace with actual model identifier if different)
-source_tokenizer_url = f"https://huggingface.co/Bajiyo/ml-en-transliteration/resolve/main/source_tokenizer.json"
-target_tokenizer_url = f"https://huggingface.co/Bajiyo/ml-en-transliteration/resolve/main/target_tokenizer.json"
-# Download tokenizer files using cached_download (avoids redundant downloads)
-source_tokenizer_path = cached_download(source_tokenizer_url)
-target_tokenizer_path = cached_download(target_tokenizer_url)
-# Load tokenizers from downloaded files
-from keras.preprocessing.text import tokenizer_from_json
-with open(source_tokenizer_path, "r") as f:
-    source_tokenizer = tokenizer_from_json(json.load(f))
-with open(target_tokenizer_path, "r") as f:
-    target_tokenizer = tokenizer_from_json(json.load(f))
-# Reconstruct tokenizers
-from keras.preprocessing.text import tokenizer_from_json
-source_tokenizer = tokenizer_from_json(source_tokenizer_config)
-target_tokenizer = tokenizer_from_json(target_tokenizer_config)
-# Define the maximum sequence length
-max_seq_length = 50
-# Function to predict transliteration
-def predict_transliteration(input_text):
-    # Preprocess the input text
     input_sequence = source_tokenizer.texts_to_sequences([input_text])
-    input_sequence_padded = pad_sequences(input_sequence, maxlen=max_seq_length, padding='post')
-    # Generate predictions
-    predicted_sequence = model.predict(input_sequence_padded)
-    # Decode the predicted sequence
-    predicted_text = "".join(target_tokenizer.index_word[i] for i in np.argmax(predicted_sequence, axis=-1)[0] if i != 0)
     return predicted_text
-# Create a Gradio interface
-input_textbox = gr.inputs.Textbox(lines=2, label="Enter Malayalam text")
-output_textbox = gr.outputs.Textbox(label="Predicted Transliteration")
-gr.Interface(fn=predict_transliteration, inputs=input_textbox, outputs=output_textbox, title="Malayalam Transliteration", description="Enter Malayalam text to get its transliteration in English.").launch()

 import gradio as gr
+import tensorflow as tf
 from keras.preprocessing.sequence import pad_sequences
+import numpy as np
+import json
+from huggingface_hub import from_pretrained_keras, hf_hub_download
+# Function to convert sequences back to strings
+def sequence_to_text(sequence, tokenizer):
+    reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))
+    text = ''.join([reverse_word_map.get(i, '') for i in sequence])
+    return text
+# Load the model from Hugging Face repository
+model = from_pretrained_keras("Bajiyo/Malayalam_transliteration")
+# Load tokenizers
+repo_id = "Bajiyo/Malayalam_transliteration"
+source_tokenizer_path = hf_hub_download(repo_id=repo_id, filename="source_tokenizer.json")
+target_tokenizer_path = hf_hub_download(repo_id=repo_id, filename="target_tokenizer.json")
+with open(source_tokenizer_path) as f:
+    source_tokenizer_data = json.load(f)
+source_tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(source_tokenizer_data)
+with open(target_tokenizer_path) as f:
+    target_tokenizer_data = json.load(f)
+target_tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(target_tokenizer_data)
+max_seq_length = 100  # Set the maximum sequence length (adjust if necessary)
+def transliterate(input_text):
     input_sequence = source_tokenizer.texts_to_sequences([input_text])
+    input_padded = pad_sequences(input_sequence, maxlen=max_seq_length, padding='post')
+    prediction = model.predict(input_padded)
+    predicted_sequence = np.argmax(prediction, axis=-1)[0]
+    predicted_text = sequence_to_text(predicted_sequence, target_tokenizer)
     return predicted_text
+# Set up Gradio interface
+iface = gr.Interface(
+    fn=transliterate,
+    inputs=gr.inputs.Textbox(lines=2, placeholder="Enter Malayalam text here..."),
+    outputs="text",
+    title="Malayalam to English Transliteration",
+    description="Enter Malayalam names to get their English transliterations."
+)
+if __name__ == "__main__":
+    iface.launch()