import torch import spaces import gradio as gr from transformers import pipeline from huggingface_hub import model_info MODEL_NAME = "openai/whisper-small" device = 0 if torch.cuda.is_available() else "cpu" pipe = pipeline( task="automatic-speech-recognition", model=MODEL_NAME, chunk_length_s=30, device=device, ) pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids( task="transcribe") @spaces.GPU(duration=240) def transcribe(mic, file_upload): file = mic if mic is not None else file_upload text = pipe(file)["text"] return text #--------------------------------------------------------------- import ctranslate2 import gradio as gr from huggingface_hub import snapshot_download from sentencepiece import SentencePieceProcessor model_name = "santhosh/madlad400-3b-ct2" model_path = snapshot_download(model_name) tokenizer = SentencePieceProcessor() tokenizer.load(f"{model_path}/sentencepiece.model") translator = ctranslate2.Translator(model_path) tokens = [tokenizer.decode(i) for i in range(460)] lang_codes = [token[2:-1] for token in tokens if token.startswith("<2")] @spaces.GPU(duration=240) def translate(input_text, target_language): input_tokens = tokenizer.encode(f"<2{target_language}> {input_text}", out_type=str) results = translator.translate_batch( [input_tokens], batch_type="tokens", beam_size=1, no_repeat_ngram_size=1, ) translated_sentence = tokenizer.decode(results[0].hypotheses[0]) return translated_sentence @spaces.GPU(duration=240) def translate_interface(input_text, target_language): translated_text = translate(input_text, target_language) return translated_text with gr.Blocks() as demo: with gr.Column(): gr.Markdown( """

""" ) gr.Markdown("

🎤 Speech to Text & Translation 🗣️

") gr.HTML( "

" "🐤 OpenAI Whisper | " "🧑‍💻 Google Madlad" "

") gr.Markdown("

Upload an audio file or use your microphone to transcribe speech and then translate it to different languages.

") with gr.Row(): # First interface for transcription gr.Markdown("## 🎙️ Transcribe Audio ") gr.Markdown("---") audio_input = gr.Audio(sources=["upload", "microphone"], type="filepath") transcribe_button = gr.Button("Transcribe") transcribed_output = gr.Textbox(label="Transcribed Text") transcribe_button.click(transcribe, inputs=audio_input, outputs=transcribed_output) with gr.Row(): # Second interface for translation gr.Markdown("## 🌐 Translate Text 🌐") gr.Markdown("---") lang_dropdown = gr.Dropdown(lang_codes, value="en", label="Target Language") translate_button = gr.Button("Translate") translated_output = gr.Textbox(label="Translated Text") translate_button.click(translate_interface, inputs=[transcribed_output, lang_dropdown], outputs=translated_output) gr.Examples( examples=[ "Speech_samples/consumer4.wav", "Speech_samples/samples_audio-files_05-gettysburg-address-2min.wav", "Speech_samples/samples_audio-files_12-jfk-speech-12sec.wav", "Speech_samples/harvard_3mins.wav", ], inputs=audio_input, label="Try these examples" ) gr.Markdown("---") with gr.Accordion("See Details", open = False): gr.Markdown("---") gr.Markdown(''' ## Description 📝 > Using OpenAI Whisper Base model to transcribe audio files into text Google Madlad model to translate transcribed texts into multiple languages. > Enabling users to convert spoken words into written text. > Supporting various use cases, including transcription of audio files, detection of phrases, speech-to-text generation, and translation of text. ## How it Works 🫶 - Upload an audio file or record a new one directly in the app. - Transcribe the audio into text, allow copy and paste function for further use. - Or/ Translates the transcribed text into multiple languages. ## Usage 🤗 1. Transcribe audio files for note-taking, research, or content creation 2. Detect phrases or keywords in audio recordings for data analysis or market research 3. Generate text from speech for speech-to-text applications, such as subtitles, closed captions, or voice assistants 4. Use the app for language learning, by transcribing audio files in a foreign language and practicing pronunciation 5. Translate the transcribed text into multiple languages for global communication ## Disclaimer 🙅‍♂️ > This app is for personal use only and should not be used for commercial purposes. The OpenAI Whisper Base model and Google Madlad model are pre-trained models and may not always produce accurate results. ''') demo.queue(max_size=20) demo.launch()