Spaces:

yellowcandle
/

whisper-v3-gradio

Sleeping

App Files Files Community

yellowcandle commited on Jun 18

Commit

0bbcfe0

•

1 Parent(s): 634e161

Added audio transcription and proofreading functionality using Gradio and Hugging Face Transformers

Browse files

- Implemented `transcribe_audio` function to transcribe audio files using a specified model
- Implemented `proofread` function to proofread transcribed text using a specified model
- Created a Gradio interface to upload audio files, select models, and display transcribed and proofread text
- Integrated GPU support for faster processing

Files changed (1) hide show

app.py +11 -6

app.py CHANGED Viewed

@@ -2,7 +2,7 @@ import spaces
 import gradio as gr
 # Use a pipeline as a high-level helper
 import torch
-from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, AutoModelForCausalLM
 @spaces.GPU(duration=120)
 def transcribe_audio(audio, model_id):
@@ -51,7 +51,8 @@ def proofread(prompt, text):
     model.to(device)
     # Perform proofreading using the model
-    input_ids = tokenizer.encode(text, return_tensors="pt").to(device)
     output = model.generate(input_ids, max_length=len(input_ids[0])+50, num_return_sequences=1, temperature=0.7)
     proofread_text = tokenizer.decode(output[0], skip_special_tokens=True)
@@ -59,8 +60,12 @@ def proofread(prompt, text):
 with gr.Blocks() as demo:
-    gr.Markdown("# Audio Transcription and Proofreading")
-    gr.Markdown("Upload an audio file, select a model for transcription, and then proofread the transcribed text.")
     with gr.Row():
         audio = gr.Audio(sources="upload", type="filepath")
@@ -73,7 +78,7 @@ with gr.Blocks() as demo:
     proofread_output = gr.Textbox(label="Proofread Text")
     transcribe_button.click(transcribe_audio, inputs=[audio, model_dropdown], outputs=transcribed_text)
-    proofread_button.click(proofread, inputs=transcribed_text, outputs=proofread_output)
-    transcribed_text.change(proofread, inputs=transcribed_text, outputs=proofread_output)
 demo.launch()

 import gradio as gr
 # Use a pipeline as a high-level helper
 import torch
+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, AutoModelForCausalLM, AutoTokenizer
 @spaces.GPU(duration=120)
 def transcribe_audio(audio, model_id):
     model.to(device)
     # Perform proofreading using the model
+    input_text = prompt + text
+    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
     output = model.generate(input_ids, max_length=len(input_ids[0])+50, num_return_sequences=1, temperature=0.7)
     proofread_text = tokenizer.decode(output[0], skip_special_tokens=True)
 with gr.Blocks() as demo:
+    gr.Markdown("""
+                # Audio Transcription and Proofreading
+                1. Upload an audio file (Wait for the file to be fully loaded first)
+                2. Select a model for transcription
+                3. Proofread the transcribed text
+                """)
     with gr.Row():
         audio = gr.Audio(sources="upload", type="filepath")
     proofread_output = gr.Textbox(label="Proofread Text")
     transcribe_button.click(transcribe_audio, inputs=[audio, model_dropdown], outputs=transcribed_text)
+    proofread_button.click(proofread, inputs=[transcribed_text], outputs=proofread_output)
+    transcribed_text.change(proofread, inputs=["", transcribed_text], outputs=proofread_output)
 demo.launch()