Spaces:

yellowcandle
/

whisper-v3-gradio

Sleeping

File size: 3,262 Bytes

130fe19
dd4c06b
52cfee9
8607936
0bbcfe0
dd4c06b
130fe19
dc1c6ab
a4e4751
 
d1c3a70
 
a4e4751
8607936
 
52cfee9
8607936
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d1c3a70
5ad2f2b
e648c2d
 
 
 
 
 
b99929d
d1c3a70
e648c2d
d1c3a70
e648c2d
 
 
0bbcfe0
 
e648c2d
d1c3a70
e648c2d
 
 
8607936
6e40332
0bbcfe0
 
 
 
 
 
6e40332
 
f8b77d4
 
 
d1c3a70
e828a9f
6e40332
 
 
 
 
8607936
6e40332
0bbcfe0
5ad2f2b
6e40332

import spaces
import gradio as gr
# Use a pipeline as a high-level helper
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, AutoModelForCausalLM, AutoTokenizer

@spaces.GPU(duration=120)
def transcribe_audio(audio, model_id):
    if audio is None:
        return "Please upload an audio file."
    if model_id is None:
        return "Please select a model."

    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

    model = AutoModelForSpeechSeq2Seq.from_pretrained(
        model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
    )
    model.to(device)

    processor = AutoProcessor.from_pretrained(model_id)

    pipe = pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        max_new_tokens=128,
        chunk_length_s=25,
        batch_size=16,
        torch_dtype=torch_dtype,
        device=device,
    )

    result = pipe(audio)
    return result["text"]

@spaces.GPU(duration=180)
def proofread(text):
    if text is None:
        return "Please provide the transcribed text for proofreading."
    
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
    
    prompt = "用繁體中文整理這段文字，分段及改正錯別字，最後加上整段文字的重點。"
    
    model = AutoModelForCausalLM.from_pretrained("hfl/llama-3-chinese-8b-instruct-v3")
    tokenizer = AutoTokenizer.from_pretrained("hfl/llama-3-chinese-8b-instruct-v3")
    model.to(device)
    
    # Perform proofreading using the model
    input_text = prompt + text
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
    output = model.generate(input_ids, max_length=len(input_ids[0])+50, num_return_sequences=1, temperature=0.7)
    proofread_text = tokenizer.decode(output[0], skip_special_tokens=True)
    
    return proofread_text


with gr.Blocks() as demo:
    gr.Markdown("""
                # Audio Transcription and Proofreading
                1. Upload an audio file (Wait for the file to be fully loaded first)
                2. Select a model for transcription
                3. Proofread the transcribed text
                """)

    with gr.Row():
        with gr.Column():
            audio = gr.Audio(sources="upload", type="filepath")
            video = gr.Video(sources="upload", type="url")
        model_dropdown = gr.Dropdown(choices=["openai/whisper-large-v3", "alvanlii/whisper-small-cantonese"], value="openai/whisper-large-v3")

    transcribe_button = gr.Button("Transcribe")
    transcribed_text = gr.Textbox(label="Transcribed Text")
    
    proofread_button = gr.Button("Proofread")
    proofread_output = gr.Textbox(label="Proofread Text")

    transcribe_button.click(transcribe_audio, inputs=[audio, model_dropdown], outputs=transcribed_text)
    proofread_button.click(proofread, inputs=[transcribed_text], outputs=proofread_output)
    transcribed_text.change(proofread, inputs=[transcribed_text], outputs=proofread_output)

demo.launch()