yellowcandle's picture
parse josn output from prompt
bab5632 unverified
raw
history blame
3.07 kB
import spaces
import gradio as gr
import os
import orjson
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, AutoModelForCausalLM, AutoTokenizer
@spaces.GPU(duration=60)
def transcribe_audio(audio, model_id):
if audio is None:
return "Please upload an audio file."
if model_id is None:
return "Please select a model."
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model = AutoModelForSpeechSeq2Seq.from_pretrained(
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)
pipe = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
max_new_tokens=128,
chunk_length_s=25,
batch_size=16,
torch_dtype=torch_dtype,
device=device,
)
result = pipe(audio)
return result["text"]
@spaces.GPU(duration=60)
def proofread(text):
if text is None:
return "Please provide the transcribed text for proofreading."
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
messages = [
{"role": "system", "content": "用繁體中文整理這段文字,在最後加上整段文字的重點。"},
{"role": "user", "content": text},
]
pipe = pipeline("text-generation", model="hfl/llama-3-chinese-8b-instruct-v3")
llm_output = pipe(messages)
# Extract the generated text
generated_text = llm_output[0]['generated_text']
# Extract the assistant's content
assistant_content = next(item['content'] for item in generated_text if item['role'] == 'assistant')
proofread_text = assistant_content
return proofread_text
with gr.Blocks() as demo:
gr.Markdown("""
# Audio Transcription and Proofreading
1. Upload an audio file (Wait for the file to be fully loaded first)
2. Select a model for transcription
3. Proofread the transcribed text
""")
with gr.Row():
audio = gr.Audio(sources="upload", type="filepath")
model_dropdown = gr.Dropdown(choices=["openai/whisper-large-v3", "alvanlii/whisper-small-cantonese"], value="openai/whisper-large-v3")
transcribe_button = gr.Button("Transcribe")
transcribed_text = gr.Textbox(label="Transcribed Text")
proofread_button = gr.Button("Proofread")
proofread_output = gr.Textbox(label="Proofread Text")
transcribe_button.click(transcribe_audio, inputs=[audio, model_dropdown], outputs=transcribed_text)
proofread_button.click(proofread, inputs=[transcribed_text], outputs=proofread_output)
transcribed_text.change(proofread, inputs=[transcribed_text], outputs=proofread_output)
demo.launch()