Spaces:

yellowcandle
/

whisper-v3-gradio

Sleeping

App Files Files Community

yellowcandle commited on Jun 18

Commit

5576fae

•

1 Parent(s): 27fbf39

changed how the transcription is done

Browse files

Files changed (1) hide show

app.py +7 -38

app.py CHANGED Viewed

@@ -1,33 +1,9 @@
 import spaces
 import gradio as gr
 import os
-import logging
-from pytube import YouTube
 import torch
 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, AutoModelForCausalLM, AutoTokenizer
-def get_text(url):
-    if url != '':
-        output_text_transcribe = ''
-    yt = YouTube(url)
-    video = yt.streams.filter(only_audio=True).first()
-    out_file = video.download(output_path=".")
-    file_stats = os.stat(out_file)
-    logging.info(f'Size of audio file in Bytes: {file_stats.st_size}')
-    if file_stats.st_size <= 30000000:
-        base, ext = os.path.splitext(out_file)
-        new_file = base + '.mp3'
-        os.rename(out_file, new_file)
-        a = new_file
-        result = model.transcribe(a)
-        return result['text'].strip()
-    else:
-        logging.error('Videos for transcription on this space are limited to about 1.5 hours. Sorry about this limit but some joker thought they could stop this tool from working by transcribing many extremely long videos. Please visit https://steve.digital to contact me about this space.')
 @spaces.GPU(duration=60)
 def transcribe_audio(audio, model_id):
     if audio is None:
@@ -67,17 +43,12 @@ def proofread(text):
     device = "cuda:0" if torch.cuda.is_available() else "cpu"
     torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
-    prompt = "用繁體中文整理這段文字，在最後加上整段文字的重點。"
-    model = AutoModelForCausalLM.from_pretrained("hfl/llama-3-chinese-8b-instruct-v3")
-    tokenizer = AutoTokenizer.from_pretrained("hfl/llama-3-chinese-8b-instruct-v3")
-    model.to(device)
-    input_text = prompt + text
-    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
-    output = model.generate(input_ids, max_length=len(input_ids[0]) + 50, num_return_sequences=1, temperature=0.7)
-    proofread_text = tokenizer.decode(output[0], skip_special_tokens=True)
     return proofread_text
 with gr.Blocks() as demo:
@@ -89,9 +60,7 @@ with gr.Blocks() as demo:
                 """)
     with gr.Row():
-        with gr.Column():
-            audio = gr.Audio(sources="upload", type="filepath")
-            input_text_url = gr.Textbox(label="Video URL")
         model_dropdown = gr.Dropdown(choices=["openai/whisper-large-v3", "alvanlii/whisper-small-cantonese"], value="openai/whisper-large-v3")
     transcribe_button = gr.Button("Transcribe")

 import spaces
 import gradio as gr
 import os
 import torch
 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, AutoModelForCausalLM, AutoTokenizer
 @spaces.GPU(duration=60)
 def transcribe_audio(audio, model_id):
     if audio is None:
     device = "cuda:0" if torch.cuda.is_available() else "cpu"
     torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+    messages = [
+        {"role": "system", "content": "用繁體中文整理這段文字，在最後加上整段文字的重點。"},
+        {"role": "user", "content": text},
+    ]
+    pipe = pipeline("text-generation", model="hfl/llama-3-chinese-8b-instruct-v3")
+    proofread_text = pipe(messages)
     return proofread_text
 with gr.Blocks() as demo:
                 """)
     with gr.Row():
+        audio = gr.Audio(sources="upload", type="filepath")
         model_dropdown = gr.Dropdown(choices=["openai/whisper-large-v3", "alvanlii/whisper-small-cantonese"], value="openai/whisper-large-v3")
     transcribe_button = gr.Button("Transcribe")