yellowcandle commited on
Commit
5576fae
1 Parent(s): 27fbf39

changed how the transcription is done

Browse files
Files changed (1) hide show
  1. app.py +7 -38
app.py CHANGED
@@ -1,33 +1,9 @@
1
  import spaces
2
  import gradio as gr
3
  import os
4
- import logging
5
- from pytube import YouTube
6
  import torch
7
  from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, AutoModelForCausalLM, AutoTokenizer
8
 
9
- def get_text(url):
10
- if url != '':
11
- output_text_transcribe = ''
12
-
13
- yt = YouTube(url)
14
- video = yt.streams.filter(only_audio=True).first()
15
- out_file = video.download(output_path=".")
16
-
17
- file_stats = os.stat(out_file)
18
- logging.info(f'Size of audio file in Bytes: {file_stats.st_size}')
19
-
20
- if file_stats.st_size <= 30000000:
21
- base, ext = os.path.splitext(out_file)
22
- new_file = base + '.mp3'
23
- os.rename(out_file, new_file)
24
- a = new_file
25
-
26
- result = model.transcribe(a)
27
- return result['text'].strip()
28
- else:
29
- logging.error('Videos for transcription on this space are limited to about 1.5 hours. Sorry about this limit but some joker thought they could stop this tool from working by transcribing many extremely long videos. Please visit https://steve.digital to contact me about this space.')
30
-
31
  @spaces.GPU(duration=60)
32
  def transcribe_audio(audio, model_id):
33
  if audio is None:
@@ -67,17 +43,12 @@ def proofread(text):
67
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
68
  torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
69
 
70
- prompt = "用繁體中文整理這段文字,在最後加上整段文字的重點。"
71
-
72
- model = AutoModelForCausalLM.from_pretrained("hfl/llama-3-chinese-8b-instruct-v3")
73
- tokenizer = AutoTokenizer.from_pretrained("hfl/llama-3-chinese-8b-instruct-v3")
74
- model.to(device)
75
-
76
- input_text = prompt + text
77
- input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
78
- output = model.generate(input_ids, max_length=len(input_ids[0]) + 50, num_return_sequences=1, temperature=0.7)
79
- proofread_text = tokenizer.decode(output[0], skip_special_tokens=True)
80
-
81
  return proofread_text
82
 
83
  with gr.Blocks() as demo:
@@ -89,9 +60,7 @@ with gr.Blocks() as demo:
89
  """)
90
 
91
  with gr.Row():
92
- with gr.Column():
93
- audio = gr.Audio(sources="upload", type="filepath")
94
- input_text_url = gr.Textbox(label="Video URL")
95
  model_dropdown = gr.Dropdown(choices=["openai/whisper-large-v3", "alvanlii/whisper-small-cantonese"], value="openai/whisper-large-v3")
96
 
97
  transcribe_button = gr.Button("Transcribe")
 
1
  import spaces
2
  import gradio as gr
3
  import os
 
 
4
  import torch
5
  from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, AutoModelForCausalLM, AutoTokenizer
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  @spaces.GPU(duration=60)
8
  def transcribe_audio(audio, model_id):
9
  if audio is None:
 
43
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
44
  torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
45
 
46
+ messages = [
47
+ {"role": "system", "content": "用繁體中文整理這段文字,在最後加上整段文字的重點。"},
48
+ {"role": "user", "content": text},
49
+ ]
50
+ pipe = pipeline("text-generation", model="hfl/llama-3-chinese-8b-instruct-v3")
51
+ proofread_text = pipe(messages)
 
 
 
 
 
52
  return proofread_text
53
 
54
  with gr.Blocks() as demo:
 
60
  """)
61
 
62
  with gr.Row():
63
+ audio = gr.Audio(sources="upload", type="filepath")
 
 
64
  model_dropdown = gr.Dropdown(choices=["openai/whisper-large-v3", "alvanlii/whisper-small-cantonese"], value="openai/whisper-large-v3")
65
 
66
  transcribe_button = gr.Button("Transcribe")