Keyven commited on
Commit
2449b43
β€’
1 Parent(s): 2d256fb

whisper integration

Browse files
Files changed (1) hide show
  1. app.py +22 -0
app.py CHANGED
@@ -5,6 +5,12 @@ import re
5
  import copy
6
  import secrets
7
  from pathlib import Path
 
 
 
 
 
 
8
 
9
  # Constants
10
  BOX_TAG_PATTERN = r"<box>([\s\S]*?)</box>"
@@ -46,6 +52,15 @@ def format_text(text):
46
  text = "".join(lines)
47
  return text
48
 
 
 
 
 
 
 
 
 
 
49
 
50
  def get_chat_response(chatbot, task_history):
51
  global model, tokenizer
@@ -133,6 +148,12 @@ def handle_regeneration(chatbot, task_history):
133
 
134
 
135
  with gr.Blocks(theme='gradio/soft') as demo:
 
 
 
 
 
 
136
  gr.Markdown("# Qwen-VL Multimodal-Vision-Insight")
137
  gr.Markdown(
138
  "## Developed by Keyvan Hardani (Keyvven on [Twitter](https://twitter.com/Keyvven))\n"
@@ -162,6 +183,7 @@ with gr.Blocks(theme='gradio/soft') as demo:
162
  clear_btn.click(clear_history, [task_history], [chatbot], show_progress=True)
163
  regen_btn.click(handle_regeneration, [chatbot, task_history], [chatbot], show_progress=True)
164
  upload_btn.upload(handle_file_upload, [chatbot, task_history, upload_btn], [chatbot, task_history], show_progress=True)
 
165
 
166
 
167
  demo.launch()
 
5
  import copy
6
  import secrets
7
  from pathlib import Path
8
+ import os
9
+ os.system("pip install git+https://github.com/openai/whisper.git")
10
+ import whisper
11
+
12
+
13
+ model_whisper = whisper.load_model("small")
14
 
15
  # Constants
16
  BOX_TAG_PATTERN = r"<box>([\s\S]*?)</box>"
 
52
  text = "".join(lines)
53
  return text
54
 
55
+ def transcribe_audio(audio):
56
+ audio = whisper.load_audio(audio)
57
+ audio = whisper.pad_or_trim(audio)
58
+ mel = whisper.log_mel_spectrogram(audio).to(model_whisper.device)
59
+ _, probs = model_whisper.detect_language(mel)
60
+ options = whisper.DecodingOptions(fp16 = False)
61
+ result = whisper.decode(model_whisper, mel, options)
62
+ return result.text
63
+
64
 
65
  def get_chat_response(chatbot, task_history):
66
  global model, tokenizer
 
148
 
149
 
150
  with gr.Blocks(theme='gradio/soft') as demo:
151
+ audio = gr.Audio(
152
+ label="Input Audio",
153
+ show_label=False,
154
+ source="microphone",
155
+ type="filepath"
156
+ )
157
  gr.Markdown("# Qwen-VL Multimodal-Vision-Insight")
158
  gr.Markdown(
159
  "## Developed by Keyvan Hardani (Keyvven on [Twitter](https://twitter.com/Keyvven))\n"
 
183
  clear_btn.click(clear_history, [task_history], [chatbot], show_progress=True)
184
  regen_btn.click(handle_regeneration, [chatbot, task_history], [chatbot], show_progress=True)
185
  upload_btn.upload(handle_file_upload, [chatbot, task_history, upload_btn], [chatbot, task_history], show_progress=True)
186
+ audio.on_change(transcribe_audio, inputs=[audio], outputs=[query])
187
 
188
 
189
  demo.launch()