Spaces:
Sleeping
Sleeping
File size: 3,285 Bytes
6a2dcd8 3f1afb1 0dca430 95ddc82 3f1afb1 7c8a2d5 3c78a64 3f1afb1 e77e1d8 3f1afb1 199cc77 3f1afb1 3c78a64 bcd9622 3f1afb1 3c78a64 3146b8d 3f1afb1 a45d825 3f1afb1 3c78a64 3f1afb1 1307cdd 3c78a64 d3d2e7b 3f1afb1 3c78a64 925603d 5365aef ef9f4cf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 |
from faster_whisper import WhisperModel
#import whisper
import pandas as pd
import gradio as gr
import psutil
import time
import whisperx
model = WhisperModel('large-v2', device="cpu", compute_type="float32")
#model = whisper.load_model('large-v2')
def speech_to_text(mic=None, file=None, lang=None, task='transcribe'):
if mic is not None:
audio = mic
elif file is not None:
audio = file
else:
raise gr.Error("You must either provide a mic recording or a file")
print(lang, task)
time_start = time.time()
segments, info = model.transcribe(audio, task=task, language=lang, beam_size=5)
#results = model.transcribe(audio, task=task, language=lang, beam_size=5)
#print("Detected language '%s' with probability %f" % (info.language, info.language_probability))
# Decode audio to Text
objects = [s._asdict() for s in segments]
#objects = results["segments"]
print(objects)
time_end = time.time()
time_diff = time_end - time_start
#memory = psutil.virtual_memory()
# *Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB.*
system_info = f"""
*Processing time: {time_diff:.5} seconds.*
"""
df_results = pd.DataFrame(objects)
df_results = df_results.drop(columns=['seek', 'tokens', 'avg_logprob'])
return df_results, system_info
theme=gr.themes.Default().set(
color_accent="#e20074",
# Buttons
button_primary_text_color='white',
button_primary_text_color_hover='black',
button_primary_background_fill="#e20074",
button_primary_background_fill_hover='#c00063', # --telekom-color-primary-hovered
button_primary_border_color="#e20074",
button_primary_border_color_hover="#c00063",
stat_background_fill="#e20074",
# Dark Mode
button_primary_background_fill_dark="#e20074",
button_primary_background_fill_hover_dark='#c00063', # --telekom-color-primary-hovered
button_primary_border_color_dark="#e20074",
button_primary_border_color_hover_dark="#c00063",
stat_background_fill_dark="#e20074",
)
with gr.Blocks(title='Whisper Demo', theme=theme) as demo:
gr.Markdown('''
<div>
<h1 style='text-align: center'>Simple Whisper Demo</h1>
A simple Whisper demo using local CPU Inference of the largest-v2 Model
</div>
''')
audio_in = gr.Audio(label="Record", source='microphone', type="filepath")
file_in = gr.Audio(label="Upload", source='upload', type="filepath")
transcribe_btn = gr.Button("Transcribe audio", variant="primary")
translate_btn = gr.Button("Translate to English")
trans_df = gr.DataFrame(label="Transcription dataframe", row_count=(0, "dynamic"), max_rows = 10, wrap=True, overflow_row_behaviour='paginate')
sys_info = gr.Markdown("")
transcribe_btn.click(lambda x, y: speech_to_text(x, y, task='transcribe'),
[audio_in, file_in],
[trans_df, sys_info]
)
translate_btn.click(lambda x, y, z: speech_to_text(x, y, task='translate'),
[audio_in, file_in],
[trans_df, sys_info])
demo.launch()
|