Spaces:
Runtime error
Runtime error
import gradio as gr | |
import os | |
from dotenv import load_dotenv | |
from pydub import AudioSegment | |
load_dotenv() | |
from lang_list import TEXT_SOURCE_LANGUAGE_NAMES | |
from gradio_client import Client | |
HF_API = os.getenv("HF_API") | |
API_URL = os.getenv("API_URL") # path to Seamlessm4t API endpoint | |
DEFAULT_TARGET_LANGUAGE = "Western Persian" | |
DESCRIPTION = """ | |
# Seamlessm4t + Speaker Diarization + Voice Activity Detection | |
Here we use seamlessm4t to generate captions for full audios. Audio can be of arbitrary length. | |
""" | |
DUPLICATE = """ | |
To duplicate this repo, you have to give permission from three reopsitories and accept all user conditions: | |
1- https://huggingface.co/pyannote/voice-activity-detection | |
2- https://hf.co/pyannote/segmentation | |
3- https://hf.co/pyannote/speaker-diarization | |
""" | |
from pyannote.audio import Pipeline | |
pipeline = Pipeline.from_pretrained( | |
"pyannote/speaker-diarization", use_auth_token=HF_API | |
) | |
def predict( | |
target_language, number_of_speakers, audio_source, input_audio_mic, input_audio_file | |
): | |
if audio_source == "microphone": | |
input_data = input_audio_mic | |
else: | |
input_data = input_audio_file | |
print(input_data) | |
if number_of_speakers == 0: | |
diarization = pipeline(input_data) | |
else: | |
diarization = pipeline(input_data, num_speakers=number_of_speakers) | |
for turn, value, speaker in diarization.itertracks(yield_label=True): | |
print(f"start={turn.start}s stop={turn.end}s speaker_{speaker}") | |
song = AudioSegment.from_wav(input_data) | |
client = Client(API_URL) | |
output_text = "" | |
for turn, value, speaker in diarization.itertracks(yield_label=True): | |
print(turn) | |
try: | |
clipped = song[turn.start * 1000 : turn.end * 1000] | |
clipped.export(f"my.wav", format="wav", bitrate=16000) | |
_, result = client.predict( | |
"ASR (Automatic Speech Recognition)", | |
"file", # str in 'Audio source' Radio component | |
f"my.wav", | |
f"my.wav", | |
"text", | |
target_language, | |
target_language, | |
api_name="/run", | |
) | |
current_text = f"start: {turn.start:.1f} end: {turn.end:.1f} text: {result} speaker: {speaker}" | |
if current_text is not None: | |
output_text = output_text + "\n" + current_text | |
yield output_text | |
except Exception as e: | |
print(e) | |
# return output_text | |
def update_audio_ui(audio_source: str) -> tuple[dict, dict]: | |
mic = audio_source == "microphone" | |
return ( | |
gr.update(visible=mic, value=None), # input_audio_mic | |
gr.update(visible=not mic, value=None), # input_audio_file | |
) | |
with gr.Blocks(css="style.css") as demo: | |
gr.Markdown(DESCRIPTION) | |
with gr.Group(): | |
with gr.Row(): | |
target_language = gr.Dropdown( | |
choices=TEXT_SOURCE_LANGUAGE_NAMES, | |
label="Output Language", | |
value=DEFAULT_TARGET_LANGUAGE, | |
interactive=True, | |
info="Select your target language", | |
) | |
number_of_speakers = gr.Number( | |
label="Number of Speakers", | |
info="Keep it zero, if you want the model to automatically detect the number of speakers", | |
) | |
with gr.Row() as audio_box: | |
audio_source = gr.Radio( | |
choices=["file", "microphone"], value="file", interactive=True | |
) | |
input_audio_mic = gr.Audio( | |
label="Input speech", | |
type="filepath", | |
source="microphone", | |
visible=False, | |
) | |
input_audio_file = gr.Audio( | |
label="Input speech", | |
type="filepath", | |
source="upload", | |
visible=True, | |
) | |
final_audio = gr.Audio(label="Output", visible=False) | |
audio_source.change( | |
fn=update_audio_ui, | |
inputs=audio_source, | |
outputs=[input_audio_mic, input_audio_file], | |
queue=False, | |
api_name=False, | |
) | |
input_audio_mic.change(lambda x: x, input_audio_mic, final_audio) | |
input_audio_file.change(lambda x: x, input_audio_file, final_audio) | |
submit = gr.Button("Submit") | |
text_output = gr.Textbox( | |
label="Transcribed Text", | |
value="", | |
interactive=False, | |
lines=10, | |
scale=10, | |
max_lines=10, | |
) | |
submit.click( | |
fn=predict, | |
inputs=[ | |
target_language, | |
number_of_speakers, | |
audio_source, | |
input_audio_mic, | |
input_audio_file, | |
], | |
outputs=[text_output], | |
api_name="predict", | |
) | |
gr.Markdown(DUPLICATE) | |
demo.queue(max_size=50).launch() | |