a.pourmand
add seamlessm4t
0d83c55
raw
history blame
4.97 kB
import gradio as gr
import os
from dotenv import load_dotenv
from pydub import AudioSegment
load_dotenv()
from lang_list import TEXT_SOURCE_LANGUAGE_NAMES
from gradio_client import Client
HF_API = os.getenv("HF_API")
API_URL = os.getenv("API_URL") # path to Seamlessm4t API endpoint
DEFAULT_TARGET_LANGUAGE = "Western Persian"
DESCRIPTION = """
# Seamlessm4t + Speaker Diarization + Voice Activity Detection
Here we use seamlessm4t to generate captions for full audios. Audio can be of arbitrary length.
"""
DUPLICATE = """
To duplicate this repo, you have to give permission from three reopsitories and accept all user conditions:
1- https://huggingface.co/pyannote/voice-activity-detection
2- https://hf.co/pyannote/segmentation
3- https://hf.co/pyannote/speaker-diarization
"""
from pyannote.audio import Pipeline
pipeline = Pipeline.from_pretrained(
"pyannote/speaker-diarization", use_auth_token=HF_API
)
def predict(
target_language, number_of_speakers, audio_source, input_audio_mic, input_audio_file
):
if audio_source == "microphone":
input_data = input_audio_mic
else:
input_data = input_audio_file
print(input_data)
if number_of_speakers == 0:
diarization = pipeline(input_data)
else:
diarization = pipeline(input_data, num_speakers=number_of_speakers)
for turn, value, speaker in diarization.itertracks(yield_label=True):
print(f"start={turn.start}s stop={turn.end}s speaker_{speaker}")
song = AudioSegment.from_wav(input_data)
client = Client(API_URL)
output_text = ""
for turn, value, speaker in diarization.itertracks(yield_label=True):
print(turn)
try:
clipped = song[turn.start * 1000 : turn.end * 1000]
clipped.export(f"my.wav", format="wav", bitrate=16000)
_, result = client.predict(
"ASR (Automatic Speech Recognition)",
"file", # str in 'Audio source' Radio component
f"my.wav",
f"my.wav",
"text",
target_language,
target_language,
api_name="/run",
)
current_text = f"start: {turn.start:.1f} end: {turn.end:.1f} text: {result} speaker: {speaker}"
if current_text is not None:
output_text = output_text + "\n" + current_text
yield output_text
except Exception as e:
print(e)
# return output_text
def update_audio_ui(audio_source: str) -> tuple[dict, dict]:
mic = audio_source == "microphone"
return (
gr.update(visible=mic, value=None), # input_audio_mic
gr.update(visible=not mic, value=None), # input_audio_file
)
with gr.Blocks(css="style.css") as demo:
gr.Markdown(DESCRIPTION)
with gr.Group():
with gr.Row():
target_language = gr.Dropdown(
choices=TEXT_SOURCE_LANGUAGE_NAMES,
label="Output Language",
value=DEFAULT_TARGET_LANGUAGE,
interactive=True,
info="Select your target language",
)
number_of_speakers = gr.Number(
label="Number of Speakers",
info="Keep it zero, if you want the model to automatically detect the number of speakers",
)
with gr.Row() as audio_box:
audio_source = gr.Radio(
choices=["file", "microphone"], value="file", interactive=True
)
input_audio_mic = gr.Audio(
label="Input speech",
type="filepath",
source="microphone",
visible=False,
)
input_audio_file = gr.Audio(
label="Input speech",
type="filepath",
source="upload",
visible=True,
)
final_audio = gr.Audio(label="Output", visible=False)
audio_source.change(
fn=update_audio_ui,
inputs=audio_source,
outputs=[input_audio_mic, input_audio_file],
queue=False,
api_name=False,
)
input_audio_mic.change(lambda x: x, input_audio_mic, final_audio)
input_audio_file.change(lambda x: x, input_audio_file, final_audio)
submit = gr.Button("Submit")
text_output = gr.Textbox(
label="Transcribed Text",
value="",
interactive=False,
lines=10,
scale=10,
max_lines=10,
)
submit.click(
fn=predict,
inputs=[
target_language,
number_of_speakers,
audio_source,
input_audio_mic,
input_audio_file,
],
outputs=[text_output],
api_name="predict",
)
gr.Markdown(DUPLICATE)
demo.queue(max_size=50).launch()