import gradio as gr import os from dotenv import load_dotenv load_dotenv() from lang_list import TEXT_SOURCE_LANGUAGE_NAMES from gradio_client import Client HF_API = os.getenv("HF_API") API_URL = os.getenv("API_URL") # path to Seamlessm4t API endpoint DEFAULT_TARGET_LANGUAGE = "Western Persian" DESCRIPTION = """ # Seamlessm4t + Speaker Diarization + Voice Activity Detection Here we use seamlessm4t to generate captions for full audios. Audio can be of arbitrary length. """ DUPLICATE = """ To duplicate this repo, you have to give permission from three reopsitories and accept all user conditions: 1- https://huggingface.co/pyannote/voice-activity-detection 2- https://hf.co/pyannote/segmentation 3- https://hf.co/pyannote/speaker-diarization """ pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization",use_auth_token=HF_API) def predict(target_language, number_of_speakers, final_audio): if number_of_speakers == 0: diarization = pipeline(final_audio) else: diarization = pipeline(final_audio, num_speakers=number_of_speakers) for turn, value, speaker in diarization.itertracks(yield_label=True): print(f"start={turn.start}s stop={turn.end}s speaker_{speaker}") song = AudioSegment.from_wav(sample_file) client = Client(API_URL) output_text = "" for turn, value, speaker in diarization.itertracks(yield_label=True): print(turn) try: clipped = song[turn.start * 1000: turn.end * 1000] clipped.export(f'my.wav', format='wav', bitrate=16000) _, result = client.predict( "ASR (Automatic Speech Recognition)", "file", # str in 'Audio source' Radio component f"my.wav", f"my.wav", "text", target_language, target_language, api_name="/run" ) output_text = output_text + "\n" + (f'start: {turn.start:.1f} end: {turn.end:.1f} text: {result} speaker: {speaker}') except Exception as e: print(e) def update_audio_ui(audio_source: str) -> tuple[dict, dict]: mic = audio_source == "microphone" return ( gr.update(visible=mic, value=None), # input_audio_mic gr.update(visible=not mic, value=None), # input_audio_file ) with gr.Blocks(css="style.css") as demo: gr.Markdown(DESCRIPTION) with gr.Group(): with gr.Row(): target_language = gr.Dropdown( choices=TEXT_SOURCE_LANGUAGE_NAMES, label="Output Language", value=DEFAULT_TARGET_LANGUAGE, interactive=True, ) number_of_speakers=gr.Number(label="Number of Speakers",info="Keep it zero, if you want the model to automatically detect the number of speakers") with gr.Row() as audio_box: audio_source = gr.Radio( choices=["file", "microphone"], value="file", interactive=True ) input_audio_mic = gr.Audio( label="Input speech", type="filepath", source="microphone", visible=False, ) input_audio_file = gr.Audio( label="Input speech", type="filepath", source="upload", visible=True, ) final_audio = gr.Audio(label="Output", visible=False) audio_source.change( fn=update_audio_ui, inputs=audio_source, outputs=[input_audio_mic, input_audio_file], queue=False, api_name=False, ) input_audio_mic.change(lambda x: x, input_audio_mic, final_audio) input_audio_file.change(lambda x: x, input_audio_file, final_audio) submit = gr.Button("Submit") text_output = gr.Textbox(label="Transcribed Text", value="", interactive=False) submit.click(fn=predict, inputs=[target_language,number_of_speakers, final_audio], outputs=[text_output],api_name="predict") gr.Markdown(DUPLICATE) demo.queue(max_size=50).launch()