File size: 4,657 Bytes
16a3aec
 
2142103
0b6a418
2142103
 
 
d147382
6a51822
16a3aec
dcf6807
 
16a3aec
d147382
 
16a3aec
 
d147382
 
16a3aec
 
 
d147382
16a3aec
d147382
 
 
 
 
16a3aec
c9d0859
6a51822
1b6d8ab
 
 
 
 
 
 
 
6a51822
1b6d8ab
6a51822
1b6d8ab
6a51822
 
 
 
bd5ca33
6a51822
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dc25c6e
f5a7828
 
 
6a51822
 
 
 
16a3aec
f5a7828
aa5cca4
d147382
 
 
 
 
 
 
 
 
16a3aec
d147382
6a51822
 
 
 
 
 
d2fc75a
6a51822
 
d147382
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6a51822
d147382
 
 
 
 
 
 
6a51822
 
d147382
8c29c3b
d147382
1b6d8ab
16a3aec
6a51822
d147382
16a3aec
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import gradio as gr
import os
from dotenv import load_dotenv
from pydub import AudioSegment

load_dotenv()

from lang_list import TEXT_SOURCE_LANGUAGE_NAMES
from gradio_client import Client

HF_API = os.getenv("HF_API")
API_URL = os.getenv("API_URL")  # path to Seamlessm4t API endpoint

DEFAULT_TARGET_LANGUAGE = "Western Persian"

DESCRIPTION = """
# Seamlessm4t + Speaker Diarization + Voice Activity Detection
Here we use seamlessm4t to generate captions for full audios. Audio can be of arbitrary length. 
"""

DUPLICATE = """
To duplicate this repo, you have to give permission from three reopsitories and accept all user conditions: 

1- https://huggingface.co/pyannote/voice-activity-detection

2- https://hf.co/pyannote/segmentation

3- https://hf.co/pyannote/speaker-diarization

"""
from pyannote.audio import Pipeline
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization",use_auth_token=HF_API)
def predict(target_language, number_of_speakers, audio_source, input_audio_mic, input_audio_file):
    if audio_source == "microphone":
        input_data = input_audio_mic
    else:
        input_data = input_audio_file

    print(input_data)

    if number_of_speakers == 0:
        diarization = pipeline(input_data)
    else:
        diarization = pipeline(input_data, num_speakers=number_of_speakers)

    for turn, value, speaker in diarization.itertracks(yield_label=True):
        print(f"start={turn.start}s stop={turn.end}s speaker_{speaker}")

    song = AudioSegment.from_wav(input_data)

    client = Client(API_URL)
    output_text = ""
    for turn, value, speaker in diarization.itertracks(yield_label=True):
        print(turn)
        try:
            clipped = song[turn.start * 1000: turn.end * 1000]
            clipped.export(f'my.wav', format='wav', bitrate=16000)

            _, result = client.predict(
                "ASR (Automatic Speech Recognition)",
                "file",  # str in 'Audio source' Radio component
                f"my.wav",
                f"my.wav",
                "text",
                target_language,
                target_language,
                api_name="/run"
            )
            current_text = f'start: {turn.start:.1f} end: {turn.end:.1f} text: {result} speaker: {speaker}'
            yield current_text
            #if current_text is not None:
            #    output_text = output_text + "\n" + current_text


        except Exception as e:
            print(e)

        #return output_text

def update_audio_ui(audio_source: str) -> tuple[dict, dict]:
    mic = audio_source == "microphone"
    return (
        gr.update(visible=mic, value=None),  # input_audio_mic
        gr.update(visible=not mic, value=None),  # input_audio_file
    )


with gr.Blocks(css="style.css") as demo:
    gr.Markdown(DESCRIPTION)
    with gr.Group():
        with gr.Row():
            target_language = gr.Dropdown(
                choices=TEXT_SOURCE_LANGUAGE_NAMES,
                label="Output Language",
                value=DEFAULT_TARGET_LANGUAGE,
                interactive=True,
                info="Select your target language"
            )
            number_of_speakers=gr.Number(label="Number of Speakers",info="Keep it zero, if you want the model to automatically detect the number of speakers")
        with gr.Row() as audio_box:
            audio_source = gr.Radio(
                choices=["file", "microphone"], value="file", interactive=True
            )
            input_audio_mic = gr.Audio(
                label="Input speech",
                type="filepath",
                source="microphone",
                visible=False,
            )
            input_audio_file = gr.Audio(
                label="Input speech",
                type="filepath",
                source="upload",
                visible=True,
            )
            final_audio = gr.Audio(label="Output", visible=False)
        audio_source.change(
            fn=update_audio_ui,
            inputs=audio_source,
            outputs=[input_audio_mic, input_audio_file],
            queue=False,
            api_name=False,
        )
        input_audio_mic.change(lambda x: x, input_audio_mic, final_audio)
        input_audio_file.change(lambda x: x, input_audio_file, final_audio)
        submit = gr.Button("Submit")
        text_output = gr.Textbox(label="Transcribed Text", value="", interactive=False,lines=2,scale=3,max_lines=2)

        submit.click(fn=predict, inputs=[target_language,number_of_speakers, audio_source,input_audio_mic, input_audio_file], outputs=[text_output],api_name="predict")

    gr.Markdown(DUPLICATE)

demo.queue(max_size=50).launch()