a.pourmand commited on
Commit
6a51822
1 Parent(s): d147382
Files changed (1) hide show
  1. app.py +50 -16
app.py CHANGED
@@ -1,10 +1,7 @@
1
- from cProfile import label
2
- from email.policy import default
3
-
4
- from altair import value
5
  import gradio as gr
6
  import os
7
  from lang_list import TEXT_SOURCE_LANGUAGE_NAMES
 
8
 
9
  HF_API = os.getenv("HF_API")
10
  API_URL = os.getenv("API_URL") # path to Seamlessm4t API endpoint
@@ -14,7 +11,6 @@ DEFAULT_TARGET_LANGUAGE = "Western Persian"
14
  DESCRIPTION = """
15
  # Seamlessm4t + Speaker Diarization + Voice Activity Detection
16
  Here we use seamlessm4t to generate captions for full audios. Audio can be of arbitrary length.
17
-
18
  """
19
 
20
  DUPLICATE = """
@@ -28,6 +24,42 @@ To duplicate this repo, you have to give permission from three reopsitories and
28
 
29
  """
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
  def update_audio_ui(audio_source: str) -> tuple[dict, dict]:
33
  mic = audio_source == "microphone"
@@ -40,13 +72,14 @@ def update_audio_ui(audio_source: str) -> tuple[dict, dict]:
40
  with gr.Blocks(css="style.css") as demo:
41
  gr.Markdown(DESCRIPTION)
42
  with gr.Group():
43
- target_language = gr.Dropdown(
44
- choices=TEXT_SOURCE_LANGUAGE_NAMES,
45
- label="Output Language",
46
- value=DEFAULT_TARGET_LANGUAGE,
47
- interactive=True,
48
- )
49
- target_language.update(value=DEFAULT_TARGET_LANGUAGE)
 
50
  with gr.Row() as audio_box:
51
  audio_source = gr.Radio(
52
  choices=["file", "microphone"], value="file", interactive=True
@@ -63,7 +96,7 @@ with gr.Blocks(css="style.css") as demo:
63
  source="upload",
64
  visible=True,
65
  )
66
- output = gr.Audio(label="Output", visible=False)
67
  audio_source.change(
68
  fn=update_audio_ui,
69
  inputs=audio_source,
@@ -71,12 +104,13 @@ with gr.Blocks(css="style.css") as demo:
71
  queue=False,
72
  api_name=False,
73
  )
74
- input_audio_mic.change(lambda x: x, input_audio_mic, output)
75
- input_audio_file.change(lambda x: x, input_audio_file, output)
76
  submit = gr.Button("Submit")
77
  text_output = gr.Textbox(label="Transcribed Text", value="", interactive=False)
78
 
79
- gr.Markdown(DUPLICATE)
80
 
 
81
 
82
  demo.queue(max_size=50).launch()
 
 
 
 
 
1
  import gradio as gr
2
  import os
3
  from lang_list import TEXT_SOURCE_LANGUAGE_NAMES
4
+ from gradio_client import Client
5
 
6
  HF_API = os.getenv("HF_API")
7
  API_URL = os.getenv("API_URL") # path to Seamlessm4t API endpoint
 
11
  DESCRIPTION = """
12
  # Seamlessm4t + Speaker Diarization + Voice Activity Detection
13
  Here we use seamlessm4t to generate captions for full audios. Audio can be of arbitrary length.
 
14
  """
15
 
16
  DUPLICATE = """
 
24
 
25
  """
26
 
27
+ pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization",use_auth_token=HF_API)
28
+ def predict(target_language, number_of_speakers, final_audio):
29
+ if number_of_speakers == 0:
30
+ diarization = pipeline(final_audio)
31
+ else:
32
+ diarization = pipeline(final_audio, num_speakers=number_of_speakers)
33
+
34
+ for turn, value, speaker in diarization.itertracks(yield_label=True):
35
+ print(f"start={turn.start}s stop={turn.end}s speaker_{speaker}")
36
+
37
+ song = AudioSegment.from_wav(sample_file)
38
+
39
+ client = Client(API_URL)
40
+ output_text = ""
41
+ for turn, value, speaker in diarization.itertracks(yield_label=True):
42
+ print(turn)
43
+ try:
44
+ clipped = song[turn.start * 1000: turn.end * 1000]
45
+ clipped.export(f'my.wav', format='wav', bitrate=16000)
46
+
47
+ _, result = client.predict(
48
+ "ASR (Automatic Speech Recognition)",
49
+ "file", # str in 'Audio source' Radio component
50
+ f"my.wav",
51
+ f"my.wav",
52
+ "text",
53
+ target_language,
54
+ target_language,
55
+ api_name="/run"
56
+ )
57
+
58
+ output_text = output_text + "\n" + (f'start: {turn.start:.1f} end: {turn.end:.1f} text: {result} speaker: {speaker}')
59
+
60
+
61
+ except Exception as e:
62
+ print(e)
63
 
64
  def update_audio_ui(audio_source: str) -> tuple[dict, dict]:
65
  mic = audio_source == "microphone"
 
72
  with gr.Blocks(css="style.css") as demo:
73
  gr.Markdown(DESCRIPTION)
74
  with gr.Group():
75
+ with gr.Row():
76
+ target_language = gr.Dropdown(
77
+ choices=TEXT_SOURCE_LANGUAGE_NAMES,
78
+ label="Output Language",
79
+ value=DEFAULT_TARGET_LANGUAGE,
80
+ interactive=True,
81
+ )
82
+ number_of_speakers=gr.Number(label="Number of Speakers",info="Keep it zero, if you want the model to automatically detect the number of speakers")
83
  with gr.Row() as audio_box:
84
  audio_source = gr.Radio(
85
  choices=["file", "microphone"], value="file", interactive=True
 
96
  source="upload",
97
  visible=True,
98
  )
99
+ final_audio = gr.Audio(label="Output", visible=False)
100
  audio_source.change(
101
  fn=update_audio_ui,
102
  inputs=audio_source,
 
104
  queue=False,
105
  api_name=False,
106
  )
107
+ input_audio_mic.change(lambda x: x, input_audio_mic, final_audio)
108
+ input_audio_file.change(lambda x: x, input_audio_file, final_audio)
109
  submit = gr.Button("Submit")
110
  text_output = gr.Textbox(label="Transcribed Text", value="", interactive=False)
111
 
112
+ submit.click(fn=predict, inputs=[target_language,number_of_speakers, final_audio], outputs=[text_output],api_name="predict")
113
 
114
+ gr.Markdown(DUPLICATE)
115
 
116
  demo.queue(max_size=50).launch()