a.pourmand commited on
Commit
0d83c55
1 Parent(s): 3e79246

add seamlessm4t

Browse files
.idea/.gitignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # Default ignored files
2
+ /shelf/
3
+ /workspace.xml
4
+ # Editor-based HTTP Client requests
5
+ /httpRequests/
6
+ # Datasource local storage ignored files
7
+ /dataSources/
8
+ /dataSources.local.xml
.idea/Seamlessm4t_diarization_VAD.iml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <module type="PYTHON_MODULE" version="4">
3
+ <component name="NewModuleRootManager">
4
+ <content url="file://$MODULE_DIR$" />
5
+ <orderEntry type="inheritedJdk" />
6
+ <orderEntry type="sourceFolder" forTests="false" />
7
+ </component>
8
+ </module>
.idea/inspectionProfiles/profiles_settings.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <component name="InspectionProjectProfileManager">
2
+ <settings>
3
+ <option name="USE_PROJECT_PROFILE" value="false" />
4
+ <version value="1.0" />
5
+ </settings>
6
+ </component>
.idea/modules.xml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectModuleManager">
4
+ <modules>
5
+ <module fileurl="file://$PROJECT_DIR$/.idea/Seamlessm4t_diarization_VAD.iml" filepath="$PROJECT_DIR$/.idea/Seamlessm4t_diarization_VAD.iml" />
6
+ </modules>
7
+ </component>
8
+ </project>
.idea/vcs.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="VcsDirectoryMappings">
4
+ <mapping directory="" vcs="Git" />
5
+ </component>
6
+ </project>
app.py CHANGED
@@ -29,8 +29,15 @@ To duplicate this repo, you have to give permission from three reopsitories and
29
 
30
  """
31
  from pyannote.audio import Pipeline
32
- pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization",use_auth_token=HF_API)
33
- def predict(target_language, number_of_speakers, audio_source, input_audio_mic, input_audio_file):
 
 
 
 
 
 
 
34
  if audio_source == "microphone":
35
  input_data = input_audio_mic
36
  else:
@@ -53,8 +60,8 @@ def predict(target_language, number_of_speakers, audio_source, input_audio_mic,
53
  for turn, value, speaker in diarization.itertracks(yield_label=True):
54
  print(turn)
55
  try:
56
- clipped = song[turn.start * 1000: turn.end * 1000]
57
- clipped.export(f'my.wav', format='wav', bitrate=16000)
58
 
59
  _, result = client.predict(
60
  "ASR (Automatic Speech Recognition)",
@@ -64,19 +71,19 @@ def predict(target_language, number_of_speakers, audio_source, input_audio_mic,
64
  "text",
65
  target_language,
66
  target_language,
67
- api_name="/run"
68
  )
69
- current_text = f'start: {turn.start:.1f} end: {turn.end:.1f} text: {result} speaker: {speaker}'
70
 
71
  if current_text is not None:
72
  output_text = output_text + "\n" + current_text
73
  yield output_text
74
 
75
-
76
  except Exception as e:
77
  print(e)
78
 
79
- #return output_text
 
80
 
81
  def update_audio_ui(audio_source: str) -> tuple[dict, dict]:
82
  mic = audio_source == "microphone"
@@ -95,9 +102,12 @@ with gr.Blocks(css="style.css") as demo:
95
  label="Output Language",
96
  value=DEFAULT_TARGET_LANGUAGE,
97
  interactive=True,
98
- info="Select your target language"
 
 
 
 
99
  )
100
- number_of_speakers=gr.Number(label="Number of Speakers",info="Keep it zero, if you want the model to automatically detect the number of speakers")
101
  with gr.Row() as audio_box:
102
  audio_source = gr.Radio(
103
  choices=["file", "microphone"], value="file", interactive=True
@@ -125,9 +135,27 @@ with gr.Blocks(css="style.css") as demo:
125
  input_audio_mic.change(lambda x: x, input_audio_mic, final_audio)
126
  input_audio_file.change(lambda x: x, input_audio_file, final_audio)
127
  submit = gr.Button("Submit")
128
- text_output = gr.Textbox(label="Transcribed Text", value="", interactive=False,lines=2,scale=3,max_lines=2)
 
 
 
 
 
 
 
129
 
130
- submit.click(fn=predict, inputs=[target_language,number_of_speakers, audio_source,input_audio_mic, input_audio_file], outputs=[text_output],api_name="predict")
 
 
 
 
 
 
 
 
 
 
 
131
 
132
  gr.Markdown(DUPLICATE)
133
 
 
29
 
30
  """
31
  from pyannote.audio import Pipeline
32
+
33
+ pipeline = Pipeline.from_pretrained(
34
+ "pyannote/speaker-diarization", use_auth_token=HF_API
35
+ )
36
+
37
+
38
+ def predict(
39
+ target_language, number_of_speakers, audio_source, input_audio_mic, input_audio_file
40
+ ):
41
  if audio_source == "microphone":
42
  input_data = input_audio_mic
43
  else:
 
60
  for turn, value, speaker in diarization.itertracks(yield_label=True):
61
  print(turn)
62
  try:
63
+ clipped = song[turn.start * 1000 : turn.end * 1000]
64
+ clipped.export(f"my.wav", format="wav", bitrate=16000)
65
 
66
  _, result = client.predict(
67
  "ASR (Automatic Speech Recognition)",
 
71
  "text",
72
  target_language,
73
  target_language,
74
+ api_name="/run",
75
  )
76
+ current_text = f"start: {turn.start:.1f} end: {turn.end:.1f} text: {result} speaker: {speaker}"
77
 
78
  if current_text is not None:
79
  output_text = output_text + "\n" + current_text
80
  yield output_text
81
 
 
82
  except Exception as e:
83
  print(e)
84
 
85
+ # return output_text
86
+
87
 
88
  def update_audio_ui(audio_source: str) -> tuple[dict, dict]:
89
  mic = audio_source == "microphone"
 
102
  label="Output Language",
103
  value=DEFAULT_TARGET_LANGUAGE,
104
  interactive=True,
105
+ info="Select your target language",
106
+ )
107
+ number_of_speakers = gr.Number(
108
+ label="Number of Speakers",
109
+ info="Keep it zero, if you want the model to automatically detect the number of speakers",
110
  )
 
111
  with gr.Row() as audio_box:
112
  audio_source = gr.Radio(
113
  choices=["file", "microphone"], value="file", interactive=True
 
135
  input_audio_mic.change(lambda x: x, input_audio_mic, final_audio)
136
  input_audio_file.change(lambda x: x, input_audio_file, final_audio)
137
  submit = gr.Button("Submit")
138
+ text_output = gr.Textbox(
139
+ label="Transcribed Text",
140
+ value="",
141
+ interactive=False,
142
+ lines=10,
143
+ scale=10,
144
+ max_lines=10,
145
+ )
146
 
147
+ submit.click(
148
+ fn=predict,
149
+ inputs=[
150
+ target_language,
151
+ number_of_speakers,
152
+ audio_source,
153
+ input_audio_mic,
154
+ input_audio_file,
155
+ ],
156
+ outputs=[text_output],
157
+ api_name="predict",
158
+ )
159
 
160
  gr.Markdown(DUPLICATE)
161