Spaces:
Running
on
T4
Running
on
T4
Anna Sun
commited on
Commit
•
fd69a21
1
Parent(s):
c1e0588
more fixes
Browse files- app.py +27 -10
- simuleval_transcoder.py +1 -0
app.py
CHANGED
@@ -35,6 +35,7 @@ def build_agent(model_path, config_name=None):
|
|
35 |
|
36 |
agent = build_agent("models", "vad_s2st_sc_24khz_main.yaml")
|
37 |
transcoder = SimulevalTranscoder(
|
|
|
38 |
sample_rate=48_000,
|
39 |
debug=False,
|
40 |
buffer_limit=1,
|
@@ -43,8 +44,8 @@ transcoder = SimulevalTranscoder(
|
|
43 |
def start_recording():
|
44 |
logger.debug(f"start_recording: starting transcoder")
|
45 |
transcoder.reset_states()
|
46 |
-
transcoder.start()
|
47 |
transcoder.close = False
|
|
|
48 |
|
49 |
def stop_recording():
|
50 |
transcoder.close = True
|
@@ -87,11 +88,13 @@ def get_buffered_output():
|
|
87 |
|
88 |
return speech, text, speech_and_text_output.final
|
89 |
|
|
|
90 |
def streaming_input_callback():
|
91 |
final = False
|
92 |
max_wait_s = 15
|
93 |
wait_s = 0
|
94 |
translated_text_state = ""
|
|
|
95 |
while not transcoder.close:
|
96 |
translated_wav_segment, translated_text, final = get_buffered_output()
|
97 |
|
@@ -107,7 +110,7 @@ def streaming_input_callback():
|
|
107 |
print("output sample rate", sample_rate)
|
108 |
translated_wav_segment = sample_rate, np.array(audio_bytes)
|
109 |
else:
|
110 |
-
translated_wav_segment =
|
111 |
|
112 |
if translated_text is not None:
|
113 |
translated_text_state += " | " + str(translated_text)
|
@@ -123,16 +126,23 @@ def streaming_input_callback():
|
|
123 |
|
124 |
|
125 |
def streaming_callback_dummy():
|
|
|
|
|
126 |
while not transcoder.close:
|
127 |
if s.queue.empty():
|
128 |
-
|
129 |
-
|
|
|
130 |
time.sleep(0.3)
|
131 |
else:
|
132 |
-
|
|
|
|
|
133 |
audio = s.queue.get_nowait()
|
|
|
|
|
134 |
s.queue.task_done()
|
135 |
-
yield audio
|
136 |
|
137 |
def clear():
|
138 |
logger.debug(f"Clearing State")
|
@@ -175,21 +185,28 @@ def blocks():
|
|
175 |
).then(
|
176 |
start_recording
|
177 |
).then(
|
178 |
-
#
|
179 |
-
#
|
180 |
-
#
|
|
|
|
|
|
|
181 |
streaming_input_callback,
|
182 |
None,
|
183 |
[
|
184 |
output_translation_segment,
|
185 |
stream_output_text,
|
186 |
translated_text_state,
|
187 |
-
]
|
188 |
)
|
189 |
input_audio.stop_recording(
|
190 |
stop_recording
|
191 |
)
|
192 |
input_audio.stream(
|
|
|
|
|
|
|
|
|
193 |
process_incoming_bytes, [input_audio], None
|
194 |
)
|
195 |
|
|
|
35 |
|
36 |
agent = build_agent("models", "vad_s2st_sc_24khz_main.yaml")
|
37 |
transcoder = SimulevalTranscoder(
|
38 |
+
agent,
|
39 |
sample_rate=48_000,
|
40 |
debug=False,
|
41 |
buffer_limit=1,
|
|
|
44 |
def start_recording():
|
45 |
logger.debug(f"start_recording: starting transcoder")
|
46 |
transcoder.reset_states()
|
|
|
47 |
transcoder.close = False
|
48 |
+
transcoder.start()
|
49 |
|
50 |
def stop_recording():
|
51 |
transcoder.close = True
|
|
|
88 |
|
89 |
return speech, text, speech_and_text_output.final
|
90 |
|
91 |
+
from scipy.io.wavfile import write as scipy_write
|
92 |
def streaming_input_callback():
|
93 |
final = False
|
94 |
max_wait_s = 15
|
95 |
wait_s = 0
|
96 |
translated_text_state = ""
|
97 |
+
sample_rate = 24000
|
98 |
while not transcoder.close:
|
99 |
translated_wav_segment, translated_text, final = get_buffered_output()
|
100 |
|
|
|
110 |
print("output sample rate", sample_rate)
|
111 |
translated_wav_segment = sample_rate, np.array(audio_bytes)
|
112 |
else:
|
113 |
+
translated_wav_segment = sample_rate, np.empty(0, dtype=np.int16)
|
114 |
|
115 |
if translated_text is not None:
|
116 |
translated_text_state += " | " + str(translated_text)
|
|
|
126 |
|
127 |
|
128 |
def streaming_callback_dummy():
|
129 |
+
i = 0
|
130 |
+
out_text = ""
|
131 |
while not transcoder.close:
|
132 |
if s.queue.empty():
|
133 |
+
yield (
|
134 |
+
(48000, np.empty(0, dtype=np.int16)), out_text, out_text
|
135 |
+
)
|
136 |
time.sleep(0.3)
|
137 |
else:
|
138 |
+
i += 1
|
139 |
+
out_text += " | " + str(i)
|
140 |
+
print(out_text)
|
141 |
audio = s.queue.get_nowait()
|
142 |
+
if i == 0:
|
143 |
+
print(audio[0], type(audio[1]))
|
144 |
s.queue.task_done()
|
145 |
+
yield audio, out_text, out_text
|
146 |
|
147 |
def clear():
|
148 |
logger.debug(f"Clearing State")
|
|
|
185 |
).then(
|
186 |
start_recording
|
187 |
).then(
|
188 |
+
# TODO: streaming speech autoplay works fine with streaming_callback_dummy,
|
189 |
+
# but speech output from streaming_input_callback has a huge delay
|
190 |
+
# when comparing print/debugging logs vs. output speech
|
191 |
+
# TODO: text output works fine with one output, but is not
|
192 |
+
# updating when output is both text + speech
|
193 |
+
# streaming_callback_dummy,
|
194 |
streaming_input_callback,
|
195 |
None,
|
196 |
[
|
197 |
output_translation_segment,
|
198 |
stream_output_text,
|
199 |
translated_text_state,
|
200 |
+
]
|
201 |
)
|
202 |
input_audio.stop_recording(
|
203 |
stop_recording
|
204 |
)
|
205 |
input_audio.stream(
|
206 |
+
# TODO: *only when streaming speech output* about half the time
|
207 |
+
# there is some race condition in gradio where process_incoming_bytes
|
208 |
+
# stops getting called once the first speech chunk is yield-ed
|
209 |
+
# in streaming_input_callback (or streaming_callback_dummy)
|
210 |
process_incoming_bytes, [input_audio], None
|
211 |
)
|
212 |
|
simuleval_transcoder.py
CHANGED
@@ -325,6 +325,7 @@ class SimulevalTranscoder:
|
|
325 |
|
326 |
def process_pipeline_loop(self):
|
327 |
if self.close:
|
|
|
328 |
return # closes the thread
|
329 |
|
330 |
print("processing_pipeline")
|
|
|
325 |
|
326 |
def process_pipeline_loop(self):
|
327 |
if self.close:
|
328 |
+
print("transcoder closed")
|
329 |
return # closes the thread
|
330 |
|
331 |
print("processing_pipeline")
|