andito HF staff commited on
Commit
b5b0b9a
β€’
1 Parent(s): 8b97d54

update streaming

Browse files
Files changed (3) hide show
  1. README.md +1 -1
  2. app.py +18 -15
  3. audio_streaming_client.py +128 -0
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: πŸ“š
4
  colorFrom: yellow
5
  colorTo: purple
6
  sdk: gradio
7
- sdk_version: 4.44.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
 
4
  colorFrom: yellow
5
  colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 5.0.0b3
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
app.py CHANGED
@@ -1,20 +1,23 @@
1
  import gradio as gr
2
 
3
- def pass_audio(audio, test):
4
- if test is not None:
5
- print(len(test))
6
- print(test[0])
7
- print(len(test[1]))
8
- print(test[1])
9
- else:
10
- print("test is None")
11
- return audio, test
12
 
13
- demo = gr.Interface(
14
- pass_audio,
15
- ["state", gr.Audio(sources=["microphone"], streaming=True)],
16
- ["state", gr.Audio(streaming=True, autoplay=True)],
17
- live=True,
18
- )
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  demo.launch()
 
1
  import gradio as gr
2
 
3
+ from audio_streaming_client import AudioStreamingClient
 
 
 
 
 
 
 
 
4
 
5
+ audio_streaming_client = AudioStreamingClient()
6
+ audio_streaming_client.start()
7
+
8
+
9
+ def stream_audio(audio):
10
+ sample_rate = audio[0]
11
+ audio_streaming_client.put_audio(audio[1], sample_rate)
12
+ output_size = len(audio[1])
13
+ output_audio = audio_streaming_client.get_audio(sample_rate, output_size)
14
+ return (sample_rate, output_audio)
15
+
16
+
17
+ with gr.Blocks() as demo:
18
+ gr.Markdown("# Speech to speech in an inference endpoint 🎀")
19
+ inp = gr.Audio(sources=["microphone"], type="numpy")
20
+ out = gr.Audio(streaming=True, autoplay=True)
21
+ inp.stream(stream_audio, inp, out, time_limit=600, stream_every=1)
22
 
23
  demo.launch()
audio_streaming_client.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import threading
2
+ from queue import Queue, Empty
3
+ import numpy as np
4
+ import requests
5
+ import base64
6
+ import time
7
+ from dataclasses import dataclass, field
8
+ import websocket
9
+ import threading
10
+ import ssl
11
+ import librosa
12
+ import os
13
+
14
+ class AudioStreamingClient:
15
+ def __init__(self):
16
+ self.auth_token = os.environ.get("HF_AUTH_TOKEN", None)
17
+ self.api_url = os.environ.get("HF_API_URL", None)
18
+ self.stop_event = threading.Event()
19
+ self.send_queue = Queue()
20
+ self.recv_queue = Queue()
21
+ self.session_id = None
22
+ self.headers = {
23
+ "Accept": "application/json",
24
+ "Authorization": f"Bearer {self.auth_token}",
25
+ "Content-Type": "application/json"
26
+ }
27
+ self.session_state = "idle" # Possible states: idle, sending, processing, waiting
28
+ self.ws_ready = threading.Event()
29
+
30
+ def start(self):
31
+ print("Starting audio streaming...")
32
+
33
+ ws_url = self.api_url.replace("http", "ws") + "/ws"
34
+
35
+ self.ws = websocket.WebSocketApp(
36
+ ws_url,
37
+ header=[f"{key}: {value}" for key, value in self.headers.items()],
38
+ on_open=self.on_open,
39
+ on_message=self.on_message,
40
+ on_error=self.on_error,
41
+ on_close=self.on_close
42
+ )
43
+
44
+ self.ws_thread = threading.Thread(target=self.ws.run_forever, kwargs={'sslopt': {"cert_reqs": ssl.CERT_NONE}})
45
+ self.ws_thread.start()
46
+
47
+ # Wait for the WebSocket to be ready
48
+ self.ws_ready.wait()
49
+
50
+ self.send_thread = threading.Thread(target=self.send_audio)
51
+
52
+ self.send_thread.start()
53
+
54
+
55
+ def on_close(self):
56
+ self.stop_event.set()
57
+ self.send_thread.join()
58
+ self.ws.close()
59
+ self.ws_thread.join()
60
+ print("Audio streaming stopped.")
61
+
62
+ def on_open(self, ws):
63
+ print("WebSocket connection opened.")
64
+ self.ws_ready.set() # Signal that the WebSocket is ready
65
+
66
+ def on_message(self, ws, message):
67
+ # message is bytes
68
+ if message == b'DONE':
69
+ print("listen")
70
+ self.session_state = "listen"
71
+ else:
72
+ print("processing")
73
+ self.session_state = "processing"
74
+ audio_np = np.frombuffer(message, dtype=np.int16)
75
+ self.recv_queue.put(audio_np)
76
+
77
+ def on_error(self, ws, error):
78
+ print(f"WebSocket error: {error}")
79
+
80
+ def on_close(self, ws, close_status_code, close_msg):
81
+ print("WebSocket connection closed.")
82
+
83
+ def send_audio(self):
84
+ while not self.stop_event.is_set():
85
+ if not self.send_queue.empty():
86
+ chunk = self.send_queue.get()
87
+ if self.session_state != "processing":
88
+ self.ws.send(chunk.tobytes(), opcode=websocket.ABNF.OPCODE_BINARY)
89
+ else:
90
+ self.ws.send([], opcode=websocket.ABNF.OPCODE_BINARY) # handshake
91
+ time.sleep(0.01)
92
+
93
+ def put_audio(self, chunk, sample_rate):
94
+ chunk = np.clip(chunk, -32768, 32767).astype(np.int16)
95
+ chunk = chunk.astype(np.float32) / 32768.0
96
+ chunk = librosa.resample(chunk, orig_sr=48000, target_sr=16000)
97
+ chunk = (chunk * 32768.0).astype(np.int16)
98
+ self.send_queue.put(chunk)
99
+
100
+ def get_audio(self, sample_rate, output_size):
101
+ output_chunk = np.array([], dtype=np.int16)
102
+ output_sample_rate = 16000
103
+ output_chunk_size = int(output_size*output_sample_rate/sample_rate)
104
+ while output_chunk.size < output_chunk_size:
105
+ try:
106
+ self.ws.send([], opcode=websocket.ABNF.OPCODE_BINARY) # handshake
107
+ chunk = self.recv_queue.get(timeout=0.1)
108
+ except Empty:
109
+ chunk = None
110
+ if chunk is not None:
111
+ # Ensure chunk is int16 and clip to valid range
112
+ chunk_int16 = np.clip(chunk, -32768, 32767).astype(np.int16)
113
+ output_chunk = np.concatenate([output_chunk, chunk_int16])
114
+ else:
115
+ print("padding chunk of size ", len(output_chunk))
116
+ output_chunk = np.pad(output_chunk, (0, output_chunk_size - len(output_chunk)))
117
+ output_chunk = output_chunk.astype(np.float32) / 32768.0
118
+ output_chunk = librosa.resample(output_chunk, orig_sr=output_sample_rate, target_sr=sample_rate)
119
+ output_chunk = (output_chunk * 32768.0).astype(np.int16)
120
+ print("output_chunk size: ", len(output_chunk))
121
+ output_chunk = output_chunk[:output_size]
122
+ return np.pad(output_chunk, (0, output_size - len(output_chunk)))
123
+
124
+
125
+
126
+ if __name__ == "__main__":
127
+ client = AudioStreamingClient()
128
+ client.start()