llama-3.2-3b-voice-webrtc

Running

App Files Files Community

akhaliq HF staff commited on Sep 27

Commit

a0f34aa

•

1 Parent(s): e65a834

Update app.py

Browse files

Files changed (1) hide show

app.py +59 -33

app.py CHANGED Viewed

@@ -3,8 +3,6 @@ import numpy as np
 import io
 from pydub import AudioSegment
 import tempfile
-import os
-import base64
 import openai
 import time
 from dataclasses import dataclass, field
@@ -14,11 +12,11 @@ from threading import Lock
 class AppState:
     stream: np.ndarray | None = None
     sampling_rate: int = 0
-    pause_start: float | None = None
-    last_speech: float = 0
     conversation: list = field(default_factory=list)
     client: openai.OpenAI = None
     output_format: str = "mp3"
 # Global lock for thread safety
 state_lock = Lock()
@@ -29,27 +27,36 @@ def create_client(api_key):
         api_key=api_key
     )
 def process_audio(audio: tuple, state: AppState):
     if state.stream is None:
         state.stream = audio[1]
         state.sampling_rate = audio[0]
-        state.last_speech = time.time()
     else:
         state.stream = np.concatenate((state.stream, audio[1]))
-    # Improved pause detection
-    current_time = time.time()
-    if np.max(np.abs(audio[1])) > 0.1:  # Adjust this threshold as needed
-        state.last_speech = current_time
-        state.pause_start = None
-    elif state.pause_start is None:
-        state.pause_start = current_time
-    # Check if pause is long enough to stop recording
-    if state.pause_start and (current_time - state.pause_start > 2.0):  # 2 seconds of silence
         return gr.Audio(recording=False), state
-    return None, state
 def generate_response_and_audio(audio_bytes: bytes, state: AppState):
     if state.client is None:
@@ -58,7 +65,7 @@ def generate_response_and_audio(audio_bytes: bytes, state: AppState):
     format_ = state.output_format
     bitrate = 128 if format_ == "mp3" else 32  # Higher bitrate for MP3, lower for OPUS
     audio_data = base64.b64encode(audio_bytes).decode()
     try:
         stream = state.client.chat.completions.create(
             extra_body={
@@ -90,9 +97,6 @@ def generate_response_and_audio(audio_bytes: bytes, state: AppState):
         final_audio = b''.join([base64.b64decode(a) for a in audios])
-        state.conversation.append({"role": "user", "content": "Audio input"})
-        state.conversation.append({"role": "assistant", "content": full_response})
         yield full_response, final_audio, state
     except Exception as e:
@@ -101,7 +105,7 @@ def generate_response_and_audio(audio_bytes: bytes, state: AppState):
 def response(state: AppState):
     if state.stream is None or len(state.stream) == 0:
         return None, None, state
     audio_buffer = io.BytesIO()
     segment = AudioSegment(
         state.stream.tobytes(),
@@ -112,7 +116,7 @@ def response(state: AppState):
     segment.export(audio_buffer, format="wav")
     generator = generate_response_and_audio(audio_buffer.getvalue(), state)
     # Process the generator to get the final results
     final_text = ""
     final_audio = None
@@ -122,15 +126,23 @@ def response(state: AppState):
         state = updated_state
     # Update the chatbot with the final conversation
-    chatbot_output = state.conversation[-2:]  # Get the last two messages (user input and AI response)
     # Reset the audio stream for the next interaction
     state.stream = None
-    state.pause_start = None
-    state.last_speech = 0
     return chatbot_output, final_audio, state
 def set_api_key(api_key, state):
     if not api_key:
         raise gr.Error("Please enter a valid API key.")
@@ -145,19 +157,19 @@ with gr.Blocks() as demo:
     with gr.Row():
         api_key_input = gr.Textbox(type="password", label="Enter your Lepton API Key")
         set_key_button = gr.Button("Set API Key")
     api_key_status = gr.Textbox(label="API Key Status", interactive=False)
     with gr.Row():
         format_dropdown = gr.Dropdown(choices=["mp3", "opus"], value="mp3", label="Output Audio Format")
     with gr.Row():
         with gr.Column():
             input_audio = gr.Audio(label="Input Audio", sources="microphone", type="numpy")
         with gr.Column():
             chatbot = gr.Chatbot(label="Conversation", type="messages")
             output_audio = gr.Audio(label="Output Audio", autoplay=True)
     state = gr.State(AppState())
     set_key_button.click(set_api_key, inputs=[api_key_input, state], outputs=[api_key_status, state])
@@ -170,11 +182,25 @@ with gr.Blocks() as demo:
         stream_every=0.25,  # Reduced to make it more responsive
         time_limit=60,  # Increased to allow for longer messages
     )
     respond = input_audio.stop_recording(
         response,
         [state],
         [chatbot, output_audio, state]
     )
-demo.launch()

 import io
 from pydub import AudioSegment
 import tempfile
 import openai
 import time
 from dataclasses import dataclass, field
 class AppState:
     stream: np.ndarray | None = None
     sampling_rate: int = 0
+    pause_detected: bool = False
     conversation: list = field(default_factory=list)
     client: openai.OpenAI = None
     output_format: str = "mp3"
+    stopped: bool = False
 # Global lock for thread safety
 state_lock = Lock()
         api_key=api_key
     )
+def determine_pause(audio, sampling_rate, state):
+    # Take the last 1 second of audio
+    pause_length = int(sampling_rate * 1)  # 1 second
+    if len(audio) < pause_length:
+        return False
+    last_audio = audio[-pause_length:]
+    amplitude = np.abs(last_audio)
+    # Calculate the average amplitude in the last 1 second
+    avg_amplitude = np.mean(amplitude)
+    silence_threshold = 0.01  # Adjust this threshold as needed
+    if avg_amplitude < silence_threshold:
+        return True
+    else:
+        return False
 def process_audio(audio: tuple, state: AppState):
     if state.stream is None:
         state.stream = audio[1]
         state.sampling_rate = audio[0]
     else:
         state.stream = np.concatenate((state.stream, audio[1]))
+    pause_detected = determine_pause(state.stream, state.sampling_rate, state)
+    state.pause_detected = pause_detected
+    if state.pause_detected:
         return gr.Audio(recording=False), state
+    else:
+        return None, state
 def generate_response_and_audio(audio_bytes: bytes, state: AppState):
     if state.client is None:
     format_ = state.output_format
     bitrate = 128 if format_ == "mp3" else 32  # Higher bitrate for MP3, lower for OPUS
     audio_data = base64.b64encode(audio_bytes).decode()
     try:
         stream = state.client.chat.completions.create(
             extra_body={
         final_audio = b''.join([base64.b64decode(a) for a in audios])
         yield full_response, final_audio, state
     except Exception as e:
 def response(state: AppState):
     if state.stream is None or len(state.stream) == 0:
         return None, None, state
     audio_buffer = io.BytesIO()
     segment = AudioSegment(
         state.stream.tobytes(),
     segment.export(audio_buffer, format="wav")
     generator = generate_response_and_audio(audio_buffer.getvalue(), state)
     # Process the generator to get the final results
     final_text = ""
     final_audio = None
         state = updated_state
     # Update the chatbot with the final conversation
+    state.conversation.append({"role": "user", "content": "Audio input"})
+    state.conversation.append({"role": "assistant", "content": final_text})
     # Reset the audio stream for the next interaction
     state.stream = None
+    state.pause_detected = False
+    chatbot_output = state.conversation[-2:]  # Get the last two messages
     return chatbot_output, final_audio, state
+def start_recording_user(state: AppState):
+    if not state.stopped:
+        return gr.Audio(recording=True)
+    else:
+        return gr.Audio(recording=False)
 def set_api_key(api_key, state):
     if not api_key:
         raise gr.Error("Please enter a valid API key.")
     with gr.Row():
         api_key_input = gr.Textbox(type="password", label="Enter your Lepton API Key")
         set_key_button = gr.Button("Set API Key")
     api_key_status = gr.Textbox(label="API Key Status", interactive=False)
     with gr.Row():
         format_dropdown = gr.Dropdown(choices=["mp3", "opus"], value="mp3", label="Output Audio Format")
     with gr.Row():
         with gr.Column():
             input_audio = gr.Audio(label="Input Audio", sources="microphone", type="numpy")
         with gr.Column():
             chatbot = gr.Chatbot(label="Conversation", type="messages")
             output_audio = gr.Audio(label="Output Audio", autoplay=True)
     state = gr.State(AppState())
     set_key_button.click(set_api_key, inputs=[api_key_input, state], outputs=[api_key_status, state])
         stream_every=0.25,  # Reduced to make it more responsive
         time_limit=60,  # Increased to allow for longer messages
     )
     respond = input_audio.stop_recording(
         response,
         [state],
         [chatbot, output_audio, state]
     )
+    # Update the chatbot with the final conversation
+    respond.then(lambda s: s.conversation, [state], [chatbot])
+    # Automatically restart recording after the assistant's response
+    restart = output_audio.stop(
+        start_recording_user,
+        [state],
+        [input_audio]
+    )
+    # Add a "Stop Conversation" button
+    cancel = gr.Button("Stop Conversation", variant="stop")
+    cancel.click(lambda: (AppState(stopped=True), gr.Audio(recording=False)), None,
+                [state, input_audio], cancels=[respond, restart])
+demo.launch()