Spaces:

sagar007
/

Multimodal_App

Running on Zero

App Files Files Community

sagar007 commited on Aug 26

Commit

7137466

•

1 Parent(s): 6b1a045

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -24

app.py CHANGED Viewed

@@ -79,6 +79,8 @@ async def generate_speech(text, tts_model, tts_tokenizer):
     return audio_generation.cpu().numpy().squeeze()
 @spaces.GPU(timeout=300)
 def stream_text_chat(message, history, system_prompt, temperature=0.8, max_new_tokens=1024, top_p=1.0, top_k=20, use_tts=True):
     try:
@@ -102,7 +104,8 @@ def stream_text_chat(message, history, system_prompt, temperature=0.8, max_new_t
             top_p=top_p,
             top_k=top_k,
             temperature=temperature,
-            eos_token_id=[128001, 128008, 128009],
             streamer=streamer,
         )
@@ -110,35 +113,41 @@ def stream_text_chat(message, history, system_prompt, temperature=0.8, max_new_t
         thread.start()
         buffer = ""
-        audio_buffer = np.array([0.0])  # Initialize with a single zero
         for new_text in streamer:
             buffer += new_text
-            yield history + [[message, buffer]], (tts_model.config.sampling_rate, audio_buffer)
-        # Generate speech after text generation is complete
-        if use_tts and buffer:  # Only generate speech if there's text
-            audio_buffer = generate_speech_sync(buffer, tts_model, tts_tokenizer)
-            if audio_buffer.size == 0:  # If audio_buffer is empty
-                audio_buffer = np.array([0.0])  # Use a single zero instead
-        # Final yield with complete text and audio
-        yield history + [[message, buffer]], (tts_model.config.sampling_rate, audio_buffer)
     except Exception as e:
         print(f"An error occurred: {str(e)}")
-        yield history + [[message, f"An error occurred: {str(e)}"]], (tts_model.config.sampling_rate, np.array([0.0]))
 def generate_speech_sync(text, tts_model, tts_tokenizer):
-    tts_input_ids = tts_tokenizer(text, return_tensors="pt").input_ids.to(device)
-    tts_description = "A clear and natural voice reads the text with moderate speed and expression."
-    tts_description_ids = tts_tokenizer(tts_description, return_tensors="pt").input_ids.to(device)
-    with torch.no_grad():
-        audio_generation = tts_model.generate(input_ids=tts_description_ids, prompt_input_ids=tts_input_ids)
-    audio_buffer = audio_generation.cpu().numpy().squeeze()
-    return audio_buffer if audio_buffer.size > 0 else np.array([0.0])
 @spaces.GPU(timeout=300)  # Increase timeout to 5 minutes
 def process_vision_query(image, text_input):

     return audio_generation.cpu().numpy().squeeze()
+from gradio import Error as GradioError
 @spaces.GPU(timeout=300)
 def stream_text_chat(message, history, system_prompt, temperature=0.8, max_new_tokens=1024, top_p=1.0, top_k=20, use_tts=True):
     try:
             top_p=top_p,
             top_k=top_k,
             temperature=temperature,
+            eos_token_id=text_tokenizer.eos_token_id,
+            pad_token_id=text_tokenizer.pad_token_id,
             streamer=streamer,
         )
         thread.start()
         buffer = ""
         for new_text in streamer:
             buffer += new_text
+            yield history + [[message, buffer]], None  # Yield None for audio initially
+        # Only attempt TTS if it's enabled and we have a response
+        if use_tts and buffer:
+            try:
+                audio = generate_speech_sync(buffer, tts_model, tts_tokenizer)
+                yield history + [[message, buffer]], (tts_model.config.sampling_rate, audio)
+            except Exception as e:
+                print(f"TTS failed: {str(e)}")
+                yield history + [[message, buffer]], None
+        else:
+            yield history + [[message, buffer]], None
+    except GradioError:
+        yield history + [[message, "GPU task aborted. Please try again."]], None
     except Exception as e:
         print(f"An error occurred: {str(e)}")
+        yield history + [[message, f"An error occurred: {str(e)}"]], None
 def generate_speech_sync(text, tts_model, tts_tokenizer):
+    try:
+        tts_input_ids = tts_tokenizer(text, return_tensors="pt").input_ids.to(device)
+        tts_description = "A clear and natural voice reads the text with moderate speed and expression."
+        tts_description_ids = tts_tokenizer(tts_description, return_tensors="pt").input_ids.to(device)
+        with torch.no_grad():
+            audio_generation = tts_model.generate(input_ids=tts_description_ids, prompt_input_ids=tts_input_ids)
+        audio_buffer = audio_generation.cpu().numpy().squeeze()
+        return audio_buffer if audio_buffer.size > 0 else np.array([0.0])
+    except Exception as e:
+        print(f"Speech generation failed: {str(e)}")
+        return np.array([0.0])
 @spaces.GPU(timeout=300)  # Increase timeout to 5 minutes
 def process_vision_query(image, text_input):