ai-tube-model-parler-tts-mini

Paused

jbilcke-hf HF staff commited on Apr 29

Commit

5ca5d91

•

1 Parent(s): ef0447a

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -4,10 +4,9 @@ import os
 from io import BytesIO
 import base64
 import numpy as np
 from parler_tts import ParlerTTSForConditionalGeneration
 from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed
-from scipy.io.wavfile import write
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
@@ -36,14 +35,23 @@ def gen_tts(secret_token, text, description):
     )
     audio_arr = generation.cpu().numpy().squeeze()
-    # Write the numpy array as a WAV file
-    buffer = BytesIO()
-    write(buffer, SAMPLE_RATE, audio_arr.astype(np.int16))
-    buffer.seek(0)
-    # Encode the WAV file in base64
-    audio_base64 = base64.b64encode(buffer.read()).decode('utf-8')
-    data_uri = 'data:audio/wav;base64,' + audio_base64
     return data_uri

 from io import BytesIO
 import base64
 import numpy as np
+from pydub import AudioSegment
 from parler_tts import ParlerTTSForConditionalGeneration
 from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
     )
     audio_arr = generation.cpu().numpy().squeeze()
+    # Create an AudioSegment directly from numpy array data
+    samples = np.array(audio_arr * (2**15 - 1), dtype=np.int16)
+    sound = AudioSegment(
+        samples.tobytes(),
+        frame_rate=SAMPLE_RATE,
+        sample_width=samples.dtype.itemsize,
+        channels=1
+    )
+    # Export to MP3
+    buff_mp3 = BytesIO()
+    sound.export(buff_mp3, format="mp3")
+    buff_mp3.seek(0)
+    # Encode the MP3 file in base64
+    audio_base64 = base64.b64encode(buff_mp3.read()).decode('utf-8')
+    data_uri = 'data:audio/mp3;base64,' + audio_base64
     return data_uri