Spaces:

fffiloni
/

instant-TTS-Bark-cloning

Paused

App Files Files Community

fffiloni commited on Sep 5, 2023

Commit

5cbba27

•

1 Parent(s): 7f61595

add audio cleaning + record helpers

Browse files

Files changed (1) hide show

app.py +128 -32

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ import shutil
 #from huggingface_hub import snapshot_download
 import numpy as np
 from scipy.io import wavfile
 from pydub import AudioSegment
 file_upload_available = os.environ.get("ALLOW_FILE_UPLOAD")
@@ -22,23 +23,6 @@ with open("characters.json", "r") as file:
         for item in data
     ]
-"""
-model_ids = [
-    'suno/bark',
-]
-for model_id in model_ids:
-    model_name = model_id.split('/')[-1]
-    snapshot_download(model_id, local_dir=f'checkpoints/{model_name}')
-from TTS.tts.configs.bark_config import BarkConfig
-from TTS.tts.models.bark import Bark
-#os.environ['CUDA_VISIBLE_DEVICES'] = '1'
-config = BarkConfig()
-model = Bark.init_from_config(config)
-model.load_checkpoint(config, checkpoint_dir="checkpoints/bark", eval=True)
-"""
 from TTS.api import TTS
 tts = TTS("tts_models/multilingual/multi-dataset/bark", gpu=True)
@@ -66,6 +50,66 @@ def cut_wav(input_path, max_duration):
     return output_path
 def update_selection(selected_state: gr.SelectData):
     c_image = characters[selected_state.index]["image"]
     c_title = characters[selected_state.index]["title"]
@@ -74,10 +118,24 @@ def update_selection(selected_state: gr.SelectData):
     return c_title, selected_state
-def infer(prompt, input_wav_file):
-    # Path to your WAV file
-    source_path = input_wav_file
     # Destination directory
     destination_directory = "bark_voices"
@@ -108,7 +166,7 @@ def infer(prompt, input_wav_file):
     tts_video = gr.make_waveform(audio="output.wav")
-    return "output.wav", tts_video, gr.update(value=f"bark_voices/{file_name}/{contents[1]}", visible=True), gr.Group.update(visible=True)
 def infer_from_c(prompt, c_name):
@@ -265,18 +323,38 @@ with gr.Blocks(css=css) as demo:
                                 source="upload",
                                 interactive = False
                             )
                         submit_btn = gr.Button("Submit")
                 with gr.Tab("Microphone"):
                     micro_in = gr.Audio(
                                 label="Record voice to clone",
                                 type="filepath",
                                 source="microphone",
                                 interactive = True
                             )
                     micro_submit_btn = gr.Button("Submit")
                 with gr.Tab("Voices Characters"):
                     selected_state = gr.State()
                     gallery_in = gr.Gallery(
@@ -308,6 +386,10 @@ with gr.Blocks(css=css) as demo:
                     visible = False
                 )
                 character_name = gr.Textbox(
                     label="Character Name",
                     placeholder="Name that voice character",
@@ -334,29 +416,37 @@ with gr.Blocks(css=css) as demo:
             show_progress=False,
         )
         gr.Examples(
             examples = [
                 [
                     "Once upon a time, in a cozy little shell, lived a friendly crab named Crabby. Crabby loved his cozy home, but he always felt like something was missing.",
                     "./examples/en_speaker_6.wav",
                 ],
                 [
                     "It was a typical afternoon in the bustling city, the sun shining brightly through the windows of the packed courtroom. Three people sat at the bar, their faces etched with worry and anxiety. ",
                     "./examples/en_speaker_9.wav",
                 ],
             ],
             fn = infer,
             inputs = [
                 prompt,
-                audio_in
             ],
             outputs = [
                 cloned_out,
                 video_out,
                 npz_file,
-                share_group
             ],
             cache_examples = False
         )
@@ -381,13 +471,16 @@ with gr.Blocks(css=css) as demo:
         fn = infer,
         inputs = [
             prompt,
-            audio_in
         ],
         outputs = [
             cloned_out,
             video_out,
             npz_file,
-            share_group
         ]
     )
@@ -395,13 +488,16 @@ with gr.Blocks(css=css) as demo:
         fn = infer,
         inputs = [
             prompt,
-            micro_in
         ],
         outputs = [
             cloned_out,
             video_out,
             npz_file,
-            share_group
         ]
     )

 #from huggingface_hub import snapshot_download
 import numpy as np
 from scipy.io import wavfile
+from scipy.io.wavfile import write, read
 from pydub import AudioSegment
 file_upload_available = os.environ.get("ALLOW_FILE_UPLOAD")
         for item in data
     ]
 from TTS.api import TTS
 tts = TTS("tts_models/multilingual/multi-dataset/bark", gpu=True)
     return output_path
+def load_hidden(audio_in):
+    return audio_in
+def load_hidden_mic(audio_in):
+    print("MICRO IN HAS CHANGED")
+    library_path = 'bark_voices'
+    folder_name = 'audio-0-100'
+    second_folder_name = 'audio-0-100_cleaned'
+    folder_path = os.path.join(library_path, folder_name)
+    second_folder_path = os.path.join(library_path, second_folder_name)
+    if os.path.exists(folder_path):
+        try:
+            shutil.rmtree(folder_path)
+            print(f"Successfully deleted the folder: {folder_path}")
+        except OSError as e:
+            print(f"Error: {folder_path} - {e.strerror}")
+    else:
+        print(f"The folder does not exist: {folder_path}")
+    if os.path.exists(second_folder_path):
+        try:
+            shutil.rmtree(second_folder_path)
+            print(f"Successfully deleted the folder: {second_folder_path}")
+        except OSError as e:
+            print(f"Error: {second_folder_path} - {e.strerror}")
+    else:
+        print(f"The folder does not exist: {second_folder_path}")
+    return audio_in
+def clear_clean_ckeck():
+    return False
+def wipe_npz_file(folder_path):
+    if os.path.exists(folder_path):
+        #shutil.rmtree(folder_path)
+        print(folder_path)
+    else :
+        print("path does not exists yet")
+    print("YO")
+def split_process(audio, chosen_out_track):
+    os.makedirs("out", exist_ok=True)
+    write('test.wav', audio[0], audio[1])
+    os.system("python3 -m demucs.separate -n mdx_extra_q -j 4 test.wav -o out")
+    #return "./out/mdx_extra_q/test/vocals.wav","./out/mdx_extra_q/test/bass.wav","./out/mdx_extra_q/test/drums.wav","./out/mdx_extra_q/test/other.wav"
+    if chosen_out_track == "vocals":
+        return "./out/mdx_extra_q/test/vocals.wav"
+    elif chosen_out_track == "bass":
+        return "./out/mdx_extra_q/test/bass.wav"
+    elif chosen_out_track == "drums":
+        return "./out/mdx_extra_q/test/drums.wav"
+    elif chosen_out_track == "other":
+        return "./out/mdx_extra_q/test/other.wav"
+    elif chosen_out_track == "all-in":
+        return "test.wav"
 def update_selection(selected_state: gr.SelectData):
     c_image = characters[selected_state.index]["image"]
     c_title = characters[selected_state.index]["title"]
     return c_title, selected_state
+def infer(prompt, input_wav_file, clean_audio, hidden_numpy_audio):
+    if clean_audio is True :
+        # Extract the file name without the extension
+        new_name = os.path.splitext(os.path.basename(input_wav_file))[0]
+        check_name = os.path.join("bark_voices", f"{new_name}_cleaned")
+        if os.path.exists(check_name):
+            source_path = os.path.join(check_name, f"{new_name}_cleaned.wav")
+        else:
+            source_path = split_process(hidden_numpy_audio, "vocals")
+            # Rename the file
+            new_path = os.path.join(os.path.dirname(source_path), f"{new_name}_cleaned.wav")
+            os.rename(source_path, new_path)
+            source_path = new_path
+    else :
+        # Path to your WAV file
+        source_path = input_wav_file
     # Destination directory
     destination_directory = "bark_voices"
     tts_video = gr.make_waveform(audio="output.wav")
+    return "output.wav", tts_video, gr.update(value=f"bark_voices/{file_name}/{contents[1]}", visible=True), gr.Group.update(visible=True), destination_path
 def infer_from_c(prompt, c_name):
                                 source="upload",
                                 interactive = False
                             )
+                        clean_sample = gr.Checkbox(label="Clean sample ?", value=False)
+                        hidden_audio_numpy = gr.Audio(type="numpy", visible=False)
                         submit_btn = gr.Button("Submit")
                 with gr.Tab("Microphone"):
+                    texts_samples = gr.Textbox(label = "Helpers",
+                                               info = "You can read out loud one of these sentences if you do not know what to record :)",
+                                               value = """"Jazz, a quirky mix of groovy saxophones and wailing trumpets, echoes through the vibrant city streets."
+"A majestic orchestra plays enchanting melodies, filling the air with harmony."
+"The exquisite aroma of freshly baked bread wafts from a cozy bakery, enticing passersby."
+"A thunderous roar shakes the ground as a massive jet takes off into the sky, leaving trails of white behind."
+"Laughter erupts from a park where children play, their innocent voices rising like tinkling bells."
+"Waves crash on the beach, and seagulls caw as they soar overhead, a symphony of nature's sounds."
+"In the distance, a blacksmith hammers red-hot metal, the rhythmic clang punctuating the day."
+"As evening falls, a soft hush blankets the world, crickets chirping in a soothing rhythm."
+                                               """,
+                                               interactive = False,
+                                               lines = 4
+                                              )
                     micro_in = gr.Audio(
                                 label="Record voice to clone",
                                 type="filepath",
                                 source="microphone",
                                 interactive = True
                             )
+                    clean_micro = gr.Checkbox(label="Clean sample ?", value=False)
                     micro_submit_btn = gr.Button("Submit")
+                audio_in.upload(fn=load_hidden, inputs=[audio_in], outputs=[hidden_audio_numpy])
+                micro_in.stop_recording(fn=load_hidden_mic, inputs=[micro_in], outputs=[hidden_audio_numpy])
                 with gr.Tab("Voices Characters"):
                     selected_state = gr.State()
                     gallery_in = gr.Gallery(
                     visible = False
                 )
+                folder_path = gr.Textbox(visible=False)
                 character_name = gr.Textbox(
                     label="Character Name",
                     placeholder="Name that voice character",
             show_progress=False,
         )
+        audio_in.change(fn=wipe_npz_file, inputs=[folder_path])
+        micro_in.clear(fn=wipe_npz_file, inputs=[folder_path])
         gr.Examples(
             examples = [
                 [
                     "Once upon a time, in a cozy little shell, lived a friendly crab named Crabby. Crabby loved his cozy home, but he always felt like something was missing.",
                     "./examples/en_speaker_6.wav",
+                    False,
+                    None
                 ],
                 [
                     "It was a typical afternoon in the bustling city, the sun shining brightly through the windows of the packed courtroom. Three people sat at the bar, their faces etched with worry and anxiety. ",
                     "./examples/en_speaker_9.wav",
+                    False,
+                    None
                 ],
             ],
             fn = infer,
             inputs = [
                 prompt,
+                audio_in,
+                clean_sample,
+                hidden_audio_numpy
             ],
             outputs = [
                 cloned_out,
                 video_out,
                 npz_file,
+                share_group,
+                folder_path
             ],
             cache_examples = False
         )
         fn = infer,
         inputs = [
             prompt,
+            audio_in,
+            clean_sample,
+            hidden_audio_numpy
         ],
         outputs = [
             cloned_out,
             video_out,
             npz_file,
+            share_group,
+            folder_path
         ]
     )
         fn = infer,
         inputs = [
             prompt,
+            micro_in,
+            clean_micro,
+            hidden_audio_numpy
         ],
         outputs = [
             cloned_out,
             video_out,
             npz_file,
+            share_group,
+            folder_path
         ]
     )