# Copyright (c) Microsoft # 2022 Chengdong Liang (liangchengdong@mail.nwpu.edu.cn) # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import gradio as gr import wesep import soundfile import torchaudio import os from scipy.signal import resample input_path = "./audios" output_path = "./extracted" if not os.path.exists(input_path): os.mkdir(input_path) if not os.path.exists(output_path): os.mkdir(output_path) en_model = wesep.load_model("english") def save_to_file(audio,filename,target_sr=16000): audio_path = os.path.join(input_path,filename) soundfile.write(audio_path,audio[1],audio[0]) pcm, sample_rate = torchaudio.load(audio_path) transform = torchaudio.transforms.Resample( orig_freq=audio[0], new_freq=target_sr) pcm = transform(pcm) torchaudio.save(audio_path, pcm, target_sr) return audio_path def speaker_extraction(audio1, audio2, mixture, select_speaker='#1'): if audio1 == None or audio2 == None or mixture == None: print("??") return gr.Warning("The audio file cannot be empty, please upload a valid audio file. 音频文件不能为空,请上传有效的音频文件。") audio_path1 = save_to_file(audio1,"enroll_1.wav",16000) audio_path2 = save_to_file(audio2,"enroll_2.wav",16000) audio_mixture = save_to_file(mixture,"mixture.wav",16000) model = en_model if select_speaker == '#1': select_speaker = audio_path1 elif select_speaker == '#2': select_speaker = audio_path2 speech = model.extract_speech(audio_mixture,select_speaker) audio_speech = output_path + "/speech.wav" soundfile.write(audio_speech,speech[0],16000) return audio_speech inputs = [ gr.Audio( show_download_button = True, label='Enroll Speaker#1', ), gr.Audio( show_download_button = True, label='Enroll Speaker#2'), gr.Audio( show_download_button = True, label='Mixture'), gr.Radio(['#1', '#2'], label='Extract Speaker #'), ] output = gr.Audio(type="filepath",label="Extract Speaker") # description description = ("

WeSep Demo ! Try it with your own voice ! Note: We recommend that the audio length be greater than 5s !

") article = ( "

" "Github: Learn more about WeSep" "

") examples = [ ['examples/enroll_1.wav', 'examples/enroll_2.wav', 'examples/mixture.wav','#1'], ['examples/enroll1_zh.wav', 'examples/enroll2_zh.wav', 'examples/mixture_zh.wav','#2'], ] interface = gr.Interface( fn=speaker_extraction, inputs=inputs, outputs=output, title="Speaker Extraction in WeSep : 基于 WeSep 的说话人提取", description=description, article=article, examples=examples ) interface.launch()