Spaces:

wenet-e2e
/

wesep-tse-2speaker-demo

Sleeping

App Files Files Community

王帅 commited on 25 days ago

Commit

dc83cb6

•

1 Parent(s): f9d0d8b

init the repo

Browse files

Files changed (9) hide show

.gitattributes +1 -0
app.py +113 -0
examples/enroll1_zh.wav +0 -0
examples/enroll2_zh.wav +0 -0
examples/enroll_1.wav +0 -0
examples/enroll_2.wav +0 -0
examples/mixture.wav +0 -0
examples/mixture_zh.wav +3 -0
requirements.txt +160 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*/mixture_zh.wav filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,113 @@

+# Copyright (c) Microsoft
+#               2022 Chengdong Liang ([email protected])
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import gradio as gr
+import wesep
+import soundfile
+import torchaudio
+import os
+from scipy.signal import resample
+input_path = "./audios"
+output_path = "./extracted"
+if not os.path.exists(input_path):
+    os.mkdir(input_path)
+if not os.path.exists(output_path):
+    os.mkdir(output_path)
+en_model =  wesep.load_model("english")
+def save_to_file(audio,filename,target_sr=16000):
+    audio_path = os.path.join(input_path,filename)
+    soundfile.write(audio_path,audio[1],audio[0])
+    pcm, sample_rate = torchaudio.load(audio_path)
+    transform = torchaudio.transforms.Resample(
+                    orig_freq=audio[0],
+                    new_freq=target_sr)
+    pcm = transform(pcm)
+    torchaudio.save(audio_path, pcm, target_sr)
+    return audio_path
+def speaker_extraction(audio1, audio2, mixture, select_speaker='#1'):
+    if audio1 == None or audio2 == None or mixture == None:
+        print("??")
+        return gr.Warning("The audio file cannot be empty, please upload a valid audio file. 音频文件不能为空，请上传有效的音频文件。")
+    audio_path1 = save_to_file(audio1,"enroll_1.wav",16000)
+    audio_path2 = save_to_file(audio2,"enroll_2.wav",16000)
+    audio_mixture = save_to_file(mixture,"mixture.wav",16000)
+    model = en_model
+    if select_speaker == '#1':
+        select_speaker = audio_path1
+    elif select_speaker == '#2':
+        select_speaker = audio_path2
+    speech = model.extract_speech(audio_mixture,select_speaker)
+    audio_speech = output_path + "/speech.wav"
+    soundfile.write(audio_speech,speech[0],16000)
+    return audio_speech
+inputs = [
+    gr.Audio(
+            show_download_button = True,
+            label='Enroll Speaker#1',
+            ),
+    gr.Audio(
+            show_download_button = True,
+            label='Enroll Speaker#2'),
+    gr.Audio(
+            show_download_button = True,
+            label='Mixture'),
+    gr.Radio(['#1', '#2'], label='Extract Speaker #'),
+]
+output = gr.Audio(type="filepath",label="Extract Speaker")
+# description
+description = ("<p>WeSep Demo ! Try it with your own voice ! Note: We recommend that the audio length be greater than 5s !</p>")
+article = (
+    "<p style='text-align: center'>"
+    "<a href='https://github.com/wenet-e2e/wesep' target='_blank'>Github: Learn more about WeSep</a>"
+    "</p>")
+examples = [
+    ['examples/enroll_1.wav', 'examples/enroll_2.wav', 'examples/mixture.wav','#1'],
+    ['examples/enroll1_zh.wav', 'examples/enroll2_zh.wav', 'examples/mixture_zh.wav','#2'],
+]
+interface = gr.Interface(
+    fn=speaker_extraction,
+    inputs=inputs,
+    outputs=output,
+    title="Speaker Extraction in WeSep : 基于 WeSep 的说话人提取",
+    description=description,
+    article=article,
+    examples=examples
+)
+interface.launch()

examples/enroll1_zh.wav ADDED Viewed

Binary file (204 kB). View file

examples/enroll2_zh.wav ADDED Viewed

Binary file (502 kB). View file

examples/enroll_1.wav ADDED Viewed

Binary file (169 kB). View file

examples/enroll_2.wav ADDED Viewed

Binary file (112 kB). View file

examples/mixture.wav ADDED Viewed

Binary file (128 kB). View file

examples/mixture_zh.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:545f95dc05f4627d2e8384a5539d9b4a5a0f174fb753f6687caa3a2dc403e2cf
+size 1169576

requirements.txt ADDED Viewed

	@@ -0,0 +1,160 @@

+aiofiles==23.2.1
+aiohappyeyeballs==2.4.3
+aiohttp==3.10.10
+aiosignal==1.3.1
+altair==5.4.1
+annotated-types==0.7.0
+anyio==4.6.2.post1
+async-timeout==4.0.3
+attrs==24.2.0
+audioread==3.0.1
+auraloss==0.4.0
+certifi==2024.8.30
+cffi==1.17.1
+cfgv==3.4.0
+charset-normalizer==3.4.0
+click==8.1.7
+coloredlogs==15.0.1
+cycler==0.12.1
+Cython==0.29.37
+decorator==5.1.1
+distlib==0.3.9
+exceptiongroup==1.2.2
+fast_bss_eval==0.1.4
+fastapi==0.115.4
+ffmpy==0.4.0
+filelock==3.16.1
+fire==0.4.0
+flake8==3.8.2
+flake8-bugbear==23.3.12
+flake8-comprehensions==3.16.0
+flake8-executable==2.1.3
+flake8-pyi==20.5.0
+flatbuffers==24.3.25
+fonttools==4.54.1
+frozenlist==1.5.0
+fsspec==2024.10.0
+future==1.0.0
+gradio==4.44.1
+gradio_client==1.3.0
+h11==0.14.0
+h5py==3.12.1
+hdbscan==0.8.37
+httpcore==1.0.6
+httpx==0.27.2
+huggingface-hub==0.26.2
+humanfriendly==10.0
+identify==2.6.1
+idna==3.10
+importlib_resources==6.4.5
+Jinja2==3.1.4
+joblib==1.1.0
+jsonpatch==1.33
+jsonpointer==3.0.0
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+kaldiio==2.18.0
+kiwisolver==1.4.7
+lazy_loader==0.4
+librosa==0.10.1
+lightning-utilities==0.11.8
+linkify-it-py==2.0.3
+llvmlite==0.41.1
+lmdb==1.3.0
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+matplotlib==3.5.1
+mccabe==0.6.1
+mdit-py-plugins==0.4.2
+mdurl==0.1.2
+mir_eval==0.7
+mpmath==1.3.0
+msgpack==1.1.0
+multidict==6.1.0
+narwhals==1.13.1
+networkx==3.2.1
+nodeenv==1.9.1
+numba==0.58.0
+numpy==1.22.4
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+onnxruntime==1.19.2
+orjson==3.10.11
+packaging==24.1
+pandas==2.2.3
+pesq==0.0.4
+pillow==10.4.0
+platformdirs==4.3.6
+pooch==1.8.2
+pre-commit==3.5.0
+propcache==0.2.0
+protobuf==5.28.3
+pycodestyle==2.6.0
+pycparser==2.22
+pycryptodome==3.21.0
+pydantic==2.9.2
+pydantic_core==2.23.4
+pydub==0.25.1
+pyflakes==2.2.0
+Pygments==2.18.0
+pynndescent==0.5.13
+pyparsing==3.2.0
+pystoi==0.3.3
+python-dateutil==2.9.0.post0
+python-multipart==0.0.17
+pytz==2024.2
+PyYAML==6.0
+referencing==0.35.1
+requests==2.31.0
+rich==13.9.4
+rpds-py==0.20.1
+ruff==0.7.2
+scikit-learn==1.1.3
+scipy==1.7.3
+semantic-version==2.10.0
+shellingham==1.5.4
+silero-vad==5.1.2
+six==1.16.0
+sniffio==1.3.1
+soundfile==0.12.1
+soxr==0.5.0.post1
+starlette==0.41.2
+sympy==1.13.1
+tableprint==0.9.1
+termcolor==2.5.0
+thop==0.1.1.post2209072238
+threadpoolctl==3.5.0
+tomlkit==0.12.0
+torch==2.5.1
+torchaudio==2.5.1
+torchmetrics==1.2.0
+torchnet==0.0.4
+tornado==6.4.1
+tqdm==4.64.0
+triton==3.1.0
+typer==0.12.5
+typing_extensions==4.12.2
+tzdata==2024.2
+uc-micro-py==1.0.3
+umap-learn==0.5.6
+urllib3==2.2.3
+uvicorn==0.32.0
+virtualenv==20.27.1
+visdom==0.2.4
+wcwidth==0.2.13
+websocket-client==1.8.0
+websockets==12.0
+-e git+https://github.com/wenet-e2e/wesep.git@f0e479e998206a5404feb4f735113ff9bf6c4d55#egg=wesep
+wespeaker @ git+https://github.com/wenet-e2e/wespeaker.git@e9bbf73d0fd13db6cf42a6cb2eafb0d7dd0f8e0e
+yarl==1.17.1
+zipp==3.20.2