王帅 commited on
Commit
dc83cb6
1 Parent(s): f9d0d8b

init the repo

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ */mixture_zh.wav filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft
2
+ # 2022 Chengdong Liang ([email protected])
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import gradio as gr
17
+ import wesep
18
+ import soundfile
19
+ import torchaudio
20
+ import os
21
+ from scipy.signal import resample
22
+
23
+ input_path = "./audios"
24
+ output_path = "./extracted"
25
+
26
+ if not os.path.exists(input_path):
27
+ os.mkdir(input_path)
28
+
29
+ if not os.path.exists(output_path):
30
+ os.mkdir(output_path)
31
+
32
+
33
+ en_model = wesep.load_model("english")
34
+
35
+ def save_to_file(audio,filename,target_sr=16000):
36
+ audio_path = os.path.join(input_path,filename)
37
+ soundfile.write(audio_path,audio[1],audio[0])
38
+ pcm, sample_rate = torchaudio.load(audio_path)
39
+ transform = torchaudio.transforms.Resample(
40
+ orig_freq=audio[0],
41
+ new_freq=target_sr)
42
+ pcm = transform(pcm)
43
+ torchaudio.save(audio_path, pcm, target_sr)
44
+
45
+ return audio_path
46
+
47
+
48
+ def speaker_extraction(audio1, audio2, mixture, select_speaker='#1'):
49
+ if audio1 == None or audio2 == None or mixture == None:
50
+ print("??")
51
+ return gr.Warning("The audio file cannot be empty, please upload a valid audio file. 音频文件不能为空,请上传有效的音频文件。")
52
+
53
+ audio_path1 = save_to_file(audio1,"enroll_1.wav",16000)
54
+ audio_path2 = save_to_file(audio2,"enroll_2.wav",16000)
55
+ audio_mixture = save_to_file(mixture,"mixture.wav",16000)
56
+
57
+ model = en_model
58
+
59
+ if select_speaker == '#1':
60
+ select_speaker = audio_path1
61
+ elif select_speaker == '#2':
62
+ select_speaker = audio_path2
63
+
64
+ speech = model.extract_speech(audio_mixture,select_speaker)
65
+ audio_speech = output_path + "/speech.wav"
66
+ soundfile.write(audio_speech,speech[0],16000)
67
+
68
+
69
+ return audio_speech
70
+
71
+
72
+ inputs = [
73
+ gr.Audio(
74
+ show_download_button = True,
75
+ label='Enroll Speaker#1',
76
+ ),
77
+ gr.Audio(
78
+ show_download_button = True,
79
+ label='Enroll Speaker#2'),
80
+ gr.Audio(
81
+ show_download_button = True,
82
+ label='Mixture'),
83
+ gr.Radio(['#1', '#2'], label='Extract Speaker #'),
84
+ ]
85
+
86
+ output = gr.Audio(type="filepath",label="Extract Speaker")
87
+
88
+ # description
89
+ description = ("<p>WeSep Demo ! Try it with your own voice ! Note: We recommend that the audio length be greater than 5s !</p>")
90
+
91
+ article = (
92
+ "<p style='text-align: center'>"
93
+ "<a href='https://github.com/wenet-e2e/wesep' target='_blank'>Github: Learn more about WeSep</a>"
94
+ "</p>")
95
+
96
+ examples = [
97
+ ['examples/enroll_1.wav', 'examples/enroll_2.wav', 'examples/mixture.wav','#1'],
98
+ ['examples/enroll1_zh.wav', 'examples/enroll2_zh.wav', 'examples/mixture_zh.wav','#2'],
99
+
100
+ ]
101
+
102
+ interface = gr.Interface(
103
+ fn=speaker_extraction,
104
+ inputs=inputs,
105
+ outputs=output,
106
+ title="Speaker Extraction in WeSep : 基于 WeSep 的说话人提取",
107
+ description=description,
108
+ article=article,
109
+ examples=examples
110
+
111
+ )
112
+
113
+ interface.launch()
examples/enroll1_zh.wav ADDED
Binary file (204 kB). View file
 
examples/enroll2_zh.wav ADDED
Binary file (502 kB). View file
 
examples/enroll_1.wav ADDED
Binary file (169 kB). View file
 
examples/enroll_2.wav ADDED
Binary file (112 kB). View file
 
examples/mixture.wav ADDED
Binary file (128 kB). View file
 
examples/mixture_zh.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:545f95dc05f4627d2e8384a5539d9b4a5a0f174fb753f6687caa3a2dc403e2cf
3
+ size 1169576
requirements.txt ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.2.1
2
+ aiohappyeyeballs==2.4.3
3
+ aiohttp==3.10.10
4
+ aiosignal==1.3.1
5
+ altair==5.4.1
6
+ annotated-types==0.7.0
7
+ anyio==4.6.2.post1
8
+ async-timeout==4.0.3
9
+ attrs==24.2.0
10
+ audioread==3.0.1
11
+ auraloss==0.4.0
12
+ certifi==2024.8.30
13
+ cffi==1.17.1
14
+ cfgv==3.4.0
15
+ charset-normalizer==3.4.0
16
+ click==8.1.7
17
+ coloredlogs==15.0.1
18
+ cycler==0.12.1
19
+ Cython==0.29.37
20
+ decorator==5.1.1
21
+ distlib==0.3.9
22
+ exceptiongroup==1.2.2
23
+ fast_bss_eval==0.1.4
24
+ fastapi==0.115.4
25
+ ffmpy==0.4.0
26
+ filelock==3.16.1
27
+ fire==0.4.0
28
+ flake8==3.8.2
29
+ flake8-bugbear==23.3.12
30
+ flake8-comprehensions==3.16.0
31
+ flake8-executable==2.1.3
32
+ flake8-pyi==20.5.0
33
+ flatbuffers==24.3.25
34
+ fonttools==4.54.1
35
+ frozenlist==1.5.0
36
+ fsspec==2024.10.0
37
+ future==1.0.0
38
+ gradio==4.44.1
39
+ gradio_client==1.3.0
40
+ h11==0.14.0
41
+ h5py==3.12.1
42
+ hdbscan==0.8.37
43
+ httpcore==1.0.6
44
+ httpx==0.27.2
45
+ huggingface-hub==0.26.2
46
+ humanfriendly==10.0
47
+ identify==2.6.1
48
+ idna==3.10
49
+ importlib_resources==6.4.5
50
+ Jinja2==3.1.4
51
+ joblib==1.1.0
52
+ jsonpatch==1.33
53
+ jsonpointer==3.0.0
54
+ jsonschema==4.23.0
55
+ jsonschema-specifications==2024.10.1
56
+ kaldiio==2.18.0
57
+ kiwisolver==1.4.7
58
+ lazy_loader==0.4
59
+ librosa==0.10.1
60
+ lightning-utilities==0.11.8
61
+ linkify-it-py==2.0.3
62
+ llvmlite==0.41.1
63
+ lmdb==1.3.0
64
+ markdown-it-py==3.0.0
65
+ MarkupSafe==2.1.5
66
+ matplotlib==3.5.1
67
+ mccabe==0.6.1
68
+ mdit-py-plugins==0.4.2
69
+ mdurl==0.1.2
70
+ mir_eval==0.7
71
+ mpmath==1.3.0
72
+ msgpack==1.1.0
73
+ multidict==6.1.0
74
+ narwhals==1.13.1
75
+ networkx==3.2.1
76
+ nodeenv==1.9.1
77
+ numba==0.58.0
78
+ numpy==1.22.4
79
+ nvidia-cublas-cu12==12.4.5.8
80
+ nvidia-cuda-cupti-cu12==12.4.127
81
+ nvidia-cuda-nvrtc-cu12==12.4.127
82
+ nvidia-cuda-runtime-cu12==12.4.127
83
+ nvidia-cudnn-cu12==9.1.0.70
84
+ nvidia-cufft-cu12==11.2.1.3
85
+ nvidia-curand-cu12==10.3.5.147
86
+ nvidia-cusolver-cu12==11.6.1.9
87
+ nvidia-cusparse-cu12==12.3.1.170
88
+ nvidia-nccl-cu12==2.21.5
89
+ nvidia-nvjitlink-cu12==12.4.127
90
+ nvidia-nvtx-cu12==12.4.127
91
+ onnxruntime==1.19.2
92
+ orjson==3.10.11
93
+ packaging==24.1
94
+ pandas==2.2.3
95
+ pesq==0.0.4
96
+ pillow==10.4.0
97
+ platformdirs==4.3.6
98
+ pooch==1.8.2
99
+ pre-commit==3.5.0
100
+ propcache==0.2.0
101
+ protobuf==5.28.3
102
+ pycodestyle==2.6.0
103
+ pycparser==2.22
104
+ pycryptodome==3.21.0
105
+ pydantic==2.9.2
106
+ pydantic_core==2.23.4
107
+ pydub==0.25.1
108
+ pyflakes==2.2.0
109
+ Pygments==2.18.0
110
+ pynndescent==0.5.13
111
+ pyparsing==3.2.0
112
+ pystoi==0.3.3
113
+ python-dateutil==2.9.0.post0
114
+ python-multipart==0.0.17
115
+ pytz==2024.2
116
+ PyYAML==6.0
117
+ referencing==0.35.1
118
+ requests==2.31.0
119
+ rich==13.9.4
120
+ rpds-py==0.20.1
121
+ ruff==0.7.2
122
+ scikit-learn==1.1.3
123
+ scipy==1.7.3
124
+ semantic-version==2.10.0
125
+ shellingham==1.5.4
126
+ silero-vad==5.1.2
127
+ six==1.16.0
128
+ sniffio==1.3.1
129
+ soundfile==0.12.1
130
+ soxr==0.5.0.post1
131
+ starlette==0.41.2
132
+ sympy==1.13.1
133
+ tableprint==0.9.1
134
+ termcolor==2.5.0
135
+ thop==0.1.1.post2209072238
136
+ threadpoolctl==3.5.0
137
+ tomlkit==0.12.0
138
+ torch==2.5.1
139
+ torchaudio==2.5.1
140
+ torchmetrics==1.2.0
141
+ torchnet==0.0.4
142
+ tornado==6.4.1
143
+ tqdm==4.64.0
144
+ triton==3.1.0
145
+ typer==0.12.5
146
+ typing_extensions==4.12.2
147
+ tzdata==2024.2
148
+ uc-micro-py==1.0.3
149
+ umap-learn==0.5.6
150
+ urllib3==2.2.3
151
+ uvicorn==0.32.0
152
+ virtualenv==20.27.1
153
+ visdom==0.2.4
154
+ wcwidth==0.2.13
155
+ websocket-client==1.8.0
156
+ websockets==12.0
157
+ -e git+https://github.com/wenet-e2e/wesep.git@f0e479e998206a5404feb4f735113ff9bf6c4d55#egg=wesep
158
+ wespeaker @ git+https://github.com/wenet-e2e/wespeaker.git@e9bbf73d0fd13db6cf42a6cb2eafb0d7dd0f8e0e
159
+ yarl==1.17.1
160
+ zipp==3.20.2