Artrajz commited on
Commit
5854014
1 Parent(s): 881cc0a

Upload 14 files

Browse files
Files changed (12) hide show
  1. README_zh.md +12 -5
  2. app.py +6 -5
  3. config.py +4 -1
  4. request.py +12 -7
  5. requirements.txt +1 -1
  6. text/cantonese.py +9 -0
  7. text/mandarin.py +9 -0
  8. text/shanghainese.py +9 -0
  9. utils/merge.py +3 -3
  10. utils/nlp.py +31 -14
  11. utils/utils.py +0 -23
  12. voice.py +49 -71
README_zh.md CHANGED
@@ -30,6 +30,8 @@
30
  - [x] SSML语音合成标记语言(完善中...)
31
 
32
  <details><summary>Update Logs</summary><pre><code>
 
 
33
  <h2>2023.5.24</h2>
34
  <p>添加dimensional_emotion api,从文件夹加载多个npy文件,Docker添加了Linux/ARM64和Linux/ARM64/v8平台</p>
35
  <h2>2023.5.15</h2>
@@ -52,12 +54,17 @@
52
  </code></pre></details>
53
 
54
 
 
55
  ## demo
56
 
57
  [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/Artrajz/vits-simple-api)
58
 
 
 
59
 
60
  - `https://artrajz-vits-simple-api.hf.space/voice/vits?text=你好,こんにちは&id=164`
 
 
61
  - 激动:`https://artrajz-vits-simple-api.hf.space/voice/w2v2-vits?text=こんにちは&id=3&emotion=111`
62
  - 小声:`https://artrajz-vits-simple-api.hf.space/voice/w2v2-vits?text=こんにちは&id=3&emotion=2077`
63
 
@@ -273,15 +280,15 @@ pip install openjtalk==0.3.0.dev2 --index-url https://pypi.artrajz.cn/simple
273
 
274
  #### voice vits
275
 
276
- - GET http://127.0.0.1/voice?text=text
277
 
278
  其他参数不指定时均为默认值
279
 
280
- - GET http://127.0.0.1/voice?text=[ZH]text[ZH][JA]text[JA]&lang=mix
281
 
282
  lang=mix时文本要标注
283
 
284
- - GET http://127.0.0.1/voice?text=text&id=142&format=wav&lang=zh&length=1.4
285
 
286
  文本为text,角色id为142,音频格式为wav,文本语言为zh,语音长度为1.4,其余参数默认
287
 
@@ -490,7 +497,7 @@ def voice_dimensional_emotion(upload_path):
490
  | ------------- | --------- | ------- | ------- | ----- | ------------------------------------------------------------ |
491
  | 合成文本 | text | true | | str | |
492
  | 角色id | id | false | 0 | int | |
493
- | 音频格式 | format | false | wav | str | wav,ogg,silk |
494
  | 文本语言 | lang | false | auto | str | auto为自动识别语言模式,也是默认模式。lang=mix时,文本应该用[ZH] 或 [JA] 包裹。方言无法自动识别。 |
495
  | 语音长度/语速 | length | false | 1.0 | float | 调节语音长度,相当于调节语速,该数值越大语速越慢 |
496
  | 噪声 | noise | false | 0.667 | float | |
@@ -528,7 +535,7 @@ def voice_dimensional_emotion(upload_path):
528
  | ------------- | --------- | ------- | ------- | ----- | ------------------------------------------------------------ |
529
  | 合成文本 | text | true | | str | |
530
  | 角色id | id | false | 0 | int | |
531
- | 音频格式 | format | false | wav | str | wav,ogg,silk |
532
  | 文本语言 | lang | false | auto | str | auto为自动识别语言模式,也是默认模式。lang=mix时,文本应该用[ZH] 或 [JA] 包裹。方言无法自动识别。 |
533
  | 语音长度/语速 | length | false | 1.0 | float | 调节语音长度,相当于调节语速,该数值越大语速越慢 |
534
  | 噪声 | noise | false | 0.667 | float | |
 
30
  - [x] SSML语音合成标记语言(完善中...)
31
 
32
  <details><summary>Update Logs</summary><pre><code>
33
+ <h2>2023.6.5</h2>
34
+ <p>更换音频编码使用的库,增加flac格式,增加中文对读简单数学公式的支持</p>
35
  <h2>2023.5.24</h2>
36
  <p>添加dimensional_emotion api,从文件夹加载多个npy文件,Docker添加了Linux/ARM64和Linux/ARM64/v8平台</p>
37
  <h2>2023.5.15</h2>
 
54
  </code></pre></details>
55
 
56
 
57
+
58
  ## demo
59
 
60
  [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/Artrajz/vits-simple-api)
61
 
62
+ 注意不同的id支持的语言可能有所不同。[speakers](https://artrajz-vits-simple-api.hf.space/voice/speakers)
63
+
64
 
65
  - `https://artrajz-vits-simple-api.hf.space/voice/vits?text=你好,こんにちは&id=164`
66
+ - `https://artrajz-vits-simple-api.hf.space/voice/vits?text=你知道1+1=几吗?我觉得1+1≠3&id=164&lang=zh`
67
+ - `https://artrajz-vits-simple-api.hf.space/voice/vits?text=Difficult the first time, easy the second.&id=4`
68
  - 激动:`https://artrajz-vits-simple-api.hf.space/voice/w2v2-vits?text=こんにちは&id=3&emotion=111`
69
  - 小声:`https://artrajz-vits-simple-api.hf.space/voice/w2v2-vits?text=こんにちは&id=3&emotion=2077`
70
 
 
280
 
281
  #### voice vits
282
 
283
+ - GET http://127.0.0.1:23456/voice/vits?text=text
284
 
285
  其他参数不指定时均为默认值
286
 
287
+ - GET http://127.0.0.1:23456/voice/vits?text=[ZH]text[ZH][JA]text[JA]&lang=mix
288
 
289
  lang=mix时文本要标注
290
 
291
+ - GET http://127.0.0.1:23456/voice/vits?text=text&id=142&format=wav&lang=zh&length=1.4
292
 
293
  文本为text,角色id为142,音频格式为wav,文本语言为zh,语音长度为1.4,其余参数默认
294
 
 
497
  | ------------- | --------- | ------- | ------- | ----- | ------------------------------------------------------------ |
498
  | 合成文本 | text | true | | str | |
499
  | 角色id | id | false | 0 | int | |
500
+ | 音频格式 | format | false | wav | str | 支持wav,ogg,silk,mp3,flac |
501
  | 文本语言 | lang | false | auto | str | auto为自动识别语言模式,也是默认模式。lang=mix时,文本应该用[ZH] 或 [JA] 包裹。方言无法自动识别。 |
502
  | 语音长度/语速 | length | false | 1.0 | float | 调节语音长度,相当于调节语速,该数值越大语速越慢 |
503
  | 噪声 | noise | false | 0.667 | float | |
 
535
  | ------------- | --------- | ------- | ------- | ----- | ------------------------------------------------------------ |
536
  | 合成文本 | text | true | | str | |
537
  | 角色id | id | false | 0 | int | |
538
+ | 音频格式 | format | false | wav | str | 支持wav,ogg,silk,mp3,flac |
539
  | 文本语言 | lang | false | auto | str | auto为自动识别语言模式,也是默认模式。lang=mix时,文本应该用[ZH] 或 [JA] 包裹。方言无法自动识别。 |
540
  | 语音长度/语速 | length | false | 1.0 | float | 调节语音长度,相当于调节语速,该数值越大语速越慢 |
541
  | 噪声 | noise | false | 0.667 | float | |
app.py CHANGED
@@ -3,7 +3,7 @@ import logging
3
  import time
4
  import logzero
5
  import uuid
6
- from flask import Flask, request, send_file, jsonify, make_response, render_template
7
  from werkzeug.utils import secure_filename
8
  from flask_apscheduler import APScheduler
9
  from functools import wraps
@@ -52,10 +52,7 @@ def require_api_key(func):
52
 
53
  @app.route('/', methods=["GET", "POST"])
54
  def index():
55
- kwargs = {
56
- "speakers": tts.voice_speakers
57
- }
58
- return render_template("index.html", **kwargs)
59
 
60
 
61
  @app.route('/voice/speakers', methods=["GET", "POST"])
@@ -105,11 +102,13 @@ def voice_vits_api():
105
  logger.info(f"[VITS] speaker id {id} does not exist")
106
  return make_response(jsonify({"status": "error", "message": f"id {id} does not exist"}), 400)
107
 
 
108
  speaker_lang = tts.voice_speakers["VITS"][id].get('lang')
109
  if lang.upper() != "AUTO" and lang.upper() != "MIX" and len(speaker_lang) != 1 and lang not in speaker_lang:
110
  logger.info(f"[VITS] lang \"{lang}\" is not in {speaker_lang}")
111
  return make_response(jsonify({"status": "error", "message": f"lang '{lang}' is not in {speaker_lang}"}), 400)
112
 
 
113
  if app.config.get("LANGUAGE_AUTOMATIC_DETECT", []) != []:
114
  speaker_lang = app.config.get("LANGUAGE_AUTOMATIC_DETECT")
115
 
@@ -219,11 +218,13 @@ def voice_w2v2_api():
219
  logger.info(f"[w2v2] speaker id {id} does not exist")
220
  return make_response(jsonify({"status": "error", "message": f"id {id} does not exist"}), 400)
221
 
 
222
  speaker_lang = tts.voice_speakers["W2V2-VITS"][id].get('lang')
223
  if lang.upper() != "AUTO" and lang.upper() != "MIX" and len(speaker_lang) != 1 and lang not in speaker_lang:
224
  logger.info(f"[w2v2] lang \"{lang}\" is not in {speaker_lang}")
225
  return make_response(jsonify({"status": "error", "message": f"lang '{lang}' is not in {speaker_lang}"}), 400)
226
 
 
227
  if app.config.get("LANGUAGE_AUTOMATIC_DETECT", []) != []:
228
  speaker_lang = app.config.get("LANGUAGE_AUTOMATIC_DETECT")
229
 
 
3
  import time
4
  import logzero
5
  import uuid
6
+ from flask import Flask, request, send_file, jsonify, make_response
7
  from werkzeug.utils import secure_filename
8
  from flask_apscheduler import APScheduler
9
  from functools import wraps
 
52
 
53
  @app.route('/', methods=["GET", "POST"])
54
  def index():
55
+ return "vits-simple-api"
 
 
 
56
 
57
 
58
  @app.route('/voice/speakers', methods=["GET", "POST"])
 
102
  logger.info(f"[VITS] speaker id {id} does not exist")
103
  return make_response(jsonify({"status": "error", "message": f"id {id} does not exist"}), 400)
104
 
105
+ # 校验模型是否支持输入的语言
106
  speaker_lang = tts.voice_speakers["VITS"][id].get('lang')
107
  if lang.upper() != "AUTO" and lang.upper() != "MIX" and len(speaker_lang) != 1 and lang not in speaker_lang:
108
  logger.info(f"[VITS] lang \"{lang}\" is not in {speaker_lang}")
109
  return make_response(jsonify({"status": "error", "message": f"lang '{lang}' is not in {speaker_lang}"}), 400)
110
 
111
+ # 如果配置文件中设置了LANGUAGE_AUTOMATIC_DETECT则强制将speaker_lang设置为LANGUAGE_AUTOMATIC_DETECT
112
  if app.config.get("LANGUAGE_AUTOMATIC_DETECT", []) != []:
113
  speaker_lang = app.config.get("LANGUAGE_AUTOMATIC_DETECT")
114
 
 
218
  logger.info(f"[w2v2] speaker id {id} does not exist")
219
  return make_response(jsonify({"status": "error", "message": f"id {id} does not exist"}), 400)
220
 
221
+ # 校验模型是否支持输入的语言
222
  speaker_lang = tts.voice_speakers["W2V2-VITS"][id].get('lang')
223
  if lang.upper() != "AUTO" and lang.upper() != "MIX" and len(speaker_lang) != 1 and lang not in speaker_lang:
224
  logger.info(f"[w2v2] lang \"{lang}\" is not in {speaker_lang}")
225
  return make_response(jsonify({"status": "error", "message": f"lang '{lang}' is not in {speaker_lang}"}), 400)
226
 
227
+ # 如果配置文件中设置了LANGUAGE_AUTOMATIC_DETECT则强制将speaker_lang设置为LANGUAGE_AUTOMATIC_DETECT
228
  if app.config.get("LANGUAGE_AUTOMATIC_DETECT", []) != []:
229
  speaker_lang = app.config.get("LANGUAGE_AUTOMATIC_DETECT")
230
 
config.py CHANGED
@@ -32,8 +32,12 @@ API_KEY = "api-key"
32
  # logging_level:DEBUG/INFO/WARNING/ERROR/CRITICAL
33
  LOGGING_LEVEL = "DEBUG"
34
 
 
 
 
35
  # To use the english_cleaner, you need to install espeak and provide the path of libespeak-ng.dll as input here.
36
  # If ESPEAK_LIBRARY is set to empty, it will be read from the environment variable.
 
37
  ESPEAK_LIBRARY = ""
38
 
39
  # Fill in the model path here
@@ -50,7 +54,6 @@ MODEL_LIST = [
50
  [ABS_PATH + "/Model/louise/360_epochs.pth", ABS_PATH + "/Model/louise/config.json"],
51
  # W2V2-VITS (Need to configure DIMENSIONAL_EMOTION_NPY)
52
  [ABS_PATH + "/Model/w2v2-vits/1026_epochs.pth", ABS_PATH + "/Model/w2v2-vits/config.json"],
53
-
54
  ]
55
 
56
  # hubert-vits: hubert soft model
 
32
  # logging_level:DEBUG/INFO/WARNING/ERROR/CRITICAL
33
  LOGGING_LEVEL = "DEBUG"
34
 
35
+ # Language identification library. Optional fastlid, langid
36
+ LANGUAGE_IDENTIFICATION_LIBRARY = "langid"
37
+
38
  # To use the english_cleaner, you need to install espeak and provide the path of libespeak-ng.dll as input here.
39
  # If ESPEAK_LIBRARY is set to empty, it will be read from the environment variable.
40
+ # For windows : "C:/Program Files/eSpeak NG/libespeak-ng.dll"
41
  ESPEAK_LIBRARY = ""
42
 
43
  # Fill in the model path here
 
54
  [ABS_PATH + "/Model/louise/360_epochs.pth", ABS_PATH + "/Model/louise/config.json"],
55
  # W2V2-VITS (Need to configure DIMENSIONAL_EMOTION_NPY)
56
  [ABS_PATH + "/Model/w2v2-vits/1026_epochs.pth", ABS_PATH + "/Model/w2v2-vits/config.json"],
 
57
  ]
58
 
59
  # hubert-vits: hubert soft model
request.py CHANGED
@@ -251,15 +251,20 @@ ssml = """
251
  </speak>
252
  """
253
 
254
- text = """猫咪是爱撒娇、爱玩耍的小家伙,通常有着柔软的绒毛和温柔的眼神,是许多人都喜欢的宠物哦~它们特别喜欢舔自己的毛发,用柔顺的小脑袋搓人的脚丫子,还能给人带来很多欢乐和温馨。
255
- """
256
  t1 = time.time()
257
- # voice_conversion("H:/git/vits-simple-api/25ecb3f6-f968-11ed-b094-e0d4e84af078.wav", 91, 93)
258
- # voice_hubert_vits("H:/git/vits-simple-api/25ecb3f6-f968-11ed-b094-e0d4e84af078.wav",0)
259
  # voice_vits(text,format="wav",lang="zh")
260
  # voice_w2v2_vits(text,emotion=111)
261
  # os.system(voice_ssml(ssml))
262
- os.system(voice_vits(text,id=0, format="wav", max=0))
263
- # voice_dimensional_emotion("H:/git/vits-simple-api/25ecb3f6-f968-11ed-b094-e0d4e84af078.wav")
264
  t2 = time.time()
265
- print(f"len:{len(text)}耗时:{t2 - t1}")
 
 
 
 
 
 
251
  </speak>
252
  """
253
 
254
+ text = """你知道1+1=几吗?我觉得1+1≠3"""
255
+
256
  t1 = time.time()
257
+ # voice_conversion("H:/git/vits-simple-api/47fa127a-03ab-11ee-a4dc-e0d4e84af078.wav", 91, 93)
258
+ # voice_hubert_vits("H:/git/vits-simple-api/47fa127a-03ab-11ee-a4dc-e0d4e84af078.wav",0)
259
  # voice_vits(text,format="wav",lang="zh")
260
  # voice_w2v2_vits(text,emotion=111)
261
  # os.system(voice_ssml(ssml))
262
+ os.system(voice_vits(text,id=126, format="wav", max=0,noise=0.33,noisew=0.4,lang="zh"))
263
+ # voice_dimensional_emotion("H:/git/vits-simple-api/47fa127a-03ab-11ee-a4dc-e0d4e84af078.wav")
264
  t2 = time.time()
265
+ # print(f"len:{len(text)}耗时:{t2 - t1}")
266
+ # for i in range(10):
267
+ # t1 = time.time()
268
+ # voice_vits(text, format="wav", lang="zh")
269
+ # t2 = time.time()
270
+ # print(f"len:{len(text)}耗时:{t2 - t1}")
requirements.txt CHANGED
@@ -20,10 +20,10 @@ num_thai
20
  opencc
21
  audonnx
22
  flask==2.2.3
23
- av
24
  soundfile==0.12.1
25
  graiax-silkcoder[libsndfile]
26
  flask_apscheduler
27
  fasttext
28
  fastlid
 
29
  phonemizer==3.2.1
 
20
  opencc
21
  audonnx
22
  flask==2.2.3
 
23
  soundfile==0.12.1
24
  graiax-silkcoder[libsndfile]
25
  flask_apscheduler
26
  fasttext
27
  fastlid
28
+ langid
29
  phonemizer==3.2.1
text/cantonese.py CHANGED
@@ -37,6 +37,15 @@ _latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
37
 
38
  _symbols_to_chinese = [(re.compile(f'{x[0]}'), x[1]) for x in [
39
  ('([0-9]+(?:\.?[0-9]+)?)%', r'百分之\1'),
 
 
 
 
 
 
 
 
 
40
  ]]
41
 
42
 
 
37
 
38
  _symbols_to_chinese = [(re.compile(f'{x[0]}'), x[1]) for x in [
39
  ('([0-9]+(?:\.?[0-9]+)?)%', r'百分之\1'),
40
+ ('([0-9]+)/([0-9]+)', r'\2分之\1'),
41
+ ('\+', r'加'),
42
+ ('([0-9]+)-([0-9]+)', r'\1减\2'),
43
+ ('×', r'乘以'),
44
+ ('([0-9]+)x([0-9]+)', r'\1乘以\2'),
45
+ ('([0-9]+)\*([0-9]+)', r'\1乘以\2'),
46
+ ('÷', r'除以'),
47
+ ('=', r'等于'),
48
+ ('≠', r'不等于'),
49
  ]]
50
 
51
 
text/mandarin.py CHANGED
@@ -237,6 +237,15 @@ _bopomofo_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
237
 
238
  _symbols_to_chinese = [(re.compile(f'{x[0]}'), x[1]) for x in [
239
  ('([0-9]+(?:\.?[0-9]+)?)%', r'百分之\1'),
 
 
 
 
 
 
 
 
 
240
  ]]
241
 
242
 
 
237
 
238
  _symbols_to_chinese = [(re.compile(f'{x[0]}'), x[1]) for x in [
239
  ('([0-9]+(?:\.?[0-9]+)?)%', r'百分之\1'),
240
+ ('([0-9]+)/([0-9]+)', r'\2分之\1'),
241
+ ('\+', r'加'),
242
+ ('([0-9]+)-([0-9]+)', r'\1减\2'),
243
+ ('×', r'乘以'),
244
+ ('([0-9]+)x([0-9]+)', r'\1乘以\2'),
245
+ ('([0-9]+)\*([0-9]+)', r'\1乘以\2'),
246
+ ('÷', r'除以'),
247
+ ('=', r'等于'),
248
+ ('≠', r'不等于'),
249
  ]]
250
 
251
 
text/shanghainese.py CHANGED
@@ -37,6 +37,15 @@ _latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
37
 
38
  _symbols_to_chinese = [(re.compile(f'{x[0]}'), x[1]) for x in [
39
  ('([0-9]+(?:\.?[0-9]+)?)%', r'百分之\1'),
 
 
 
 
 
 
 
 
 
40
  ]]
41
 
42
 
 
37
 
38
  _symbols_to_chinese = [(re.compile(f'{x[0]}'), x[1]) for x in [
39
  ('([0-9]+(?:\.?[0-9]+)?)%', r'百分之\1'),
40
+ ('([0-9]+)/([0-9]+)', r'\2分之\1'),
41
+ ('\+', r'加'),
42
+ ('([0-9]+)-([0-9]+)', r'\1减\2'),
43
+ ('×', r'乘以'),
44
+ ('([0-9]+)x([0-9]+)', r'\1乘以\2'),
45
+ ('([0-9]+)\*([0-9]+)', r'\1乘以\2'),
46
+ ('÷', r'除以'),
47
+ ('=', r'等于'),
48
+ ('≠', r'不等于'),
49
  ]]
50
 
51
 
utils/merge.py CHANGED
@@ -109,7 +109,7 @@ def merge_model(merging_model):
109
  obj = vits(model=i[0], config=i[1], model_type="vits")
110
  lang = lang_dict.get(obj.get_cleaner(), obj.get_cleaner())
111
 
112
- for id, name in enumerate(obj.return_speakers()):
113
  vits_obj.append([int(id), obj, obj_id])
114
  vits_speakers.append({"id": new_id, "name": name, "lang": lang})
115
  new_id += 1
@@ -129,7 +129,7 @@ def merge_model(merging_model):
129
  obj = vits(model=i[0], config=i[1], model_=hubert, model_type="hubert")
130
  lang = lang_dict.get(obj.get_cleaner(), obj.get_cleaner())
131
 
132
- for id, name in enumerate(obj.return_speakers()):
133
  hubert_vits_obj.append([int(id), obj, obj_id])
134
  hubert_vits_speakers.append({"id": new_id, "name": name, "lang": lang})
135
  new_id += 1
@@ -148,7 +148,7 @@ def merge_model(merging_model):
148
  obj = vits(model=i[0], config=i[1], model_=emotion_reference, model_type="w2v2")
149
  lang = lang_dict.get(obj.get_cleaner(), obj.get_cleaner())
150
 
151
- for id, name in enumerate(obj.return_speakers()):
152
  w2v2_vits_obj.append([int(id), obj, obj_id])
153
  w2v2_vits_speakers.append({"id": new_id, "name": name, "lang": lang})
154
  new_id += 1
 
109
  obj = vits(model=i[0], config=i[1], model_type="vits")
110
  lang = lang_dict.get(obj.get_cleaner(), obj.get_cleaner())
111
 
112
+ for id, name in enumerate(obj.get_speakers()):
113
  vits_obj.append([int(id), obj, obj_id])
114
  vits_speakers.append({"id": new_id, "name": name, "lang": lang})
115
  new_id += 1
 
129
  obj = vits(model=i[0], config=i[1], model_=hubert, model_type="hubert")
130
  lang = lang_dict.get(obj.get_cleaner(), obj.get_cleaner())
131
 
132
+ for id, name in enumerate(obj.get_speakers()):
133
  hubert_vits_obj.append([int(id), obj, obj_id])
134
  hubert_vits_speakers.append({"id": new_id, "name": name, "lang": lang})
135
  new_id += 1
 
148
  obj = vits(model=i[0], config=i[1], model_=emotion_reference, model_type="w2v2")
149
  lang = lang_dict.get(obj.get_cleaner(), obj.get_cleaner())
150
 
151
+ for id, name in enumerate(obj.get_speakers()):
152
  w2v2_vits_obj.append([int(id), obj, obj_id])
153
  w2v2_vits_speakers.append({"id": new_id, "name": name, "lang": lang})
154
  new_id += 1
utils/nlp.py CHANGED
@@ -1,7 +1,6 @@
1
  import regex as re
2
  import logging
3
  import config
4
- from fastlid import fastlid
5
  from .utils import check_is_none
6
 
7
  logger = logging.getLogger("vits-simple-api")
@@ -11,7 +10,7 @@ level_dict = {'DEBUG': logging.DEBUG, 'INFO': logging.INFO, 'WARNING': logging.W
11
  logger.setLevel(level_dict[level])
12
 
13
 
14
- def clasify_lang(text):
15
  pattern = r'[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\>\=\?\@\[\]\{\}\\\\\^\_\`' \
16
  r'\!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」' \
17
  r'『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘\'\‛\“\”\„\‟…‧﹏.]+'
@@ -22,7 +21,20 @@ def clasify_lang(text):
22
  for word in words:
23
 
24
  if check_is_none(word): continue
25
- lang = fastlid(word)[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  if pre == "":
27
  text = text[:p] + text[p:].replace(word, f'[{lang.upper()}]' + word, 1)
28
  p += len(f'[{lang.upper()}]')
@@ -37,19 +49,24 @@ def clasify_lang(text):
37
 
38
 
39
  def cut(text, max):
40
- pattern = r'[\!\(\)\,\-\.\/\:\;\?\?\。\,\、\;\:]+'
41
  sentences = re.split(pattern, text)
42
- sentence_list = []
43
- count = 0
44
- p = 0
45
- for sentence in sentences:
46
- count += len(sentence) + 1
 
 
47
  if count >= max:
48
- sentence_list.append(text[p:p + count])
49
  p += count
50
  count = 0
 
 
51
  if p < len(text):
52
  sentence_list.append(text[p:])
 
53
  return sentence_list
54
 
55
 
@@ -60,19 +77,19 @@ def sentence_split(text, max=50, lang="auto", speaker_lang=None):
60
  logger.debug(
61
  f"lang \"{lang}\" is not in speaker_lang {speaker_lang},automatically set lang={speaker_lang[0]}")
62
  lang = speaker_lang[0]
63
- else:
64
- fastlid.set_languages = speaker_lang
65
 
66
  sentence_list = []
67
  if lang.upper() != "MIX":
68
  if max <= 0:
69
  sentence_list.append(
70
- clasify_lang(text) if lang.upper() == "AUTO" else f"[{lang.upper()}]{text}[{lang.upper()}]")
 
71
  else:
72
  for i in cut(text, max):
73
  if check_is_none(i): continue
74
  sentence_list.append(
75
- clasify_lang(i) if lang.upper() == "AUTO" else f"[{lang.upper()}]{i}[{lang.upper()}]")
 
76
  else:
77
  sentence_list.append(text)
78
 
 
1
  import regex as re
2
  import logging
3
  import config
 
4
  from .utils import check_is_none
5
 
6
  logger = logging.getLogger("vits-simple-api")
 
10
  logger.setLevel(level_dict[level])
11
 
12
 
13
+ def clasify_lang(text, speaker_lang):
14
  pattern = r'[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\>\=\?\@\[\]\{\}\\\\\^\_\`' \
15
  r'\!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」' \
16
  r'『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘\'\‛\“\”\„\‟…‧﹏.]+'
 
21
  for word in words:
22
 
23
  if check_is_none(word): continue
24
+
25
+ # 读取配置选择语种识别库
26
+ clf = getattr(config, "LANGUAGE_IDENTIFICATION_LIBRARY", "fastlid")
27
+ if clf.upper() == "FASTLID" or clf.upper() == "FASTTEXT":
28
+ from fastlid import fastlid
29
+ lang = fastlid(word)[0]
30
+ if speaker_lang != None: fastlid.set_languages = speaker_lang
31
+ elif clf.upper() == "LANGID":
32
+ import langid
33
+ lang = langid.classify(word)[0]
34
+ if speaker_lang != None: langid.set_languages(speaker_lang)
35
+ else:
36
+ raise ValueError(f"Wrong LANGUAGE_IDENTIFICATION_LIBRARY in config.py")
37
+
38
  if pre == "":
39
  text = text[:p] + text[p:].replace(word, f'[{lang.upper()}]' + word, 1)
40
  p += len(f'[{lang.upper()}]')
 
49
 
50
 
51
  def cut(text, max):
52
+ pattern = r'[!(),—+\-.:;??。,、;:]+'
53
  sentences = re.split(pattern, text)
54
+ discarded_chars = re.findall(pattern, text)
55
+
56
+ sentence_list, count, p = [], 0, 0
57
+
58
+ # 按被分割的符号遍历
59
+ for i, discarded_chars in enumerate(discarded_chars):
60
+ count += len(sentences[i]) + len(discarded_chars)
61
  if count >= max:
62
+ sentence_list.append(text[p:p + count].strip())
63
  p += count
64
  count = 0
65
+
66
+ # 加入最后剩余的文本
67
  if p < len(text):
68
  sentence_list.append(text[p:])
69
+
70
  return sentence_list
71
 
72
 
 
77
  logger.debug(
78
  f"lang \"{lang}\" is not in speaker_lang {speaker_lang},automatically set lang={speaker_lang[0]}")
79
  lang = speaker_lang[0]
 
 
80
 
81
  sentence_list = []
82
  if lang.upper() != "MIX":
83
  if max <= 0:
84
  sentence_list.append(
85
+ clasify_lang(text,
86
+ speaker_lang) if lang.upper() == "AUTO" else f"[{lang.upper()}]{text}[{lang.upper()}]")
87
  else:
88
  for i in cut(text, max):
89
  if check_is_none(i): continue
90
  sentence_list.append(
91
+ clasify_lang(i,
92
+ speaker_lang) if lang.upper() == "AUTO" else f"[{lang.upper()}]{i}[{lang.upper()}]")
93
  else:
94
  sentence_list.append(text)
95
 
utils/utils.py CHANGED
@@ -1,7 +1,6 @@
1
  import logging
2
  import os
3
  from json import loads
4
- import av
5
  from torch import load, FloatTensor
6
  from numpy import float32
7
  import librosa
@@ -77,28 +76,6 @@ def load_audio_to_torch(full_path, target_sampling_rate):
77
  return FloatTensor(audio.astype(float32))
78
 
79
 
80
- def wav2ogg(input, output):
81
- with av.open(input, 'rb') as i:
82
- with av.open(output, 'wb', format='ogg') as o:
83
- out_stream = o.add_stream('libvorbis')
84
- for frame in i.decode(audio=0):
85
- for p in out_stream.encode(frame):
86
- o.mux(p)
87
-
88
- for p in out_stream.encode(None):
89
- o.mux(p)
90
-
91
- def wav2mp3(input, output):
92
- with av.open(input, 'rb') as i:
93
- with av.open(output, 'wb', format='mp3') as o:
94
- out_stream = o.add_stream('mp3')
95
- for frame in i.decode(audio=0):
96
- for p in out_stream.encode(frame):
97
- o.mux(p)
98
-
99
- for p in out_stream.encode(None):
100
- o.mux(p)
101
-
102
  def clean_folder(folder_path):
103
  for filename in os.listdir(folder_path):
104
  file_path = os.path.join(folder_path, filename)
 
1
  import logging
2
  import os
3
  from json import loads
 
4
  from torch import load, FloatTensor
5
  from numpy import float32
6
  import librosa
 
76
  return FloatTensor(audio.astype(float32))
77
 
78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  def clean_folder(folder_path):
80
  for filename in os.listdir(folder_path):
81
  file_path = os.path.join(folder_path, filename)
voice.py CHANGED
@@ -8,13 +8,13 @@ import torch
8
  import xml.etree.ElementTree as ET
9
  import config
10
  import logging
 
11
  from torch import no_grad, LongTensor, inference_mode, FloatTensor
12
  from io import BytesIO
13
  from graiax import silkcoder
14
- from utils.nlp import cut, sentence_split
15
- from scipy.io.wavfile import write
16
  from mel_processing import spectrogram_torch
17
- from text import text_to_sequence, _clean_text
18
  from models import SynthesizerTrn
19
  from utils import utils
20
 
@@ -62,36 +62,15 @@ class vits:
62
  text_norm = LongTensor(text_norm)
63
  return text_norm
64
 
65
- def get_label_value(self, label, default, warning_name='value', text=""):
66
- value = re.search(rf'\[{label}=(.+?)\]', text)
67
- if value:
68
- try:
69
- text = re.sub(rf'\[{label}=(.+?)\]', '', text, 1)
70
- value = float(value.group(1))
71
- except:
72
- print(f'Invalid {warning_name}!')
73
- sys.exit(1)
74
- else:
75
- value = default
76
- if text == "":
77
- return value
78
- else:
79
- return value, text
80
-
81
- def get_label(self, text, label):
82
- if f'[{label}]' in text:
83
- return True, text.replace(f'[{label}]', '')
84
- else:
85
- return False, text
86
-
87
  def get_cleaner(self):
88
  return getattr(self.hps_ms.data, 'text_cleaners', [None])[0]
89
 
90
- def return_speakers(self, escape=False):
91
  return self.speakers
92
 
93
  def infer(self, params):
94
  emotion = params.get("emotion", None)
 
95
 
96
  with no_grad():
97
  x_tst = params.get("stn_tst").unsqueeze(0)
@@ -101,21 +80,16 @@ class vits:
101
  noise_scale=params.get("noise_scale"),
102
  noise_scale_w=params.get("noise_scale_w"),
103
  length_scale=params.get("length_scale"),
104
- emotion_embedding=emotion.to(device) if emotion != None else None)[0][
105
- 0, 0].data.float().cpu().numpy()
106
 
107
  torch.cuda.empty_cache()
 
108
  return audio
109
 
110
- def get_infer_param(self, length, noise, noisew, text=None, speaker_id=None, audio_path=None,
111
- emotion=None):
112
  emo = None
113
  if self.model_type != "hubert":
114
- length_scale, text = self.get_label_value('LENGTH', length, 'length scale', text)
115
- noise_scale, text = self.get_label_value('NOISE', noise, 'noise scale', text)
116
- noise_scale_w, text = self.get_label_value('NOISEW', noisew, 'deviation of noise', text)
117
- cleaned, text = self.get_label(text, 'CLEANED')
118
-
119
  stn_tst = self.get_cleaned_text(text, self.hps_ms, cleaned=cleaned)
120
  sid = LongTensor([speaker_id])
121
 
@@ -137,22 +111,14 @@ class vits:
137
 
138
  elif self.model_type == "hubert":
139
  if self.use_f0:
140
- audio, sampling_rate = librosa.load(
141
- audio_path, sr=self.hps_ms.data.sampling_rate, mono=True)
142
- audio16000 = librosa.resample(
143
- audio, orig_sr=sampling_rate, target_sr=16000)
144
  else:
145
- audio16000, sampling_rate = librosa.load(
146
- audio_path, sr=16000, mono=True)
147
-
148
- length_scale = self.get_label_value('LENGTH', length, 'length scale')
149
- noise_scale = self.get_label_value('NOISE', noise, 'noise scale')
150
- noise_scale_w = self.get_label_value('NOISEW', noisew, 'deviation of noise')
151
 
152
  with inference_mode():
153
  units = self.hubert.units(FloatTensor(audio16000).unsqueeze(0).unsqueeze(0)).squeeze(0).numpy()
154
  if self.use_f0:
155
- f0_scale = self.get_label_value('F0', 1, 'f0 scale')
156
  f0 = librosa.pyin(audio,
157
  sr=sampling_rate,
158
  fmin=librosa.note_to_hz('C0'),
@@ -168,6 +134,7 @@ class vits:
168
  params = {"length_scale": length_scale, "noise_scale": noise_scale,
169
  "noise_scale_w": noise_scale_w, "stn_tst": stn_tst,
170
  "sid": sid, "emotion": emo}
 
171
  return params
172
 
173
  def get_audio(self, voice, auto_break=False):
@@ -193,10 +160,10 @@ class vits:
193
  sentence_list = sentence_split(text, max, lang, speaker_lang)
194
  for sentence in sentence_list:
195
  tasks.append(
196
- self.get_infer_param(text=sentence, speaker_id=speaker_id, length=length, noise=noise,
197
- noisew=noisew))
198
- audios = []
199
 
 
200
  for task in tasks:
201
  audios.append(self.infer(task))
202
  if auto_break:
@@ -205,16 +172,16 @@ class vits:
205
  audio = np.concatenate(audios, axis=0)
206
 
207
  elif self.model_type == "hubert":
208
- params = self.get_infer_param(speaker_id=speaker_id, length=length, noise=noise, noisew=noisew,
209
- audio_path=audio_path)
210
  audio = self.infer(params)
211
 
212
  elif self.model_type == "w2v2":
213
  sentence_list = sentence_split(text, max, lang, speaker_lang)
214
  for sentence in sentence_list:
215
  tasks.append(
216
- self.get_infer_param(text=sentence, speaker_id=speaker_id, length=length, noise=noise,
217
- noisew=noisew, emotion=emotion))
218
 
219
  audios = []
220
  for task in tasks:
@@ -265,6 +232,12 @@ class TTS:
265
  self._hubert_speakers_count = len(self._voice_speakers["HUBERT-VITS"])
266
  self._w2v2_speakers_count = len(self._voice_speakers["W2V2-VITS"])
267
  self.dem = None
 
 
 
 
 
 
268
  if getattr(config, "DIMENSIONAL_EMOTION_MODEL", None) != None:
269
  try:
270
  import audonnx
@@ -274,10 +247,6 @@ class TTS:
274
  except Exception as e:
275
  self.logger.warning(f"Load DIMENSIONAL_EMOTION_MODEL failed {e}")
276
 
277
- # Initialization information
278
- self.logger = logging.getLogger("vits-simple-api")
279
- self.logger.info(f"torch:{torch.__version__} cuda_available:{torch.cuda.is_available()}")
280
- self.logger.info(f'device:{device} device.type:{device.type}')
281
  if self._vits_speakers_count != 0: self.logger.info(f"[VITS] {self._vits_speakers_count} speakers")
282
  if self._hubert_speakers_count != 0: self.logger.info(f"[hubert] {self._hubert_speakers_count} speakers")
283
  if self._w2v2_speakers_count != 0: self.logger.info(f"[w2v2] {self._w2v2_speakers_count} speakers")
@@ -307,19 +276,23 @@ class TTS:
307
 
308
  def encode(self, sampling_rate, audio, format):
309
  with BytesIO() as f:
310
- write(f, sampling_rate, audio)
311
  if format.upper() == 'OGG':
312
- with BytesIO() as o:
313
- utils.wav2ogg(f, o)
314
- return BytesIO(o.getvalue())
315
  elif format.upper() == 'SILK':
 
316
  return BytesIO(silkcoder.encode(f))
317
  elif format.upper() == 'MP3':
318
- with BytesIO() as o:
319
- utils.wav2mp3(f, o)
320
- return BytesIO(o.getvalue())
321
  elif format.upper() == 'WAV':
 
322
  return BytesIO(f.getvalue())
 
 
 
 
 
323
 
324
  def convert_time_string(self, time_string):
325
  time_value = float(re.findall(r'\d+\.?\d*', time_string)[0])
@@ -424,36 +397,40 @@ class TTS:
424
  raise ValueError(f"Unsupported model: {voice.get('model')}")
425
  voice_obj = self._voice_obj[model][voice.get("id")][1]
426
  voice["id"] = self._voice_obj[model][voice.get("id")][0]
427
-
428
- audios.append(voice_obj.get_audio(voice))
429
 
430
  audio = np.concatenate(audios, axis=0)
 
431
 
432
- return self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format), format
433
 
434
  def vits_infer(self, voice):
435
  format = voice.get("format", "wav")
436
  voice_obj = self._voice_obj["VITS"][voice.get("id")][1]
437
  voice["id"] = self._voice_obj["VITS"][voice.get("id")][0]
438
  audio = voice_obj.get_audio(voice, auto_break=True)
 
439
 
440
- return self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format)
441
 
442
  def hubert_vits_infer(self, voice):
443
  format = voice.get("format", "wav")
444
  voice_obj = self._voice_obj["HUBERT-VITS"][voice.get("id")][1]
445
  voice["id"] = self._voice_obj["HUBERT-VITS"][voice.get("id")][0]
446
  audio = voice_obj.get_audio(voice)
 
447
 
448
- return self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format)
449
 
450
  def w2v2_vits_infer(self, voice):
451
  format = voice.get("format", "wav")
452
  voice_obj = self._voice_obj["W2V2-VITS"][voice.get("id")][1]
453
  voice["id"] = self._voice_obj["W2V2-VITS"][voice.get("id")][0]
454
  audio = voice_obj.get_audio(voice, auto_break=True)
 
455
 
456
- return self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format)
457
 
458
  def vits_voice_conversion(self, voice):
459
  original_id = voice.get("original_id")
@@ -471,8 +448,9 @@ class TTS:
471
 
472
  voice_obj = self._voice_obj["VITS"][original_id][1]
473
  audio = voice_obj.voice_conversion(voice)
 
474
 
475
- return self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format)
476
 
477
  def get_dimensional_emotion_npy(self, audio):
478
  if self.dem is None:
 
8
  import xml.etree.ElementTree as ET
9
  import config
10
  import logging
11
+ import soundfile as sf
12
  from torch import no_grad, LongTensor, inference_mode, FloatTensor
13
  from io import BytesIO
14
  from graiax import silkcoder
15
+ from utils.nlp import sentence_split
 
16
  from mel_processing import spectrogram_torch
17
+ from text import text_to_sequence
18
  from models import SynthesizerTrn
19
  from utils import utils
20
 
 
62
  text_norm = LongTensor(text_norm)
63
  return text_norm
64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  def get_cleaner(self):
66
  return getattr(self.hps_ms.data, 'text_cleaners', [None])[0]
67
 
68
+ def get_speakers(self, escape=False):
69
  return self.speakers
70
 
71
  def infer(self, params):
72
  emotion = params.get("emotion", None)
73
+ emotion = emotion.to(device) if emotion != None else None
74
 
75
  with no_grad():
76
  x_tst = params.get("stn_tst").unsqueeze(0)
 
80
  noise_scale=params.get("noise_scale"),
81
  noise_scale_w=params.get("noise_scale_w"),
82
  length_scale=params.get("length_scale"),
83
+ emotion_embedding=emotion)[0][0, 0].data.float().cpu().numpy()
 
84
 
85
  torch.cuda.empty_cache()
86
+
87
  return audio
88
 
89
+ def get_infer_param(self, length_scale, noise_scale, noise_scale_w, text=None, speaker_id=None, audio_path=None,
90
+ emotion=None, cleaned=False, f0_scale=1):
91
  emo = None
92
  if self.model_type != "hubert":
 
 
 
 
 
93
  stn_tst = self.get_cleaned_text(text, self.hps_ms, cleaned=cleaned)
94
  sid = LongTensor([speaker_id])
95
 
 
111
 
112
  elif self.model_type == "hubert":
113
  if self.use_f0:
114
+ audio, sampling_rate = librosa.load(audio_path, sr=self.hps_ms.data.sampling_rate, mono=True)
115
+ audio16000 = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
 
 
116
  else:
117
+ audio16000, sampling_rate = librosa.load(audio_path, sr=16000, mono=True)
 
 
 
 
 
118
 
119
  with inference_mode():
120
  units = self.hubert.units(FloatTensor(audio16000).unsqueeze(0).unsqueeze(0)).squeeze(0).numpy()
121
  if self.use_f0:
 
122
  f0 = librosa.pyin(audio,
123
  sr=sampling_rate,
124
  fmin=librosa.note_to_hz('C0'),
 
134
  params = {"length_scale": length_scale, "noise_scale": noise_scale,
135
  "noise_scale_w": noise_scale_w, "stn_tst": stn_tst,
136
  "sid": sid, "emotion": emo}
137
+
138
  return params
139
 
140
  def get_audio(self, voice, auto_break=False):
 
160
  sentence_list = sentence_split(text, max, lang, speaker_lang)
161
  for sentence in sentence_list:
162
  tasks.append(
163
+ self.get_infer_param(text=sentence, speaker_id=speaker_id, length_scale=length, noise_scale=noise,
164
+ noise_scale_w=noisew))
 
165
 
166
+ audios = []
167
  for task in tasks:
168
  audios.append(self.infer(task))
169
  if auto_break:
 
172
  audio = np.concatenate(audios, axis=0)
173
 
174
  elif self.model_type == "hubert":
175
+ params = self.get_infer_param(speaker_id=speaker_id, length_scale=length, noise_scale=noise,
176
+ noise_scale_w=noisew, audio_path=audio_path)
177
  audio = self.infer(params)
178
 
179
  elif self.model_type == "w2v2":
180
  sentence_list = sentence_split(text, max, lang, speaker_lang)
181
  for sentence in sentence_list:
182
  tasks.append(
183
+ self.get_infer_param(text=sentence, speaker_id=speaker_id, length_scale=length, noise_scale=noise,
184
+ noise_scale_w=noisew, emotion=emotion))
185
 
186
  audios = []
187
  for task in tasks:
 
232
  self._hubert_speakers_count = len(self._voice_speakers["HUBERT-VITS"])
233
  self._w2v2_speakers_count = len(self._voice_speakers["W2V2-VITS"])
234
  self.dem = None
235
+
236
+ # Initialization information
237
+ self.logger = logging.getLogger("vits-simple-api")
238
+ self.logger.info(f"torch:{torch.__version__} cuda_available:{torch.cuda.is_available()}")
239
+ self.logger.info(f'device:{device} device.type:{device.type}')
240
+
241
  if getattr(config, "DIMENSIONAL_EMOTION_MODEL", None) != None:
242
  try:
243
  import audonnx
 
247
  except Exception as e:
248
  self.logger.warning(f"Load DIMENSIONAL_EMOTION_MODEL failed {e}")
249
 
 
 
 
 
250
  if self._vits_speakers_count != 0: self.logger.info(f"[VITS] {self._vits_speakers_count} speakers")
251
  if self._hubert_speakers_count != 0: self.logger.info(f"[hubert] {self._hubert_speakers_count} speakers")
252
  if self._w2v2_speakers_count != 0: self.logger.info(f"[w2v2] {self._w2v2_speakers_count} speakers")
 
276
 
277
  def encode(self, sampling_rate, audio, format):
278
  with BytesIO() as f:
 
279
  if format.upper() == 'OGG':
280
+ sf.write(f, audio, sampling_rate, format="ogg")
281
+ return BytesIO(f.getvalue())
 
282
  elif format.upper() == 'SILK':
283
+ sf.write(f, audio, sampling_rate, format="wav")
284
  return BytesIO(silkcoder.encode(f))
285
  elif format.upper() == 'MP3':
286
+ sf.write(f, audio, sampling_rate, format="mp3")
287
+ return BytesIO(f.getvalue())
 
288
  elif format.upper() == 'WAV':
289
+ sf.write(f, audio, sampling_rate, format="wav")
290
  return BytesIO(f.getvalue())
291
+ elif format.upper() == 'FLAC':
292
+ sf.write(f, audio, sampling_rate, format="flac")
293
+ return BytesIO(f.getvalue())
294
+ else:
295
+ raise ValueError(f"Unsupported format:{format}")
296
 
297
  def convert_time_string(self, time_string):
298
  time_value = float(re.findall(r'\d+\.?\d*', time_string)[0])
 
397
  raise ValueError(f"Unsupported model: {voice.get('model')}")
398
  voice_obj = self._voice_obj[model][voice.get("id")][1]
399
  voice["id"] = self._voice_obj[model][voice.get("id")][0]
400
+ audio = voice_obj.get_audio(voice)
401
+ audios.append(audio)
402
 
403
  audio = np.concatenate(audios, axis=0)
404
+ output = self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format)
405
 
406
+ return output, format
407
 
408
  def vits_infer(self, voice):
409
  format = voice.get("format", "wav")
410
  voice_obj = self._voice_obj["VITS"][voice.get("id")][1]
411
  voice["id"] = self._voice_obj["VITS"][voice.get("id")][0]
412
  audio = voice_obj.get_audio(voice, auto_break=True)
413
+ output = self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format)
414
 
415
+ return output
416
 
417
  def hubert_vits_infer(self, voice):
418
  format = voice.get("format", "wav")
419
  voice_obj = self._voice_obj["HUBERT-VITS"][voice.get("id")][1]
420
  voice["id"] = self._voice_obj["HUBERT-VITS"][voice.get("id")][0]
421
  audio = voice_obj.get_audio(voice)
422
+ output = self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format)
423
 
424
+ return output
425
 
426
  def w2v2_vits_infer(self, voice):
427
  format = voice.get("format", "wav")
428
  voice_obj = self._voice_obj["W2V2-VITS"][voice.get("id")][1]
429
  voice["id"] = self._voice_obj["W2V2-VITS"][voice.get("id")][0]
430
  audio = voice_obj.get_audio(voice, auto_break=True)
431
+ output = self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format)
432
 
433
+ return output
434
 
435
  def vits_voice_conversion(self, voice):
436
  original_id = voice.get("original_id")
 
448
 
449
  voice_obj = self._voice_obj["VITS"][original_id][1]
450
  audio = voice_obj.voice_conversion(voice)
451
+ output = self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format)
452
 
453
+ return output
454
 
455
  def get_dimensional_emotion_npy(self, audio):
456
  if self.dem is None: