feat add onnx model
Browse files- README.md +72 -4
- model.onnx +3 -0
README.md
CHANGED
@@ -34,12 +34,13 @@ https://huggingface.co/spaces/utrobinmv/tts_ru_free_hf_vits_low_multispeaker
|
|
34 |
|
35 |
|
36 |
|
37 |
-
Usage example:
|
38 |
|
39 |
```python
|
40 |
from transformers import VitsModel, AutoTokenizer, set_seed
|
41 |
import torch
|
42 |
import scipy
|
|
|
43 |
|
44 |
device = 'cuda' # 'cpu' or 'cuda'
|
45 |
|
@@ -54,8 +55,6 @@ model = VitsModel.from_pretrained(model_name).to(device)
|
|
54 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
55 |
model.eval()
|
56 |
|
57 |
-
from ruaccent import RUAccent
|
58 |
-
|
59 |
# load accentizer
|
60 |
accentizer = RUAccent()
|
61 |
accentizer.load(omograph_model_size='turbo', use_dictionary=True, device=device)
|
@@ -84,7 +83,8 @@ print(text)
|
|
84 |
inputs = tokenizer(text, return_tensors="pt")
|
85 |
|
86 |
with torch.no_grad():
|
87 |
-
output = model(**inputs.to(device), speaker_id=speaker).waveform
|
|
|
88 |
|
89 |
scipy.io.wavfile.write("tts_audio.wav", rate=model.config.sampling_rate,
|
90 |
data=output[0])
|
@@ -102,6 +102,74 @@ Audio(output, rate=model.config.sampling_rate)
|
|
102 |
|
103 |
##
|
104 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
## Languages covered
|
106 |
|
107 |
Russian (ru_RU)
|
|
|
34 |
|
35 |
|
36 |
|
37 |
+
Usage example using PyTorch:
|
38 |
|
39 |
```python
|
40 |
from transformers import VitsModel, AutoTokenizer, set_seed
|
41 |
import torch
|
42 |
import scipy
|
43 |
+
from ruaccent import RUAccent
|
44 |
|
45 |
device = 'cuda' # 'cpu' or 'cuda'
|
46 |
|
|
|
55 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
56 |
model.eval()
|
57 |
|
|
|
|
|
58 |
# load accentizer
|
59 |
accentizer = RUAccent()
|
60 |
accentizer.load(omograph_model_size='turbo', use_dictionary=True, device=device)
|
|
|
83 |
inputs = tokenizer(text, return_tensors="pt")
|
84 |
|
85 |
with torch.no_grad():
|
86 |
+
output = model(**inputs.to(device), speaker_id=speaker).waveform
|
87 |
+
output = output.detach().cpu().numpy()
|
88 |
|
89 |
scipy.io.wavfile.write("tts_audio.wav", rate=model.config.sampling_rate,
|
90 |
data=output[0])
|
|
|
102 |
|
103 |
##
|
104 |
|
105 |
+
Usage example using ONNX:
|
106 |
+
|
107 |
+
First copy the model.onnx file to the folder "tts_ru_free_hf_vits_low_multispeaker".
|
108 |
+
|
109 |
+
```python
|
110 |
+
import numpy as np
|
111 |
+
import scipy
|
112 |
+
import onnxruntime
|
113 |
+
from ruaccent import RUAccent
|
114 |
+
from transformers import AutoTokenizer
|
115 |
+
|
116 |
+
speaker = 0 # 0-woman, 1-man
|
117 |
+
|
118 |
+
# load model
|
119 |
+
model_path = "tts_ru_free_hf_vits_low_multispeaker/model.onnx"
|
120 |
+
|
121 |
+
sess_options = onnxruntime.SessionOptions()
|
122 |
+
model = onnxruntime.InferenceSession(model_path, sess_options=sess_options)
|
123 |
+
tokenizer = AutoTokenizer.from_pretrained("utrobinmv/tts_ru_free_hf_vits_low_multispeaker")
|
124 |
+
|
125 |
+
# text
|
126 |
+
text = """Ночью двадцать третьего июня начал извергаться самый высокий
|
127 |
+
действующий вулкан в Евразии - Кл+ючевской. Об этом сообщила руководитель
|
128 |
+
Камчатской группы реагирования на вулканические извержения, ведущий
|
129 |
+
научный сотрудник Института вулканологии и сейсмологии ДВО РАН Ольга Гирина.
|
130 |
+
«Зафиксированное ночью не просто свечение, а вершинное эксплозивное
|
131 |
+
извержение стромболианского типа. Пока такое извержение никому не опасно:
|
132 |
+
ни населению, ни авиации» пояснила ТАСС госпожа Гирина."""
|
133 |
+
|
134 |
+
# load accentizer
|
135 |
+
accentizer = RUAccent()
|
136 |
+
accentizer.load(omograph_model_size='turbo', use_dictionary=True)
|
137 |
+
|
138 |
+
# the placement of accents
|
139 |
+
text = accentizer.process_all(text)
|
140 |
+
|
141 |
+
# inference
|
142 |
+
inputs = tokenizer(text, return_tensors="np")
|
143 |
+
sid = np.array([speaker])
|
144 |
+
sampling_rate = 16000
|
145 |
+
|
146 |
+
output = model.run(
|
147 |
+
None,
|
148 |
+
{
|
149 |
+
"input_ids": inputs['input_ids'],
|
150 |
+
"attention_mask": inputs['attention_mask'],
|
151 |
+
"sid": sid,
|
152 |
+
},
|
153 |
+
)[0]
|
154 |
+
|
155 |
+
scipy.io.wavfile.write("tts_audio.wav", rate=sampling_rate,
|
156 |
+
data=output[0])
|
157 |
+
```
|
158 |
+
|
159 |
+
|
160 |
+
|
161 |
+
For displayed in a Jupyter Notebook / Google Colab:
|
162 |
+
|
163 |
+
```python
|
164 |
+
from IPython.display import Audio
|
165 |
+
|
166 |
+
Audio(output, rate=sampling_rate)
|
167 |
+
```
|
168 |
+
|
169 |
+
##
|
170 |
+
|
171 |
+
|
172 |
+
|
173 |
## Languages covered
|
174 |
|
175 |
Russian (ru_RU)
|
model.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:03ee7618a7c7930dde43489153a6e21f6619eec6a5497aed551ddd29b10eb15c
|
3 |
+
size 50849741
|