yuekai commited on
Commit
c8eb611
1 Parent(s): c1b325d

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ See https://github.com/modelscope/FunASR/tree/main/runtime/triton_gpu
model_repo_sense_voice_small/encoder/1/.gitkeep ADDED
File without changes
model_repo_sense_voice_small/encoder/1/model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:07fe6dd7a4765c64dd63e01d0bba340d3c1eefa3b591553060fb231e2d7cd874
3
+ size 937424191
model_repo_sense_voice_small/encoder/config.pbtxt ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ name: "encoder"
16
+ backend: "onnxruntime"
17
+ default_model_filename: "model.onnx"
18
+
19
+ max_batch_size: 16
20
+
21
+ input [
22
+ {
23
+ name: "speech"
24
+ data_type: TYPE_FP32
25
+ dims: [-1, 560]
26
+ },
27
+ {
28
+ name: "speech_lengths"
29
+ data_type: TYPE_INT32
30
+ dims: [1]
31
+ reshape: { shape: [ ] }
32
+ },
33
+ {
34
+ name: "language"
35
+ data_type: TYPE_INT32
36
+ dims: [1]
37
+ reshape: { shape: [ ] }
38
+ },
39
+ {
40
+ name: "textnorm"
41
+ data_type: TYPE_INT32
42
+ dims: [1]
43
+ reshape: { shape: [ ] }
44
+ }
45
+ ]
46
+
47
+ output [
48
+ {
49
+ name: "ctc_logits"
50
+ data_type: TYPE_FP32
51
+ dims: [-1, 25055]
52
+ },
53
+ {
54
+ name: "encoder_out_lens"
55
+ data_type: TYPE_INT32
56
+ dims: [1]
57
+ reshape: { shape: [ ] }
58
+ }
59
+ ]
60
+
61
+ dynamic_batching {
62
+ }
63
+ parameters { key: "cudnn_conv_algo_search" value: { string_value: "2" } }
64
+
65
+ instance_group [
66
+ {
67
+ count: 1
68
+ kind: KIND_GPU
69
+ }
70
+ ]
71
+
model_repo_sense_voice_small/feature_extractor/1/__pycache__/model.cpython-310.pyc ADDED
Binary file (9.94 kB). View file
 
model_repo_sense_voice_small/feature_extractor/1/model.py ADDED
@@ -0,0 +1,325 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #
3
+ # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ import math
17
+ import triton_python_backend_utils as pb_utils
18
+ from torch.utils.dlpack import to_dlpack
19
+ import torch
20
+ import numpy as np
21
+ import kaldifeat
22
+ import _kaldifeat
23
+ from typing import List
24
+ import json
25
+ import yaml
26
+ from typing import Any, Dict, Iterable, List, NamedTuple, Set, Tuple, Union
27
+
28
+
29
+ class LFR(torch.nn.Module):
30
+ """Batch LFR: https://github.com/Mddct/devil-asr/blob/main/patch/lfr.py"""
31
+
32
+ def __init__(self, m: int = 7, n: int = 6) -> None:
33
+ """
34
+ Actually, this implements stacking frames and skipping frames.
35
+ if m = 1 and n = 1, just return the origin features.
36
+ if m = 1 and n > 1, it works like skipping.
37
+ if m > 1 and n = 1, it works like stacking but only support right frames.
38
+ if m > 1 and n > 1, it works like LFR.
39
+ """
40
+ super().__init__()
41
+
42
+ self.m = m
43
+ self.n = n
44
+
45
+ self.left_padding_nums = math.ceil((self.m - 1) // 2)
46
+
47
+ def forward(
48
+ self, input_tensor: torch.Tensor, input_lens: torch.Tensor
49
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
50
+ B, _, D = input_tensor.size()
51
+ n_lfr = torch.ceil(input_lens / self.n)
52
+
53
+ prepad_nums = input_lens + self.left_padding_nums
54
+
55
+ right_padding_nums = torch.where(
56
+ self.m >= (prepad_nums - self.n * (n_lfr - 1)),
57
+ self.m - (prepad_nums - self.n * (n_lfr - 1)),
58
+ 0,
59
+ )
60
+
61
+ T_all = self.left_padding_nums + input_lens + right_padding_nums
62
+
63
+ new_len = T_all // self.n
64
+
65
+ T_all_max = T_all.max().int()
66
+
67
+ tail_frames_index = (input_lens - 1).view(B, 1, 1).repeat(1, 1, D) # [B,1,D]
68
+
69
+ tail_frames = torch.gather(input_tensor, 1, tail_frames_index)
70
+ tail_frames = tail_frames.repeat(1, right_padding_nums.max().int(), 1)
71
+ head_frames = input_tensor[:, 0:1, :].repeat(1, self.left_padding_nums, 1)
72
+
73
+ # stack
74
+ input_tensor = torch.cat([head_frames, input_tensor, tail_frames], dim=1)
75
+
76
+ index = (
77
+ torch.arange(T_all_max, device=input_tensor.device, dtype=input_lens.dtype)
78
+ .unsqueeze(0)
79
+ .repeat(B, 1)
80
+ ) # [B, T_all_max]
81
+ index_mask = index < (self.left_padding_nums + input_lens).unsqueeze(1) # [B, T_all_max]
82
+
83
+ tail_index_mask = torch.logical_not(index >= (T_all.unsqueeze(1))) & index_mask
84
+ tail = torch.ones(T_all_max, dtype=input_lens.dtype, device=input_tensor.device).unsqueeze(
85
+ 0
86
+ ).repeat(B, 1) * (
87
+ T_all_max - 1
88
+ ) # [B, T_all_max]
89
+ indices = torch.where(torch.logical_or(index_mask, tail_index_mask), index, tail)
90
+ input_tensor = torch.gather(input_tensor, 1, indices.unsqueeze(2).repeat(1, 1, D))
91
+
92
+ input_tensor = input_tensor.unfold(1, self.m, step=self.n).transpose(2, 3)
93
+
94
+ return input_tensor.reshape(B, -1, D * self.m), new_len
95
+
96
+
97
+ class WavFrontend:
98
+ """Conventional frontend structure for ASR."""
99
+
100
+ def __init__(
101
+ self,
102
+ cmvn_file: str = None,
103
+ fs: int = 16000,
104
+ window: str = "hamming",
105
+ n_mels: int = 80,
106
+ frame_length: int = 25,
107
+ frame_shift: int = 10,
108
+ filter_length_min: int = -1,
109
+ filter_length_max: float = -1,
110
+ lfr_m: int = 7,
111
+ lfr_n: int = 6,
112
+ dither: float = 1.0,
113
+ ) -> None:
114
+
115
+ self.fs = fs
116
+ self.window = window
117
+ self.n_mels = n_mels
118
+ self.frame_length = frame_length
119
+ self.frame_shift = frame_shift
120
+ self.filter_length_min = filter_length_min
121
+ self.filter_length_max = filter_length_max
122
+ self.lfr_m = lfr_m
123
+ self.lfr_n = lfr_n
124
+ self.lfr = LFR(lfr_m, lfr_n)
125
+ self.cmvn_file = cmvn_file
126
+ self.dither = dither
127
+
128
+ if self.cmvn_file:
129
+ self.cmvn = self.load_cmvn()
130
+
131
+ def apply_cmvn_batch(self, inputs: np.ndarray) -> np.ndarray:
132
+ """
133
+ Apply CMVN with mvn data
134
+ """
135
+ batch, frame, dim = inputs.shape
136
+ means = np.tile(self.cmvn[0:1, :dim], (frame, 1))
137
+ vars = np.tile(self.cmvn[1:2, :dim], (frame, 1))
138
+
139
+ means = torch.from_numpy(means).to(inputs.device)
140
+ vars = torch.from_numpy(vars).to(inputs.device)
141
+
142
+ inputs = (inputs + means) * vars
143
+ return inputs
144
+
145
+ def load_cmvn(
146
+ self,
147
+ ) -> np.ndarray:
148
+ with open(self.cmvn_file, "r", encoding="utf-8") as f:
149
+ lines = f.readlines()
150
+
151
+ means_list = []
152
+ vars_list = []
153
+ for i in range(len(lines)):
154
+ line_item = lines[i].split()
155
+ if line_item[0] == "<AddShift>":
156
+ line_item = lines[i + 1].split()
157
+ if line_item[0] == "<LearnRateCoef>":
158
+ add_shift_line = line_item[3 : (len(line_item) - 1)]
159
+ means_list = list(add_shift_line)
160
+ continue
161
+ elif line_item[0] == "<Rescale>":
162
+ line_item = lines[i + 1].split()
163
+ if line_item[0] == "<LearnRateCoef>":
164
+ rescale_line = line_item[3 : (len(line_item) - 1)]
165
+ vars_list = list(rescale_line)
166
+ continue
167
+
168
+ means = np.array(means_list).astype(np.float64)
169
+ vars = np.array(vars_list).astype(np.float64)
170
+ cmvn = np.array([means, vars])
171
+ return cmvn
172
+
173
+
174
+ class Fbank(torch.nn.Module):
175
+ def __init__(self, opts):
176
+ super(Fbank, self).__init__()
177
+ self.fbank = kaldifeat.Fbank(opts)
178
+
179
+ def forward(self, waves: List[torch.Tensor]):
180
+ return self.fbank(waves)
181
+
182
+
183
+ class TritonPythonModel:
184
+ """Your Python model must use the same class name. Every Python model
185
+ that is created must have "TritonPythonModel" as the class name.
186
+ """
187
+
188
+ def initialize(self, args):
189
+ """`initialize` is called only once when the model is being loaded.
190
+ Implementing `initialize` function is optional. This function allows
191
+ the model to initialize any state associated with this model.
192
+
193
+ Parameters
194
+ ----------
195
+ args : dict
196
+ Both keys and values are strings. The dictionary keys and values are:
197
+ * model_config: A JSON string containing the model configuration
198
+ * model_instance_kind: A string containing model instance kind
199
+ * model_instance_device_id: A string containing model instance device ID
200
+ * model_repository: Model repository path
201
+ * model_version: Model version
202
+ * model_name: Model name
203
+ """
204
+ self.model_config = model_config = json.loads(args["model_config"])
205
+ self.max_batch_size = max(model_config["max_batch_size"], 1)
206
+ self.device = "cuda"
207
+
208
+ # Get OUTPUT0 configuration
209
+ output0_config = pb_utils.get_output_config_by_name(model_config, "speech")
210
+ # Convert Triton types to numpy types
211
+ output0_dtype = pb_utils.triton_string_to_numpy(output0_config["data_type"])
212
+
213
+ if output0_dtype == np.float32:
214
+ self.output0_dtype = torch.float32
215
+ else:
216
+ self.output0_dtype = torch.float16
217
+
218
+ # Get OUTPUT1 configuration
219
+ output1_config = pb_utils.get_output_config_by_name(model_config, "speech_lengths")
220
+ # Convert Triton types to numpy types
221
+ self.output1_dtype = pb_utils.triton_string_to_numpy(output1_config["data_type"])
222
+
223
+ params = self.model_config["parameters"]
224
+
225
+ for li in params.items():
226
+ key, value = li
227
+ value = value["string_value"]
228
+ if key == "config_path":
229
+ with open(str(value), "rb") as f:
230
+ config = yaml.load(f, Loader=yaml.Loader)
231
+ if key == "cmvn_path":
232
+ cmvn_path = str(value)
233
+ config["frontend_conf"]["cmvn_file"] = cmvn_path
234
+
235
+ opts = kaldifeat.FbankOptions()
236
+ opts.frame_opts.dither = 1.0 # TODO: 0.0 or 1.0
237
+ opts.frame_opts.window_type = config["frontend_conf"]["window"]
238
+ opts.mel_opts.num_bins = int(config["frontend_conf"]["n_mels"])
239
+ opts.frame_opts.frame_shift_ms = float(config["frontend_conf"]["frame_shift"])
240
+ opts.frame_opts.frame_length_ms = float(config["frontend_conf"]["frame_length"])
241
+ opts.frame_opts.samp_freq = int(config["frontend_conf"]["fs"])
242
+ opts.device = torch.device(self.device)
243
+ self.opts = opts
244
+ self.feature_extractor = Fbank(self.opts)
245
+ self.feature_size = opts.mel_opts.num_bins
246
+
247
+ self.frontend = WavFrontend(**config["frontend_conf"])
248
+
249
+ def extract_feat(self, waveform_list: List[np.ndarray]) -> Tuple[np.ndarray, np.ndarray]:
250
+ feats, feats_len = [], []
251
+ wavs = []
252
+ for waveform in waveform_list:
253
+ wav = torch.from_numpy(waveform).float().squeeze().to(self.device)
254
+ wavs.append(wav)
255
+
256
+ features = self.feature_extractor(wavs)
257
+ features_len = [feature.shape[0] for feature in features]
258
+ speech = torch.zeros(
259
+ (len(features), max(features_len), self.opts.mel_opts.num_bins),
260
+ dtype=self.output0_dtype,
261
+ device=self.device,
262
+ )
263
+ for i, feature in enumerate(features):
264
+ speech[i, : int(features_len[i])] = feature
265
+ speech_lens = torch.tensor(features_len, dtype=torch.int64).to(self.device)
266
+
267
+ feats, feats_len = self.frontend.lfr(speech, speech_lens)
268
+ feats_len = feats_len.type(torch.int32)
269
+
270
+ feats = self.frontend.apply_cmvn_batch(feats)
271
+ feats = feats.type(self.output0_dtype)
272
+
273
+ return feats, feats_len
274
+
275
+ def execute(self, requests):
276
+ """`execute` must be implemented in every Python model. `execute`
277
+ function receives a list of pb_utils.InferenceRequest as the only
278
+ argument. This function is called when an inference is requested
279
+ for this model.
280
+
281
+ Parameters
282
+ ----------
283
+ requests : list
284
+ A list of pb_utils.InferenceRequest
285
+
286
+ Returns
287
+ -------
288
+ list
289
+ A list of pb_utils.InferenceResponse. The length of this list must
290
+ be the same as `requests`
291
+ """
292
+ batch_count = []
293
+ total_waves = []
294
+ batch_len = []
295
+ responses = []
296
+ for request in requests:
297
+
298
+ input0 = pb_utils.get_input_tensor_by_name(request, "wav")
299
+ input1 = pb_utils.get_input_tensor_by_name(request, "wav_lens")
300
+
301
+ cur_b_wav = input0.as_numpy() * (1 << 15) # b x -1
302
+ # remove paddings, however, encoder may can't batch requests since different lengths.
303
+ # cur_b_wav = cur_b_wav[:, : int(input1.as_numpy()[0])]
304
+ batch_count.append(cur_b_wav.shape[0])
305
+
306
+ # convert the bx-1 numpy array into a 1x-1 list of arrays
307
+ cur_b_wav_list = [np.expand_dims(cur_b_wav[i],0) for i in range(cur_b_wav.shape[0])]
308
+ total_waves.extend(cur_b_wav_list)
309
+
310
+ features, feats_len = self.extract_feat(total_waves)
311
+
312
+ i = 0
313
+ for batch in batch_count:
314
+ speech = features[i : i + batch]
315
+ speech_lengths = feats_len[i : i + batch].unsqueeze(1)
316
+
317
+ speech, speech_lengths = speech.cpu(), speech_lengths.cpu()
318
+
319
+ out0 = pb_utils.Tensor.from_dlpack("speech", to_dlpack(speech))
320
+ out1 = pb_utils.Tensor.from_dlpack("speech_lengths", to_dlpack(speech_lengths))
321
+ inference_response = pb_utils.InferenceResponse(output_tensors=[out0, out1])
322
+ responses.append(inference_response)
323
+ i += batch
324
+
325
+ return responses
model_repo_sense_voice_small/feature_extractor/am.mvn ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ <Nnet>
2
+ <Splice> 560 560
3
+ [ 0 ]
4
+ <AddShift> 560 560
5
+ <LearnRateCoef> 0 [ -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 ]
6
+ <Rescale> 560 560
7
+ <LearnRateCoef> 0 [ 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 ]
8
+ </Nnet>
model_repo_sense_voice_small/feature_extractor/config.pbtxt ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ name: "feature_extractor"
16
+ backend: "python"
17
+ max_batch_size: 16
18
+
19
+ parameters [
20
+ {
21
+ key: "num_mel_bins",
22
+ value: { string_value: "80"}
23
+ },
24
+ {
25
+ key: "frame_shift_in_ms"
26
+ value: { string_value: "10"}
27
+ },
28
+ {
29
+ key: "frame_length_in_ms"
30
+ value: { string_value: "25"}
31
+ },
32
+ {
33
+ key: "sample_rate"
34
+ value: { string_value: "16000"}
35
+ },
36
+ {
37
+ key: "cmvn_path"
38
+ value: { string_value: "./model_repo_sense_voice_small/feature_extractor/am.mvn"}
39
+ },
40
+ {
41
+ key: "config_path"
42
+ value: { string_value: "./model_repo_sense_voice_small/feature_extractor/config.yaml"}
43
+ }
44
+
45
+ ]
46
+
47
+ input [
48
+ {
49
+ name: "wav"
50
+ data_type: TYPE_FP32
51
+ dims: [-1]
52
+ },
53
+ {
54
+ name: "wav_lens"
55
+ data_type: TYPE_INT32
56
+ dims: [1]
57
+ }
58
+ ]
59
+
60
+ output [
61
+ {
62
+ name: "speech"
63
+ data_type: TYPE_FP32
64
+ dims: [-1, 560] # 80
65
+ },
66
+ {
67
+ name: "speech_lengths"
68
+ data_type: TYPE_INT32
69
+ dims: [1]
70
+ }
71
+ ]
72
+
73
+ dynamic_batching {
74
+ }
75
+
76
+ instance_group [
77
+ {
78
+ count: 2
79
+ kind: KIND_GPU
80
+ }
81
+ ]
model_repo_sense_voice_small/feature_extractor/config.yaml ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ encoder: SenseVoiceEncoderSmall
2
+ encoder_conf:
3
+ output_size: 512
4
+ attention_heads: 4
5
+ linear_units: 2048
6
+ num_blocks: 50
7
+ tp_blocks: 20
8
+ dropout_rate: 0.1
9
+ positional_dropout_rate: 0.1
10
+ attention_dropout_rate: 0.1
11
+ input_layer: pe
12
+ pos_enc_class: SinusoidalPositionEncoder
13
+ normalize_before: true
14
+ kernel_size: 11
15
+ sanm_shfit: 0
16
+ selfattention_layer_type: sanm
17
+
18
+
19
+ model: SenseVoiceSmall
20
+ model_conf:
21
+ length_normalized_loss: true
22
+ sos: 1
23
+ eos: 2
24
+ ignore_id: -1
25
+
26
+ tokenizer: SentencepiecesTokenizer
27
+ tokenizer_conf:
28
+ bpemodel: null
29
+ unk_symbol: <unk>
30
+ split_with_space: true
31
+
32
+ frontend: WavFrontend
33
+ frontend_conf:
34
+ fs: 16000
35
+ window: hamming
36
+ n_mels: 80
37
+ frame_length: 25
38
+ frame_shift: 10
39
+ lfr_m: 7
40
+ lfr_n: 6
41
+ cmvn_file: null
42
+
43
+
44
+ dataset: SenseVoiceCTCDataset
45
+ dataset_conf:
46
+ index_ds: IndexDSJsonl
47
+ batch_sampler: EspnetStyleBatchSampler
48
+ data_split_num: 32
49
+ batch_type: token
50
+ batch_size: 14000
51
+ max_token_length: 2000
52
+ min_token_length: 60
53
+ max_source_length: 2000
54
+ min_source_length: 60
55
+ max_target_length: 200
56
+ min_target_length: 0
57
+ shuffle: true
58
+ num_workers: 4
59
+ sos: ${model_conf.sos}
60
+ eos: ${model_conf.eos}
61
+ IndexDSJsonl: IndexDSJsonl
62
+ retry: 20
63
+
64
+ train_conf:
65
+ accum_grad: 1
66
+ grad_clip: 5
67
+ max_epoch: 20
68
+ keep_nbest_models: 10
69
+ avg_nbest_model: 10
70
+ log_interval: 100
71
+ resume: true
72
+ validate_interval: 10000
73
+ save_checkpoint_interval: 10000
74
+
75
+ optim: adamw
76
+ optim_conf:
77
+ lr: 0.00002
78
+ scheduler: warmuplr
79
+ scheduler_conf:
80
+ warmup_steps: 25000
81
+
82
+ specaug: SpecAugLFR
83
+ specaug_conf:
84
+ apply_time_warp: false
85
+ time_warp_window: 5
86
+ time_warp_mode: bicubic
87
+ apply_freq_mask: true
88
+ freq_mask_width_range:
89
+ - 0
90
+ - 30
91
+ lfr_rate: 6
92
+ num_freq_mask: 1
93
+ apply_time_mask: true
94
+ time_mask_width_range:
95
+ - 0
96
+ - 12
97
+ num_time_mask: 1
model_repo_sense_voice_small/scoring/1/__pycache__/model.cpython-310.pyc ADDED
Binary file (4.32 kB). View file
 
model_repo_sense_voice_small/scoring/1/model.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #
3
+ # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ import triton_python_backend_utils as pb_utils
18
+ import numpy as np
19
+ import torch
20
+ from torch.utils.dlpack import from_dlpack
21
+
22
+ import json
23
+ import os
24
+ import yaml
25
+
26
+ import sentencepiece as spm
27
+
28
+ class TritonPythonModel:
29
+ """Your Python model must use the same class name. Every Python model
30
+ that is created must have "TritonPythonModel" as the class name.
31
+ """
32
+
33
+ def initialize(self, args):
34
+ """`initialize` is called only once when the model is being loaded.
35
+ Implementing `initialize` function is optional. This function allows
36
+ the model to initialize any state associated with this model.
37
+
38
+ Parameters
39
+ ----------
40
+ args : dict
41
+ Both keys and values are strings. The dictionary keys and values are:
42
+ * model_config: A JSON string containing the model configuration
43
+ * model_instance_kind: A string containing model instance kind
44
+ * model_instance_device_id: A string containing model instance device ID
45
+ * model_repository: Model repository path
46
+ * model_version: Model version
47
+ * model_name: Model name
48
+ """
49
+ self.model_config = model_config = json.loads(args["model_config"])
50
+ self.max_batch_size = max(model_config["max_batch_size"], 1)
51
+
52
+ # # Get OUTPUT0 configuration
53
+ output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0")
54
+ # # Convert Triton types to numpy types
55
+ self.out0_dtype = pb_utils.triton_string_to_numpy(output0_config["data_type"])
56
+
57
+ self.init_tokenizer(self.model_config["parameters"])
58
+
59
+ def init_tokenizer(self, parameters):
60
+ for li in parameters.items():
61
+ key, value = li
62
+ value = value["string_value"]
63
+ if key == "tokenizer_path":
64
+ tokenizer_path = value
65
+ self.tokenizer = spm.SentencePieceProcessor()
66
+ self.tokenizer.Load(tokenizer_path)
67
+
68
+
69
+ def execute(self, requests):
70
+ """`execute` must be implemented in every Python model. `execute`
71
+ function receives a list of pb_utils.InferenceRequest as the only
72
+ argument. This function is called when an inference is requested
73
+ for this model.
74
+
75
+ Parameters
76
+ ----------
77
+ requests : list
78
+ A list of pb_utils.InferenceRequest
79
+
80
+ Returns
81
+ -------
82
+ list
83
+ A list of pb_utils.InferenceResponse. The length of this list must
84
+ be the same as `requests`
85
+ """
86
+ # Every Python backend must iterate through list of requests and create
87
+ # an instance of pb_utils.InferenceResponse class for each of them. You
88
+ # should avoid storing any of the input Tensors in the class attributes
89
+ # as they will be overridden in subsequent inference requests. You can
90
+ # make a copy of the underlying NumPy array and store it if it is
91
+ # required.
92
+
93
+ total_seq = 0
94
+ logits_list, batch_count = [], []
95
+
96
+ for request in requests:
97
+ # Perform inference on the request and append it to responses list...
98
+ in_0 = pb_utils.get_input_tensor_by_name(request, "ctc_logits")
99
+
100
+ logits = from_dlpack(in_0.to_dlpack())
101
+ logits_list.append(logits)
102
+
103
+ total_seq += logits.shape[0]
104
+ batch_count.append(logits.shape[0])
105
+
106
+ logits_batch = torch.cat(logits_list, dim=0)
107
+ yseq_batch = logits_batch.argmax(axis=-1)
108
+ yseq_batch = torch.unique_consecutive(yseq_batch, dim=-1)
109
+
110
+ yseq_batch = yseq_batch.tolist()
111
+
112
+ # Remove blank_id and EOS tokens
113
+ token_int_batch = [list(filter(lambda x: x not in (0, 2), yseq)) for yseq in yseq_batch]
114
+
115
+ hyps = []
116
+ for i, token_int in enumerate(token_int_batch):
117
+ hyp = self.tokenizer.DecodeIds(token_int)
118
+ hyps.append(hyp)
119
+
120
+ responses = []
121
+ i = 0
122
+ for batch in batch_count:
123
+ sents = np.array(hyps[i : i + batch])
124
+ out0 = pb_utils.Tensor("OUTPUT0", sents.astype(self.out0_dtype))
125
+ inference_response = pb_utils.InferenceResponse(output_tensors=[out0])
126
+ responses.append(inference_response)
127
+ i += batch
128
+
129
+ return responses
130
+
131
+ def finalize(self):
132
+ """`finalize` is called only once when the model is being unloaded.
133
+ Implementing `finalize` function is optional. This function allows
134
+ the model to perform any necessary clean ups before exit.
135
+ """
136
+ print("Cleaning up...")
model_repo_sense_voice_small/scoring/chn_jpn_yue_eng_ko_spectok.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa87f86064c3730d799ddf7af3c04659151102cba548bce325cf06ba4da4e6a8
3
+ size 377341
model_repo_sense_voice_small/scoring/config.pbtxt ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ name: "scoring"
16
+ backend: "python"
17
+ max_batch_size: 16
18
+
19
+ parameters [
20
+ {
21
+ key: "tokenizer_path",
22
+ value: { string_value: "./model_repo_sense_voice_small/scoring/chn_jpn_yue_eng_ko_spectok.bpe.model"}
23
+ },
24
+ { key: "FORCE_CPU_ONLY_INPUT_TENSORS"
25
+ value: {string_value:"no"}
26
+ }
27
+ ]
28
+
29
+
30
+ input [
31
+ {
32
+ name: "ctc_logits"
33
+ data_type: TYPE_FP32
34
+ dims: [-1, 25055]
35
+ },
36
+ {
37
+ name: "encoder_out_lens"
38
+ data_type: TYPE_INT32
39
+ dims: [1]
40
+ reshape: { shape: [ ] }
41
+ }
42
+ ]
43
+
44
+ output [
45
+ {
46
+ name: "OUTPUT0"
47
+ data_type: TYPE_STRING
48
+ dims: [1]
49
+ }
50
+ ]
51
+
52
+ dynamic_batching {
53
+ }
54
+ instance_group [
55
+ {
56
+ count: 2
57
+ kind: KIND_CPU
58
+ }
59
+ ]
model_repo_sense_voice_small/sensevoice/1/.gitkeep ADDED
File without changes
model_repo_sense_voice_small/sensevoice/config.pbtxt ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ name: "sensevoice"
16
+ platform: "ensemble"
17
+ max_batch_size: 16
18
+
19
+ input [
20
+ {
21
+ name: "WAV"
22
+ data_type: TYPE_FP32
23
+ dims: [-1]
24
+ },
25
+ {
26
+ name: "WAV_LENS"
27
+ data_type: TYPE_INT32
28
+ dims: [1]
29
+ },
30
+ {
31
+ name: "LANGUAGE"
32
+ data_type: TYPE_INT32
33
+ dims: [1]
34
+ },
35
+ {
36
+ name: "TEXT_NORM"
37
+ data_type: TYPE_INT32
38
+ dims: [1]
39
+ }
40
+ ]
41
+
42
+ output [
43
+ {
44
+ name: "TRANSCRIPTS"
45
+ data_type: TYPE_STRING
46
+ dims: [1]
47
+ }
48
+ ]
49
+
50
+ ensemble_scheduling {
51
+ step [
52
+ {
53
+ model_name: "feature_extractor"
54
+ model_version: -1
55
+ input_map {
56
+ key: "wav"
57
+ value: "WAV"
58
+ }
59
+ input_map {
60
+ key: "wav_lens"
61
+ value: "WAV_LENS"
62
+ }
63
+ output_map {
64
+ key: "speech"
65
+ value: "SPEECH"
66
+ }
67
+ output_map {
68
+ key: "speech_lengths"
69
+ value: "SPEECH_LENGTHS"
70
+ }
71
+ },
72
+ {
73
+ model_name: "encoder"
74
+ model_version: -1
75
+ input_map {
76
+ key: "speech"
77
+ value: "SPEECH"
78
+ }
79
+ input_map {
80
+ key: "speech_lengths"
81
+ value: "SPEECH_LENGTHS"
82
+ }
83
+ input_map {
84
+ key: "language"
85
+ value: "LANGUAGE"
86
+ }
87
+ input_map {
88
+ key: "textnorm"
89
+ value: "TEXT_NORM"
90
+ }
91
+ output_map {
92
+ key: "ctc_logits"
93
+ value: "ctc_logits"
94
+ }
95
+ output_map {
96
+ key: "encoder_out_lens"
97
+ value: "encoder_out_lens"
98
+ }
99
+ },
100
+ {
101
+ model_name: "scoring"
102
+ model_version: -1
103
+ input_map {
104
+ key: "ctc_logits"
105
+ value: "ctc_logits"
106
+ }
107
+ input_map {
108
+ key: "encoder_out_lens"
109
+ value: "encoder_out_lens"
110
+ }
111
+ output_map {
112
+ key: "OUTPUT0"
113
+ value: "TRANSCRIPTS"
114
+ }
115
+ }
116
+ ]
117
+ }
run.sh ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ export CUDA_VISIBLE_DEVICES=0
2
+ tritonserver --model-repository=./model_repo_sense_voice_small \
3
+ --pinned-memory-pool-byte-size=2048000000 --cuda-memory-pool-byte-size=0:4096000000