|
from onnx import numpy_helper |
|
import numpy as np |
|
import onnx |
|
import ffmpeg |
|
import argparse |
|
|
|
|
|
parser = argparse.ArgumentParser(description='Whisper format converter') |
|
parser.add_argument('--ipath', metavar='S', help='path to the input file') |
|
parser.add_argument('--opath', metavar='S', help='path to the output file (.pb extension)') |
|
args = parser.parse_args() |
|
|
|
if __name__ == '__main__': |
|
|
|
out, _ = ( |
|
ffmpeg.input(args.ipath, threads=0) |
|
.output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=16000) |
|
.run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) |
|
) |
|
audio = np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0 |
|
|
|
onnx_tp = numpy_helper.from_array(audio, 'raw_audio') |
|
onnx.save_tensor(onnx_tp, args.opath) |
|
|