from onnx import numpy_helper |
import numpy as np |
import onnx |
import ffmpeg |
import argparse |
parser = argparse.ArgumentParser(description='Whisper format converter') |
parser.add_argument('--ipath', metavar='S', help='path to the input file') |
parser.add_argument('--opath', metavar='S', help='path to the output file (.pb extension)') |
args = parser.parse_args() |
if __name__ == '__main__': |
out, _ = ( |
ffmpeg.input(args.ipath, threads=0) |
.output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=16000) |
.run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) |
) |
audio = np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0 |
onnx_tp = numpy_helper.from_array(audio, 'raw_audio') |
onnx.save_tensor(onnx_tp, args.opath) |