from transformers import AutoModel import librosa import wget from modeling_diva import DiVAModel filename = wget.download( "https://github.com/ffaisal93/SD-QA/raw/refs/heads/master/dev/eng/irl/wav_eng/-1008642825401516622.wav" ) speech_data, _ = librosa.load(filename, sr=16_000) model = DiVAModel.from_pretrained("./") print(model.generate([speech_data])) print(model.generate([speech_data], ["Repeat verbatim what is said to you."])) filename = wget.download( "https://github.com/ffaisal93/SD-QA/raw/refs/heads/master/dev/eng/irl/wav_eng/-2426554427049983479.wav" ) speech_data2, _ = librosa.load(filename, sr=16_000) print( model.generate( [speech_data, speech_data2], ["Reply Briefly Like A Pirate", "Reply Briefly Like A New Yorker"], ) )