|
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, AutoModelForCausalLM, AutoTokenizer |
|
|
|
class whisperHF: |
|
def __init__(self, model_path='distil-whisper/distil-large-v3', device="cuda:0"): |
|
self.device = device |
|
self.model = AutoModelForSpeechSeq2Seq.from_pretrained( |
|
model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True, use_safetensors=True |
|
).eval().to(device) |
|
self.processor = AutoProcessor.from_pretrained(model_path) |
|
self.pipe = pipeline( |
|
"automatic-speech-recognition", |
|
model=self.model, |
|
tokenizer=self.processor.tokenizer, |
|
feature_extractor=self.processor.feature_extractor, |
|
max_new_tokens=128, |
|
torch_dtype=torch.float16, |
|
device=self.device, |
|
) |
|
|
|
def infer(self, file="sound.mp3"): |
|
result = self.pipe(file) |
|
return result["text"] |
|
|
|
def infer_timestep(self, file="sound.mp3"): |
|
result = pipe(sample, return_timestamps=True) |
|
return result["chunks"] |