telnyx
/

wav2vec2-end-of-speech-detection

Audio Classification

ONNX

Model card Files Files and versions Community

stefanpp commited on Sep 13

Commit

dc2d03d

•

1 Parent(s): d5d7d09

added usage part to README.md

Browse files

Files changed (1) hide show

README.md +65 -0

README.md CHANGED Viewed

@@ -37,6 +37,71 @@ The model is trained at 700 and 704ms (11x64ms) inputs of raw audio. The sample
 The model classifies each audio input into 2 classes - eos (id: 0) and not_eos (id: 1).
 # Latency (& Memory) Optimization
 - Knowledge Distillation

 The model classifies each audio input into 2 classes - eos (id: 0) and not_eos (id: 1).
+# Usage
+```python
+from transformers import Wav2Vec2Processor, AutoConfig
+import onnxruntime as rt
+import torch
+import torch.nn.functional as F
+import numpy as np
+import os
+import torchaudio
+class EndOfSpeechDetection:
+    processor: Wav2Vec2Processor
+    config: AutoConfig
+    session: rt.InferenceSession
+    def load_model(self, path, use_gpu=False):
+        processor = Wav2Vec2Processor.from_pretrained(path)
+        config = AutoConfig.from_pretrained(path)
+        sess_options = rt.SessionOptions()
+        sess_options.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_ALL
+        providers = ["ROCMExecutionProvider"] if use_gpu else ["CPUExecutionProvider"]
+        session = rt.InferenceSession(
+            os.path.join(path, "model.onnx"), sess_options, providers=providers
+        )
+        return processor, config, session
+    def predict(self, segment, file_type="pcm"):
+        if file_type == "pcm":
+            # pcm files
+            speech_array = np.memmap(segment, dtype="float32", mode="r").astype(
+                np.float32
+            )
+        else:
+            # wave files
+            speech_array, _ = torchaudio.load(segment)
+            speech_array = speech_array[0].numpy()
+        features = self.processor(
+            speech_array, sampling_rate=16000, return_tensors="pt", padding=True
+        )
+        input_values = features.input_values
+        outputs = self.session.run(
+            [self.session.get_outputs()[-1].name],
+            {self.session.get_inputs()[-1].name: input_values.detach().cpu().numpy()},
+        )[0]
+        softmax_output = F.softmax(torch.tensor(outputs), dim=1)
+        both_classes_with_prob = {
+            self.config.id2label[i]: softmax_output[0][i].item()
+            for i in range(len(softmax_output[0]))
+        }
+        return both_classes_with_prob
+if __name__ == "__main__":
+    eos = EndOfSpeechDetection()
+    eos.processor, eos.config, eos.session = eos.load_model("eos-model-onnx")
+    print(eos.predict("some.wav"))
+```
 # Latency (& Memory) Optimization
 - Knowledge Distillation