Spaces:
Running
on
Zero
Running
on
Zero
make model offline
Browse files- .gitignore +2 -0
- pyannote/config.yaml +10 -0
- pyannote/pytorch_model.bin +3 -0
- whisper.py +3 -5
.gitignore
CHANGED
@@ -1,2 +1,4 @@
|
|
1 |
venv
|
2 |
**/__pycache__
|
|
|
|
|
|
1 |
venv
|
2 |
**/__pycache__
|
3 |
+
venv
|
4 |
+
.env
|
pyannote/config.yaml
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pipeline:
|
2 |
+
name: pyannote.audio.pipelines.VoiceActivityDetection
|
3 |
+
params:
|
4 |
+
segmentation: ./pyannote/pytorch_model.bin
|
5 |
+
|
6 |
+
params:
|
7 |
+
min_duration_off: 0.09791355693027545
|
8 |
+
min_duration_on: 0.05537587440407595
|
9 |
+
offset: 0.4806866463041527
|
10 |
+
onset: 0.8104268538848918
|
pyannote/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0b5b3216d60a2d32fc086b47ea8c67589aaeb26b7e07fcbe620d6d0b83e209ea
|
3 |
+
size 17719103
|
whisper.py
CHANGED
@@ -1,6 +1,6 @@
|
|
|
|
1 |
from pyannote.audio import Pipeline
|
2 |
from pydub import AudioSegment
|
3 |
-
import os
|
4 |
from transformers import WhisperForConditionalGeneration, WhisperProcessor
|
5 |
import torchaudio
|
6 |
import torch
|
@@ -12,7 +12,7 @@ torch_dtype = torch.float32
|
|
12 |
MODEL_NAME = "openai/whisper-large-v3"
|
13 |
model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME, torch_dtype=torch_dtype).to(device)
|
14 |
processor = WhisperProcessor.from_pretrained(MODEL_NAME)
|
15 |
-
pipeline_vad = Pipeline.from_pretrained("pyannote/
|
16 |
threshold = 15000 # adjust max duration threshold
|
17 |
segments_dir = "."
|
18 |
|
@@ -67,7 +67,6 @@ def generate_1st_chunk(audio):
|
|
67 |
#exclude prompt from output
|
68 |
forced_decoder_tokens = convert_forced_to_tokens(forced_decoder_ids)
|
69 |
output = processor.decode(pred_ids[0][len(forced_decoder_tokens) + 1:], skip_special_tokens=True)
|
70 |
-
output_tokens = processor.batch_decode(pred_ids, skip_special_tokens=False)
|
71 |
|
72 |
return output[1:]
|
73 |
|
@@ -117,7 +116,6 @@ def generate_from_2nd_chunk(audio, prev_prompt):
|
|
117 |
#exclude prompt from output
|
118 |
forced_decoder_tokens = convert_forced_to_tokens(forced_decoder_ids)
|
119 |
output = processor.decode(pred_ids[0][len(forced_decoder_tokens) + 1:], skip_special_tokens=True)
|
120 |
-
output_tokens = processor.batch_decode(pred_ids, skip_special_tokens=False)
|
121 |
return output[1:]
|
122 |
|
123 |
def processing_vad_v3(audio, output_vad, prev_prompt):
|
@@ -126,8 +124,8 @@ def processing_vad_v3(audio, output_vad, prev_prompt):
|
|
126 |
for speech in output_vad.get_timeline().support():
|
127 |
start, end = speech.start, speech.end
|
128 |
segment_audio = audio[start * 1000:end * 1000]
|
129 |
-
segment_audio.export(os.path.join(segments_dir, f"temp_segment.wav"), format="wav")
|
130 |
filename = os.path.join(segments_dir, f"temp_segment.wav")
|
|
|
131 |
if first_chunk:
|
132 |
output = generate_1st_chunk(filename)
|
133 |
first_chunk = False
|
|
|
1 |
+
import os
|
2 |
from pyannote.audio import Pipeline
|
3 |
from pydub import AudioSegment
|
|
|
4 |
from transformers import WhisperForConditionalGeneration, WhisperProcessor
|
5 |
import torchaudio
|
6 |
import torch
|
|
|
12 |
MODEL_NAME = "openai/whisper-large-v3"
|
13 |
model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME, torch_dtype=torch_dtype).to(device)
|
14 |
processor = WhisperProcessor.from_pretrained(MODEL_NAME)
|
15 |
+
pipeline_vad = Pipeline.from_pretrained("./pyannote/config.yaml")
|
16 |
threshold = 15000 # adjust max duration threshold
|
17 |
segments_dir = "."
|
18 |
|
|
|
67 |
#exclude prompt from output
|
68 |
forced_decoder_tokens = convert_forced_to_tokens(forced_decoder_ids)
|
69 |
output = processor.decode(pred_ids[0][len(forced_decoder_tokens) + 1:], skip_special_tokens=True)
|
|
|
70 |
|
71 |
return output[1:]
|
72 |
|
|
|
116 |
#exclude prompt from output
|
117 |
forced_decoder_tokens = convert_forced_to_tokens(forced_decoder_ids)
|
118 |
output = processor.decode(pred_ids[0][len(forced_decoder_tokens) + 1:], skip_special_tokens=True)
|
|
|
119 |
return output[1:]
|
120 |
|
121 |
def processing_vad_v3(audio, output_vad, prev_prompt):
|
|
|
124 |
for speech in output_vad.get_timeline().support():
|
125 |
start, end = speech.start, speech.end
|
126 |
segment_audio = audio[start * 1000:end * 1000]
|
|
|
127 |
filename = os.path.join(segments_dir, f"temp_segment.wav")
|
128 |
+
segment_audio.export(filename, format="wav")
|
129 |
if first_chunk:
|
130 |
output = generate_1st_chunk(filename)
|
131 |
first_chunk = False
|