ankush13r commited on
Commit
588b387
1 Parent(s): 1faae08

add whisper with language tags and prompt

Browse files
__pycache__/app.cpython-310.pyc ADDED
Binary file (1.23 kB). View file
 
__pycache__/whisper.cpython-310.pyc ADDED
Binary file (2.45 kB). View file
 
__pycache__/whisper2.cpython-310.pyc ADDED
Binary file (2.45 kB). View file
 
app.py CHANGED
@@ -1,34 +1,18 @@
1
- import torch
2
 
3
  import gradio as gr
4
- import yt_dlp as youtube_dl
5
- from transformers import pipeline
6
- from transformers.pipelines.audio_utils import ffmpeg_read
7
 
8
- import tempfile
9
- import os
10
 
11
- MODEL_NAME = "openai/whisper-large-v3"
12
- BATCH_SIZE = 8
13
- FILE_LIMIT_MB = 1000
14
 
15
- device = 0 if torch.cuda.is_available() else "cpu"
16
 
17
- pipe = pipeline(
18
- task="automatic-speech-recognition",
19
- model=MODEL_NAME,
20
- chunk_length_s=30,
21
- device=device,
22
- )
23
 
24
- def transcribe(inputs, task):
25
  if inputs is None:
26
  raise gr.Error("Cap fitxer d'àudio introduit! Si us plau pengeu un fitxer "\
27
  "o enregistreu un àudio abans d'enviar la vostra sol·licitud")
28
 
29
-
30
- text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
31
- return text
32
 
33
 
34
  description_string = "Transcripció automàtica de micròfon o de fitxers d'àudio.\n Aquest demostrador s'ha desenvolupat per"\
@@ -38,8 +22,7 @@ description_string = "Transcripció automàtica de micròfon o de fitxers d'àud
38
  file_transcribe = gr.Interface(
39
  fn=transcribe,
40
  inputs=[
41
- gr.Audio(sources=["upload", "microphone"], type="filepath", label="Audio"),
42
- gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
43
  ],
44
  outputs="text",
45
  title="Transcripció automàtica d'àudio",
 
 
1
 
2
  import gradio as gr
3
+ from whisper2 import generate
 
 
4
 
 
 
5
 
6
+ MODEL_NAME = "/whisper-large-v3"
 
 
7
 
 
8
 
 
 
 
 
 
 
9
 
10
+ def transcribe(inputs):
11
  if inputs is None:
12
  raise gr.Error("Cap fitxer d'àudio introduit! Si us plau pengeu un fitxer "\
13
  "o enregistreu un àudio abans d'enviar la vostra sol·licitud")
14
 
15
+ return generate(audio=inputs)
 
 
16
 
17
 
18
  description_string = "Transcripció automàtica de micròfon o de fitxers d'àudio.\n Aquest demostrador s'ha desenvolupat per"\
 
22
  file_transcribe = gr.Interface(
23
  fn=transcribe,
24
  inputs=[
25
+ gr.Audio(sources=["upload", "microphone"], type="filepath", label="Audio")
 
26
  ],
27
  outputs="text",
28
  title="Transcripció automàtica d'àudio",
requirements.txt CHANGED
@@ -2,3 +2,5 @@ git+https://github.com/huggingface/transformers
2
  torch
3
  yt-dlp
4
  gradio==4.20.0
 
 
 
2
  torch
3
  yt-dlp
4
  gradio==4.20.0
5
+ torchaudio==2.2.1
6
+ librosa==0.10.1
whisper2.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import WhisperForConditionalGeneration, WhisperProcessor
2
+ import torchaudio
3
+ import torch
4
+ import librosa
5
+
6
+ MODEL_NAME = "openai/whisper-large-v3"
7
+
8
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
9
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
10
+ device = "cpu"
11
+
12
+ print("[ INFO ] Device: ", device)
13
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
14
+ torch_dtype = torch.float32
15
+
16
+ model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME, torch_dtype=torch_dtype).to(device)
17
+ processor = WhisperProcessor.from_pretrained(MODEL_NAME)
18
+
19
+
20
+ def convert_forced_to_tokens(forced_decoder_ids):
21
+ forced_decoder_tokens = []
22
+ for i, (idx, token) in enumerate(forced_decoder_ids):
23
+ if token is not None:
24
+ forced_decoder_tokens.append([idx, processor.tokenizer.decode(token)])
25
+ else:
26
+ forced_decoder_tokens.append([idx, token])
27
+ return forced_decoder_tokens
28
+
29
+
30
+ def generate(audio):
31
+ input_audio, sample_rate = torchaudio.load(audio)
32
+
33
+ #metadata = torchaudio.info(audio)
34
+ #length1 = math.ceil(metadata.num_frames / metadata.sample_rate)
35
+ length = librosa.get_duration(path=audio)
36
+
37
+ input_speech = input_audio[0]
38
+
39
+
40
+ if length <= 30:
41
+ input_features = processor(input_speech,
42
+ sampling_rate=16_000,
43
+ return_tensors="pt", torch_dtype=torch_dtype).input_features.to(device)
44
+
45
+ else:
46
+ input_features = processor(input_speech,
47
+ return_tensors="pt",
48
+ truncation=False,
49
+ padding="longest",
50
+ return_attention_mask=True,
51
+ sampling_rate=16_000).input_features.to(device)
52
+ forced_decoder_ids = []
53
+ forced_decoder_ids.append([1,50270]) #[1, '<|ca|>']
54
+ forced_decoder_ids.append([2,50262]) #[2, '<|es|>']
55
+ forced_decoder_ids.append([3,50360]) #[3, '<|transcribe|>']
56
+
57
+ forced_decoder_ids_modified = forced_decoder_ids
58
+ idx = processor.tokenizer.all_special_tokens.index("<|startofprev|>")
59
+ forced_bos_token_id = processor.tokenizer.all_special_ids[idx]
60
+
61
+ prompt = " transcribe an audio containing code-switching between es and ca"
62
+ prompt_tokens = processor.tokenizer(prompt, add_special_tokens=False).input_ids
63
+
64
+ # we need to force these tokens
65
+ forced_decoder_ids = []
66
+ for idx, token in enumerate(prompt_tokens):
67
+ # indexing starts from 1 for forced tokens (token at position 0 is the SOS token)
68
+ forced_decoder_ids.append([idx + 1, token])
69
+
70
+ # now we add the SOS token at the end
71
+ offset = len(forced_decoder_ids)
72
+ forced_decoder_ids.append([offset + 1, model.generation_config.decoder_start_token_id])
73
+
74
+ # now we need to append the rest of the prefix tokens (lang, task, timestamps)
75
+ offset = len(forced_decoder_ids)
76
+ for idx, token in forced_decoder_ids_modified:
77
+ forced_decoder_ids.append([idx + offset , token])
78
+
79
+ model.config.forced_decoder_ids = forced_decoder_ids
80
+ model.generation_config.forced_decoder_ids = forced_decoder_ids
81
+
82
+
83
+ if length <= 30:
84
+ pred_ids = model.generate(input_features,
85
+ return_timestamps=True,
86
+ decoder_start_token_id=forced_bos_token_id,
87
+ max_new_tokens=128)
88
+ #exclude prompt from output
89
+ forced_decoder_tokens = convert_forced_to_tokens(forced_decoder_ids)
90
+ output = processor.decode(pred_ids[0][len(forced_decoder_tokens) + 1:], skip_special_tokens=True)
91
+
92
+ else:
93
+ pred_ids = model.generate(input_features,
94
+ return_timestamps=True,
95
+ decoder_start_token_id=forced_bos_token_id,
96
+ logprob_threshold=-1.0,
97
+ compression_ratio_threshold=1.35,
98
+ temperature=(0.0, 0.2, 0.4),
99
+ no_speech_threshold=0.1,
100
+ )
101
+ output = processor.batch_decode(pred_ids, skip_special_tokens=True)
102
+
103
+ return output[0]