farzadab commited on
Commit
1d58827
1 Parent(s): 6b22e1f

Update ultravox_processing.py

Browse files
Files changed (1) hide show
  1. ultravox_processing.py +5 -0
ultravox_processing.py CHANGED
@@ -120,6 +120,7 @@ class UltravoxProcessor(transformers.ProcessorMixin):
120
  audio_embed_frames = int(np.ceil(nb_encoder_frames / self.stack_factor))
121
  data["audio_token_len"] = [audio_embed_frames]
122
 
 
123
  x = self.audio_processor(
124
  audio,
125
  sampling_rate=sampling_rate,
@@ -149,6 +150,10 @@ class UltravoxProcessor(transformers.ProcessorMixin):
149
  )
150
  )
151
  data["audio_token_start_idx"] = [start_idx]
 
 
 
 
152
  text = text.replace(
153
  self.audio_placeholder,
154
  self.audio_token_replacement * audio_embed_frames,
 
120
  audio_embed_frames = int(np.ceil(nb_encoder_frames / self.stack_factor))
121
  data["audio_token_len"] = [audio_embed_frames]
122
 
123
+ # Main audio processing. The processor is model-specific.
124
  x = self.audio_processor(
125
  audio,
126
  sampling_rate=sampling_rate,
 
150
  )
151
  )
152
  data["audio_token_start_idx"] = [start_idx]
153
+
154
+ # Replace the audio placeholder with the audio token.
155
+ # e.g. "Transcribe <|audio|>" -> "Transcribe </s></s></s></s></s></s></s></s>"
156
+ # where the number of </s> is the number of audio frames.
157
  text = text.replace(
158
  self.audio_placeholder,
159
  self.audio_token_replacement * audio_embed_frames,