avans06 commited on
Commit
a14fe5a
1 Parent(s): a760511

The translation model is now compatible with the

Browse files

"Word Timestamps - Highlight Words" feature.

Files changed (3) hide show
  1. app.py +32 -0
  2. docs/translateModel.md +14 -1
  3. src/utils.py +25 -9
app.py CHANGED
@@ -716,6 +716,38 @@ class WhisperTranscriber:
716
  segments_progress_listener.on_progress(idx+1, len(segments), desc=f"Process segments: {idx}/{len(segments)}")
717
 
718
  translationModel.release_vram()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
719
  perf_end_time = time.perf_counter()
720
  # Call the finished callback
721
  if segments_progress_listener is not None:
 
716
  segments_progress_listener.on_progress(idx+1, len(segments), desc=f"Process segments: {idx}/{len(segments)}")
717
 
718
  translationModel.release_vram()
719
+
720
+ if highlight_words and segments[0]["words"] is not None:
721
+ for idx, segment in enumerate(segments):
722
+ text = segment["text"]
723
+ words = segment["words"]
724
+ total_duration = words[-1]['end'] - words[0]['start'] #Calculate the total duration of the entire sentence
725
+ total_text_length = len(text)
726
+
727
+ # Allocate lengths to each word
728
+ duration_ratio_lengths = []
729
+ total_allocated = 0
730
+ text_idx = 0 # Track the position in the translated string
731
+ for word in words:
732
+ # Calculate the duration of each word as a proportion of the total time
733
+ word_duration = word['end'] - word['start']
734
+ duration_ratio = word_duration / total_duration
735
+ duration_ratio_length = int(duration_ratio * total_text_length)
736
+ duration_ratio_lengths.append(duration_ratio_length)
737
+ total_allocated += duration_ratio_length
738
+
739
+ # Distribute remaining characters to avoid 0-duration_ratio_length issues
740
+ remaining_chars = total_text_length - total_allocated
741
+ for idx in range(remaining_chars):
742
+ duration_ratio_lengths[idx % len(words)] += 1 # Distribute the remaining chars evenly
743
+
744
+ # Generate translated words based on the calculated lengths
745
+ text_idx = 0
746
+ for idx, word in enumerate(words):
747
+ text_part = text[text_idx:text_idx + duration_ratio_lengths[idx]]
748
+ word["word"], word["word_original"] = text_part, word["word"]
749
+ text_idx += duration_ratio_lengths[idx]
750
+
751
  perf_end_time = time.perf_counter()
752
  # Call the finished callback
753
  if segments_progress_listener is not None:
docs/translateModel.md CHANGED
@@ -5,7 +5,9 @@ The `translate` task in `Whisper` only supports translating other languages `int
5
 
6
  The larger the parameters of the Translation model, the better its translation capability is expected. However, this also requires higher computational resources and slower running speed.
7
 
8
- Currently, when the `Highlight Words timestamps` option is enabled in the Whisper `Word Timestamps options`, it cannot be used simultaneously with the Translation Model. This is because Highlight Words splits the source text, and after translation, it becomes a non-word-level string.
 
 
9
 
10
 
11
  # Translation Model
@@ -153,6 +155,17 @@ Automatic speech recognition (ASR)
153
  | [facebook/seamless-m4t-large](https://huggingface.co/facebook/seamless-m4t-large) | 2.3B | 11.4 GB | float32 | N/A |
154
  | [facebook/seamless-m4t-v2-large](https://huggingface.co/facebook/seamless-m4t-v2-large) | 2.3B | 11.4 GB (safetensors:9.24 GB) | float32 | ≈9.2 GB |
155
 
 
 
 
 
 
 
 
 
 
 
 
156
 
157
  # Options
158
 
 
5
 
6
  The larger the parameters of the Translation model, the better its translation capability is expected. However, this also requires higher computational resources and slower running speed.
7
 
8
+ The translation model is now compatible with the `Word Timestamps - Highlight Words` feature.
9
+
10
+ ~~Currently, when the `Highlight Words timestamps` option is enabled in the Whisper `Word Timestamps options`, it cannot be used simultaneously with the Translation Model. This is because Highlight Words splits the source text, and after translation, it becomes a non-word-level string.~~
11
 
12
 
13
  # Translation Model
 
155
  | [facebook/seamless-m4t-large](https://huggingface.co/facebook/seamless-m4t-large) | 2.3B | 11.4 GB | float32 | N/A |
156
  | [facebook/seamless-m4t-v2-large](https://huggingface.co/facebook/seamless-m4t-v2-large) | 2.3B | 11.4 GB (safetensors:9.24 GB) | float32 | ≈9.2 GB |
157
 
158
+ ## Llama
159
+
160
+ Meta developed and released the Meta Llama 3 family of large language models (LLMs). This program modifies them through prompts to function as translation models.
161
+
162
+ | Name | Parameters | Size | type/quantize | Required VRAM |
163
+ |------|------------|------|---------------|---------------|
164
+ | [avans06/Meta-Llama-3.2-8B-Instruct-ct2-int8_float16](https://huggingface.co/avans06/Meta-Llama-3.2-8B-Instruct-ct2-int8_float16) | 8B | 8.04 GB | int8_float16 | ≈ 7.9 GB |
165
+ | [avans06/Meta-Llama-3.1-8B-Instruct-ct2-int8_float16](https://huggingface.co/avans06/Meta-Llama-3.1-8B-Instruct-ct2-int8_float16) | 8B | 8.04 GB | int8_float16 | ≈ 7.9 GB |
166
+ | [avans06/Meta-Llama-3-8B-Instruct-ct2-int8_float16](https://huggingface.co/avans06/Meta-Llama-3-8B-Instruct-ct2-int8_float16) | 8B | 8.04 GB | int8_float16 | ≈ 7.9 GB |
167
+ | [jncraton/Llama-3.2-3B-Instruct-ct2-int8](https://huggingface.co/jncraton/Llama-3.2-3B-Instruct-ct2-int8) | 3B | 3.22 GB | int8 | ≈ 3.3 GB |
168
+
169
 
170
  # Options
171
 
src/utils.py CHANGED
@@ -155,7 +155,7 @@ def __subtitle_preprocessor_iterator(transcript: Iterator[dict], maxLineWidth: i
155
  subtitle_start = segment['start']
156
  subtitle_end = segment['end']
157
  text = segment['text'].strip()
158
- original_text = segment['original'].strip() if 'original' in segment else None
159
 
160
  if len(words) == 0:
161
  # Prepend the longest speaker ID if available
@@ -167,8 +167,8 @@ def __subtitle_preprocessor_iterator(transcript: Iterator[dict], maxLineWidth: i
167
  'end' : subtitle_end,
168
  'text' : process_text(text, maxLineWidth)
169
  }
170
- if original_text is not None and len(original_text) > 0:
171
- result.update({'original': process_text(original_text, maxLineWidth)})
172
  yield result
173
 
174
  # We are done
@@ -181,12 +181,14 @@ def __subtitle_preprocessor_iterator(transcript: Iterator[dict], maxLineWidth: i
181
  'end' : subtitle_start,
182
  'word' : f"({segment_longest_speaker})"
183
  })
 
 
184
 
185
- text_words = [text] if not highlight_words and original_text is not None and len(original_text) > 0 else [ this_word["word"] for this_word in words ]
186
  subtitle_text = __join_words(text_words, maxLineWidth)
187
 
188
  # Iterate over the words in the segment
189
  if highlight_words:
 
190
  last = subtitle_start
191
 
192
  for idx, this_word in enumerate(words):
@@ -195,14 +197,17 @@ def __subtitle_preprocessor_iterator(transcript: Iterator[dict], maxLineWidth: i
195
 
196
  if last != start:
197
  # Display the text up to this point
198
- yield {
199
  'start': last,
200
  'end' : start,
201
  'text' : subtitle_text
202
  }
 
 
 
203
 
204
  # Display the text with the current word highlighted
205
- yield {
206
  'start': start,
207
  'end' : end,
208
  'text' : __join_words(
@@ -212,15 +217,26 @@ def __subtitle_preprocessor_iterator(transcript: Iterator[dict], maxLineWidth: i
212
  ]
213
  , maxLineWidth)
214
  }
 
 
 
 
 
 
 
 
215
  last = end
216
 
217
  if last != subtitle_end:
218
  # Display the last part of the text
219
- yield {
220
  'start': last,
221
  'end' : subtitle_end,
222
  'text' : subtitle_text
223
  }
 
 
 
224
 
225
  # Just return the subtitle text
226
  else:
@@ -229,8 +245,8 @@ def __subtitle_preprocessor_iterator(transcript: Iterator[dict], maxLineWidth: i
229
  'end' : subtitle_end,
230
  'text' : subtitle_text
231
  }
232
- if original_text is not None and len(original_text) > 0:
233
- result.update({'original': process_text(original_text, maxLineWidth)})
234
  yield result
235
 
236
  def __join_words(words: Iterator[str], maxLineWidth: int = None):
 
155
  subtitle_start = segment['start']
156
  subtitle_end = segment['end']
157
  text = segment['text'].strip()
158
+ text_original = segment['original'].strip() if 'original' in segment else None
159
 
160
  if len(words) == 0:
161
  # Prepend the longest speaker ID if available
 
167
  'end' : subtitle_end,
168
  'text' : process_text(text, maxLineWidth)
169
  }
170
+ if text_original is not None and len(text_original) > 0:
171
+ result.update({'original': process_text(text_original, maxLineWidth)})
172
  yield result
173
 
174
  # We are done
 
181
  'end' : subtitle_start,
182
  'word' : f"({segment_longest_speaker})"
183
  })
184
+
185
+ text_words = [text] if not highlight_words and text_original is not None and len(text_original) > 0 else [ this_word["word"] for this_word in words ]
186
 
 
187
  subtitle_text = __join_words(text_words, maxLineWidth)
188
 
189
  # Iterate over the words in the segment
190
  if highlight_words:
191
+ text_words_original = [ this_word["word_original"] for this_word in words if "word_original" in this_word ] if text_original is not None and len(text_original) > 0 else None
192
  last = subtitle_start
193
 
194
  for idx, this_word in enumerate(words):
 
197
 
198
  if last != start:
199
  # Display the text up to this point
200
+ result = {
201
  'start': last,
202
  'end' : start,
203
  'text' : subtitle_text
204
  }
205
+ if text_original is not None and len(text_original) > 0:
206
+ result.update({'original': process_text(text_original, maxLineWidth)})
207
+ yield result
208
 
209
  # Display the text with the current word highlighted
210
+ result = {
211
  'start': start,
212
  'end' : end,
213
  'text' : __join_words(
 
217
  ]
218
  , maxLineWidth)
219
  }
220
+ if text_words_original is not None and len(text_words_original) > 0:
221
+ result.update({'original': __join_words(
222
+ [
223
+ re.sub(r"^(\s*)(.*)$", r"\1<u>\2</u>", word_original) if subidx == idx else word_original
224
+ for subidx, word_original in enumerate(text_words_original)
225
+ ]
226
+ , maxLineWidth)})
227
+ yield result
228
  last = end
229
 
230
  if last != subtitle_end:
231
  # Display the last part of the text
232
+ result = {
233
  'start': last,
234
  'end' : subtitle_end,
235
  'text' : subtitle_text
236
  }
237
+ if text_original is not None and len(text_original) > 0:
238
+ result.update({'original': process_text(text_original, maxLineWidth)})
239
+ yield result
240
 
241
  # Just return the subtitle text
242
  else:
 
245
  'end' : subtitle_end,
246
  'text' : subtitle_text
247
  }
248
+ if text_original is not None and len(text_original) > 0:
249
+ result.update({'original': process_text(text_original, maxLineWidth)})
250
  yield result
251
 
252
  def __join_words(words: Iterator[str], maxLineWidth: int = None):