The translation model is now compatible with the
Browse files"Word Timestamps - Highlight Words" feature.
- app.py +32 -0
- docs/translateModel.md +14 -1
- src/utils.py +25 -9
app.py
CHANGED
@@ -716,6 +716,38 @@ class WhisperTranscriber:
|
|
716 |
segments_progress_listener.on_progress(idx+1, len(segments), desc=f"Process segments: {idx}/{len(segments)}")
|
717 |
|
718 |
translationModel.release_vram()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
719 |
perf_end_time = time.perf_counter()
|
720 |
# Call the finished callback
|
721 |
if segments_progress_listener is not None:
|
|
|
716 |
segments_progress_listener.on_progress(idx+1, len(segments), desc=f"Process segments: {idx}/{len(segments)}")
|
717 |
|
718 |
translationModel.release_vram()
|
719 |
+
|
720 |
+
if highlight_words and segments[0]["words"] is not None:
|
721 |
+
for idx, segment in enumerate(segments):
|
722 |
+
text = segment["text"]
|
723 |
+
words = segment["words"]
|
724 |
+
total_duration = words[-1]['end'] - words[0]['start'] #Calculate the total duration of the entire sentence
|
725 |
+
total_text_length = len(text)
|
726 |
+
|
727 |
+
# Allocate lengths to each word
|
728 |
+
duration_ratio_lengths = []
|
729 |
+
total_allocated = 0
|
730 |
+
text_idx = 0 # Track the position in the translated string
|
731 |
+
for word in words:
|
732 |
+
# Calculate the duration of each word as a proportion of the total time
|
733 |
+
word_duration = word['end'] - word['start']
|
734 |
+
duration_ratio = word_duration / total_duration
|
735 |
+
duration_ratio_length = int(duration_ratio * total_text_length)
|
736 |
+
duration_ratio_lengths.append(duration_ratio_length)
|
737 |
+
total_allocated += duration_ratio_length
|
738 |
+
|
739 |
+
# Distribute remaining characters to avoid 0-duration_ratio_length issues
|
740 |
+
remaining_chars = total_text_length - total_allocated
|
741 |
+
for idx in range(remaining_chars):
|
742 |
+
duration_ratio_lengths[idx % len(words)] += 1 # Distribute the remaining chars evenly
|
743 |
+
|
744 |
+
# Generate translated words based on the calculated lengths
|
745 |
+
text_idx = 0
|
746 |
+
for idx, word in enumerate(words):
|
747 |
+
text_part = text[text_idx:text_idx + duration_ratio_lengths[idx]]
|
748 |
+
word["word"], word["word_original"] = text_part, word["word"]
|
749 |
+
text_idx += duration_ratio_lengths[idx]
|
750 |
+
|
751 |
perf_end_time = time.perf_counter()
|
752 |
# Call the finished callback
|
753 |
if segments_progress_listener is not None:
|
docs/translateModel.md
CHANGED
@@ -5,7 +5,9 @@ The `translate` task in `Whisper` only supports translating other languages `int
|
|
5 |
|
6 |
The larger the parameters of the Translation model, the better its translation capability is expected. However, this also requires higher computational resources and slower running speed.
|
7 |
|
8 |
-
|
|
|
|
|
9 |
|
10 |
|
11 |
# Translation Model
|
@@ -153,6 +155,17 @@ Automatic speech recognition (ASR)
|
|
153 |
| [facebook/seamless-m4t-large](https://huggingface.co/facebook/seamless-m4t-large) | 2.3B | 11.4 GB | float32 | N/A |
|
154 |
| [facebook/seamless-m4t-v2-large](https://huggingface.co/facebook/seamless-m4t-v2-large) | 2.3B | 11.4 GB (safetensors:9.24 GB) | float32 | ≈9.2 GB |
|
155 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
|
157 |
# Options
|
158 |
|
|
|
5 |
|
6 |
The larger the parameters of the Translation model, the better its translation capability is expected. However, this also requires higher computational resources and slower running speed.
|
7 |
|
8 |
+
The translation model is now compatible with the `Word Timestamps - Highlight Words` feature.
|
9 |
+
|
10 |
+
~~Currently, when the `Highlight Words timestamps` option is enabled in the Whisper `Word Timestamps options`, it cannot be used simultaneously with the Translation Model. This is because Highlight Words splits the source text, and after translation, it becomes a non-word-level string.~~
|
11 |
|
12 |
|
13 |
# Translation Model
|
|
|
155 |
| [facebook/seamless-m4t-large](https://huggingface.co/facebook/seamless-m4t-large) | 2.3B | 11.4 GB | float32 | N/A |
|
156 |
| [facebook/seamless-m4t-v2-large](https://huggingface.co/facebook/seamless-m4t-v2-large) | 2.3B | 11.4 GB (safetensors:9.24 GB) | float32 | ≈9.2 GB |
|
157 |
|
158 |
+
## Llama
|
159 |
+
|
160 |
+
Meta developed and released the Meta Llama 3 family of large language models (LLMs). This program modifies them through prompts to function as translation models.
|
161 |
+
|
162 |
+
| Name | Parameters | Size | type/quantize | Required VRAM |
|
163 |
+
|------|------------|------|---------------|---------------|
|
164 |
+
| [avans06/Meta-Llama-3.2-8B-Instruct-ct2-int8_float16](https://huggingface.co/avans06/Meta-Llama-3.2-8B-Instruct-ct2-int8_float16) | 8B | 8.04 GB | int8_float16 | ≈ 7.9 GB |
|
165 |
+
| [avans06/Meta-Llama-3.1-8B-Instruct-ct2-int8_float16](https://huggingface.co/avans06/Meta-Llama-3.1-8B-Instruct-ct2-int8_float16) | 8B | 8.04 GB | int8_float16 | ≈ 7.9 GB |
|
166 |
+
| [avans06/Meta-Llama-3-8B-Instruct-ct2-int8_float16](https://huggingface.co/avans06/Meta-Llama-3-8B-Instruct-ct2-int8_float16) | 8B | 8.04 GB | int8_float16 | ≈ 7.9 GB |
|
167 |
+
| [jncraton/Llama-3.2-3B-Instruct-ct2-int8](https://huggingface.co/jncraton/Llama-3.2-3B-Instruct-ct2-int8) | 3B | 3.22 GB | int8 | ≈ 3.3 GB |
|
168 |
+
|
169 |
|
170 |
# Options
|
171 |
|
src/utils.py
CHANGED
@@ -155,7 +155,7 @@ def __subtitle_preprocessor_iterator(transcript: Iterator[dict], maxLineWidth: i
|
|
155 |
subtitle_start = segment['start']
|
156 |
subtitle_end = segment['end']
|
157 |
text = segment['text'].strip()
|
158 |
-
|
159 |
|
160 |
if len(words) == 0:
|
161 |
# Prepend the longest speaker ID if available
|
@@ -167,8 +167,8 @@ def __subtitle_preprocessor_iterator(transcript: Iterator[dict], maxLineWidth: i
|
|
167 |
'end' : subtitle_end,
|
168 |
'text' : process_text(text, maxLineWidth)
|
169 |
}
|
170 |
-
if
|
171 |
-
result.update({'original': process_text(
|
172 |
yield result
|
173 |
|
174 |
# We are done
|
@@ -181,12 +181,14 @@ def __subtitle_preprocessor_iterator(transcript: Iterator[dict], maxLineWidth: i
|
|
181 |
'end' : subtitle_start,
|
182 |
'word' : f"({segment_longest_speaker})"
|
183 |
})
|
|
|
|
|
184 |
|
185 |
-
text_words = [text] if not highlight_words and original_text is not None and len(original_text) > 0 else [ this_word["word"] for this_word in words ]
|
186 |
subtitle_text = __join_words(text_words, maxLineWidth)
|
187 |
|
188 |
# Iterate over the words in the segment
|
189 |
if highlight_words:
|
|
|
190 |
last = subtitle_start
|
191 |
|
192 |
for idx, this_word in enumerate(words):
|
@@ -195,14 +197,17 @@ def __subtitle_preprocessor_iterator(transcript: Iterator[dict], maxLineWidth: i
|
|
195 |
|
196 |
if last != start:
|
197 |
# Display the text up to this point
|
198 |
-
|
199 |
'start': last,
|
200 |
'end' : start,
|
201 |
'text' : subtitle_text
|
202 |
}
|
|
|
|
|
|
|
203 |
|
204 |
# Display the text with the current word highlighted
|
205 |
-
|
206 |
'start': start,
|
207 |
'end' : end,
|
208 |
'text' : __join_words(
|
@@ -212,15 +217,26 @@ def __subtitle_preprocessor_iterator(transcript: Iterator[dict], maxLineWidth: i
|
|
212 |
]
|
213 |
, maxLineWidth)
|
214 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
215 |
last = end
|
216 |
|
217 |
if last != subtitle_end:
|
218 |
# Display the last part of the text
|
219 |
-
|
220 |
'start': last,
|
221 |
'end' : subtitle_end,
|
222 |
'text' : subtitle_text
|
223 |
}
|
|
|
|
|
|
|
224 |
|
225 |
# Just return the subtitle text
|
226 |
else:
|
@@ -229,8 +245,8 @@ def __subtitle_preprocessor_iterator(transcript: Iterator[dict], maxLineWidth: i
|
|
229 |
'end' : subtitle_end,
|
230 |
'text' : subtitle_text
|
231 |
}
|
232 |
-
if
|
233 |
-
result.update({'original': process_text(
|
234 |
yield result
|
235 |
|
236 |
def __join_words(words: Iterator[str], maxLineWidth: int = None):
|
|
|
155 |
subtitle_start = segment['start']
|
156 |
subtitle_end = segment['end']
|
157 |
text = segment['text'].strip()
|
158 |
+
text_original = segment['original'].strip() if 'original' in segment else None
|
159 |
|
160 |
if len(words) == 0:
|
161 |
# Prepend the longest speaker ID if available
|
|
|
167 |
'end' : subtitle_end,
|
168 |
'text' : process_text(text, maxLineWidth)
|
169 |
}
|
170 |
+
if text_original is not None and len(text_original) > 0:
|
171 |
+
result.update({'original': process_text(text_original, maxLineWidth)})
|
172 |
yield result
|
173 |
|
174 |
# We are done
|
|
|
181 |
'end' : subtitle_start,
|
182 |
'word' : f"({segment_longest_speaker})"
|
183 |
})
|
184 |
+
|
185 |
+
text_words = [text] if not highlight_words and text_original is not None and len(text_original) > 0 else [ this_word["word"] for this_word in words ]
|
186 |
|
|
|
187 |
subtitle_text = __join_words(text_words, maxLineWidth)
|
188 |
|
189 |
# Iterate over the words in the segment
|
190 |
if highlight_words:
|
191 |
+
text_words_original = [ this_word["word_original"] for this_word in words if "word_original" in this_word ] if text_original is not None and len(text_original) > 0 else None
|
192 |
last = subtitle_start
|
193 |
|
194 |
for idx, this_word in enumerate(words):
|
|
|
197 |
|
198 |
if last != start:
|
199 |
# Display the text up to this point
|
200 |
+
result = {
|
201 |
'start': last,
|
202 |
'end' : start,
|
203 |
'text' : subtitle_text
|
204 |
}
|
205 |
+
if text_original is not None and len(text_original) > 0:
|
206 |
+
result.update({'original': process_text(text_original, maxLineWidth)})
|
207 |
+
yield result
|
208 |
|
209 |
# Display the text with the current word highlighted
|
210 |
+
result = {
|
211 |
'start': start,
|
212 |
'end' : end,
|
213 |
'text' : __join_words(
|
|
|
217 |
]
|
218 |
, maxLineWidth)
|
219 |
}
|
220 |
+
if text_words_original is not None and len(text_words_original) > 0:
|
221 |
+
result.update({'original': __join_words(
|
222 |
+
[
|
223 |
+
re.sub(r"^(\s*)(.*)$", r"\1<u>\2</u>", word_original) if subidx == idx else word_original
|
224 |
+
for subidx, word_original in enumerate(text_words_original)
|
225 |
+
]
|
226 |
+
, maxLineWidth)})
|
227 |
+
yield result
|
228 |
last = end
|
229 |
|
230 |
if last != subtitle_end:
|
231 |
# Display the last part of the text
|
232 |
+
result = {
|
233 |
'start': last,
|
234 |
'end' : subtitle_end,
|
235 |
'text' : subtitle_text
|
236 |
}
|
237 |
+
if text_original is not None and len(text_original) > 0:
|
238 |
+
result.update({'original': process_text(text_original, maxLineWidth)})
|
239 |
+
yield result
|
240 |
|
241 |
# Just return the subtitle text
|
242 |
else:
|
|
|
245 |
'end' : subtitle_end,
|
246 |
'text' : subtitle_text
|
247 |
}
|
248 |
+
if text_original is not None and len(text_original) > 0:
|
249 |
+
result.update({'original': process_text(text_original, maxLineWidth)})
|
250 |
yield result
|
251 |
|
252 |
def __join_words(words: Iterator[str], maxLineWidth: int = None):
|