Spaces:
Running
Running
When the language is set to Chinese,
Browse filesthe method of converting simplified Chinese to traditional Chinese is changed to using the method provided by zhconv.
When the "--merge_subtitle_with_sources" argument is enabled,
the video file extension will be appended with the subtitle language information, such as .en, .zh, .jp, etc.
The downloaded YouTube videos will be added with the format [vcodec^=avc1].
- app.py +15 -12
- requirements-fasterWhisper.txt +2 -1
- requirements-whisper.txt +2 -1
- requirements.txt +2 -1
- src/download.py +1 -1
- src/vad.py +1 -1
app.py
CHANGED
@@ -19,6 +19,7 @@ from src.hooks.subTaskProgressListener import SubTaskProgressListener
|
|
19 |
from src.hooks.whisperProgressHook import create_progress_listener_handle
|
20 |
from src.languages import _TO_LANGUAGE_CODE
|
21 |
from src.languages import get_language_names
|
|
|
22 |
from src.modelCache import ModelCache
|
23 |
from src.prompts.jsonPromptStrategy import JsonPromptStrategy
|
24 |
from src.prompts.prependPromptStrategy import PrependPromptStrategy
|
@@ -38,6 +39,7 @@ from src.whisper.abstractWhisperContainer import AbstractWhisperContainer
|
|
38 |
from src.whisper.whisperFactory import create_whisper_container
|
39 |
|
40 |
import shutil
|
|
|
41 |
|
42 |
# Configure more application defaults in config.json5
|
43 |
|
@@ -102,14 +104,11 @@ class WhisperTranscriber:
|
|
102 |
vad, vadMergeWindow, vadMaxMergeSize,
|
103 |
word_timestamps: bool = False, highlight_words: bool = False,
|
104 |
progress=gr.Progress()):
|
105 |
-
decodeOptions = dict(word_timestamps=word_timestamps)
|
106 |
-
if languageName == "Chinese":
|
107 |
-
decodeOptions.update(initial_prompt="繁體: ")
|
108 |
-
self.app_config.vad_initial_prompt_mode = "prepend_all_segments"
|
109 |
|
110 |
vadOptions = VadOptions(vad, vadMergeWindow, vadMaxMergeSize, self.app_config.vad_padding, self.app_config.vad_prompt_window, self.app_config.vad_initial_prompt_mode)
|
111 |
|
112 |
-
return self.transcribe_webui(modelName, languageName, urlData, multipleFiles, microphoneData, task, vadOptions,
|
|
|
113 |
|
114 |
# Entry function for the full tab
|
115 |
def transcribe_webui_full(self, modelName, languageName, urlData, multipleFiles, microphoneData, task,
|
@@ -143,10 +142,6 @@ class WhisperTranscriber:
|
|
143 |
else:
|
144 |
temperature = [temperature]
|
145 |
|
146 |
-
if languageName == "Chinese":
|
147 |
-
initial_prompt = "繁體: " + initial_prompt
|
148 |
-
self.app_config.vad_initial_prompt_mode = "prepend_all_segments"
|
149 |
-
|
150 |
vadOptions = VadOptions(vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow, vadInitialPromptMode)
|
151 |
|
152 |
return self.transcribe_webui(modelName, languageName, urlData, multipleFiles, microphoneData, task, vadOptions,
|
@@ -163,7 +158,8 @@ class WhisperTranscriber:
|
|
163 |
sources = self.__get_source(urlData, multipleFiles, microphoneData)
|
164 |
|
165 |
try:
|
166 |
-
|
|
|
167 |
selectedModel = modelName if modelName is not None else "base"
|
168 |
|
169 |
model = create_whisper_container(whisper_implementation=self.app_config.whisper_implementation,
|
@@ -266,13 +262,14 @@ class WhisperTranscriber:
|
|
266 |
srt_path = source_download[0]
|
267 |
save_path = os.path.join(self.app_config.output_dir, source.source_name)
|
268 |
save_without_ext, ext = os.path.splitext(save_path)
|
269 |
-
|
|
|
270 |
|
271 |
#ffmpeg -i "input.mp4" -i "input.srt" -c copy -c:s mov_text output.mp4
|
272 |
input_file = ffmpeg.input(source.source_path)
|
273 |
input_srt = ffmpeg.input(srt_path)
|
274 |
out = ffmpeg.output(input_file, input_srt, output_with_srt, vcodec='copy', acodec='copy', scodec='mov_text')
|
275 |
-
outRsult = out.run()
|
276 |
except Exception as e:
|
277 |
# Ignore error - it's just a cleanup
|
278 |
print("Error merge subtitle with source file: \n" + source.source_path + ", \n" + str(e), outRsult)
|
@@ -439,6 +436,12 @@ class WhisperTranscriber:
|
|
439 |
srt = self.__get_subs(result["segments"], "srt", languageMaxLineWidth, highlight_words=highlight_words)
|
440 |
json_result = json.dumps(result, indent=4, ensure_ascii=False)
|
441 |
|
|
|
|
|
|
|
|
|
|
|
|
|
442 |
output_files = []
|
443 |
output_files.append(self.__create_file(srt, output_dir, source_name + "-subs.srt"));
|
444 |
output_files.append(self.__create_file(vtt, output_dir, source_name + "-subs.vtt"));
|
|
|
19 |
from src.hooks.whisperProgressHook import create_progress_listener_handle
|
20 |
from src.languages import _TO_LANGUAGE_CODE
|
21 |
from src.languages import get_language_names
|
22 |
+
from src.languages import get_language_from_name
|
23 |
from src.modelCache import ModelCache
|
24 |
from src.prompts.jsonPromptStrategy import JsonPromptStrategy
|
25 |
from src.prompts.prependPromptStrategy import PrependPromptStrategy
|
|
|
39 |
from src.whisper.whisperFactory import create_whisper_container
|
40 |
|
41 |
import shutil
|
42 |
+
import zhconv
|
43 |
|
44 |
# Configure more application defaults in config.json5
|
45 |
|
|
|
104 |
vad, vadMergeWindow, vadMaxMergeSize,
|
105 |
word_timestamps: bool = False, highlight_words: bool = False,
|
106 |
progress=gr.Progress()):
|
|
|
|
|
|
|
|
|
107 |
|
108 |
vadOptions = VadOptions(vad, vadMergeWindow, vadMaxMergeSize, self.app_config.vad_padding, self.app_config.vad_prompt_window, self.app_config.vad_initial_prompt_mode)
|
109 |
|
110 |
+
return self.transcribe_webui(modelName, languageName, urlData, multipleFiles, microphoneData, task, vadOptions,
|
111 |
+
word_timestamps=word_timestamps, highlight_words=highlight_words, progress=progress)
|
112 |
|
113 |
# Entry function for the full tab
|
114 |
def transcribe_webui_full(self, modelName, languageName, urlData, multipleFiles, microphoneData, task,
|
|
|
142 |
else:
|
143 |
temperature = [temperature]
|
144 |
|
|
|
|
|
|
|
|
|
145 |
vadOptions = VadOptions(vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow, vadInitialPromptMode)
|
146 |
|
147 |
return self.transcribe_webui(modelName, languageName, urlData, multipleFiles, microphoneData, task, vadOptions,
|
|
|
158 |
sources = self.__get_source(urlData, multipleFiles, microphoneData)
|
159 |
|
160 |
try:
|
161 |
+
langObj = get_language_from_name(languageName)
|
162 |
+
selectedLanguage = languageName.lower() if languageName is not None and len(languageName) > 0 else None
|
163 |
selectedModel = modelName if modelName is not None else "base"
|
164 |
|
165 |
model = create_whisper_container(whisper_implementation=self.app_config.whisper_implementation,
|
|
|
262 |
srt_path = source_download[0]
|
263 |
save_path = os.path.join(self.app_config.output_dir, source.source_name)
|
264 |
save_without_ext, ext = os.path.splitext(save_path)
|
265 |
+
lang_ext = "." + langObj.code if langObj is not None else ""
|
266 |
+
output_with_srt = save_without_ext + lang_ext + ext
|
267 |
|
268 |
#ffmpeg -i "input.mp4" -i "input.srt" -c copy -c:s mov_text output.mp4
|
269 |
input_file = ffmpeg.input(source.source_path)
|
270 |
input_srt = ffmpeg.input(srt_path)
|
271 |
out = ffmpeg.output(input_file, input_srt, output_with_srt, vcodec='copy', acodec='copy', scodec='mov_text')
|
272 |
+
outRsult = out.run(overwrite_output=True)
|
273 |
except Exception as e:
|
274 |
# Ignore error - it's just a cleanup
|
275 |
print("Error merge subtitle with source file: \n" + source.source_path + ", \n" + str(e), outRsult)
|
|
|
436 |
srt = self.__get_subs(result["segments"], "srt", languageMaxLineWidth, highlight_words=highlight_words)
|
437 |
json_result = json.dumps(result, indent=4, ensure_ascii=False)
|
438 |
|
439 |
+
if language == "zh":
|
440 |
+
vtt = zhconv.convert(vtt, "zh-tw")
|
441 |
+
srt = zhconv.convert(srt, "zh-tw")
|
442 |
+
text = zhconv.convert(text, "zh-tw")
|
443 |
+
json_result = zhconv.convert(json_result, "zh-tw")
|
444 |
+
|
445 |
output_files = []
|
446 |
output_files.append(self.__create_file(srt, output_dir, source_name + "-subs.srt"));
|
447 |
output_files.append(self.__create_file(vtt, output_dir, source_name + "-subs.vtt"));
|
requirements-fasterWhisper.txt
CHANGED
@@ -6,4 +6,5 @@ yt-dlp
|
|
6 |
json5
|
7 |
torch
|
8 |
torchaudio
|
9 |
-
more_itertools
|
|
|
|
6 |
json5
|
7 |
torch
|
8 |
torchaudio
|
9 |
+
more_itertools
|
10 |
+
zhconv
|
requirements-whisper.txt
CHANGED
@@ -6,4 +6,5 @@ gradio==3.36.0
|
|
6 |
yt-dlp
|
7 |
torchaudio
|
8 |
altair
|
9 |
-
json5
|
|
|
|
6 |
yt-dlp
|
7 |
torchaudio
|
8 |
altair
|
9 |
+
json5
|
10 |
+
zhconv
|
requirements.txt
CHANGED
@@ -6,4 +6,5 @@ yt-dlp
|
|
6 |
json5
|
7 |
torch
|
8 |
torchaudio
|
9 |
-
more_itertools
|
|
|
|
6 |
json5
|
7 |
torch
|
8 |
torchaudio
|
9 |
+
more_itertools
|
10 |
+
zhconv
|
src/download.py
CHANGED
@@ -29,7 +29,7 @@ def _perform_download(url: str, maxDuration: int = None, outputTemplate: str = N
|
|
29 |
destinationDirectory = mkdtemp()
|
30 |
|
31 |
ydl_opts = {
|
32 |
-
"format": "bestaudio/best" if onlyAudio else "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best",
|
33 |
'paths': {
|
34 |
'home': destinationDirectory
|
35 |
}
|
|
|
29 |
destinationDirectory = mkdtemp()
|
30 |
|
31 |
ydl_opts = {
|
32 |
+
"format": "bestaudio/best" if onlyAudio else "bestvideo[ext=mp4][vcodec^=avc1]+bestaudio[ext=m4a]/best",
|
33 |
'paths': {
|
34 |
'home': destinationDirectory
|
35 |
}
|
src/vad.py
CHANGED
@@ -204,7 +204,7 @@ class AbstractTranscription(ABC):
|
|
204 |
detected_language = languageCounter.most_common(1)[0][0] if len(languageCounter) > 0 else None
|
205 |
|
206 |
print("Running whisper from ", format_timestamp(segment_start), " to ", format_timestamp(segment_end), ", duration: ",
|
207 |
-
segment_duration, "expanded: ", segment_expand_amount, "prompt: ", segment_prompt, "language: ", detected_language)
|
208 |
|
209 |
perf_start_time = time.perf_counter()
|
210 |
|
|
|
204 |
detected_language = languageCounter.most_common(1)[0][0] if len(languageCounter) > 0 else None
|
205 |
|
206 |
print("Running whisper from ", format_timestamp(segment_start), " to ", format_timestamp(segment_end), ", duration: ",
|
207 |
+
segment_duration, "expanded: ", segment_expand_amount, ", prompt: ", segment_prompt, ", detected language: ", detected_language)
|
208 |
|
209 |
perf_start_time = time.perf_counter()
|
210 |
|