Spaces:
Sleeping
Sleeping
Initial Commit
Browse files- .DS_Store +0 -0
- UI.py +25 -0
- diarization.py +81 -0
- main.py +63 -0
- opus.py +63 -0
- requirements.txt +197 -0
- translated_video.py +77 -0
- tts.py +96 -0
- video_to_text.py +86 -0
- yt_download.py +53 -0
.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
UI.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from main import main as process_video
|
3 |
+
|
4 |
+
def run_pipeline(youtube_url):
|
5 |
+
# Run the main processing function from your script
|
6 |
+
# This function should save the final video in the '/translated/' directory
|
7 |
+
process_video(youtube_url)
|
8 |
+
|
9 |
+
# Construct the path to the final video
|
10 |
+
# Assuming the video is named 'final_video.mp4' and stored in '/translated/'
|
11 |
+
final_video_path = './translated/final_video.mp4'
|
12 |
+
|
13 |
+
# Return the path for Gradio to display
|
14 |
+
return final_video_path
|
15 |
+
|
16 |
+
iface = gr.Interface(
|
17 |
+
fn=run_pipeline,
|
18 |
+
inputs=gr.Textbox(lines=2, placeholder="Enter YouTube Video URL here..."),
|
19 |
+
outputs=gr.Video(),
|
20 |
+
title="YouTube Video Processing",
|
21 |
+
description="Enter a YouTube URL to process the video through transcription, translation, and more."
|
22 |
+
)
|
23 |
+
|
24 |
+
if __name__ == "__main__":
|
25 |
+
iface.launch()
|
diarization.py
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pyannote.audio import Pipeline
|
2 |
+
from pydub import AudioSegment
|
3 |
+
import os
|
4 |
+
import re
|
5 |
+
import torch
|
6 |
+
|
7 |
+
def perform_diarization(audio_file_path, translated_file_path, output_dir='./audio/diarization'):
|
8 |
+
# Initialize diarization pipeline
|
9 |
+
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1")
|
10 |
+
|
11 |
+
# Send pipeline to GPU (when available)
|
12 |
+
pipeline.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
|
13 |
+
|
14 |
+
# Load audio file
|
15 |
+
audio = AudioSegment.from_wav(audio_file_path)
|
16 |
+
|
17 |
+
# Apply pretrained pipeline
|
18 |
+
diarization = pipeline(audio_file_path)
|
19 |
+
|
20 |
+
os.makedirs(output_dir, exist_ok=True)
|
21 |
+
|
22 |
+
# Process and save each speaker's audio segments
|
23 |
+
speaker_segments_audio = {}
|
24 |
+
for turn, _, speaker in diarization.itertracks(yield_label=True):
|
25 |
+
start_ms = int(turn.start * 1000) # Convert to milliseconds
|
26 |
+
end_ms = int(turn.end * 1000) # Convert to milliseconds
|
27 |
+
segment = audio[start_ms:end_ms]
|
28 |
+
|
29 |
+
if speaker in speaker_segments_audio:
|
30 |
+
speaker_segments_audio[speaker] += segment
|
31 |
+
else:
|
32 |
+
speaker_segments_audio[speaker] = segment
|
33 |
+
|
34 |
+
# Save audio segments
|
35 |
+
for speaker, segment in speaker_segments_audio.items():
|
36 |
+
output_path = os.path.join(output_dir, f"{speaker}.wav")
|
37 |
+
segment.export(output_path, format="wav")
|
38 |
+
print(f"Combined audio for speaker {speaker} saved in {output_path}")
|
39 |
+
|
40 |
+
# Load translated text
|
41 |
+
with open(translated_file_path, "r") as file:
|
42 |
+
translated_lines = file.readlines()
|
43 |
+
|
44 |
+
# Process and align translated text with diarization data
|
45 |
+
last_speaker = None
|
46 |
+
aligned_text = []
|
47 |
+
timestamp_pattern = re.compile(r'\[(\d+\.\d+)\-(\d+\.\d+)\]')
|
48 |
+
for line in translated_lines:
|
49 |
+
match = timestamp_pattern.match(line)
|
50 |
+
|
51 |
+
if match:
|
52 |
+
start_time = float(match.group(1))
|
53 |
+
end_time = float(match.group(2))
|
54 |
+
text = line[match.end():].strip() # Extract text part
|
55 |
+
|
56 |
+
speaker_found = False
|
57 |
+
# Find corresponding speaker
|
58 |
+
for turn, _, speaker in diarization.itertracks(yield_label=True):
|
59 |
+
speaker_start = turn.start
|
60 |
+
speaker_end = turn.end
|
61 |
+
# Check for overlap between speaker segment and line timestamp
|
62 |
+
if max(speaker_start, start_time) < min(speaker_end, end_time):
|
63 |
+
aligned_text.append(f"[{speaker}] [{start_time}-{end_time}] {text}")
|
64 |
+
speaker_found = True
|
65 |
+
last_speaker = speaker
|
66 |
+
break
|
67 |
+
|
68 |
+
# If no speaker found, use the last speaker
|
69 |
+
if not speaker_found:
|
70 |
+
if last_speaker is not None:
|
71 |
+
aligned_text.append(f"[{last_speaker}] [{start_time}-{end_time}] {text}")
|
72 |
+
else:
|
73 |
+
aligned_text.append(f"[Unknown Speaker] [{start_time}-{end_time}] {text}")
|
74 |
+
|
75 |
+
# Save aligned text to a single file
|
76 |
+
aligned_text_output_path = os.path.join(output_dir, "aligned_text.txt")
|
77 |
+
with open(aligned_text_output_path, "w") as aligned_text_file:
|
78 |
+
aligned_text_file.write('\n'.join(aligned_text))
|
79 |
+
print(f"Aligned text saved in {aligned_text_output_path}")
|
80 |
+
|
81 |
+
# The rest of your script, if any
|
main.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import os
|
3 |
+
from yt_download import download_video
|
4 |
+
from video_to_text import convert_video_to_text
|
5 |
+
from opus import translate_file
|
6 |
+
from diarization import perform_diarization
|
7 |
+
from tts import main as tts_main
|
8 |
+
from translated_video import create_translated_video
|
9 |
+
|
10 |
+
def get_transcription_filename(video_path):
|
11 |
+
base_name = os.path.splitext(os.path.basename(video_path))[0]
|
12 |
+
return f'./transcribed/{base_name}.txt'
|
13 |
+
|
14 |
+
def get_audio_filename(video_path):
|
15 |
+
base_name = os.path.splitext(os.path.basename(video_path))[0]
|
16 |
+
return f'./audio/{base_name}.wav'
|
17 |
+
|
18 |
+
def main(youtube_url):
|
19 |
+
# Ensure necessary directories exist
|
20 |
+
if not os.path.exists('./downloads'):
|
21 |
+
os.makedirs('./downloads')
|
22 |
+
if not os.path.exists('./audio'):
|
23 |
+
os.makedirs('./audio')
|
24 |
+
if not os.path.exists('./transcribed'):
|
25 |
+
os.makedirs('./transcribed')
|
26 |
+
if not os.path.exists('./translated'):
|
27 |
+
os.makedirs('./translated')
|
28 |
+
|
29 |
+
# Step 1: Download the video
|
30 |
+
downloaded_video_path = download_video(youtube_url)
|
31 |
+
|
32 |
+
# Step 2: Transcribe the video's audio
|
33 |
+
transcribed_text_path = get_transcription_filename(downloaded_video_path)
|
34 |
+
model_type = 'base' # You can specify the Whisper model type
|
35 |
+
convert_video_to_text(downloaded_video_path, model_type)
|
36 |
+
|
37 |
+
|
38 |
+
# Step 3: Translate the transcribed text to Spanish
|
39 |
+
translated_text_path = './translated/translated_text.txt'
|
40 |
+
translate_file(transcribed_text_path, translated_text_path)
|
41 |
+
|
42 |
+
# Step 4: Perform diarization
|
43 |
+
audio_path = get_audio_filename(downloaded_video_path)
|
44 |
+
diarized_audio_dir = './audio/diarization'
|
45 |
+
perform_diarization(audio_path, translated_text_path)
|
46 |
+
|
47 |
+
# Step 5: Generate speech for translated text
|
48 |
+
speaker_directory = './audio/diarization'
|
49 |
+
aligned_text_file = './audio/diarization/aligned_text.txt' # Ensure this is the correct path
|
50 |
+
output_audio_file = './translated/final_audio.wav'
|
51 |
+
tts_main(speaker_directory, aligned_text_file, output_audio_file)
|
52 |
+
|
53 |
+
# Step 6: Create the final translated video
|
54 |
+
final_video_path = create_translated_video(downloaded_video_path, output_audio_file, translated_text_path)
|
55 |
+
|
56 |
+
print(f"Final translated video created at {final_video_path}")
|
57 |
+
|
58 |
+
if __name__ == "__main__":
|
59 |
+
parser = argparse.ArgumentParser(description="Process a YouTube video with multiple steps.")
|
60 |
+
parser.add_argument("youtube_url", help="YouTube video URL")
|
61 |
+
args = parser.parse_args()
|
62 |
+
|
63 |
+
main(args.youtube_url)
|
opus.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import MarianMTModel, MarianTokenizer
|
2 |
+
from tqdm import tqdm
|
3 |
+
import os
|
4 |
+
import re
|
5 |
+
import argparse
|
6 |
+
|
7 |
+
# Load Model and Tokenizer
|
8 |
+
model_name = "Helsinki-NLP/opus-mt-en-es"
|
9 |
+
tokenizer = MarianTokenizer.from_pretrained(model_name)
|
10 |
+
model = MarianMTModel.from_pretrained(model_name)
|
11 |
+
|
12 |
+
# Extract & separate timestamp and text
|
13 |
+
def extract_timestamp_and_text(line):
|
14 |
+
match = re.match(r'\[(\d+\.\d+\-\d+\.\d+)\]\s+(.*)', line)
|
15 |
+
if match:
|
16 |
+
return match.group(1), match.group(2)
|
17 |
+
return '', line
|
18 |
+
|
19 |
+
# Translate text
|
20 |
+
def translate_text(text):
|
21 |
+
lines = text.split('\n')
|
22 |
+
translated_lines = []
|
23 |
+
|
24 |
+
for line in tqdm(lines, desc="Translating lines", leave=False):
|
25 |
+
if not line.strip():
|
26 |
+
translated_lines.append('')
|
27 |
+
continue
|
28 |
+
|
29 |
+
timestamp, line_text = extract_timestamp_and_text(line)
|
30 |
+
|
31 |
+
if line_text.strip():
|
32 |
+
model_inputs = tokenizer(line_text, return_tensors="pt", truncation=True, padding="longest")
|
33 |
+
translated = model.generate(**model_inputs)
|
34 |
+
translated_text = tokenizer.batch_decode(translated, skip_special_tokens=True)[0]
|
35 |
+
translated_line = f'[{timestamp}] {translated_text}'
|
36 |
+
else:
|
37 |
+
translated_line = f'[{timestamp}]'
|
38 |
+
|
39 |
+
translated_lines.append(translated_line)
|
40 |
+
|
41 |
+
return '\n'.join(translated_lines)
|
42 |
+
|
43 |
+
# Main function to translate a file
|
44 |
+
def translate_file(src_file_path, dst_file_path):
|
45 |
+
try:
|
46 |
+
with open(src_file_path, 'r') as file:
|
47 |
+
english_text = file.read()
|
48 |
+
spanish_text = translate_text(english_text)
|
49 |
+
|
50 |
+
with open(dst_file_path, 'w') as file:
|
51 |
+
file.write(spanish_text)
|
52 |
+
print(f"Translation completed: {dst_file_path}")
|
53 |
+
|
54 |
+
except Exception as e:
|
55 |
+
print(f"Error processing file: {e}")
|
56 |
+
|
57 |
+
if __name__ == "__main__":
|
58 |
+
parser = argparse.ArgumentParser(description="Translate English text to Spanish")
|
59 |
+
parser.add_argument("src_file_path", help="Path to the source file with English text")
|
60 |
+
parser.add_argument("dst_file_path", help="Path to save the translated Spanish text")
|
61 |
+
args = parser.parse_args()
|
62 |
+
|
63 |
+
translate_file(args.src_file_path, args.dst_file_path)
|
requirements.txt
ADDED
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
absl-py==2.0.0
|
2 |
+
aiohttp==3.9.0
|
3 |
+
aiosignal==1.3.1
|
4 |
+
alembic==1.12.1
|
5 |
+
annotated-types==0.6.0
|
6 |
+
antlr4-python3-runtime==4.9.3
|
7 |
+
anyascii==0.3.2
|
8 |
+
asteroid-filterbanks==0.4.0
|
9 |
+
attrs==23.1.0
|
10 |
+
audioread==3.0.1
|
11 |
+
Babel==2.13.1
|
12 |
+
bangla==0.0.2
|
13 |
+
blinker==1.7.0
|
14 |
+
blis==0.7.11
|
15 |
+
bnnumerizer==0.0.2
|
16 |
+
bnunicodenormalizer==0.1.6
|
17 |
+
Brotli @ file:///D:/bld/brotli-split_1695989908365/work
|
18 |
+
cachetools==5.3.2
|
19 |
+
catalogue==2.0.10
|
20 |
+
certifi @ file:///home/conda/feedstock_root/build_artifacts/certifi_1700303426725/work/certifi
|
21 |
+
cffi==1.16.0
|
22 |
+
charset-normalizer @ file:///home/conda/feedstock_root/build_artifacts/charset-normalizer_1698833585322/work
|
23 |
+
click==8.1.7
|
24 |
+
cloudpathlib==0.16.0
|
25 |
+
colorama==0.4.6
|
26 |
+
colorlog==6.7.0
|
27 |
+
confection==0.1.3
|
28 |
+
contourpy==1.2.0
|
29 |
+
coqpit==0.0.17
|
30 |
+
cycler==0.12.1
|
31 |
+
cymem==2.0.8
|
32 |
+
Cython==3.0.5
|
33 |
+
dateparser==1.1.8
|
34 |
+
decorator==4.4.2
|
35 |
+
docopt==0.6.2
|
36 |
+
einops==0.7.0
|
37 |
+
en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl#sha256=86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889
|
38 |
+
encodec==0.1.1
|
39 |
+
ffmpeg-python==0.2.0
|
40 |
+
filelock @ file:///home/conda/feedstock_root/build_artifacts/filelock_1698714947081/work
|
41 |
+
fire==0.5.0
|
42 |
+
Flask==3.0.0
|
43 |
+
fonttools==4.45.0
|
44 |
+
frozenlist==1.4.0
|
45 |
+
fsspec==2023.10.0
|
46 |
+
future==0.18.3
|
47 |
+
g2pkk==0.1.2
|
48 |
+
google-auth==2.23.4
|
49 |
+
google-auth-oauthlib==1.1.0
|
50 |
+
greenlet==3.0.1
|
51 |
+
grpcio==1.59.3
|
52 |
+
gruut==2.2.3
|
53 |
+
gruut-ipa==0.13.0
|
54 |
+
gruut-lang-de==2.0.0
|
55 |
+
gruut-lang-en==2.0.0
|
56 |
+
gruut-lang-es==2.0.0
|
57 |
+
gruut-lang-fr==2.0.2
|
58 |
+
hangul-romanize==0.1.0
|
59 |
+
huggingface-hub==0.19.4
|
60 |
+
HyperPyYAML==1.2.2
|
61 |
+
idna @ file:///home/conda/feedstock_root/build_artifacts/idna_1663625384323/work
|
62 |
+
imageio==2.33.0
|
63 |
+
imageio-ffmpeg==0.4.9
|
64 |
+
inflect==7.0.0
|
65 |
+
itsdangerous==2.1.2
|
66 |
+
jamo==0.4.1
|
67 |
+
jieba==0.42.1
|
68 |
+
Jinja2 @ file:///home/conda/feedstock_root/build_artifacts/jinja2_1654302431367/work
|
69 |
+
joblib==1.3.2
|
70 |
+
jsonlines==1.2.0
|
71 |
+
julius==0.2.7
|
72 |
+
kiwisolver==1.4.5
|
73 |
+
langcodes==3.3.0
|
74 |
+
lazy_loader==0.3
|
75 |
+
librosa==0.10.1
|
76 |
+
lightning==2.1.2
|
77 |
+
lightning-utilities==0.10.0
|
78 |
+
llvmlite==0.41.1
|
79 |
+
Mako==1.3.0
|
80 |
+
Markdown==3.5.1
|
81 |
+
markdown-it-py==3.0.0
|
82 |
+
MarkupSafe @ file:///D:/bld/markupsafe_1695367436673/work
|
83 |
+
matplotlib==3.8.2
|
84 |
+
mdurl==0.1.2
|
85 |
+
more-itertools==10.1.0
|
86 |
+
moviepy==1.0.3
|
87 |
+
mpmath @ file:///home/conda/feedstock_root/build_artifacts/mpmath_1678228039184/work
|
88 |
+
msgpack==1.0.7
|
89 |
+
multidict==6.0.4
|
90 |
+
murmurhash==1.0.10
|
91 |
+
networkx==2.8.8
|
92 |
+
nltk==3.8.1
|
93 |
+
num2words==0.5.13
|
94 |
+
numba==0.58.1
|
95 |
+
numpy @ file:///D:/bld/numpy_1694920156760/work/dist/numpy-1.26.0-cp311-cp311-win_amd64.whl#sha256=52e1af97f7d84aafe72cc1aaae3e1c9d52dff69c7ffcc96e2f4f7799fdad7a0c
|
96 |
+
oauthlib==3.2.2
|
97 |
+
omegaconf==2.3.0
|
98 |
+
openai-whisper==20231117
|
99 |
+
opencv-python==4.8.1.78
|
100 |
+
optuna==3.4.0
|
101 |
+
packaging==23.2
|
102 |
+
pandas==1.5.3
|
103 |
+
Pillow @ file:///D:/bld/pillow_1697423754480/work
|
104 |
+
platformdirs==4.0.0
|
105 |
+
pooch==1.8.0
|
106 |
+
preshed==3.0.9
|
107 |
+
primePy==1.3
|
108 |
+
proglog==0.1.10
|
109 |
+
protobuf==4.23.4
|
110 |
+
psutil==5.9.6
|
111 |
+
pyannote.audio==3.1.0
|
112 |
+
pyannote.core==5.0.0
|
113 |
+
pyannote.database==5.0.1
|
114 |
+
pyannote.metrics==3.2.1
|
115 |
+
pyannote.pipeline==3.0.1
|
116 |
+
pyasn1==0.5.1
|
117 |
+
pyasn1-modules==0.3.0
|
118 |
+
pycparser==2.21
|
119 |
+
pydantic==2.5.2
|
120 |
+
pydantic_core==2.14.5
|
121 |
+
pydub==0.25.1
|
122 |
+
Pygments==2.17.2
|
123 |
+
pymp3==0.1.9
|
124 |
+
pynndescent==0.5.11
|
125 |
+
pyparsing==3.1.1
|
126 |
+
pypinyin==0.49.0
|
127 |
+
pysbd==0.3.4
|
128 |
+
PySocks @ file:///D:/bld/pysocks_1661604991356/work
|
129 |
+
PySoundFile==0.9.0.post1
|
130 |
+
python-crfsuite==0.9.9
|
131 |
+
python-dateutil==2.8.2
|
132 |
+
pytorch-lightning==2.1.2
|
133 |
+
pytorch-metric-learning==2.3.0
|
134 |
+
pytube==15.0.0
|
135 |
+
pytz==2023.3.post1
|
136 |
+
PyYAML @ file:///D:/bld/pyyaml_1695373635661/work
|
137 |
+
regex==2023.10.3
|
138 |
+
requests @ file:///home/conda/feedstock_root/build_artifacts/requests_1684774241324/work
|
139 |
+
requests-oauthlib==1.3.1
|
140 |
+
rich==13.7.0
|
141 |
+
rsa==4.9
|
142 |
+
ruamel.yaml==0.18.5
|
143 |
+
ruamel.yaml.clib==0.2.8
|
144 |
+
sacremoses==0.1.1
|
145 |
+
safetensors==0.4.0
|
146 |
+
scikit-learn==1.3.2
|
147 |
+
scipy==1.11.4
|
148 |
+
semver==3.0.2
|
149 |
+
sentencepiece==0.1.99
|
150 |
+
shellingham==1.5.4
|
151 |
+
six==1.16.0
|
152 |
+
smart-open==6.4.0
|
153 |
+
sortedcontainers==2.4.0
|
154 |
+
soundfile==0.12.1
|
155 |
+
soxr==0.3.7
|
156 |
+
spacy==3.7.2
|
157 |
+
spacy-legacy==3.0.12
|
158 |
+
spacy-loggers==1.0.5
|
159 |
+
speechbrain==0.5.16
|
160 |
+
SQLAlchemy==2.0.23
|
161 |
+
srsly==2.4.8
|
162 |
+
srt==3.5.3
|
163 |
+
SudachiDict-core==20230927
|
164 |
+
SudachiPy==0.6.7
|
165 |
+
sympy @ file:///home/conda/feedstock_root/build_artifacts/sympy_1684180539862/work
|
166 |
+
tabulate==0.9.0
|
167 |
+
tensorboard==2.15.1
|
168 |
+
tensorboard-data-server==0.7.2
|
169 |
+
tensorboardX==2.6.2.2
|
170 |
+
termcolor==2.4.0
|
171 |
+
thinc==8.2.1
|
172 |
+
threadpoolctl==3.2.0
|
173 |
+
tiktoken==0.5.1
|
174 |
+
tokenizers==0.15.0
|
175 |
+
torch==2.1.1
|
176 |
+
torch-audiomentations==0.11.0
|
177 |
+
torch-pitch-shift==1.2.4
|
178 |
+
torch-time-stretch==1.0.3
|
179 |
+
torchaudio==2.1.1
|
180 |
+
torchmetrics==1.2.0
|
181 |
+
torchvision==0.16.1
|
182 |
+
tqdm==4.66.1
|
183 |
+
trainer==0.0.32
|
184 |
+
transformers==4.35.2
|
185 |
+
TTS==0.21.3
|
186 |
+
typer==0.9.0
|
187 |
+
typing_extensions @ file:///home/conda/feedstock_root/build_artifacts/typing_extensions_1695040754690/work
|
188 |
+
tzdata==2023.3
|
189 |
+
tzlocal==5.2
|
190 |
+
umap-learn==0.5.5
|
191 |
+
Unidecode==1.3.7
|
192 |
+
urllib3 @ file:///home/conda/feedstock_root/build_artifacts/urllib3_1699933488691/work
|
193 |
+
wasabi==1.1.2
|
194 |
+
weasel==0.3.4
|
195 |
+
Werkzeug==3.0.1
|
196 |
+
win-inet-pton @ file:///D:/bld/win_inet_pton_1667051142467/work
|
197 |
+
yarl==1.9.3
|
translated_video.py
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from moviepy.editor import VideoFileClip, AudioFileClip
|
2 |
+
from pydub import AudioSegment
|
3 |
+
import srt
|
4 |
+
import datetime
|
5 |
+
import ffmpeg
|
6 |
+
import os
|
7 |
+
import re
|
8 |
+
|
9 |
+
def create_translated_video(original_video_path, translated_audio_path, translated_text_path, output_dir='./translated'):
|
10 |
+
# Load original video
|
11 |
+
video = VideoFileClip(original_video_path)
|
12 |
+
|
13 |
+
# Load TTS audio
|
14 |
+
new_audio = AudioFileClip(translated_audio_path)
|
15 |
+
video = video.set_audio(new_audio)
|
16 |
+
audio_segment = AudioSegment.from_file(translated_audio_path, format="wav")
|
17 |
+
|
18 |
+
# Check if new audio is shorter to pad with silence
|
19 |
+
if new_audio.duration < video.duration:
|
20 |
+
silence_duration = (video.duration - new_audio.duration) * 1000 # convert to milliseconds
|
21 |
+
silence_segment = AudioSegment.silent(duration=silence_duration)
|
22 |
+
audio_segment = audio_segment + silence_segment
|
23 |
+
padded_audio_path = os.path.join(output_dir, 'padded_audio.wav')
|
24 |
+
audio_segment.export(padded_audio_path, format='wav')
|
25 |
+
new_audio = AudioFileClip(padded_audio_path)
|
26 |
+
|
27 |
+
# Generate SRT content
|
28 |
+
def parse_translated_text(file_path):
|
29 |
+
with open(file_path, 'r') as file:
|
30 |
+
content = file.readlines()
|
31 |
+
|
32 |
+
subtitles = []
|
33 |
+
timestamp_pattern = re.compile(r'\[(\d+\.\d+)\-(\d+\.\d+)\]')
|
34 |
+
for line in content:
|
35 |
+
match = timestamp_pattern.match(line)
|
36 |
+
if match:
|
37 |
+
start_time = datetime.timedelta(seconds=float(match.group(1)))
|
38 |
+
end_time = datetime.timedelta(seconds=float(match.group(2)))
|
39 |
+
text = line[match.end():].strip()
|
40 |
+
|
41 |
+
subtitle = srt.Subtitle(index=len(subtitles)+1,
|
42 |
+
start=start_time,
|
43 |
+
end=end_time,
|
44 |
+
content=text)
|
45 |
+
subtitles.append(subtitle)
|
46 |
+
|
47 |
+
return srt.compose(subtitles)
|
48 |
+
|
49 |
+
# Generate SRT content
|
50 |
+
srt_content = parse_translated_text(translated_text_path)
|
51 |
+
|
52 |
+
# Write to an SRT file
|
53 |
+
srt_file = './translated/translated.srt'
|
54 |
+
with open(srt_file, 'w', encoding='utf-8') as file:
|
55 |
+
file.write(srt_content)
|
56 |
+
|
57 |
+
# Write the final video file
|
58 |
+
temp = "./translated/temp.mp4"
|
59 |
+
video.write_videofile(temp)
|
60 |
+
|
61 |
+
# Add subtitles
|
62 |
+
final_video_file = os.path.join(output_dir, "final_video.mp4")
|
63 |
+
|
64 |
+
# Correct the subtitle filter string for ffmpeg
|
65 |
+
subtitle_filter_str = f"subtitles='{srt_file}'"
|
66 |
+
|
67 |
+
try:
|
68 |
+
ffmpeg.input(temp).output(final_video_file, vf=subtitle_filter_str).run()
|
69 |
+
except ffmpeg.Error as e:
|
70 |
+
print(f"Error creating final video: {e}")
|
71 |
+
return None
|
72 |
+
|
73 |
+
# Remove temp file
|
74 |
+
os.remove(temp)
|
75 |
+
return final_video_file
|
76 |
+
|
77 |
+
# The rest of your script, if any
|
tts.py
ADDED
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from TTS.api import TTS
|
2 |
+
from pydub import AudioSegment
|
3 |
+
import os
|
4 |
+
import re
|
5 |
+
import ffmpeg
|
6 |
+
import shutil
|
7 |
+
import argparse
|
8 |
+
|
9 |
+
def adjust_speed(input_file, speed_factor):
|
10 |
+
output_file = input_file.replace(".wav", "_adjusted.wav")
|
11 |
+
ffmpeg.input(input_file).filter('atempo', speed_factor).output(output_file, acodec='pcm_s16le').run()
|
12 |
+
return output_file
|
13 |
+
|
14 |
+
def generate_speech(text, speaker_voice_map, output_file):
|
15 |
+
combined_audio = AudioSegment.empty()
|
16 |
+
temp_files = []
|
17 |
+
|
18 |
+
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to("cuda")
|
19 |
+
|
20 |
+
for line in text.split("\n"):
|
21 |
+
if not line.strip():
|
22 |
+
continue
|
23 |
+
|
24 |
+
match = re.match(r"\[SPEAKER_(\d+)\] \[(\d+\.\d+)-(\d+\.\d+)\] (.+)", line)
|
25 |
+
if not match:
|
26 |
+
continue
|
27 |
+
|
28 |
+
speaker_id, start_time, end_time, sentence = match.groups()
|
29 |
+
start_time, end_time = float(start_time), float(end_time)
|
30 |
+
segment_duration = (end_time - start_time) * 1000 # Duration in milliseconds
|
31 |
+
|
32 |
+
speaker_wav = speaker_voice_map.get(f"SPEAKER_{speaker_id}")
|
33 |
+
if not speaker_wav:
|
34 |
+
continue
|
35 |
+
|
36 |
+
os.makedirs('./audio/temp', exist_ok=True)
|
37 |
+
temp_file_path = f"./audio/temp/temp_output_part_{len(temp_files)}.wav"
|
38 |
+
temp_files.append(temp_file_path)
|
39 |
+
|
40 |
+
tts_speed = 1.0
|
41 |
+
tts.tts_to_file(text=sentence, file_path=temp_file_path, speaker_wav=speaker_wav, language="es", speed=tts_speed)
|
42 |
+
|
43 |
+
segment_audio = AudioSegment.from_wav(temp_file_path)
|
44 |
+
|
45 |
+
if segment_audio.duration_seconds * 1000 > segment_duration:
|
46 |
+
while tts_speed < 2.0 and segment_audio.duration_seconds * 1000 > segment_duration:
|
47 |
+
tts_speed += 0.5
|
48 |
+
tts.tts_to_file(text=sentence, file_path=temp_file_path, speaker_wav=speaker_wav, language="es", speed=tts_speed)
|
49 |
+
segment_audio = AudioSegment.from_wav(temp_file_path)
|
50 |
+
|
51 |
+
if segment_audio.duration_seconds * 1000 > segment_duration:
|
52 |
+
required_speed = segment_duration / (segment_audio.duration_seconds * 1000)
|
53 |
+
if required_speed < 1.0:
|
54 |
+
required_speed = 1.0 / required_speed
|
55 |
+
temp_file_path = adjust_speed(temp_file_path, required_speed)
|
56 |
+
segment_audio = AudioSegment.from_wav(temp_file_path)
|
57 |
+
|
58 |
+
if combined_audio.duration_seconds == 0 and start_time > 0:
|
59 |
+
combined_audio = AudioSegment.silent(duration=start_time * 1000) + combined_audio
|
60 |
+
|
61 |
+
if segment_audio.duration_seconds * 1000 > segment_duration:
|
62 |
+
segment_audio = segment_audio[:segment_duration]
|
63 |
+
else:
|
64 |
+
segment_audio = segment_audio + AudioSegment.silent(duration=segment_duration - len(segment_audio))
|
65 |
+
|
66 |
+
combined_audio += segment_audio
|
67 |
+
|
68 |
+
combined_audio.export(output_file, format="wav")
|
69 |
+
|
70 |
+
for temp_file in temp_files:
|
71 |
+
os.remove(temp_file)
|
72 |
+
|
73 |
+
def map_speaker_ids(directory):
|
74 |
+
speaker_voice_map = {}
|
75 |
+
for file in os.listdir(directory):
|
76 |
+
if file.endswith(".wav"):
|
77 |
+
speaker_id = file.replace(".wav", "")
|
78 |
+
speaker_voice_map[speaker_id] = os.path.join(directory, file)
|
79 |
+
return speaker_voice_map
|
80 |
+
|
81 |
+
def main(speaker_directory, aligned_text_file, output_audio_file):
|
82 |
+
speaker_voice_map = map_speaker_ids(speaker_directory)
|
83 |
+
with open(aligned_text_file, 'r') as file:
|
84 |
+
translated_text = file.read()
|
85 |
+
generate_speech(translated_text, speaker_voice_map, output_audio_file)
|
86 |
+
if os.path.exists('./audio/temp'):
|
87 |
+
shutil.rmtree('./audio/temp')
|
88 |
+
|
89 |
+
if __name__ == "__main__":
|
90 |
+
parser = argparse.ArgumentParser(description="Generate speech from translated text")
|
91 |
+
parser.add_argument("speaker_directory", help="Directory containing speaker voice clips")
|
92 |
+
parser.add_argument("aligned_text_file", help="Path to the translated and aligned text file")
|
93 |
+
parser.add_argument("output_audio_file", help="Path to save the generated speech audio file")
|
94 |
+
args = parser.parse_args()
|
95 |
+
|
96 |
+
main(args.speaker_directory, args.aligned_text_file, args.output_audio_file)
|
video_to_text.py
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
from moviepy.editor import VideoFileClip
|
3 |
+
import whisper
|
4 |
+
import os
|
5 |
+
import re
|
6 |
+
|
7 |
+
def extract_audio(video_path, audio_dir='./audio'):
|
8 |
+
os.makedirs(audio_dir, exist_ok=True)
|
9 |
+
base_filename = os.path.splitext(os.path.basename(video_path))[0]
|
10 |
+
audio_filename = os.path.join(audio_dir, base_filename + '.wav')
|
11 |
+
video_clip = VideoFileClip(video_path)
|
12 |
+
video_clip.audio.write_audiofile(audio_filename)
|
13 |
+
video_clip.close()
|
14 |
+
return audio_filename
|
15 |
+
|
16 |
+
def transcribe_audio(audio_path, model_type='base', transcribed_dir='./transcribed'):
|
17 |
+
model = whisper.load_model(model_type)
|
18 |
+
result = model.transcribe(audio_path)
|
19 |
+
|
20 |
+
os.makedirs(transcribed_dir, exist_ok=True)
|
21 |
+
base_filename = os.path.splitext(os.path.basename(audio_path))[0]
|
22 |
+
transcribed_filename = os.path.join(transcribed_dir, base_filename + '.txt')
|
23 |
+
|
24 |
+
with open(transcribed_filename, 'w') as file:
|
25 |
+
for segment in result['segments']:
|
26 |
+
start = segment['start']
|
27 |
+
end = segment['end']
|
28 |
+
text = segment['text']
|
29 |
+
file.write(f"[{start:.2f}-{end:.2f}] {text}\n")
|
30 |
+
|
31 |
+
return transcribed_filename, result['text']
|
32 |
+
|
33 |
+
def merge_lines(file_path):
|
34 |
+
timestamp_pattern = re.compile(r'\[(\d+\.\d+)-(\d+\.\d+)\]')
|
35 |
+
|
36 |
+
with open(file_path, 'r') as file:
|
37 |
+
lines = file.readlines()
|
38 |
+
|
39 |
+
merged_lines = []
|
40 |
+
i = 0
|
41 |
+
|
42 |
+
while i < len(lines):
|
43 |
+
line = lines[i].strip()
|
44 |
+
match = timestamp_pattern.match(line)
|
45 |
+
|
46 |
+
if match:
|
47 |
+
start_time = float(match.group(1))
|
48 |
+
text = line[match.end():].strip()
|
49 |
+
|
50 |
+
if not (text.endswith('.') or text.endswith('?')):
|
51 |
+
if i + 1 < len(lines):
|
52 |
+
next_line = lines[i + 1].strip()
|
53 |
+
next_match = timestamp_pattern.match(next_line)
|
54 |
+
|
55 |
+
if next_match:
|
56 |
+
end_time = float(next_match.group(2))
|
57 |
+
next_text = next_line[next_match.end():].strip()
|
58 |
+
merged_text = text + ' ' + next_text
|
59 |
+
merged_line = f"[{start_time:.2f}-{end_time:.2f}] {merged_text}\n"
|
60 |
+
merged_lines.append(merged_line)
|
61 |
+
i += 1
|
62 |
+
else:
|
63 |
+
end_time = float(match.group(2))
|
64 |
+
merged_lines.append(f"[{start_time:.2f}-{end_time:.2f}] {text}\n")
|
65 |
+
|
66 |
+
i += 1
|
67 |
+
|
68 |
+
with open(file_path, 'w') as file:
|
69 |
+
file.writelines(merged_lines)
|
70 |
+
|
71 |
+
return file_path
|
72 |
+
|
73 |
+
def convert_video_to_text(video_file_path, model_type='base'):
|
74 |
+
audio_path = extract_audio(video_file_path)
|
75 |
+
transcribed_path, _ = transcribe_audio(audio_path, model_type)
|
76 |
+
merge_lines(transcribed_path)
|
77 |
+
return transcribed_path
|
78 |
+
|
79 |
+
|
80 |
+
if __name__ == "__main__":
|
81 |
+
parser = argparse.ArgumentParser(description="Transcribe audio from video")
|
82 |
+
parser.add_argument("video_file", help="Path to the video file")
|
83 |
+
parser.add_argument("--model", help="Size of the whisper model (e.g., tiny, base, small, medium, large, huge).", default="base")
|
84 |
+
args = parser.parse_args()
|
85 |
+
|
86 |
+
convert_video_to_text(args.video_file, args.model)
|
yt_download.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
from pytube import YouTube
|
3 |
+
from tqdm import tqdm
|
4 |
+
import os
|
5 |
+
|
6 |
+
def download_youtube_video(video_url, download_captions=False):
|
7 |
+
progress_bar = None
|
8 |
+
|
9 |
+
def progress_function(stream, chunk, bytes_remaining):
|
10 |
+
nonlocal progress_bar
|
11 |
+
if progress_bar is None:
|
12 |
+
progress_bar = tqdm(total=stream.filesize, unit='B', unit_scale=True, desc="Downloading Video")
|
13 |
+
current = stream.filesize - bytes_remaining
|
14 |
+
progress_bar.n = current
|
15 |
+
progress_bar.last_print_n = current
|
16 |
+
progress_bar.update()
|
17 |
+
|
18 |
+
if not os.path.exists('./downloads'):
|
19 |
+
os.makedirs('./downloads')
|
20 |
+
|
21 |
+
yt = YouTube(
|
22 |
+
video_url,
|
23 |
+
on_progress_callback=progress_function,
|
24 |
+
)
|
25 |
+
|
26 |
+
stream = yt.streams.get_highest_resolution()
|
27 |
+
stream.download(output_path='./downloads')
|
28 |
+
if progress_bar:
|
29 |
+
progress_bar.close()
|
30 |
+
|
31 |
+
if download_captions:
|
32 |
+
caption = yt.captions.get('en') or yt.captions.get('a.en')
|
33 |
+
if caption:
|
34 |
+
caption_convert_to_srt = caption.generate_srt_captions()
|
35 |
+
caption_convert_to_srt = caption_convert_to_srt.replace("\n\n", "\n")
|
36 |
+
with open(os.path.join('./downloads', f"{yt.title}.srt"), "w", encoding="utf-8") as file:
|
37 |
+
file.write(caption_convert_to_srt)
|
38 |
+
print(f"Captions saved to 'downloads/{yt.title}.srt'")
|
39 |
+
else:
|
40 |
+
print("No English captions found for this video.")
|
41 |
+
|
42 |
+
def download_video(url, download_captions=False):
|
43 |
+
video_path = './downloads/' + YouTube(url).streams.get_highest_resolution().default_filename
|
44 |
+
download_youtube_video(url, download_captions)
|
45 |
+
return video_path
|
46 |
+
|
47 |
+
if __name__ == "__main__":
|
48 |
+
parser = argparse.ArgumentParser(description="Download YouTube video and captions")
|
49 |
+
parser.add_argument("video_url", help="YouTube video URL")
|
50 |
+
parser.add_argument("--captions", action="store_true", help="Download captions if available")
|
51 |
+
args = parser.parse_args()
|
52 |
+
|
53 |
+
download_video(args.video_url, args.captions)
|