Spaces:

sc45
/

FOREIGN-WHISPERS

Sleeping

App Files Files Community

sc45 commited on Dec 11, 2023

Commit

f0ceee4

•

1 Parent(s): 2165f59

Initial Commit

Browse files

Files changed (10) hide show

.DS_Store +0 -0
UI.py +25 -0
diarization.py +81 -0
main.py +63 -0
opus.py +63 -0
requirements.txt +197 -0
translated_video.py +77 -0
tts.py +96 -0
video_to_text.py +86 -0
yt_download.py +53 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

UI.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import gradio as gr
+from main import main as process_video
+def run_pipeline(youtube_url):
+    # Run the main processing function from your script
+    # This function should save the final video in the '/translated/' directory
+    process_video(youtube_url)
+    # Construct the path to the final video
+    # Assuming the video is named 'final_video.mp4' and stored in '/translated/'
+    final_video_path = './translated/final_video.mp4'
+    # Return the path for Gradio to display
+    return final_video_path
+iface = gr.Interface(
+    fn=run_pipeline,
+    inputs=gr.Textbox(lines=2, placeholder="Enter YouTube Video URL here..."),
+    outputs=gr.Video(),
+    title="YouTube Video Processing",
+    description="Enter a YouTube URL to process the video through transcription, translation, and more."
+)
+if __name__ == "__main__":
+    iface.launch()

diarization.py ADDED Viewed

	@@ -0,0 +1,81 @@

+from pyannote.audio import Pipeline
+from pydub import AudioSegment
+import os
+import re
+import torch
+def perform_diarization(audio_file_path, translated_file_path, output_dir='./audio/diarization'):
+    # Initialize diarization pipeline
+    pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1")
+    # Send pipeline to GPU (when available)
+    pipeline.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
+    # Load audio file
+    audio = AudioSegment.from_wav(audio_file_path)
+    # Apply pretrained pipeline
+    diarization = pipeline(audio_file_path)
+    os.makedirs(output_dir, exist_ok=True)
+    # Process and save each speaker's audio segments
+    speaker_segments_audio = {}
+    for turn, _, speaker in diarization.itertracks(yield_label=True):
+        start_ms = int(turn.start * 1000)  # Convert to milliseconds
+        end_ms = int(turn.end * 1000)      # Convert to milliseconds
+        segment = audio[start_ms:end_ms]
+        if speaker in speaker_segments_audio:
+            speaker_segments_audio[speaker] += segment
+        else:
+            speaker_segments_audio[speaker] = segment
+    # Save audio segments
+    for speaker, segment in speaker_segments_audio.items():
+        output_path = os.path.join(output_dir, f"{speaker}.wav")
+        segment.export(output_path, format="wav")
+        print(f"Combined audio for speaker {speaker} saved in {output_path}")
+    # Load translated text
+    with open(translated_file_path, "r") as file:
+        translated_lines = file.readlines()
+    # Process and align translated text with diarization data
+    last_speaker = None
+    aligned_text = []
+    timestamp_pattern = re.compile(r'\[(\d+\.\d+)\-(\d+\.\d+)\]')
+    for line in translated_lines:
+        match = timestamp_pattern.match(line)
+        if match:
+            start_time = float(match.group(1))
+            end_time = float(match.group(2))
+            text = line[match.end():].strip()  # Extract text part
+            speaker_found = False
+            # Find corresponding speaker
+            for turn, _, speaker in diarization.itertracks(yield_label=True):
+                speaker_start = turn.start
+                speaker_end = turn.end
+                # Check for overlap between speaker segment and line timestamp
+                if max(speaker_start, start_time) < min(speaker_end, end_time):
+                    aligned_text.append(f"[{speaker}] [{start_time}-{end_time}] {text}")
+                    speaker_found = True
+                    last_speaker = speaker
+                    break
+            # If no speaker found, use the last speaker
+            if not speaker_found:
+                if last_speaker is not None:
+                    aligned_text.append(f"[{last_speaker}] [{start_time}-{end_time}] {text}")
+                else:
+                    aligned_text.append(f"[Unknown Speaker] [{start_time}-{end_time}] {text}")
+    # Save aligned text to a single file
+    aligned_text_output_path = os.path.join(output_dir, "aligned_text.txt")
+    with open(aligned_text_output_path, "w") as aligned_text_file:
+        aligned_text_file.write('\n'.join(aligned_text))
+    print(f"Aligned text saved in {aligned_text_output_path}")
+# The rest of your script, if any

main.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import argparse
+import os
+from yt_download import download_video
+from video_to_text import convert_video_to_text
+from opus import translate_file
+from diarization import perform_diarization
+from tts import main as tts_main
+from translated_video import create_translated_video
+def get_transcription_filename(video_path):
+    base_name = os.path.splitext(os.path.basename(video_path))[0]
+    return f'./transcribed/{base_name}.txt'
+def get_audio_filename(video_path):
+    base_name = os.path.splitext(os.path.basename(video_path))[0]
+    return f'./audio/{base_name}.wav'
+def main(youtube_url):
+    # Ensure necessary directories exist
+    if not os.path.exists('./downloads'):
+        os.makedirs('./downloads')
+    if not os.path.exists('./audio'):
+        os.makedirs('./audio')
+    if not os.path.exists('./transcribed'):
+        os.makedirs('./transcribed')
+    if not os.path.exists('./translated'):
+        os.makedirs('./translated')
+    # Step 1: Download the video
+    downloaded_video_path = download_video(youtube_url)
+    # Step 2: Transcribe the video's audio
+    transcribed_text_path = get_transcription_filename(downloaded_video_path)
+    model_type = 'base'  # You can specify the Whisper model type
+    convert_video_to_text(downloaded_video_path, model_type)
+    # Step 3: Translate the transcribed text to Spanish
+    translated_text_path = './translated/translated_text.txt'
+    translate_file(transcribed_text_path, translated_text_path)
+    # Step 4: Perform diarization
+    audio_path = get_audio_filename(downloaded_video_path)
+    diarized_audio_dir = './audio/diarization'
+    perform_diarization(audio_path, translated_text_path)
+    # Step 5: Generate speech for translated text
+    speaker_directory = './audio/diarization'
+    aligned_text_file = './audio/diarization/aligned_text.txt'  # Ensure this is the correct path
+    output_audio_file = './translated/final_audio.wav'
+    tts_main(speaker_directory, aligned_text_file, output_audio_file)
+    # Step 6: Create the final translated video
+    final_video_path = create_translated_video(downloaded_video_path, output_audio_file, translated_text_path)
+    print(f"Final translated video created at {final_video_path}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Process a YouTube video with multiple steps.")
+    parser.add_argument("youtube_url", help="YouTube video URL")
+    args = parser.parse_args()
+    main(args.youtube_url)

opus.py ADDED Viewed

	@@ -0,0 +1,63 @@

+from transformers import MarianMTModel, MarianTokenizer
+from tqdm import tqdm
+import os
+import re
+import argparse
+# Load Model and Tokenizer
+model_name = "Helsinki-NLP/opus-mt-en-es"
+tokenizer = MarianTokenizer.from_pretrained(model_name)
+model = MarianMTModel.from_pretrained(model_name)
+# Extract & separate timestamp and text
+def extract_timestamp_and_text(line):
+    match = re.match(r'\[(\d+\.\d+\-\d+\.\d+)\]\s+(.*)', line)
+    if match:
+        return match.group(1), match.group(2)
+    return '', line
+# Translate text
+def translate_text(text):
+    lines = text.split('\n')
+    translated_lines = []
+    for line in tqdm(lines, desc="Translating lines", leave=False):
+        if not line.strip():
+            translated_lines.append('')
+            continue
+        timestamp, line_text = extract_timestamp_and_text(line)
+        if line_text.strip():
+            model_inputs = tokenizer(line_text, return_tensors="pt", truncation=True, padding="longest")
+            translated = model.generate(**model_inputs)
+            translated_text = tokenizer.batch_decode(translated, skip_special_tokens=True)[0]
+            translated_line = f'[{timestamp}] {translated_text}'
+        else:
+            translated_line = f'[{timestamp}]'
+        translated_lines.append(translated_line)
+    return '\n'.join(translated_lines)
+# Main function to translate a file
+def translate_file(src_file_path, dst_file_path):
+    try:
+        with open(src_file_path, 'r') as file:
+            english_text = file.read()
+            spanish_text = translate_text(english_text)
+        with open(dst_file_path, 'w') as file:
+            file.write(spanish_text)
+        print(f"Translation completed: {dst_file_path}")
+    except Exception as e:
+        print(f"Error processing file: {e}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Translate English text to Spanish")
+    parser.add_argument("src_file_path", help="Path to the source file with English text")
+    parser.add_argument("dst_file_path", help="Path to save the translated Spanish text")
+    args = parser.parse_args()
+    translate_file(args.src_file_path, args.dst_file_path)

requirements.txt ADDED Viewed

	@@ -0,0 +1,197 @@

+absl-py==2.0.0
+aiohttp==3.9.0
+aiosignal==1.3.1
+alembic==1.12.1
+annotated-types==0.6.0
+antlr4-python3-runtime==4.9.3
+anyascii==0.3.2
+asteroid-filterbanks==0.4.0
+attrs==23.1.0
+audioread==3.0.1
+Babel==2.13.1
+bangla==0.0.2
+blinker==1.7.0
+blis==0.7.11
+bnnumerizer==0.0.2
+bnunicodenormalizer==0.1.6
+Brotli @ file:///D:/bld/brotli-split_1695989908365/work
+cachetools==5.3.2
+catalogue==2.0.10
+certifi @ file:///home/conda/feedstock_root/build_artifacts/certifi_1700303426725/work/certifi
+cffi==1.16.0
+charset-normalizer @ file:///home/conda/feedstock_root/build_artifacts/charset-normalizer_1698833585322/work
+click==8.1.7
+cloudpathlib==0.16.0
+colorama==0.4.6
+colorlog==6.7.0
+confection==0.1.3
+contourpy==1.2.0
+coqpit==0.0.17
+cycler==0.12.1
+cymem==2.0.8
+Cython==3.0.5
+dateparser==1.1.8
+decorator==4.4.2
+docopt==0.6.2
+einops==0.7.0
+en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl#sha256=86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889
+encodec==0.1.1
+ffmpeg-python==0.2.0
+filelock @ file:///home/conda/feedstock_root/build_artifacts/filelock_1698714947081/work
+fire==0.5.0
+Flask==3.0.0
+fonttools==4.45.0
+frozenlist==1.4.0
+fsspec==2023.10.0
+future==0.18.3
+g2pkk==0.1.2
+google-auth==2.23.4
+google-auth-oauthlib==1.1.0
+greenlet==3.0.1
+grpcio==1.59.3
+gruut==2.2.3
+gruut-ipa==0.13.0
+gruut-lang-de==2.0.0
+gruut-lang-en==2.0.0
+gruut-lang-es==2.0.0
+gruut-lang-fr==2.0.2
+hangul-romanize==0.1.0
+huggingface-hub==0.19.4
+HyperPyYAML==1.2.2
+idna @ file:///home/conda/feedstock_root/build_artifacts/idna_1663625384323/work
+imageio==2.33.0
+imageio-ffmpeg==0.4.9
+inflect==7.0.0
+itsdangerous==2.1.2
+jamo==0.4.1
+jieba==0.42.1
+Jinja2 @ file:///home/conda/feedstock_root/build_artifacts/jinja2_1654302431367/work
+joblib==1.3.2
+jsonlines==1.2.0
+julius==0.2.7
+kiwisolver==1.4.5
+langcodes==3.3.0
+lazy_loader==0.3
+librosa==0.10.1
+lightning==2.1.2
+lightning-utilities==0.10.0
+llvmlite==0.41.1
+Mako==1.3.0
+Markdown==3.5.1
+markdown-it-py==3.0.0
+MarkupSafe @ file:///D:/bld/markupsafe_1695367436673/work
+matplotlib==3.8.2
+mdurl==0.1.2
+more-itertools==10.1.0
+moviepy==1.0.3
+mpmath @ file:///home/conda/feedstock_root/build_artifacts/mpmath_1678228039184/work
+msgpack==1.0.7
+multidict==6.0.4
+murmurhash==1.0.10
+networkx==2.8.8
+nltk==3.8.1
+num2words==0.5.13
+numba==0.58.1
+numpy @ file:///D:/bld/numpy_1694920156760/work/dist/numpy-1.26.0-cp311-cp311-win_amd64.whl#sha256=52e1af97f7d84aafe72cc1aaae3e1c9d52dff69c7ffcc96e2f4f7799fdad7a0c
+oauthlib==3.2.2
+omegaconf==2.3.0
+openai-whisper==20231117
+opencv-python==4.8.1.78
+optuna==3.4.0
+packaging==23.2
+pandas==1.5.3
+Pillow @ file:///D:/bld/pillow_1697423754480/work
+platformdirs==4.0.0
+pooch==1.8.0
+preshed==3.0.9
+primePy==1.3
+proglog==0.1.10
+protobuf==4.23.4
+psutil==5.9.6
+pyannote.audio==3.1.0
+pyannote.core==5.0.0
+pyannote.database==5.0.1
+pyannote.metrics==3.2.1
+pyannote.pipeline==3.0.1
+pyasn1==0.5.1
+pyasn1-modules==0.3.0
+pycparser==2.21
+pydantic==2.5.2
+pydantic_core==2.14.5
+pydub==0.25.1
+Pygments==2.17.2
+pymp3==0.1.9
+pynndescent==0.5.11
+pyparsing==3.1.1
+pypinyin==0.49.0
+pysbd==0.3.4
+PySocks @ file:///D:/bld/pysocks_1661604991356/work
+PySoundFile==0.9.0.post1
+python-crfsuite==0.9.9
+python-dateutil==2.8.2
+pytorch-lightning==2.1.2
+pytorch-metric-learning==2.3.0
+pytube==15.0.0
+pytz==2023.3.post1
+PyYAML @ file:///D:/bld/pyyaml_1695373635661/work
+regex==2023.10.3
+requests @ file:///home/conda/feedstock_root/build_artifacts/requests_1684774241324/work
+requests-oauthlib==1.3.1
+rich==13.7.0
+rsa==4.9
+ruamel.yaml==0.18.5
+ruamel.yaml.clib==0.2.8
+sacremoses==0.1.1
+safetensors==0.4.0
+scikit-learn==1.3.2
+scipy==1.11.4
+semver==3.0.2
+sentencepiece==0.1.99
+shellingham==1.5.4
+six==1.16.0
+smart-open==6.4.0
+sortedcontainers==2.4.0
+soundfile==0.12.1
+soxr==0.3.7
+spacy==3.7.2
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+speechbrain==0.5.16
+SQLAlchemy==2.0.23
+srsly==2.4.8
+srt==3.5.3
+SudachiDict-core==20230927
+SudachiPy==0.6.7
+sympy @ file:///home/conda/feedstock_root/build_artifacts/sympy_1684180539862/work
+tabulate==0.9.0
+tensorboard==2.15.1
+tensorboard-data-server==0.7.2
+tensorboardX==2.6.2.2
+termcolor==2.4.0
+thinc==8.2.1
+threadpoolctl==3.2.0
+tiktoken==0.5.1
+tokenizers==0.15.0
+torch==2.1.1
+torch-audiomentations==0.11.0
+torch-pitch-shift==1.2.4
+torch-time-stretch==1.0.3
+torchaudio==2.1.1
+torchmetrics==1.2.0
+torchvision==0.16.1
+tqdm==4.66.1
+trainer==0.0.32
+transformers==4.35.2
+TTS==0.21.3
+typer==0.9.0
+typing_extensions @ file:///home/conda/feedstock_root/build_artifacts/typing_extensions_1695040754690/work
+tzdata==2023.3
+tzlocal==5.2
+umap-learn==0.5.5
+Unidecode==1.3.7
+urllib3 @ file:///home/conda/feedstock_root/build_artifacts/urllib3_1699933488691/work
+wasabi==1.1.2
+weasel==0.3.4
+Werkzeug==3.0.1
+win-inet-pton @ file:///D:/bld/win_inet_pton_1667051142467/work
+yarl==1.9.3

translated_video.py ADDED Viewed

	@@ -0,0 +1,77 @@

+from moviepy.editor import VideoFileClip, AudioFileClip
+from pydub import AudioSegment
+import srt
+import datetime
+import ffmpeg
+import os
+import re
+def create_translated_video(original_video_path, translated_audio_path, translated_text_path, output_dir='./translated'):
+    # Load original video
+    video = VideoFileClip(original_video_path)
+    # Load TTS audio
+    new_audio = AudioFileClip(translated_audio_path)
+    video = video.set_audio(new_audio)
+    audio_segment = AudioSegment.from_file(translated_audio_path, format="wav")
+    # Check if new audio is shorter to pad with silence
+    if new_audio.duration < video.duration:
+        silence_duration = (video.duration - new_audio.duration) * 1000  # convert to milliseconds
+        silence_segment = AudioSegment.silent(duration=silence_duration)
+        audio_segment = audio_segment + silence_segment
+        padded_audio_path = os.path.join(output_dir, 'padded_audio.wav')
+        audio_segment.export(padded_audio_path, format='wav')
+        new_audio = AudioFileClip(padded_audio_path)
+    # Generate SRT content
+    def parse_translated_text(file_path):
+        with open(file_path, 'r') as file:
+            content = file.readlines()
+        subtitles = []
+        timestamp_pattern = re.compile(r'\[(\d+\.\d+)\-(\d+\.\d+)\]')
+        for line in content:
+            match = timestamp_pattern.match(line)
+            if match:
+                start_time = datetime.timedelta(seconds=float(match.group(1)))
+                end_time = datetime.timedelta(seconds=float(match.group(2)))
+                text = line[match.end():].strip()
+                subtitle = srt.Subtitle(index=len(subtitles)+1,
+                                        start=start_time,
+                                        end=end_time,
+                                        content=text)
+                subtitles.append(subtitle)
+        return srt.compose(subtitles)
+    # Generate SRT content
+    srt_content = parse_translated_text(translated_text_path)
+    # Write to an SRT file
+    srt_file = './translated/translated.srt'
+    with open(srt_file, 'w', encoding='utf-8') as file:
+        file.write(srt_content)
+    # Write the final video file
+    temp = "./translated/temp.mp4"
+    video.write_videofile(temp)
+    # Add subtitles
+    final_video_file = os.path.join(output_dir, "final_video.mp4")
+    # Correct the subtitle filter string for ffmpeg
+    subtitle_filter_str = f"subtitles='{srt_file}'"
+    try:
+        ffmpeg.input(temp).output(final_video_file, vf=subtitle_filter_str).run()
+    except ffmpeg.Error as e:
+        print(f"Error creating final video: {e}")
+        return None
+    # Remove temp file
+    os.remove(temp)
+    return final_video_file
+# The rest of your script, if any

tts.py ADDED Viewed

	@@ -0,0 +1,96 @@

+from TTS.api import TTS
+from pydub import AudioSegment
+import os
+import re
+import ffmpeg
+import shutil
+import argparse
+def adjust_speed(input_file, speed_factor):
+    output_file = input_file.replace(".wav", "_adjusted.wav")
+    ffmpeg.input(input_file).filter('atempo', speed_factor).output(output_file, acodec='pcm_s16le').run()
+    return output_file
+def generate_speech(text, speaker_voice_map, output_file):
+    combined_audio = AudioSegment.empty()
+    temp_files = []
+    tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to("cuda")
+    for line in text.split("\n"):
+        if not line.strip():
+            continue
+        match = re.match(r"\[SPEAKER_(\d+)\] \[(\d+\.\d+)-(\d+\.\d+)\] (.+)", line)
+        if not match:
+            continue
+        speaker_id, start_time, end_time, sentence = match.groups()
+        start_time, end_time = float(start_time), float(end_time)
+        segment_duration = (end_time - start_time) * 1000  # Duration in milliseconds
+        speaker_wav = speaker_voice_map.get(f"SPEAKER_{speaker_id}")
+        if not speaker_wav:
+            continue
+        os.makedirs('./audio/temp', exist_ok=True)
+        temp_file_path = f"./audio/temp/temp_output_part_{len(temp_files)}.wav"
+        temp_files.append(temp_file_path)
+        tts_speed = 1.0
+        tts.tts_to_file(text=sentence, file_path=temp_file_path, speaker_wav=speaker_wav, language="es", speed=tts_speed)
+        segment_audio = AudioSegment.from_wav(temp_file_path)
+        if segment_audio.duration_seconds * 1000 > segment_duration:
+            while tts_speed < 2.0 and segment_audio.duration_seconds * 1000 > segment_duration:
+                tts_speed += 0.5
+                tts.tts_to_file(text=sentence, file_path=temp_file_path, speaker_wav=speaker_wav, language="es", speed=tts_speed)
+                segment_audio = AudioSegment.from_wav(temp_file_path)
+            if segment_audio.duration_seconds * 1000 > segment_duration:
+                required_speed = segment_duration / (segment_audio.duration_seconds * 1000)
+                if required_speed < 1.0:
+                    required_speed = 1.0 / required_speed
+                temp_file_path = adjust_speed(temp_file_path, required_speed)
+                segment_audio = AudioSegment.from_wav(temp_file_path)
+        if combined_audio.duration_seconds == 0 and start_time > 0:
+            combined_audio = AudioSegment.silent(duration=start_time * 1000) + combined_audio
+        if segment_audio.duration_seconds * 1000 > segment_duration:
+            segment_audio = segment_audio[:segment_duration]
+        else:
+            segment_audio = segment_audio + AudioSegment.silent(duration=segment_duration - len(segment_audio))
+        combined_audio += segment_audio
+    combined_audio.export(output_file, format="wav")
+    for temp_file in temp_files:
+        os.remove(temp_file)
+def map_speaker_ids(directory):
+    speaker_voice_map = {}
+    for file in os.listdir(directory):
+        if file.endswith(".wav"):
+            speaker_id = file.replace(".wav", "")
+            speaker_voice_map[speaker_id] = os.path.join(directory, file)
+    return speaker_voice_map
+def main(speaker_directory, aligned_text_file, output_audio_file):
+    speaker_voice_map = map_speaker_ids(speaker_directory)
+    with open(aligned_text_file, 'r') as file:
+        translated_text = file.read()
+    generate_speech(translated_text, speaker_voice_map, output_audio_file)
+    if os.path.exists('./audio/temp'):
+        shutil.rmtree('./audio/temp')
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Generate speech from translated text")
+    parser.add_argument("speaker_directory", help="Directory containing speaker voice clips")
+    parser.add_argument("aligned_text_file", help="Path to the translated and aligned text file")
+    parser.add_argument("output_audio_file", help="Path to save the generated speech audio file")
+    args = parser.parse_args()
+    main(args.speaker_directory, args.aligned_text_file, args.output_audio_file)

video_to_text.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import argparse
+from moviepy.editor import VideoFileClip
+import whisper
+import os
+import re
+def extract_audio(video_path, audio_dir='./audio'):
+    os.makedirs(audio_dir, exist_ok=True)
+    base_filename = os.path.splitext(os.path.basename(video_path))[0]
+    audio_filename = os.path.join(audio_dir, base_filename + '.wav')
+    video_clip = VideoFileClip(video_path)
+    video_clip.audio.write_audiofile(audio_filename)
+    video_clip.close()
+    return audio_filename
+def transcribe_audio(audio_path, model_type='base', transcribed_dir='./transcribed'):
+    model = whisper.load_model(model_type)
+    result = model.transcribe(audio_path)
+    os.makedirs(transcribed_dir, exist_ok=True)
+    base_filename = os.path.splitext(os.path.basename(audio_path))[0]
+    transcribed_filename = os.path.join(transcribed_dir, base_filename + '.txt')
+    with open(transcribed_filename, 'w') as file:
+        for segment in result['segments']:
+            start = segment['start']
+            end = segment['end']
+            text = segment['text']
+            file.write(f"[{start:.2f}-{end:.2f}] {text}\n")
+    return transcribed_filename, result['text']
+def merge_lines(file_path):
+    timestamp_pattern = re.compile(r'\[(\d+\.\d+)-(\d+\.\d+)\]')
+    with open(file_path, 'r') as file:
+        lines = file.readlines()
+    merged_lines = []
+    i = 0
+    while i < len(lines):
+        line = lines[i].strip()
+        match = timestamp_pattern.match(line)
+        if match:
+            start_time = float(match.group(1))
+            text = line[match.end():].strip()
+            if not (text.endswith('.') or text.endswith('?')):
+                if i + 1 < len(lines):
+                    next_line = lines[i + 1].strip()
+                    next_match = timestamp_pattern.match(next_line)
+                    if next_match:
+                        end_time = float(next_match.group(2))
+                        next_text = next_line[next_match.end():].strip()
+                        merged_text = text + ' ' + next_text
+                        merged_line = f"[{start_time:.2f}-{end_time:.2f}] {merged_text}\n"
+                        merged_lines.append(merged_line)
+                        i += 1
+            else:
+                end_time = float(match.group(2))
+                merged_lines.append(f"[{start_time:.2f}-{end_time:.2f}] {text}\n")
+        i += 1
+    with open(file_path, 'w') as file:
+        file.writelines(merged_lines)
+    return file_path
+def convert_video_to_text(video_file_path, model_type='base'):
+    audio_path = extract_audio(video_file_path)
+    transcribed_path, _ = transcribe_audio(audio_path, model_type)
+    merge_lines(transcribed_path)
+    return transcribed_path
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Transcribe audio from video")
+    parser.add_argument("video_file", help="Path to the video file")
+    parser.add_argument("--model", help="Size of the whisper model (e.g., tiny, base, small, medium, large, huge).", default="base")
+    args = parser.parse_args()
+    convert_video_to_text(args.video_file, args.model)

yt_download.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import argparse
+from pytube import YouTube
+from tqdm import tqdm
+import os
+def download_youtube_video(video_url, download_captions=False):
+    progress_bar = None
+    def progress_function(stream, chunk, bytes_remaining):
+        nonlocal progress_bar
+        if progress_bar is None:
+            progress_bar = tqdm(total=stream.filesize, unit='B', unit_scale=True, desc="Downloading Video")
+        current = stream.filesize - bytes_remaining
+        progress_bar.n = current
+        progress_bar.last_print_n = current
+        progress_bar.update()
+    if not os.path.exists('./downloads'):
+        os.makedirs('./downloads')
+    yt = YouTube(
+        video_url,
+        on_progress_callback=progress_function,
+    )
+    stream = yt.streams.get_highest_resolution()
+    stream.download(output_path='./downloads')
+    if progress_bar:
+        progress_bar.close()
+    if download_captions:
+        caption = yt.captions.get('en') or yt.captions.get('a.en')
+        if caption:
+            caption_convert_to_srt = caption.generate_srt_captions()
+            caption_convert_to_srt = caption_convert_to_srt.replace("\n\n", "\n")
+            with open(os.path.join('./downloads', f"{yt.title}.srt"), "w", encoding="utf-8") as file:
+                file.write(caption_convert_to_srt)
+            print(f"Captions saved to 'downloads/{yt.title}.srt'")
+        else:
+            print("No English captions found for this video.")
+def download_video(url, download_captions=False):
+    video_path = './downloads/' + YouTube(url).streams.get_highest_resolution().default_filename
+    download_youtube_video(url, download_captions)
+    return video_path
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Download YouTube video and captions")
+    parser.add_argument("video_url", help="YouTube video URL")
+    parser.add_argument("--captions", action="store_true", help="Download captions if available")
+    args = parser.parse_args()
+    download_video(args.video_url, args.captions)