File size: 2,881 Bytes
fac06d0
 
 
1936f1e
de84263
e4297a8
de84263
 
4b85b27
 
fac06d0
8dffbd8
fac06d0
 
8dffbd8
 
de84263
1936f1e
8dffbd8
1936f1e
de84263
4b85b27
 
 
 
 
 
 
 
 
 
292ce47
4b85b27
de84263
 
 
1936f1e
 
 
 
8dffbd8
 
 
 
 
 
 
 
 
 
 
 
 
 
292ce47
 
 
1936f1e
c031f24
 
 
 
 
292ce47
c031f24
1936f1e
292ce47
 
 
1936f1e
292ce47
 
 
 
 
 
 
 
 
 
 
1936f1e
165b80a
1936f1e
be37f4c
292ce47
8dffbd8
 
292ce47
 
 
 
8dffbd8
292ce47
fac06d0
534a7d7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from transformers import pipeline
import gradio as gr
import time
from video_downloader import download_video, download_video1
from moviepy.editor import AudioFileClip
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
import datetime
import os
from pydub import AudioSegment
from pydub.silence import split_on_silence

pipe = pipeline("automatic-speech-recognition", model="Artanis1551/whisper_swedish")


def process_video1(date):
    video_path = download_video1(date)

    # Extract audio from the video
    audio_path = f"audio_{date}.wav"
    AudioFileClip(video_path).write_audiofile(audio_path)

    # Split the audio into chunks
    audio = AudioSegment.from_wav(audio_path)
    chunks = split_on_silence(audio, min_silence_len=500, silence_thresh=-40)

    # Transcribe each chunk
    transcription = ""
    for i, chunk in enumerate(chunks):
        chunk.export(f"chunk{i}.wav", format="wav")
        with open(f"chunk{i}.wav", "rb") as audio_file:
            audio = audio_file.read()
        transcription += pipe(audio)["text"] + "\n "
        os.remove(f"chunk{i}.wav")

    # Remove the audio file
    os.remove(audio_path)

    return video_path, transcription


iface = gr.Interface(
    fn=process_video1,
    inputs=[
        gr.inputs.Textbox(label="Date with format YYYY-MM-DD"),
    ],
    outputs=[
        gr.outputs.Video(),
        gr.Textbox(lines=1000, max_lines=1000, interactive=True),
    ],
    title="Transcribe Swedish Parliament Decisions",
    desription="This app transcribes the top Swedish Parliament decision video from the given date.",
)


def process_video(date):
    # Download the video
    video_path = download_video(date)

    # Extract the first 30 seconds of the video
    short_video_path = f"short_{date}.mp4"
    ffmpeg_extract_subclip(video_path, 0, 30, targetname=short_video_path)

    # Extract audio from the short video
    audio_path = f"audio_{date}.wav"
    AudioFileClip(short_video_path).write_audiofile(audio_path)

    # Split the audio into chunks
    audio = AudioSegment.from_wav(audio_path)
    chunks = split_on_silence(audio, min_silence_len=500, silence_thresh=-40)

    # Transcribe each chunk
    transcription = ""
    for i, chunk in enumerate(chunks):
        chunk.export(f"chunk{i}.wav", format="wav")
        with open(f"chunk{i}.wav", "rb") as audio_file:
            audio = audio_file.read()
        transcription += pipe(audio)["text"] + " "
        os.remove(f"chunk{i}.wav")

    # Remove the audio file
    os.remove(audio_path)

    return short_video_path, transcription


# iface = gr.Interface(
#     fn=process_video,
#     inputs=gr.inputs.Textbox(label="Date with format YYYYMMDD"),
#     outputs=[
#         gr.outputs.Video(),
#         gr.Textbox(lines=1000, max_lines=1000, interactive=True),
#     ],
#     title="Romanian Transcription Test",
# )

iface.launch()