FarhadMadadzade
commited on
Commit
•
80f7e89
1
Parent(s):
bdc6930
final push with only romanian parliament
Browse files- app.py +2 -65
- requirements.txt +0 -1
- video_downloader.py +0 -60
app.py
CHANGED
@@ -1,62 +1,15 @@
|
|
1 |
from transformers import pipeline
|
2 |
import gradio as gr
|
3 |
-
import
|
4 |
-
from
|
5 |
-
from moviepy.editor import AudioFileClip, VideoFileClip
|
6 |
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
|
7 |
-
import datetime
|
8 |
import os
|
9 |
from pydub import AudioSegment
|
10 |
-
from pydub.silence import split_on_silence
|
11 |
import re
|
12 |
|
13 |
pipe = pipeline("automatic-speech-recognition", model="Artanis1551/whisper_romanian")
|
14 |
|
15 |
|
16 |
-
def process_video1(date):
|
17 |
-
# If the date is not in YYYY-MM-DD format, return an error message
|
18 |
-
date_pattern = re.compile(r"\b\d{4}-\d{2}-\d{2}\b")
|
19 |
-
if not date_pattern.match(date):
|
20 |
-
video_path = download_youtube_video(
|
21 |
-
"https://www.youtube.com/watch?v=dQw4w9WgXcQ"
|
22 |
-
)
|
23 |
-
transcription = "Please enter a date in the format YYYY-MM-DD."
|
24 |
-
|
25 |
-
return video_path, transcription
|
26 |
-
try:
|
27 |
-
video_path = download_video1(date)
|
28 |
-
|
29 |
-
# Get the duration of the video
|
30 |
-
video = VideoFileClip(video_path)
|
31 |
-
duration = video.duration
|
32 |
-
|
33 |
-
# If the video is longer than 30 seconds, only take the first 30 seconds
|
34 |
-
if duration > 30:
|
35 |
-
video_path = f"short_{date}.mp4"
|
36 |
-
ffmpeg_extract_subclip(video_path, 0, 30, targetname=video_path)
|
37 |
-
|
38 |
-
# Extract audio from the video
|
39 |
-
audio_path = f"audio_{date}.wav"
|
40 |
-
AudioFileClip(video_path).write_audiofile(audio_path)
|
41 |
-
|
42 |
-
audio = AudioSegment.from_wav(audio_path)
|
43 |
-
chunks = split_on_silence(audio, min_silence_len=500, silence_thresh=-40)
|
44 |
-
|
45 |
-
with open(audio_path, "rb") as audio_file:
|
46 |
-
audio = audio_file.read()
|
47 |
-
transcription = pipe(audio)["text"]
|
48 |
-
|
49 |
-
# Remove the audio file
|
50 |
-
os.remove(audio_path)
|
51 |
-
except:
|
52 |
-
video_path = download_youtube_video(
|
53 |
-
"https://www.youtube.com/watch?v=dQw4w9WgXcQ"
|
54 |
-
)
|
55 |
-
transcription = "No decision was made on this date."
|
56 |
-
|
57 |
-
return video_path, transcription
|
58 |
-
|
59 |
-
|
60 |
def process_video(date):
|
61 |
# If the date is not in YYYY-MM-DD format, return an error message
|
62 |
date_pattern = re.compile(r"\b\d{4}\d{2}\d{2}\b")
|
@@ -107,20 +60,4 @@ iface = gr.Interface(
|
|
107 |
+ "video will be used if it is longer than that.",
|
108 |
)
|
109 |
|
110 |
-
|
111 |
-
# iface = gr.Interface(
|
112 |
-
# fn=process_video1,
|
113 |
-
# inputs=[
|
114 |
-
# gr.inputs.Textbox(label="Date with format YYYY-MM-DD"),
|
115 |
-
# ],
|
116 |
-
# outputs=[
|
117 |
-
# gr.outputs.Video(),
|
118 |
-
# gr.Textbox(lines=100, max_lines=100, interactive=True),
|
119 |
-
# ],
|
120 |
-
# title="Transcribe Swedish Parliament Decisions",
|
121 |
-
# description="This app transcribes the top Swedish Parliament decision"
|
122 |
-
# + " video from the given date. Only the first 30 seconds of the "
|
123 |
-
# + "video will be used if it is longer than that.",
|
124 |
-
# )
|
125 |
-
|
126 |
iface.launch()
|
|
|
1 |
from transformers import pipeline
|
2 |
import gradio as gr
|
3 |
+
from video_downloader import download_video, download_youtube_video
|
4 |
+
from moviepy.editor import AudioFileClip
|
|
|
5 |
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
|
|
|
6 |
import os
|
7 |
from pydub import AudioSegment
|
|
|
8 |
import re
|
9 |
|
10 |
pipe = pipeline("automatic-speech-recognition", model="Artanis1551/whisper_romanian")
|
11 |
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
def process_video(date):
|
14 |
# If the date is not in YYYY-MM-DD format, return an error message
|
15 |
date_pattern = re.compile(r"\b\d{4}\d{2}\d{2}\b")
|
|
|
60 |
+ "video will be used if it is longer than that.",
|
61 |
)
|
62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
iface.launch()
|
requirements.txt
CHANGED
@@ -8,5 +8,4 @@ torch
|
|
8 |
urllib3
|
9 |
moviepy
|
10 |
pydub
|
11 |
-
beautifulsoup4
|
12 |
pytube
|
|
|
8 |
urllib3
|
9 |
moviepy
|
10 |
pydub
|
|
|
11 |
pytube
|
video_downloader.py
CHANGED
@@ -1,30 +1,9 @@
|
|
1 |
import urllib.request
|
2 |
-
import requests
|
3 |
-
from bs4 import BeautifulSoup
|
4 |
from pytube import YouTube
|
5 |
import os
|
6 |
import glob
|
7 |
|
8 |
|
9 |
-
def get_response(url):
|
10 |
-
try:
|
11 |
-
response = requests.get(url)
|
12 |
-
response.raise_for_status()
|
13 |
-
except requests.exceptions.HTTPError as e:
|
14 |
-
if e.response.status_code == 404:
|
15 |
-
print("No video exists for the given date range.")
|
16 |
-
return None
|
17 |
-
else:
|
18 |
-
print(f"An error occurred while getting the webpage: {e}")
|
19 |
-
return None
|
20 |
-
except Exception as e:
|
21 |
-
print(f"An unexpected error occurred: {e}")
|
22 |
-
return None
|
23 |
-
|
24 |
-
soup = BeautifulSoup(response.text, "html.parser")
|
25 |
-
return soup
|
26 |
-
|
27 |
-
|
28 |
def download_video(date):
|
29 |
# Delete any existing .mp4 files
|
30 |
for mp4_file in glob.glob("*.mp4"):
|
@@ -45,45 +24,6 @@ def download_video(date):
|
|
45 |
print(f"An unexpected error occurred: {e}")
|
46 |
|
47 |
|
48 |
-
def download_video1(date):
|
49 |
-
# Delete any existing .mp4 files
|
50 |
-
for mp4_file in glob.glob("*.mp4"):
|
51 |
-
os.remove(mp4_file)
|
52 |
-
|
53 |
-
# Get the webpage
|
54 |
-
url = f"https://www.riksdagen.se/sv/sok/?avd=webbtv&from={date}&tom={date}&doktyp=kam-vo"
|
55 |
-
|
56 |
-
soup = get_response(url)
|
57 |
-
# Find the download link
|
58 |
-
try:
|
59 |
-
dateparse = date.replace("-", "")
|
60 |
-
video_page = [
|
61 |
-
a["href"]
|
62 |
-
for a in soup.find_all("a", href=True)
|
63 |
-
if a.get("aria-label") and dateparse in a["href"]
|
64 |
-
][0]
|
65 |
-
# go to video_page and get all links
|
66 |
-
soup = get_response(video_page)
|
67 |
-
video_link = [
|
68 |
-
a["href"]
|
69 |
-
for a in soup.find_all("a", href=True)
|
70 |
-
if a["href"].startswith("https://mhdownload.riksdagen.se")
|
71 |
-
][0]
|
72 |
-
print(video_link)
|
73 |
-
except IndexError:
|
74 |
-
print("No video exists for the given date range.")
|
75 |
-
return None
|
76 |
-
|
77 |
-
# Download the video
|
78 |
-
video_path = f"video_{date}.mp4"
|
79 |
-
try:
|
80 |
-
urllib.request.urlretrieve(video_link, video_path)
|
81 |
-
return video_path
|
82 |
-
except Exception as e:
|
83 |
-
print(f"An error occurred while downloading the video: {e}")
|
84 |
-
return None
|
85 |
-
|
86 |
-
|
87 |
def download_youtube_video(url):
|
88 |
try:
|
89 |
youtube = YouTube(url)
|
|
|
1 |
import urllib.request
|
|
|
|
|
2 |
from pytube import YouTube
|
3 |
import os
|
4 |
import glob
|
5 |
|
6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
def download_video(date):
|
8 |
# Delete any existing .mp4 files
|
9 |
for mp4_file in glob.glob("*.mp4"):
|
|
|
24 |
print(f"An unexpected error occurred: {e}")
|
25 |
|
26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
def download_youtube_video(url):
|
28 |
try:
|
29 |
youtube = YouTube(url)
|