FarhadMadadzade
commited on
Commit
•
1936f1e
1
Parent(s):
6902590
trying with swedish
Browse files- app.py +43 -16
- requirements.txt +2 -1
- video_downloader.py +55 -0
app.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
from transformers import pipeline
|
2 |
import gradio as gr
|
3 |
import time
|
4 |
-
from video_downloader import download_video
|
5 |
from moviepy.editor import AudioFileClip
|
6 |
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
|
7 |
import datetime
|
@@ -9,20 +9,15 @@ import os
|
|
9 |
from pydub import AudioSegment
|
10 |
from pydub.silence import split_on_silence
|
11 |
|
12 |
-
pipe = pipeline("automatic-speech-recognition", model="
|
13 |
|
14 |
|
15 |
-
def process_video(
|
16 |
-
|
17 |
-
video_path = download_video(date)
|
18 |
|
19 |
-
# Extract
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
# Extract audio from the short video
|
24 |
-
audio_path = f"audio_{date}.wav"
|
25 |
-
AudioFileClip(short_video_path).write_audiofile(audio_path)
|
26 |
|
27 |
# Split the audio into chunks
|
28 |
audio = AudioSegment.from_wav(audio_path)
|
@@ -34,22 +29,54 @@ def process_video(date):
|
|
34 |
chunk.export(f"chunk{i}.wav", format="wav")
|
35 |
with open(f"chunk{i}.wav", "rb") as audio_file:
|
36 |
audio = audio_file.read()
|
37 |
-
transcription += pipe(audio)["text"] + "\n\n
|
38 |
os.remove(f"chunk{i}.wav")
|
39 |
|
40 |
# Remove the audio file
|
41 |
os.remove(audio_path)
|
42 |
-
|
43 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
|
45 |
|
46 |
iface = gr.Interface(
|
47 |
fn=process_video,
|
48 |
-
inputs=gr.inputs.Textbox(label="Date with format YYYYMMDD"),
|
|
|
|
|
|
|
|
|
49 |
outputs=[
|
50 |
gr.outputs.Video(),
|
51 |
gr.Textbox(lines=1000, max_lines=1000, interactive=True),
|
52 |
],
|
|
|
53 |
title="Romanian Transcription Test",
|
54 |
)
|
55 |
|
|
|
1 |
from transformers import pipeline
|
2 |
import gradio as gr
|
3 |
import time
|
4 |
+
from video_downloader import download_video, download_video1
|
5 |
from moviepy.editor import AudioFileClip
|
6 |
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
|
7 |
import datetime
|
|
|
9 |
from pydub import AudioSegment
|
10 |
from pydub.silence import split_on_silence
|
11 |
|
12 |
+
pipe = pipeline("automatic-speech-recognition", model="Sleepyp00/whisper-small-Swedish")
|
13 |
|
14 |
|
15 |
+
def process_video(from_date, to_date):
|
16 |
+
video_path = download_video1(from_date, to_date)
|
|
|
17 |
|
18 |
+
# Extract audio from the video
|
19 |
+
audio_path = f"audio_{from_date}_{to_date}.wav"
|
20 |
+
AudioFileClip(video_path).write_audiofile(audio_path)
|
|
|
|
|
|
|
|
|
21 |
|
22 |
# Split the audio into chunks
|
23 |
audio = AudioSegment.from_wav(audio_path)
|
|
|
29 |
chunk.export(f"chunk{i}.wav", format="wav")
|
30 |
with open(f"chunk{i}.wav", "rb") as audio_file:
|
31 |
audio = audio_file.read()
|
32 |
+
transcription += pipe(audio)["text"] + "\n\n"
|
33 |
os.remove(f"chunk{i}.wav")
|
34 |
|
35 |
# Remove the audio file
|
36 |
os.remove(audio_path)
|
37 |
+
|
38 |
+
return video_path, transcription
|
39 |
+
|
40 |
+
|
41 |
+
# def process_video(date):
|
42 |
+
# # Download the video
|
43 |
+
# video_path = download_video(date)
|
44 |
+
|
45 |
+
# # Extract audio from the video
|
46 |
+
# audio_path = f"audio_{date}.wav"
|
47 |
+
# AudioFileClip(video_path).write_audiofile(audio_path)
|
48 |
+
|
49 |
+
# # Split the audio into chunks
|
50 |
+
# audio = AudioSegment.from_wav(audio_path)
|
51 |
+
# chunks = split_on_silence(audio, min_silence_len=500, silence_thresh=-40)
|
52 |
+
|
53 |
+
# # Transcribe each chunk
|
54 |
+
# transcription = ""
|
55 |
+
# for i, chunk in enumerate(chunks):
|
56 |
+
# chunk.export(f"chunk{i}.wav", format="wav")
|
57 |
+
# with open(f"chunk{i}.wav", "rb") as audio_file:
|
58 |
+
# audio = audio_file.read()
|
59 |
+
# transcription += pipe(audio)["text"] + " "
|
60 |
+
# os.remove(f"chunk{i}.wav")
|
61 |
+
|
62 |
+
# # Remove the audio file
|
63 |
+
# os.remove(audio_path)
|
64 |
+
|
65 |
+
# return video_path, transcription
|
66 |
|
67 |
|
68 |
iface = gr.Interface(
|
69 |
fn=process_video,
|
70 |
+
# inputs=gr.inputs.Textbox(label="Date with format YYYYMMDD"),
|
71 |
+
inputs=[
|
72 |
+
gr.inputs.Textbox(label="From date with format YYYY-MM-DD"),
|
73 |
+
gr.inputs.Textbox(label="Date with format YYYY-MM-DD"),
|
74 |
+
],
|
75 |
outputs=[
|
76 |
gr.outputs.Video(),
|
77 |
gr.Textbox(lines=1000, max_lines=1000, interactive=True),
|
78 |
],
|
79 |
+
live=True,
|
80 |
title="Romanian Transcription Test",
|
81 |
)
|
82 |
|
requirements.txt
CHANGED
@@ -7,4 +7,5 @@ transformers
|
|
7 |
torch
|
8 |
urllib3
|
9 |
moviepy
|
10 |
-
pydub
|
|
|
|
7 |
torch
|
8 |
urllib3
|
9 |
moviepy
|
10 |
+
pydub
|
11 |
+
beautifulsoup4
|
video_downloader.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1 |
import urllib.request
|
2 |
import os
|
3 |
import glob
|
|
|
|
|
4 |
|
5 |
|
6 |
def download_video(date):
|
@@ -21,3 +23,56 @@ def download_video(date):
|
|
21 |
print(f"An error occurred while downloading the video: {e}")
|
22 |
except Exception as e:
|
23 |
print(f"An unexpected error occurred: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import urllib.request
|
2 |
import os
|
3 |
import glob
|
4 |
+
import requests
|
5 |
+
from bs4 import BeautifulSoup
|
6 |
|
7 |
|
8 |
def download_video(date):
|
|
|
23 |
print(f"An error occurred while downloading the video: {e}")
|
24 |
except Exception as e:
|
25 |
print(f"An unexpected error occurred: {e}")
|
26 |
+
|
27 |
+
|
28 |
+
def get_response(url):
|
29 |
+
try:
|
30 |
+
response = requests.get(url)
|
31 |
+
response.raise_for_status()
|
32 |
+
except requests.exceptions.HTTPError as e:
|
33 |
+
if e.response.status_code == 404:
|
34 |
+
print("No video exists for the given date range.")
|
35 |
+
return None
|
36 |
+
else:
|
37 |
+
print(f"An error occurred while getting the webpage: {e}")
|
38 |
+
return None
|
39 |
+
except Exception as e:
|
40 |
+
print(f"An unexpected error occurred: {e}")
|
41 |
+
return None
|
42 |
+
|
43 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
44 |
+
return soup
|
45 |
+
|
46 |
+
|
47 |
+
def download_video1(from_date, to_date):
|
48 |
+
# Get the webpage
|
49 |
+
url = f"https://www.riksdagen.se/sv/sok/?avd=webbtv&from={from_date}&tom={to_date}&doktyp=kam-vo"
|
50 |
+
|
51 |
+
soup = get_response(url)
|
52 |
+
# Find the download link
|
53 |
+
try:
|
54 |
+
video_page = [
|
55 |
+
a["href"]
|
56 |
+
for a in soup.find_all("a", href=True)
|
57 |
+
if a.get("aria-label") and a["aria-label"].startswith("Beslut")
|
58 |
+
][0]
|
59 |
+
# go to video_page and get all links
|
60 |
+
soup = get_response(video_page)
|
61 |
+
video_link = [
|
62 |
+
a["href"]
|
63 |
+
for a in soup.find_all("a", href=True)
|
64 |
+
if a["href"].startswith("https://mhdownload.riksdagen.se")
|
65 |
+
][0]
|
66 |
+
print(video_link)
|
67 |
+
except IndexError:
|
68 |
+
print("No video exists for the given date range.")
|
69 |
+
return None
|
70 |
+
|
71 |
+
# Download the video
|
72 |
+
video_path = f"video_{from_date}_{to_date}.mp4"
|
73 |
+
try:
|
74 |
+
urllib.request.urlretrieve(video_link, video_path)
|
75 |
+
return video_path
|
76 |
+
except Exception as e:
|
77 |
+
print(f"An error occurred while downloading the video: {e}")
|
78 |
+
return None
|