FarhadMadadzade commited on
Commit
1936f1e
1 Parent(s): 6902590

trying with swedish

Browse files
Files changed (3) hide show
  1. app.py +43 -16
  2. requirements.txt +2 -1
  3. video_downloader.py +55 -0
app.py CHANGED
@@ -1,7 +1,7 @@
1
  from transformers import pipeline
2
  import gradio as gr
3
  import time
4
- from video_downloader import download_video
5
  from moviepy.editor import AudioFileClip
6
  from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
7
  import datetime
@@ -9,20 +9,15 @@ import os
9
  from pydub import AudioSegment
10
  from pydub.silence import split_on_silence
11
 
12
- pipe = pipeline("automatic-speech-recognition", model="gigant/whisper-medium-romanian")
13
 
14
 
15
- def process_video(date):
16
- # Download the video
17
- video_path = download_video(date)
18
 
19
- # Extract the first 30 seconds of the video
20
- short_video_path = f"short_{date}.mp4"
21
- ffmpeg_extract_subclip(video_path, 30, 50, targetname=short_video_path)
22
-
23
- # Extract audio from the short video
24
- audio_path = f"audio_{date}.wav"
25
- AudioFileClip(short_video_path).write_audiofile(audio_path)
26
 
27
  # Split the audio into chunks
28
  audio = AudioSegment.from_wav(audio_path)
@@ -34,22 +29,54 @@ def process_video(date):
34
  chunk.export(f"chunk{i}.wav", format="wav")
35
  with open(f"chunk{i}.wav", "rb") as audio_file:
36
  audio = audio_file.read()
37
- transcription += pipe(audio)["text"] + "\n\n "
38
  os.remove(f"chunk{i}.wav")
39
 
40
  # Remove the audio file
41
  os.remove(audio_path)
42
- print(transcription)
43
- return short_video_path, transcription
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
 
46
  iface = gr.Interface(
47
  fn=process_video,
48
- inputs=gr.inputs.Textbox(label="Date with format YYYYMMDD"),
 
 
 
 
49
  outputs=[
50
  gr.outputs.Video(),
51
  gr.Textbox(lines=1000, max_lines=1000, interactive=True),
52
  ],
 
53
  title="Romanian Transcription Test",
54
  )
55
 
 
1
  from transformers import pipeline
2
  import gradio as gr
3
  import time
4
+ from video_downloader import download_video, download_video1
5
  from moviepy.editor import AudioFileClip
6
  from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
7
  import datetime
 
9
  from pydub import AudioSegment
10
  from pydub.silence import split_on_silence
11
 
12
+ pipe = pipeline("automatic-speech-recognition", model="Sleepyp00/whisper-small-Swedish")
13
 
14
 
15
+ def process_video(from_date, to_date):
16
+ video_path = download_video1(from_date, to_date)
 
17
 
18
+ # Extract audio from the video
19
+ audio_path = f"audio_{from_date}_{to_date}.wav"
20
+ AudioFileClip(video_path).write_audiofile(audio_path)
 
 
 
 
21
 
22
  # Split the audio into chunks
23
  audio = AudioSegment.from_wav(audio_path)
 
29
  chunk.export(f"chunk{i}.wav", format="wav")
30
  with open(f"chunk{i}.wav", "rb") as audio_file:
31
  audio = audio_file.read()
32
+ transcription += pipe(audio)["text"] + "\n\n"
33
  os.remove(f"chunk{i}.wav")
34
 
35
  # Remove the audio file
36
  os.remove(audio_path)
37
+
38
+ return video_path, transcription
39
+
40
+
41
+ # def process_video(date):
42
+ # # Download the video
43
+ # video_path = download_video(date)
44
+
45
+ # # Extract audio from the video
46
+ # audio_path = f"audio_{date}.wav"
47
+ # AudioFileClip(video_path).write_audiofile(audio_path)
48
+
49
+ # # Split the audio into chunks
50
+ # audio = AudioSegment.from_wav(audio_path)
51
+ # chunks = split_on_silence(audio, min_silence_len=500, silence_thresh=-40)
52
+
53
+ # # Transcribe each chunk
54
+ # transcription = ""
55
+ # for i, chunk in enumerate(chunks):
56
+ # chunk.export(f"chunk{i}.wav", format="wav")
57
+ # with open(f"chunk{i}.wav", "rb") as audio_file:
58
+ # audio = audio_file.read()
59
+ # transcription += pipe(audio)["text"] + " "
60
+ # os.remove(f"chunk{i}.wav")
61
+
62
+ # # Remove the audio file
63
+ # os.remove(audio_path)
64
+
65
+ # return video_path, transcription
66
 
67
 
68
  iface = gr.Interface(
69
  fn=process_video,
70
+ # inputs=gr.inputs.Textbox(label="Date with format YYYYMMDD"),
71
+ inputs=[
72
+ gr.inputs.Textbox(label="From date with format YYYY-MM-DD"),
73
+ gr.inputs.Textbox(label="Date with format YYYY-MM-DD"),
74
+ ],
75
  outputs=[
76
  gr.outputs.Video(),
77
  gr.Textbox(lines=1000, max_lines=1000, interactive=True),
78
  ],
79
+ live=True,
80
  title="Romanian Transcription Test",
81
  )
82
 
requirements.txt CHANGED
@@ -7,4 +7,5 @@ transformers
7
  torch
8
  urllib3
9
  moviepy
10
- pydub
 
 
7
  torch
8
  urllib3
9
  moviepy
10
+ pydub
11
+ beautifulsoup4
video_downloader.py CHANGED
@@ -1,6 +1,8 @@
1
  import urllib.request
2
  import os
3
  import glob
 
 
4
 
5
 
6
  def download_video(date):
@@ -21,3 +23,56 @@ def download_video(date):
21
  print(f"An error occurred while downloading the video: {e}")
22
  except Exception as e:
23
  print(f"An unexpected error occurred: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import urllib.request
2
  import os
3
  import glob
4
+ import requests
5
+ from bs4 import BeautifulSoup
6
 
7
 
8
  def download_video(date):
 
23
  print(f"An error occurred while downloading the video: {e}")
24
  except Exception as e:
25
  print(f"An unexpected error occurred: {e}")
26
+
27
+
28
+ def get_response(url):
29
+ try:
30
+ response = requests.get(url)
31
+ response.raise_for_status()
32
+ except requests.exceptions.HTTPError as e:
33
+ if e.response.status_code == 404:
34
+ print("No video exists for the given date range.")
35
+ return None
36
+ else:
37
+ print(f"An error occurred while getting the webpage: {e}")
38
+ return None
39
+ except Exception as e:
40
+ print(f"An unexpected error occurred: {e}")
41
+ return None
42
+
43
+ soup = BeautifulSoup(response.text, "html.parser")
44
+ return soup
45
+
46
+
47
+ def download_video1(from_date, to_date):
48
+ # Get the webpage
49
+ url = f"https://www.riksdagen.se/sv/sok/?avd=webbtv&from={from_date}&tom={to_date}&doktyp=kam-vo"
50
+
51
+ soup = get_response(url)
52
+ # Find the download link
53
+ try:
54
+ video_page = [
55
+ a["href"]
56
+ for a in soup.find_all("a", href=True)
57
+ if a.get("aria-label") and a["aria-label"].startswith("Beslut")
58
+ ][0]
59
+ # go to video_page and get all links
60
+ soup = get_response(video_page)
61
+ video_link = [
62
+ a["href"]
63
+ for a in soup.find_all("a", href=True)
64
+ if a["href"].startswith("https://mhdownload.riksdagen.se")
65
+ ][0]
66
+ print(video_link)
67
+ except IndexError:
68
+ print("No video exists for the given date range.")
69
+ return None
70
+
71
+ # Download the video
72
+ video_path = f"video_{from_date}_{to_date}.mp4"
73
+ try:
74
+ urllib.request.urlretrieve(video_link, video_path)
75
+ return video_path
76
+ except Exception as e:
77
+ print(f"An error occurred while downloading the video: {e}")
78
+ return None