FarhadMadadzade commited on
Commit
326258c
1 Parent(s): 6319f19

trying romanian

Browse files
Files changed (2) hide show
  1. app.py +68 -9
  2. video_downloader.py +20 -3
app.py CHANGED
@@ -1,7 +1,7 @@
1
  from transformers import pipeline
2
  import gradio as gr
3
  import time
4
- from video_downloader import download_video1, download_youtube_video
5
  from moviepy.editor import AudioFileClip, VideoFileClip
6
  from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
7
  import datetime
@@ -10,7 +10,7 @@ from pydub import AudioSegment
10
  from pydub.silence import split_on_silence
11
  import re
12
 
13
- pipe = pipeline("automatic-speech-recognition", model="Artanis1551/whisper_swedish")
14
 
15
 
16
  def process_video1(date):
@@ -62,19 +62,78 @@ def process_video1(date):
62
  return video_path, transcription
63
 
64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  iface = gr.Interface(
66
- fn=process_video1,
67
- inputs=[
68
- gr.inputs.Textbox(label="Date with format YYYY-MM-DD"),
69
- ],
70
  outputs=[
71
  gr.outputs.Video(),
72
  gr.Textbox(lines=100, max_lines=100, interactive=True),
73
  ],
74
- title="Transcribe Swedish Parliament Decisions",
75
- description="This app transcribes the top Swedish Parliament decision"
76
- + " video from the given date. Only the first 30 seconds of the "
77
  + "video will be used if it is longer than that.",
78
  )
79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  iface.launch()
 
1
  from transformers import pipeline
2
  import gradio as gr
3
  import time
4
+ from video_downloader import download_video, download_video1, download_youtube_video
5
  from moviepy.editor import AudioFileClip, VideoFileClip
6
  from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
7
  import datetime
 
10
  from pydub.silence import split_on_silence
11
  import re
12
 
13
+ pipe = pipeline("automatic-speech-recognition", model="Artanis1551/whisper_romanian")
14
 
15
 
16
  def process_video1(date):
 
62
  return video_path, transcription
63
 
64
 
65
+ def process_video(date):
66
+ # If the date is not in YYYY-MM-DD format, return an error message
67
+ date_pattern = re.compile(r"\b\d{4}\d{2}\d{2}\b")
68
+ if not date_pattern.match(date):
69
+ video_path = download_youtube_video(
70
+ "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
71
+ )
72
+ transcription = "Please enter a date in the format YYYY-MM-DD."
73
+ return video_path, transcription
74
+ try:
75
+ # Download the video
76
+ video_path = download_video(date)
77
+
78
+ # Extract the first 30 seconds of the video
79
+ short_video_path = f"short_{date}.mp4"
80
+ ffmpeg_extract_subclip(video_path, 0, 30, targetname=short_video_path)
81
+
82
+ # Extract audio from the short video
83
+ audio_path = f"audio_{date}.wav"
84
+ AudioFileClip(short_video_path).write_audiofile(audio_path)
85
+
86
+ # Split the audio into chunks
87
+ audio = AudioSegment.from_wav(audio_path)
88
+ chunks = split_on_silence(audio, min_silence_len=500, silence_thresh=-40)
89
+
90
+ # Transcribe each chunk
91
+ transcription = ""
92
+ for i, chunk in enumerate(chunks):
93
+ chunk.export(f"chunk{i}.wav", format="wav")
94
+ with open(f"chunk{i}.wav", "rb") as audio_file:
95
+ audio = audio_file.read()
96
+ transcription += pipe(audio)["text"] + " "
97
+ os.remove(f"chunk{i}.wav")
98
+
99
+ # Remove the audio file
100
+ os.remove(audio_path)
101
+ except:
102
+ video_path = download_youtube_video(
103
+ "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
104
+ )
105
+ transcription = "No decision was made on this date."
106
+
107
+ return short_video_path, transcription
108
+
109
+
110
  iface = gr.Interface(
111
+ fn=process_video,
112
+ inputs=gr.inputs.Textbox(label="Date with format YYYYMMDD"),
 
 
113
  outputs=[
114
  gr.outputs.Video(),
115
  gr.Textbox(lines=100, max_lines=100, interactive=True),
116
  ],
117
+ title="Romanian Transcription Test",
118
+ description="This app transcribes videos from the Romanian Parliament"
119
+ + " on a given date. Only the first 30 seconds of the "
120
  + "video will be used if it is longer than that.",
121
  )
122
 
123
+
124
+ # iface = gr.Interface(
125
+ # fn=process_video1,
126
+ # inputs=[
127
+ # gr.inputs.Textbox(label="Date with format YYYY-MM-DD"),
128
+ # ],
129
+ # outputs=[
130
+ # gr.outputs.Video(),
131
+ # gr.Textbox(lines=100, max_lines=100, interactive=True),
132
+ # ],
133
+ # title="Transcribe Swedish Parliament Decisions",
134
+ # description="This app transcribes the top Swedish Parliament decision"
135
+ # + " video from the given date. Only the first 30 seconds of the "
136
+ # + "video will be used if it is longer than that.",
137
+ # )
138
+
139
  iface.launch()
video_downloader.py CHANGED
@@ -23,6 +23,26 @@ def get_response(url):
23
  return soup
24
 
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  def download_video1(date):
27
  # Get the webpage
28
  url = f"https://www.riksdagen.se/sv/sok/?avd=webbtv&from={date}&tom={date}&doktyp=kam-vo"
@@ -67,6 +87,3 @@ def download_youtube_video(url):
67
  return video_path
68
  except Exception as e:
69
  print(f"An error occurred while downloading the video: {e}")
70
-
71
-
72
- download_youtube_video("https://www.youtube.com/watch?v=dQw4w9WgXcQ")
 
23
  return soup
24
 
25
 
26
+ def download_video(date):
27
+ # Delete any existing .mp4 files
28
+ for mp4_file in glob.glob("*.mp4"):
29
+ os.remove(mp4_file)
30
+
31
+ year = date[:4]
32
+ url = f"https://www.cdep.ro/u02/comisii/{year}/cp46_{date}.mp4"
33
+ try:
34
+ urllib.request.urlretrieve(url, f"video_{date}.mp4")
35
+ print("Video downloaded successfully.")
36
+ return f"video_{date}.mp4"
37
+ except urllib.error.HTTPError as e:
38
+ if e.code == 404:
39
+ print("No video exists for the given date.")
40
+ else:
41
+ print(f"An error occurred while downloading the video: {e}")
42
+ except Exception as e:
43
+ print(f"An unexpected error occurred: {e}")
44
+
45
+
46
  def download_video1(date):
47
  # Get the webpage
48
  url = f"https://www.riksdagen.se/sv/sok/?avd=webbtv&from={date}&tom={date}&doktyp=kam-vo"
 
87
  return video_path
88
  except Exception as e:
89
  print(f"An error occurred while downloading the video: {e}")