Spaces:

ID2223-labs
/

romanian_parliament_transcription

Sleeping

App Files Files Community

FarhadMadadzade commited on Dec 8, 2023

Commit

1936f1e

•

1 Parent(s): 6902590

trying with swedish

Browse files

Files changed (3) hide show

app.py +43 -16
requirements.txt +2 -1
video_downloader.py +55 -0

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from transformers import pipeline
 import gradio as gr
 import time
-from video_downloader import download_video
 from moviepy.editor import AudioFileClip
 from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
 import datetime
@@ -9,20 +9,15 @@ import os
 from pydub import AudioSegment
 from pydub.silence import split_on_silence
-pipe = pipeline("automatic-speech-recognition", model="gigant/whisper-medium-romanian")
-def process_video(date):
-    # Download the video
-    video_path = download_video(date)
-    # Extract the first 30 seconds of the video
-    short_video_path = f"short_{date}.mp4"
-    ffmpeg_extract_subclip(video_path, 30, 50, targetname=short_video_path)
-    # Extract audio from the short video
-    audio_path = f"audio_{date}.wav"
-    AudioFileClip(short_video_path).write_audiofile(audio_path)
     # Split the audio into chunks
     audio = AudioSegment.from_wav(audio_path)
@@ -34,22 +29,54 @@ def process_video(date):
         chunk.export(f"chunk{i}.wav", format="wav")
         with open(f"chunk{i}.wav", "rb") as audio_file:
             audio = audio_file.read()
-        transcription += pipe(audio)["text"] + "\n\n "
         os.remove(f"chunk{i}.wav")
     # Remove the audio file
     os.remove(audio_path)
-    print(transcription)
-    return short_video_path, transcription
 iface = gr.Interface(
     fn=process_video,
-    inputs=gr.inputs.Textbox(label="Date with format YYYYMMDD"),
     outputs=[
         gr.outputs.Video(),
         gr.Textbox(lines=1000, max_lines=1000, interactive=True),
     ],
     title="Romanian Transcription Test",
 )

 from transformers import pipeline
 import gradio as gr
 import time
+from video_downloader import download_video, download_video1
 from moviepy.editor import AudioFileClip
 from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
 import datetime
 from pydub import AudioSegment
 from pydub.silence import split_on_silence
+pipe = pipeline("automatic-speech-recognition", model="Sleepyp00/whisper-small-Swedish")
+def process_video(from_date, to_date):
+    video_path = download_video1(from_date, to_date)
+    # Extract audio from the video
+    audio_path = f"audio_{from_date}_{to_date}.wav"
+    AudioFileClip(video_path).write_audiofile(audio_path)
     # Split the audio into chunks
     audio = AudioSegment.from_wav(audio_path)
         chunk.export(f"chunk{i}.wav", format="wav")
         with open(f"chunk{i}.wav", "rb") as audio_file:
             audio = audio_file.read()
+        transcription += pipe(audio)["text"] + "\n\n"
         os.remove(f"chunk{i}.wav")
     # Remove the audio file
     os.remove(audio_path)
+    return video_path, transcription
+# def process_video(date):
+#     # Download the video
+#     video_path = download_video(date)
+#     # Extract audio from the video
+#     audio_path = f"audio_{date}.wav"
+#     AudioFileClip(video_path).write_audiofile(audio_path)
+#     # Split the audio into chunks
+#     audio = AudioSegment.from_wav(audio_path)
+#     chunks = split_on_silence(audio, min_silence_len=500, silence_thresh=-40)
+#     # Transcribe each chunk
+#     transcription = ""
+#     for i, chunk in enumerate(chunks):
+#         chunk.export(f"chunk{i}.wav", format="wav")
+#         with open(f"chunk{i}.wav", "rb") as audio_file:
+#             audio = audio_file.read()
+#         transcription += pipe(audio)["text"] + " "
+#         os.remove(f"chunk{i}.wav")
+#     # Remove the audio file
+#     os.remove(audio_path)
+#     return video_path, transcription
 iface = gr.Interface(
     fn=process_video,
+    # inputs=gr.inputs.Textbox(label="Date with format YYYYMMDD"),
+    inputs=[
+        gr.inputs.Textbox(label="From date with format YYYY-MM-DD"),
+        gr.inputs.Textbox(label="Date with format YYYY-MM-DD"),
+    ],
     outputs=[
         gr.outputs.Video(),
         gr.Textbox(lines=1000, max_lines=1000, interactive=True),
     ],
+    live=True,
     title="Romanian Transcription Test",
 )

requirements.txt CHANGED Viewed

@@ -7,4 +7,5 @@ transformers
 torch
 urllib3
 moviepy
-pydub

 torch
 urllib3
 moviepy
+pydub
+beautifulsoup4

video_downloader.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import urllib.request
 import os
 import glob
 def download_video(date):
@@ -21,3 +23,56 @@ def download_video(date):
             print(f"An error occurred while downloading the video: {e}")
     except Exception as e:
         print(f"An unexpected error occurred: {e}")

 import urllib.request
 import os
 import glob
+import requests
+from bs4 import BeautifulSoup
 def download_video(date):
             print(f"An error occurred while downloading the video: {e}")
     except Exception as e:
         print(f"An unexpected error occurred: {e}")
+def get_response(url):
+    try:
+        response = requests.get(url)
+        response.raise_for_status()
+    except requests.exceptions.HTTPError as e:
+        if e.response.status_code == 404:
+            print("No video exists for the given date range.")
+            return None
+        else:
+            print(f"An error occurred while getting the webpage: {e}")
+            return None
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+        return None
+    soup = BeautifulSoup(response.text, "html.parser")
+    return soup
+def download_video1(from_date, to_date):
+    # Get the webpage
+    url = f"https://www.riksdagen.se/sv/sok/?avd=webbtv&from={from_date}&tom={to_date}&doktyp=kam-vo"
+    soup = get_response(url)
+    # Find the download link
+    try:
+        video_page = [
+            a["href"]
+            for a in soup.find_all("a", href=True)
+            if a.get("aria-label") and a["aria-label"].startswith("Beslut")
+        ][0]
+        # go to video_page and get all links
+        soup = get_response(video_page)
+        video_link = [
+            a["href"]
+            for a in soup.find_all("a", href=True)
+            if a["href"].startswith("https://mhdownload.riksdagen.se")
+        ][0]
+        print(video_link)
+    except IndexError:
+        print("No video exists for the given date range.")
+        return None
+    # Download the video
+    video_path = f"video_{from_date}_{to_date}.mp4"
+    try:
+        urllib.request.urlretrieve(video_link, video_path)
+        return video_path
+    except Exception as e:
+        print(f"An error occurred while downloading the video: {e}")
+        return None