vivekvar commited on
Commit
aa38cc5
1 Parent(s): 7571c15

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +74 -85
app.py CHANGED
@@ -1,85 +1,74 @@
1
- import streamlit as st
2
- from transformers import WhisperProcessor, WhisperForConditionalGeneration, RagTokenizer, RagRetriever, RagSequenceForGeneration
3
- import torch
4
- import soundfile as sf
5
- import librosa
6
- from moviepy.editor import VideoFileClip
7
- import os
8
-
9
- # Load Whisper base model and processor
10
- whisper_model_name = "openai/whisper-base"
11
- whisper_processor = WhisperProcessor.from_pretrained(whisper_model_name)
12
- whisper_model = WhisperForConditionalGeneration.from_pretrained(whisper_model_name)
13
-
14
- # Load RAG sequence model and tokenizer
15
- rag_model_name = "facebook/rag-sequence-nq"
16
- rag_tokenizer = RagTokenizer.from_pretrained(rag_model_name)
17
- rag_retriever = RagRetriever.from_pretrained(rag_model_name, index_name="exact", use_dummy_dataset=True, trust_remote_code=True)
18
- rag_model = RagSequenceForGeneration.from_pretrained(rag_model_name, retriever=rag_retriever)
19
-
20
- def transcribe_audio(audio_path, language="ru"):
21
- speech, rate = librosa.load(audio_path, sr=16000)
22
- inputs = whisper_processor(speech, return_tensors="pt", sampling_rate=16000)
23
- input_features = whisper_processor.feature_extractor(speech, return_tensors="pt", sampling_rate=16000).input_features
24
- predicted_ids = whisper_model.generate(input_features, forced_decoder_ids=whisper_processor.get_decoder_prompt_ids(language=language, task="translate"))
25
- transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
26
- return transcription
27
-
28
- def translate_and_summarize(text):
29
- inputs = rag_tokenizer(text, return_tensors="pt")
30
- input_ids = inputs["input_ids"]
31
- attention_mask = inputs["attention_mask"]
32
- outputs = rag_model.generate(input_ids=input_ids, attention_mask=attention_mask)
33
- return rag_tokenizer.batch_decode(outputs, skip_special_tokens=True)
34
-
35
- def extract_audio_from_video(video_path, output_audio_path):
36
- video_clip = VideoFileClip(video_path)
37
- audio_clip = video_clip.audio
38
- if audio_clip is not None:
39
- audio_clip.write_audiofile(output_audio_path)
40
- return output_audio_path
41
- else:
42
- return None
43
-
44
- st.title("Audio and Video Transcription & Summarization")
45
-
46
- # Audio Upload Section
47
- st.header("Upload an Audio File")
48
- audio_file = st.file_uploader("Choose an audio file...", type=["wav", "mp3", "m4a"])
49
-
50
- if audio_file is not None:
51
- audio_path = os.path.join("/tmp", audio_file.name)
52
- with open(audio_path, "wb") as f:
53
- f.write(audio_file.getbuffer())
54
-
55
- st.audio(audio_file)
56
- st.write("Transcribing audio...")
57
- transcription = transcribe_audio(audio_path)
58
- st.write("Transcription:", transcription)
59
-
60
- st.write("Translating and summarizing...")
61
- summary = translate_and_summarize(transcription)
62
- st.write("Translated Summary:", summary)
63
- # Video Upload Section
64
- st.header("Upload a Video File")
65
- video_file = st.file_uploader("Choose a video file...", type=["mp4", "mkv", "avi", "mov"])
66
-
67
- if video_file is not None:
68
- video_path = os.path.join("/tmp", video_file.name)
69
- with open(video_path, "wb") as f:
70
- f.write(video_file.getbuffer())
71
-
72
- st.video(video_file)
73
- st.write("Extracting audio from video...")
74
- audio_path = extract_audio_from_video(video_path, "/tmp/extracted_audio.wav")
75
-
76
- if audio_path is not None:
77
- st.write("Transcribing audio...")
78
- transcription = transcribe_audio(audio_path)
79
- st.write("Transcription:", transcription)
80
-
81
- st.write("Translating and summarizing...")
82
- summary = translate_and_summarize(transcription)
83
- st.write("Translated Summary:", summary)
84
- else:
85
- st.write("No audio track found in the video file.")
 
1
+ import gradio as gr
2
+ import torch
3
+ import soundfile as sf
4
+ import librosa
5
+ from moviepy.editor import VideoFileClip
6
+ import os
7
+
8
+ # Load Whisper base model and processor
9
+ whisper_model_name = "openai/whisper-base"
10
+ whisper_processor = WhisperProcessor.from_pretrained(whisper_model_name)
11
+ whisper_model = WhisperForConditionalGeneration.from_pretrained(whisper_model_name)
12
+
13
+ # Load RAG sequence model and tokenizer
14
+ rag_model_name = "facebook/rag-sequence-nq"
15
+ rag_tokenizer = RagTokenizer.from_pretrained(rag_model_name)
16
+ rag_retriever = RagRetriever.from_pretrained(rag_model_name, index_name="exact", use_dummy_dataset=True)
17
+ rag_model = RagSequenceForGeneration.from_pretrained(rag_model_name, retriever=rag_retriever)
18
+
19
+ def transcribe_audio(audio_path, language="ru"):
20
+ speech, rate = librosa.load(audio_path, sr=16000)
21
+ inputs = whisper_processor(speech, return_tensors="pt", sampling_rate=16000)
22
+ input_features = whisper_processor.feature_extractor(speech, return_tensors="pt", sampling_rate=16000).input_features
23
+ predicted_ids = whisper_model.generate(input_features, forced_decoder_ids=whisper_processor.get_decoder_prompt_ids(language=language, task="translate"))
24
+ transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
25
+ return transcription
26
+
27
+ def translate_and_summarize(text):
28
+ inputs = rag_tokenizer(text, return_tensors="pt")
29
+ input_ids = inputs["input_ids"]
30
+ attention_mask = inputs["attention_mask"]
31
+ outputs = rag_model.generate(input_ids=input_ids, attention_mask=attention_mask)
32
+ return rag_tokenizer.batch_decode(outputs, skip_special_tokens=True)
33
+
34
+ def extract_audio_from_video(video_path, output_audio_path):
35
+ video_clip = VideoFileClip(video_path)
36
+ audio_clip = video_clip.audio
37
+ if audio_clip is not None:
38
+ audio_clip.write_audiofile(output_audio_path)
39
+ return output_audio_path
40
+ else:
41
+ return None
42
+
43
+ def transcribe_audio_interface(audio_file):
44
+ audio_path = os.path.join("/tmp", audio_file.name)
45
+ with open(audio_path, "wb") as f:
46
+ f.write(audio_file.getvalue())
47
+ transcription = transcribe_audio(audio_path)
48
+ return transcription
49
+
50
+ def summarize_text_interface(text):
51
+ summary = translate_and_summarize(text)
52
+ return summary
53
+
54
+ def summarize_video_interface(video_file):
55
+ video_path = os.path.join("/tmp", video_file.name)
56
+ with open(video_path, "wb") as f:
57
+ f.write(video_file.getvalue())
58
+ audio_path = extract_audio_from_video(video_path, "/tmp/extracted_audio.wav")
59
+ if audio_path is not None:
60
+ transcription = transcribe_audio(audio_path)
61
+ summary = translate_and_summarize(transcription)
62
+ return summary
63
+ else:
64
+ return "No audio track found in the video file."
65
+
66
+ # Create interfaces
67
+ audio_transcription_interface = gr.Interface(transcribe_audio_interface, inputs="audio", outputs="text", title="Audio Transcription")
68
+ text_summarization_interface = gr.Interface(summarize_text_interface, inputs="text", outputs="text", title="Text Summarization")
69
+ video_summarization_interface = gr.Interface(summarize_video_interface, inputs="video", outputs="text", title="Video Summarization")
70
+
71
+ # Launch the interfaces
72
+ audio_transcription_interface.launch()
73
+ text_summarization_interface.launch()
74
+ video_summarization_interface.launch()