vivekvar commited on
Commit
4f0d0be
1 Parent(s): 79fffce

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +85 -0
  2. requirements.txt +8 -0
app.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration, RagTokenizer, RagRetriever, RagSequenceForGeneration
3
+ import torch
4
+ import soundfile as sf
5
+ import librosa
6
+ from moviepy.editor import VideoFileClip
7
+ import os
8
+
9
+ # Load Whisper base model and processor
10
+ whisper_model_name = "openai/whisper-base"
11
+ whisper_processor = WhisperProcessor.from_pretrained(whisper_model_name)
12
+ whisper_model = WhisperForConditionalGeneration.from_pretrained(whisper_model_name)
13
+
14
+ # Load RAG sequence model and tokenizer
15
+ rag_model_name = "facebook/rag-sequence-nq"
16
+ rag_tokenizer = RagTokenizer.from_pretrained(rag_model_name)
17
+ rag_retriever = RagRetriever.from_pretrained(rag_model_name, index_name="exact", use_dummy_dataset=True, trust_remote_code=True)
18
+ rag_model = RagSequenceForGeneration.from_pretrained(rag_model_name, retriever=rag_retriever)
19
+
20
+ def transcribe_audio(audio_path, language="ru"):
21
+ speech, rate = librosa.load(audio_path, sr=16000)
22
+ inputs = whisper_processor(speech, return_tensors="pt", sampling_rate=16000)
23
+ input_features = whisper_processor.feature_extractor(speech, return_tensors="pt", sampling_rate=16000).input_features
24
+ predicted_ids = whisper_model.generate(input_features, forced_decoder_ids=whisper_processor.get_decoder_prompt_ids(language=language, task="translate"))
25
+ transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
26
+ return transcription
27
+
28
+ def translate_and_summarize(text):
29
+ inputs = rag_tokenizer(text, return_tensors="pt")
30
+ input_ids = inputs["input_ids"]
31
+ attention_mask = inputs["attention_mask"]
32
+ outputs = rag_model.generate(input_ids=input_ids, attention_mask=attention_mask)
33
+ return rag_tokenizer.batch_decode(outputs, skip_special_tokens=True)
34
+
35
+ def extract_audio_from_video(video_path, output_audio_path):
36
+ video_clip = VideoFileClip(video_path)
37
+ audio_clip = video_clip.audio
38
+ if audio_clip is not None:
39
+ audio_clip.write_audiofile(output_audio_path)
40
+ return output_audio_path
41
+ else:
42
+ return None
43
+
44
+ st.title("Audio and Video Transcription & Summarization")
45
+
46
+ # Audio Upload Section
47
+ st.header("Upload an Audio File")
48
+ audio_file = st.file_uploader("Choose an audio file...", type=["wav", "mp3", "m4a"])
49
+
50
+ if audio_file is not None:
51
+ audio_path = os.path.join("/tmp", audio_file.name)
52
+ with open(audio_path, "wb") as f:
53
+ f.write(audio_file.getbuffer())
54
+
55
+ st.audio(audio_file)
56
+ st.write("Transcribing audio...")
57
+ transcription = transcribe_audio(audio_path)
58
+ st.write("Transcription:", transcription)
59
+
60
+ st.write("Translating and summarizing...")
61
+ summary = translate_and_summarize(transcription)
62
+ st.write("Translated Summary:", summary)
63
+ # Video Upload Section
64
+ st.header("Upload a Video File")
65
+ video_file = st.file_uploader("Choose a video file...", type=["mp4", "mkv", "avi", "mov"])
66
+
67
+ if video_file is not None:
68
+ video_path = os.path.join("/tmp", video_file.name)
69
+ with open(video_path, "wb") as f:
70
+ f.write(video_file.getbuffer())
71
+
72
+ st.video(video_file)
73
+ st.write("Extracting audio from video...")
74
+ audio_path = extract_audio_from_video(video_path, "/tmp/extracted_audio.wav")
75
+
76
+ if audio_path is not None:
77
+ st.write("Transcribing audio...")
78
+ transcription = transcribe_audio(audio_path)
79
+ st.write("Transcription:", transcription)
80
+
81
+ st.write("Translating and summarizing...")
82
+ summary = translate_and_summarize(transcription)
83
+ st.write("Translated Summary:", summary)
84
+ else:
85
+ st.write("No audio track found in the video file.")
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ faiss-cpu
2
+ torch
3
+ transformers
4
+ soundfile
5
+ librosa
6
+ datasets
7
+ moviepy
8
+ gradio