Spaces:
Paused
Paused
File size: 2,344 Bytes
29685bd 5167b0f 29685bd 5167b0f 29685bd 5167b0f 29685bd 5167b0f 29685bd 5167b0f 29685bd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
import gradio as gr
import yt_dlp as ydlp
from transformers import pipeline
from whispercpp import Whisper
summarizer = pipeline("summarization", model="knkarthick/MEETING_SUMMARY")
def download_audio(youtube_url, output_folder='.'):
ydl_opts = {
'format': 'bestaudio/best',
'postprocessors': [{
'key': 'FFmpegExtractAudio',
'preferredcodec': 'wav',
'preferredquality': '192',
}],
'outtmpl': f'{output_folder}/audio',
}
with ydlp.YoutubeDL(ydl_opts) as ydl:
ydl.download([youtube_url])
w = Whisper('tiny')
def process_general_transcription(transcription):
formatted_transcription = []
for line in transcription:
if line.startswith('[') and line.endswith(']'):
formatted_transcription.append(f'\n--- {line[1:-1].upper()} ---\n')
else:
formatted_transcription.append(line)
transcript_str = "\n".join(formatted_transcription)
return transcript_str
def chunk_to_tokens(text, n):
tokens = text.split()
max_chunk_size = min(len(tokens), 512)
token_size = max(1, int(max_chunk_size * (1 - n / 100)))
chunks = [" ".join(tokens[i:i + token_size]) for i in range(0, len(tokens), token_size)]
return chunks
def summarizing(text,n):
valid_tok=chunk_to_tokens(text,n)
res=""
for i in valid_tok:
res+=summarizer(i)[0]['summary_text']+'\n'
return res
def transcribe_sum_youtube(youtube_url,n):
download_audio(youtube_url)
result = w.transcribe("audio.wav")
text = w.extract_text(result)
res=process_general_transcription(text)
return summarizing(res,n)
with gr.Blocks() as demo:
gr.Markdown(
"""
# CPP Whisperer - Transcribe YouTube Videos
""")
with gr.Row():
with gr.Column():
inp = gr.Textbox(label="Youtube Url",placeholder="Insert YT Url here")
inp2 = gr.Slider(label="Summarization Percentage",min_value=0,max_value=100,step_size=1)
result_button_transcribe = gr.Button('Transcribe and Summarize')
with gr.Column():
out = gr.Textbox(label="Transcribed and Summarize Text")
result_button_transcribe.click(transcribe_sum_youtube, inputs = [inp,inp2] , outputs = out)
demo.launch()
|