Spaces:
Build error
Build error
import streamlit as st | |
from models import BagOfModels, SoundToText, TextToSummary | |
from settings import MODEL_PARSER | |
args = MODEL_PARSER | |
st.set_page_config( | |
page_title="TTS Applications | Incore Solutions", | |
layout="wide", | |
menu_items={ | |
"About": """This is a simple GUI for OpenAI's Whisper.""", | |
}, | |
) | |
def open_instructions(): | |
with open("instructions.md", "r") as f: | |
st.write(f.read()) | |
# Render input type selection on the sidebar & the form | |
input_type = st.sidebar.selectbox("Input Type", ["YouTube", "File"]) | |
with st.sidebar.form("input_form"): | |
if input_type == "YouTube": | |
youtube_url = st.text_input("Youtube URL") | |
elif input_type == "File": | |
input_file = st.file_uploader("File", type=["mp3", "wav"]) | |
whisper_model = st.selectbox("Whisper model", options = [whisper for whisper in BagOfModels.get_model_names() if "whisper" in whisper] , index=1) | |
summary = st.checkbox("summarize") | |
if summary: | |
min_sum = st.number_input("Minimum words in the summary", min_value=1, step=1) | |
max_sum = min(min_sum,st.number_input("Maximum words in the summary", min_value=2, step=1)) | |
st.form_submit_button(label="Save settings") | |
with st.sidebar.form("save settings"): | |
transcribe = st.form_submit_button(label="Transcribe!") | |
if transcribe: | |
if input_type == "YouTube": | |
if youtube_url and youtube_url.startswith("http"): | |
model = BagOfModels.load_model(whisper_model,**vars(args)) | |
st.session_state.transcription = model.predict_stt(source=youtube_url,source_type=input_type,model_task="stt") | |
else: | |
st.error("Please enter a valid YouTube URL") | |
open_instructions() | |
elif input_type == "File": | |
if input_file: | |
model = BagOfModels.load_model(whisper_model,**vars(args)) | |
st.session_state.transcription = model.predict_stt(source=input_file,source_type=input_type,model_task="stt") | |
else: | |
st.error("Please upload a file") | |
if "transcription" in st.session_state: | |
st.session_state.transcription.whisper() | |
# create two columns to separate page and youtube video | |
transcription_col, media_col = st.columns(2) | |
with transcription_col: | |
st.markdown("#### Audio") | |
with open(st.session_state.transcription.audio_path, "rb") as f: | |
st.audio(f.read()) | |
st.markdown("---") | |
st.markdown(f"#### Transcription (whisper model - `{whisper_model}`)") | |
st.markdown(f"##### Language: `{st.session_state.transcription.language}`") | |
# Trim raw transcribed output off tokens to simplify | |
raw_output = st.expander("Raw output") | |
raw_output.markdown(st.session_state.transcription.raw_output["text"]) | |
if summary: | |
summarized_output = st.expander("summarized output") | |
# CURRENTLY ONLY SUPPORTS 1024 WORD TOKENS -> TODO: FIND METHOD TO INCREASE SUMMARY FOR LONGER VIDS -> 1024 * 4 = aprox 800 words within 1024 range | |
text_summary = TextToSummary(str(st.session_state.transcription.text[:1024*4]),min_sum,max_sum).get_summary() | |
summarized_output.markdown(text_summary[0]["summary_text"]) | |
# Show transcription in format with timers added to text | |
time_annotated_output = st.expander("time_annotated_output") | |
for segment in st.session_state.transcription.segments: | |
time_annotated_output.markdown( | |
f"""[{round(segment["start"], 1)} - {round(segment["end"], 1)}] - {segment["text"]}""" | |
) | |
# Show input youtube video | |
with media_col: | |
if input_type == "YouTube": | |
st.markdown("---") | |
st.markdown("#### Original YouTube Video") | |
st.video(st.session_state.transcription.source) | |
else: | |
pass | |