import streamlit as st from models import BagOfModels, SoundToText, TextToSummary from settings import MODEL_PARSER args = MODEL_PARSER st.set_page_config( page_title="TTS Applications | Incore Solutions", layout="wide", menu_items={ "About": """This is a simple GUI for OpenAI's Whisper.""", }, ) def open_instructions(): with open("instructions.md", "r") as f: st.write(f.read()) # Render input type selection on the sidebar & the form input_type = st.sidebar.selectbox("Input Type", ["YouTube", "File"]) with st.sidebar.form("input_form"): if input_type == "YouTube": youtube_url = st.text_input("Youtube URL") elif input_type == "File": input_file = st.file_uploader("File", type=["mp3", "wav"]) whisper_model = st.selectbox("Whisper model", options = [whisper for whisper in BagOfModels.get_model_names() if "whisper" in whisper] , index=1) summary = st.checkbox("summarize") if summary: min_sum = st.number_input("Minimum words in the summary", min_value=1, step=1) max_sum = min(min_sum,st.number_input("Maximum words in the summary", min_value=2, step=1)) st.form_submit_button(label="Save settings") with st.sidebar.form("save settings"): transcribe = st.form_submit_button(label="Transcribe!") if transcribe: if input_type == "YouTube": if youtube_url and youtube_url.startswith("http"): model = BagOfModels.load_model(whisper_model,**vars(args)) st.session_state.transcription = model.predict_stt(source=youtube_url,source_type=input_type,model_task="stt") else: st.error("Please enter a valid YouTube URL") open_instructions() elif input_type == "File": if input_file: model = BagOfModels.load_model(whisper_model,**vars(args)) st.session_state.transcription = model.predict_stt(source=input_file,source_type=input_type,model_task="stt") else: st.error("Please upload a file") if "transcription" in st.session_state: st.session_state.transcription.whisper() # create two columns to separate page and youtube video transcription_col, media_col = st.columns(2) with transcription_col: st.markdown("#### Audio") with open(st.session_state.transcription.audio_path, "rb") as f: st.audio(f.read()) st.markdown("---") st.markdown(f"#### Transcription (whisper model - `{whisper_model}`)") st.markdown(f"##### Language: `{st.session_state.transcription.language}`") # Trim raw transcribed output off tokens to simplify raw_output = st.expander("Raw output") raw_output.markdown(st.session_state.transcription.raw_output["text"]) if summary: summarized_output = st.expander("summarized output") # CURRENTLY ONLY SUPPORTS 1024 WORD TOKENS -> TODO: FIND METHOD TO INCREASE SUMMARY FOR LONGER VIDS -> 1024 * 4 = aprox 800 words within 1024 range text_summary = TextToSummary(str(st.session_state.transcription.text[:1024*4]),min_sum,max_sum).get_summary() summarized_output.markdown(text_summary[0]["summary_text"]) # Show transcription in format with timers added to text time_annotated_output = st.expander("time_annotated_output") for segment in st.session_state.transcription.segments: time_annotated_output.markdown( f"""[{round(segment["start"], 1)} - {round(segment["end"], 1)}] - {segment["text"]}""" ) # Show input youtube video with media_col: if input_type == "YouTube": st.markdown("---") st.markdown("#### Original YouTube Video") st.video(st.session_state.transcription.source) else: pass