import streamlit as st import whisper import tempfile # Function to transcribe audio and detect language def transcribe_and_detect_language(audio_file): model = whisper.load_model("base").to("cpu").float() # Ensure model is in full precision # Load and process audio audio = whisper.load_audio(audio_file) audio = whisper.pad_or_trim(audio) # Convert to log-Mel spectrogram in full precision mel = whisper.log_mel_spectrogram(audio).to(model.device).float() # Convert to float32 # Detect the spoken language _, probs = model.detect_language(mel) detected_language = max(probs, key=probs.get) # Decode the audio options = whisper.DecodingOptions() result = whisper.decode(model, mel, options) return detected_language, result.text # Streamlit UI st.title("Speech to Text with Whisper") # File uploader widget uploaded_file = st.file_uploader("Upload an audio file", type=['wav', 'mp3']) if uploaded_file is not None: with tempfile.NamedTemporaryFile(delete=False) as tmp_file: tmp_file.write(uploaded_file.getvalue()) with st.spinner('Processing...'): language, transcribed_text = transcribe_and_detect_language(tmp_file.name) st.write(f"Detected language: {language}") st.text_area("Transcribed Text:", value=transcribed_text, height=300)