import os import gradio as gr import librosa import torch from transformers import WhisperForConditionalGeneration, WhisperProcessor hf_token = os.getenv("hf_token") if hf_token is None: raise ValueError( "Hugging Face token not found. Please set the 'hf_token' environment variable." ) processor = WhisperProcessor.from_pretrained( "openai/whisper-small", language="Indonesian", task="transcribe", token=hf_token ) model = WhisperForConditionalGeneration.from_pretrained( "avalonai/whisper-small-jv", token=hf_token ) def transcribe(audio_choice, audio_file): if audio_file is not None: audio_path = audio_file elif audio_choice is not None: audio_path = audio_choice else: return "No audio file provided. Please upload an audio file or select a sample." try: audio, sampling_rate = librosa.load(audio_path, sr=16000) except Exception as e: return f"Failed to load audio: {str(e)}" audio_input = processor(audio, return_tensors="pt", sampling_rate=16000) input_values = audio_input.input_features with torch.no_grad(): generated_ids = model.generate(input_values) transcription = processor.batch_decode(generated_ids, skip_special_tokens=True) return transcription[0] audio_samples = [ os.path.join("audio_sample", f) for f in os.listdir("audio_sample") if f.endswith((".wav", ".mp3", ".m4a")) ] audio_choice = gr.Dropdown(label="Select a Sample Audio", choices=audio_samples) audio_input = gr.Audio( sources="microphone", type="filepath", label="Upload Audio or Use Microphone" ) iface = gr.Interface( fn=transcribe, inputs=[audio_choice, audio_input], outputs="text", title="Speech-to-text on Javanese Language Demo", description="Ini adalah platform untuk pengujian model speech-to-text pada bahasa Jawa oleh Avalon AI. Silahkan coba dengan mengucapkan kalimat atau memilih salah satu sample audio.", ) iface.launch()