import gradio as gr import numpy as np from huggingsound import SpeechRecognitionModel from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler from transformers import pipeline import librosa # Función para convertir la tasa de muestreo del audio de entrada def modelo1(audio): audio_data, sample_rate = audio # Asegurarse de que audio_data sea un array NumPy if not isinstance(audio_data, np.ndarray): audio_data = np.array(audio_data) # Convertir audio estéreo a mono if audio_data.shape[0] == 2: audio_data = np.mean(audio_data, axis=0) # Utilizar audio_data como entrada para el modelo whisper = pipeline('automatic-speech-recognition', model='openai/whisper-medium', device=-1) # Cambia 'device' a -1 para usar la CPU text = whisper(audio_data, sample_rate) return text def modelo2(text): model_id = "stabilityai/stable-diffusion-2-1" # Use the DPMSolverMultistepScheduler (DPM-Solver++) scheduler here instead pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16) pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config) pipe = pipe.to("cuda") image = pipe(text).images[0] return image def execution(audio): modelo1res = modelo1(audio) modelo2res = modelo2(modelo1res) return modelo2res if __name__ == "__main__": demo = gr.Interface(fn=modelo1, inputs="audio", outputs="text") demo.launch()