import gradio as gr from huggingsound import SpeechRecognitionModel from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler from pydub import AudioSegment import io # FunciĆ³n para convertir la tasa de muestreo del audio de entrada def convert_sampling_rate(audio, target_sr=48000): if isinstance(audio, tuple): audio_data, sample_rate = audio audio_data = audio_data.to_bytes((audio_data.bit_length() + 7) // 8, 'big') else: audio_data = audio audio = AudioSegment.from_wav(io.BytesIO(audio_data)) audio = audio.set_frame_rate(target_sr) return audio.raw_dataa def modelo1(audio): # Convertir la tasa de muestreo del audio audio = convert_sampling_rate(audio) model = SpeechRecognitionModel("jonatasgrosman/wav2vec2-large-xlsr-53-english") transcriptions = model.transcribe(audio) return transcriptions def modelo2(text): model_id = "stabilityai/stable-diffusion-2-1" # Use the DPMSolverMultistepScheduler (DPM-Solver++) scheduler here instead pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16) pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config) pipe = pipe.to("cuda") image = pipe(text).images[0] return image def execution(audio): modelo1res = modelo1(audio) modelo2res = modelo2(modelo1res) return modelo2res if __name__ == "__main__": demo = gr.Interface(fn=execution, inputs="audio", outputs="image") demo.launch()