Spaces:
Sleeping
Sleeping
import gradio as gr | |
import numpy as np | |
from huggingsound import SpeechRecognitionModel | |
from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler | |
from transformers import pipeline | |
import librosa | |
# Función para convertir la tasa de muestreo del audio de entrada | |
def modelo1(audio): | |
audio_data, sample_rate = audio | |
# Asegurarse de que audio_data sea un array NumPy | |
if not isinstance(audio_data, np.ndarray): | |
audio_data = np.array(audio_data) | |
# Convertir audio estéreo a mono | |
if audio_data.shape[0] == 2: | |
audio_data = np.mean(audio_data, axis=0) | |
# Utilizar audio_data como entrada para el modelo | |
whisper = pipeline('automatic-speech-recognition', model='openai/whisper-medium', device=-1) # Cambia 'device' a -1 para usar la CPU | |
text = whisper(audio_data, sample_rate) | |
return text | |
def modelo2(text): | |
model_id = "stabilityai/stable-diffusion-2-1" | |
# Use the DPMSolverMultistepScheduler (DPM-Solver++) scheduler here instead | |
pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16) | |
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config) | |
pipe = pipe.to("cuda") | |
image = pipe(text).images[0] | |
return image | |
def execution(audio): | |
modelo1res = modelo1(audio) | |
modelo2res = modelo2(modelo1res) | |
return modelo2res | |
if __name__ == "__main__": | |
demo = gr.Interface(fn=modelo1, inputs="audio", outputs="text") | |
demo.launch() |