AudioToImage / app.py
Bartusito's picture
Update app.py
7e4bb2d
raw
history blame
No virus
1.48 kB
import gradio as gr
import numpy as np
from huggingsound import SpeechRecognitionModel
from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
from transformers import pipeline
import librosa
# Función para convertir la tasa de muestreo del audio de entrada
def modelo1(audio):
audio_data, sample_rate = audio
# Asegurarse de que audio_data sea un array NumPy
if not isinstance(audio_data, np.ndarray):
audio_data = np.array(audio_data)
# Convertir audio estéreo a mono
if audio_data.shape[0] == 2:
audio_data = np.mean(audio_data, axis=0)
# Utilizar audio_data como entrada para el modelo
whisper = pipeline('automatic-speech-recognition', model='openai/whisper-medium', device=-1) # Cambia 'device' a -1 para usar la CPU
text = whisper(audio_data, sample_rate)
return text
def modelo2(text):
model_id = "stabilityai/stable-diffusion-2-1"
# Use the DPMSolverMultistepScheduler (DPM-Solver++) scheduler here instead
pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
pipe = pipe.to("cuda")
image = pipe(text).images[0]
return image
def execution(audio):
modelo1res = modelo1(audio)
modelo2res = modelo2(modelo1res)
return modelo2res
if __name__ == "__main__":
demo = gr.Interface(fn=modelo1, inputs="audio", outputs="text")
demo.launch()