nanograd-engine / sd_inference.py
Esmail-AGumaan's picture
Upload 13 files
64e1ee8 verified
from nanograd.models.stable_diffusion import model_loader
from nanograd.models.stable_diffusion import pipeline
from PIL import Image
from pathlib import Path
from transformers import CLIPTokenizer
import torch
DEVICE = "cpu"
ALLOW_CUDA = False
ALLOW_MPS = False
if torch.cuda.is_available() and ALLOW_CUDA:
DEVICE = "cuda"
elif (torch.has_mps or torch.backends.mps.is_available()) and ALLOW_MPS:
DEVICE = "mps"
print(f"Using device: {DEVICE}")
tokenizer = CLIPTokenizer("nanograd\models\stable_diffusion\sd_data\\tokenizer_vocab.json", merges_file="nanograd\models\stable_diffusion\sd_data\\tokenizer_merges.txt")
model_file = "nanograd\models\stable_diffusion\sd_data\\v1-5-pruned-emaonly.ckpt"
models = model_loader.preload_models_from_standard_weights(model_file, DEVICE)
## TEXT TO IMAGE
prompt = input("Enter your prompt: ")
# prompt = "A cat stretching on the floor, highly detailed, ultra sharp, cinematic, 100mm lens, 8k resolution."
uncond_prompt = ""
do_cfg = True
cfg_scale = 8 # min: 1, max: 14
## IMAGE TO IMAGE
input_image = None
# Comment to disable image to image
# image_path = "../images/dog.jpg"
# input_image = Image.open(image_path)
# Higher values means more noise will be added to the input image, so the result will further from the input image.
strength = 0.9
## SAMPLER
sampler = "ddpm"
num_inference_steps = 50
seed = 42
def run():
output_image = pipeline.generate(
prompt=prompt,
uncond_prompt=uncond_prompt,
input_image=input_image,
strength=strength,
do_cfg=do_cfg,
cfg_scale=cfg_scale,
sampler_name=sampler,
n_inference_steps=num_inference_steps,
seed=seed,
models=models,
device=DEVICE,
idle_device="cpu",
tokenizer=tokenizer,
)
output_image = Image.fromarray(output_image)
output_path = "nanograd\models\stable_diffusion\output\\c.png"
output_image.save(output_path)
print(f"Image saved as {output_path}")
if __name__ == "__main__":
run()