how to use in cpu only

#33
by jtc1246 - opened

how to use in cpu only

unfortunately, this model is too big for inference on cpu. Even most gpus are not enough

Have to disagree with @Aplenty888 , if you have at least 32GB RAM. I ran on CPU all day yesterday before I realized I could do sequential CPU-offloading and run way faster using GPU (but lots of shuffling model components back and forth) - note the commented out #pipeline.enable_sequential_cpu_offload() line below. Anyway:

from diffusers import FluxPipeline, FluxTransformer2DModel
import torch
import os

# Configuration
MODEL_NAME_OR_DIR = "black-forest-labs/FLUX.1-dev"
IMAGE_OUTPUT_DIR = "./outputs/10_1"
IMAGE_PREFIX = "sign_"
DEVICE = torch.device("cpu")
# If True, uses pipeline.enable_sequential_cpu_offload(). Make sure device is CPU.
USE_CPU_OFFLOAD = True
SEED = 0
IMAGE_WIDTH = 1024
IMAGE_HEIGHT = 768
NUM_STEPS = 30
NUM_IMAGES = 4
CFG = 3.5
PROMPT = 'photo of a man on a beach holding a sign that says "Premature optimization is the root of all evil - test your shit!"'

print("Loading model...")
transformer = FluxTransformer2DModel.from_pretrained(MODEL_NAME_OR_DIR, torch_dtype=torch.bfloat16, use_safetensors=True)
print("Creating pipeline...")
pipeline = FluxPipeline.from_pretrained(
    "black-forest-labs/FLUX.1-dev", transformer=transformer, torch_dtype=torch.bfloat16
, use_safetensors=True, local_dir="./models/dev/", local_dir_use_symlinks=False, ignore_patterns=["flux1-dev.sft","flux1-dev.safetensors"]).to(DEVICE)
#pipeline.enable_sequential_cpu_offload()
print("Generating image...")
# Params:
# prompt โ€“ The prompt or prompts to guide the image generation. If not defined, one has to pass prompt_embeds. instead.
# prompt_2 โ€“ The prompt or prompts to be sent to tokenizer_2 and text_encoder_2. If not defined, prompt is will be used instead
# height โ€“ The height in pixels of the generated image. This is set to 1024 by default for the best results.
# width โ€“ The width in pixels of the generated image. This is set to 1024 by default for the best results.
# num_inference_steps โ€“ The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference.
# timesteps โ€“ Custom timesteps to use for the denoising process with schedulers which support a timesteps argument in their set_timesteps method. If not defined, the default behavior when num_inference_steps is passed will be used. Must be in descending order.
# guidance_scale โ€“ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https:// arxiv. org/ abs/ 2207.12598 ). guidance_scale is defined as w of equation 2. of [Imagen Paper](https:// arxiv. org/ pdf/ 2205.11487.pdf ). Guidance scale is enabled by setting guidance_scale > 1. Higher guidance scale encourages to generate images that are closely linked to the text prompt, usually at the expense of lower image quality.
# num_images_per_prompt โ€“ The number of images to generate per prompt.
# generator โ€“ One or a list of [torch generator(s)](https:// pytorch. org/ docs/ stable/ generated/ torch. Generator. html ) to make generation deterministic.
# latents โ€“ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random generator.
# prompt_embeds โ€“ Pre-generated text embeddings. Can be used to easily tweak text inputs, e. g. prompt weighting. If not provided, text embeddings will be generated from prompt input argument.
# pooled_prompt_embeds โ€“ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, e. g. prompt weighting. If not provided, pooled text embeddings will be generated from prompt input argument.
# output_type โ€“ The output format of the generate image. Choose between [PIL](https:// pillow. readthedocs. io/ en/ stable/ ): PIL. Image. Image or np. array.
# return_dict โ€“ Whether or not to return a [~pipelines. flux. FluxPipelineOutput] instead of a plain tuple.
# joint_attention_kwargs โ€“ A kwargs dictionary that if specified is passed along to the AttentionProcessor as defined under self. processor in [diffusers. models. attention_processor](https:// github. com/ huggingface/ diffusers/ blob/ main/ src/ diffusers/ models/ attention_processor. py ).
# callback_on_step_end โ€“ A function that calls at the end of each denoising steps during the inference. The function is called with the following arguments: callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict). callback_kwargs will include a list of all tensors as specified by callback_on_step_end_tensor_inputs.
# callback_on_step_end_tensor_inputs โ€“ The list of tensor inputs for the callback_on_step_end function. The tensors specified in the list will be passed as callback_kwargs argument. You will only be able to include variables listed in the ._callback_tensor_inputs attribute of your pipeline class.
# max_sequence_length โ€“ Maximum sequence length to use with the prompt.
# Returns:
# [~pipelines. flux. FluxPipelineOutput] if return_dict is True, otherwise a tuple. When returning a tuple, the first element is a list with the generated images.
images = pipeline(
    prompt=PROMPT,
    guidance_scale=CFG,
    num_inference_steps=NUM_STEPS,
    height=IMAGE_HEIGHT,
    width=IMAGE_WIDTH,
    max_sequence_length=512,
    generator=torch.manual_seed(42),
    num_images_per_prompt=NUM_IMAGES,
).images
for i, image in enumerate(images):
    print("Saving image...")
    path = os.path.join(IMAGE_OUTPUT_DIR, f"{IMAGE_PREFIX}_{i}.png")
    image.save(path)
print("Done.")

By any chance, would you have the same kind of code, but for Flux-schnell?
Thank you!

@shokru In the code, just replace the MODEL_NAME_OR_DIR = "black-forest-labs/FLUX.1-dev" as "black-forest-labs/FLUX.1-schnell"
Also change NUM_STEPS to 1-8(more is better but slower) and change CFG to 0

Allright, I managed to make it work with the code below.
... but it took 2 hours to generate just 4 images; so even though it's possible on CPU, it's quite long.
(the images were quite good to be honest)

from diffusers import FluxPipeline, FluxTransformer2DModel
import torch

# Configuration
MODEL_NAME_OR_DIR = "black-forest-labs/FLUX.1-schnell"
IMAGE_OUTPUT_DIR = "./outputs/10_1"
IMAGE_PREFIX = "sign_"
DEVICE = torch.device("cpu")
# If True, uses pipeline.enable_sequential_cpu_offload(). Make sure device is CPU.
USE_CPU_OFFLOAD = True
SEED = 42
IMAGE_WIDTH = 1024
IMAGE_HEIGHT = 768
NUM_STEPS = 8
NUM_IMAGES = 4
CFG = 0
PROMPT = 'A fierce fire ram in the night with a full moon behind.'

pipeline = FluxPipeline.from_pretrained(
    "black-forest-labs/FLUX.1-schnell", 
    torch_dtype=torch.float32, 
    use_safetensors=True, local_dir="./models/schnell/", local_dir_use_symlinks=False,
    ignore_patterns=["flux1-schnell.sft","flux1-schnell.safetensors"]).to(DEVICE)

images = pipeline(
    prompt=PROMPT,
    guidance_scale=CFG,
    num_inference_steps=NUM_STEPS,
    height=IMAGE_HEIGHT,
    width=IMAGE_WIDTH,
    max_sequence_length=512,
    generator=torch.manual_seed(42),
    num_images_per_prompt=NUM_IMAGES,
).images

@shokru Yeah it will take a very very long time on cpu. Even bad gpu's are usually better then the best cpu's for models. I would honestly recommend nf4v2 quantization version of the model which will use 8gb vram I believe and speeds up generation by a bit. You can further save vram by using https://huggingface.co/madebyollin/taef1 which is basically a much smaller vae of the flux vae. This will save maybe 1-2gb more extra vram.

If you dont have an 8gb vram gpu, just use something like kaggle or colab for free gpu's.

What about GPUs outside NVIDIA/CUDA?
Quite frustrating for mac users...

@shokru for mac's you can use this model: https://huggingface.co/argmaxinc/mlx-FLUX.1-schnell-4bit-quantized
it seems to also provide example code.

from diffusers import FluxPipeline, FluxTransformer2DModel
import torch

Configuration

MODEL_NAME_OR_DIR = "black-forest-labs/FLUX.1-schnell"
IMAGE_OUTPUT_DIR = "/content/1"
IMAGE_PREFIX = "sign_"
DEVICE = torch.device("cpu")

If True, uses pipeline.enable_sequential_cpu_offload(). Make sure device is CPU.

USE_CPU_OFFLOAD = True
SEED = 42
IMAGE_WIDTH = 1024
IMAGE_HEIGHT = 768
NUM_STEPS = 8
NUM_IMAGES = 1
CFG = 0
PROMPT = 'A fierce fire ram in the night with a full moon behind.'

pipeline = FluxPipeline.from_pretrained(
"black-forest-labs/FLUX.1-schnell",
torch_dtype=torch.float32,
use_safetensors=True, local_dir="/content/1", local_dir_use_symlinks=False,
ignore_patterns=["flux1-schnell.sft","flux1-schnell.safetensors"]).to(DEVICE)

images = pipeline(
prompt=PROMPT,
guidance_scale=CFG,
num_inference_steps=NUM_STEPS,
height=IMAGE_HEIGHT,
width=IMAGE_WIDTH,
max_sequence_length=512,
generator=torch.manual_seed(42),
num_images_per_prompt=NUM_IMAGES,
).images
from IPython.display import display
display(images[0])

https://colab.research.google.com/drive/1WOQu-QkBTGN33IzVn2qG_Hd3tIbSHnVf#scrollTo=cp4KHpuHAVYo
download.png

Screenshot 2024-09-21 172131.png

from folder

from diffusers import FluxPipeline
import torch
from IPython.display import display

ุงู„ุฅุนุฏุงุฏุงุช

LOCAL_MODEL_PATH = "/content/1/huggingface/hub/models--black-forest-labs--FLUX.1-schnell/snapshots/741f7c3ce8b383c54771c7003378a50191e9efe9" # ู‚ู… ุจุชุบูŠูŠุฑ ู‡ุฐุง ุงู„ู…ุณุงุฑ ุฅู„ู‰ ุงู„ู…ุฌู„ุฏ ุงู„ู…ุญู„ูŠ ุงู„ุฐูŠ ูŠุญุชูˆูŠ ุนู„ู‰ ู…ู„ูุงุช ุงู„ู†ู…ูˆุฐุฌ
IMAGE_OUTPUT_DIR = "/content/1"
IMAGE_PREFIX = "sign_"
DEVICE = torch.device("cpu")
SEED = 42
IMAGE_WIDTH = 1024
IMAGE_HEIGHT = 768
NUM_STEPS = 8
NUM_IMAGES = 1
CFG = 0
PROMPT = 'teen girl with teen boy and cat with dog'

ุชุญู…ูŠู„ ุงู„ู†ู…ูˆุฐุฌ ู…ู† ุงู„ู…ุฌู„ุฏ ุงู„ู…ุญู„ูŠ

pipeline = FluxPipeline.from_pretrained(
LOCAL_MODEL_PATH,
torch_dtype=torch.float32,
use_safetensors=True,
local_files_only=True, # ู‡ุฐุง ูŠุถู…ู† ุงุณุชุฎุฏุงู… ุงู„ู…ู„ูุงุช ุงู„ู…ุญู„ูŠุฉ ูู‚ุท
device_map="balanced" # ุงุณุชุฎุฏุงู… ุงุณุชุฑุงุชูŠุฌูŠุฉ "balanced" ุจุฏู„ุงู‹ ู…ู† "auto"
)

ู†ู‚ู„ ุงู„ู†ู…ูˆุฐุฌ ุฅู„ู‰ CPU ุฅุฐุง ู„ู… ูŠูƒู† ุจุงู„ูุนู„

pipeline = pipeline.to(DEVICE)

ุชูˆู„ูŠุฏ ุงู„ุตูˆุฑ

images = pipeline(
prompt=PROMPT,
guidance_scale=CFG,
num_inference_steps=NUM_STEPS,
height=IMAGE_HEIGHT,
width=IMAGE_WIDTH,
max_sequence_length=512,
generator=torch.manual_seed(SEED),
num_images_per_prompt=NUM_IMAGES,
).images

ุนุฑุถ ุงู„ุตูˆุฑุฉ ุงู„ุฃูˆู„ู‰

display(images[0])

ูŠู…ูƒู†ูƒ ุฃูŠุถู‹ุง ุญูุธ ุงู„ุตูˆุฑ ุฅุฐุง ุฑุบุจุช ููŠ ุฐู„ูƒ

for i, image in enumerate(images):
image.save(f"{IMAGE_OUTPUT_DIR}/{IMAGE_PREFIX}{i}.png")

ุทุจุงุนุฉ ุฅุตุฏุงุฑุงุช ุงู„ู…ูƒุชุจุงุช ู„ู„ุชุญู‚ู‚

print(f"PyTorch version: {torch.version}")
import diffusers
print(f"Diffusers version: {diffusers.version}")

Sign up or log in to comment