stabilityai/stable-diffusion-xl-base-1.0 · Has anyone tried stabilityai/stable-diffusion-xl-base-1.0 with enable_attention

Has anyone tried enable_attention_slicing(), with success with this model?
I seem to be having a problem, the generation seems to go fine, until it reaches unpsampling which crashes with out of CUDA memory!
For the record this is my config:

Python      : 3.11.4 (main, Jul  5 2023, 14:15:25) [GCC 11.2.0]
numpy       : 1.24.3
torch       : 2.2.0+cu118
torchvision : 0.17.0+cu118
diffusers   : 0.26.2
transformers: 4.37.2
matplotlib  : 3.7.1
PIL         : 9.4.0
using stabilityai/stable-diffusion-xl-base-1.0...

and this is the error message I get.

658 prompt = "a beautiful day in a lush forest"
--> 659 result = text2image(prompt)
    660 # doesnt work on the sdxl_turbo model
    661 if 'sdxl-turbo' not in repo_model_name:

File ~/anaconda3/lib/python3.11/site-packages/torch/utils/_contextlib.py:115, in context_decorator.<locals>.decorate_context(*args, **kwargs)
    112 @functools.wraps(func)
    113 def decorate_context(*args, **kwargs):
    114     with ctx_factory():
--> 115         return func(*args, **kwargs)

File ~/anaconda3/lib/python3.11/site-packages/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py:1297, in StableDiffusionXLPipeline.__call__(self, prompt, prompt_2, height, width, num_inference_steps, timesteps, denoising_end, guidance_scale, negative_prompt, negative_prompt_2, num_images_per_prompt, eta, generator, latents, prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds, ip_adapter_image, output_type, return_dict, cross_attention_kwargs, guidance_rescale, original_size, crops_coords_top_left, target_size, negative_original_size, negative_crops_coords_top_left, negative_target_size, clip_skip, callback_on_step_end, callback_on_step_end_tensor_inputs, **kwargs)
   1294     self.upcast_vae()
   1295     latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
-> 1297 image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
   1299 # cast back to fp16 if needed
   1300 if needs_upcasting:

File ~/anaconda3/lib/python3.11/site-packages/diffusers/utils/accelerate_utils.py:46, in apply_forward_hook.<locals>.wrapper(self, *args, **kwargs)
     44 if hasattr(self, "_hf_hook") and hasattr(self._hf_hook, "pre_forward"):
     45     self._hf_hook.pre_forward(self)
---> 46 return method(self, *args, **kwargs)

File ~/anaconda3/lib/python3.11/site-packages/diffusers/models/autoencoders/autoencoder_kl.py:302, in AutoencoderKL.decode(self, z, return_dict, generator)
    300     decoded = torch.cat(decoded_slices)
    301 else:
--> 302     decoded = self._decode(z).sample
    304 if not return_dict:
    305     return (decoded,)

File ~/anaconda3/lib/python3.11/site-packages/diffusers/models/autoencoders/autoencoder_kl.py:273, in AutoencoderKL._decode(self, z, return_dict)
    270     return self.tiled_decode(z, return_dict=return_dict)
    272 z = self.post_quant_conv(z)
--> 273 dec = self.decoder(z)
    275 if not return_dict:
    276     return (dec,)

File ~/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py:1511, in Module._wrapped_call_impl(self, *args, **kwargs)
   1509     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1510 else:
-> 1511     return self._call_impl(*args, **kwargs)

File ~/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py:1520, in Module._call_impl(self, *args, **kwargs)
   1515 # If we don't have any hooks, we want to skip the rest of the logic in
   1516 # this function, and just call forward.
   1517 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1518         or _global_backward_pre_hooks or _global_backward_hooks
   1519         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1520     return forward_call(*args, **kwargs)
   1522 try:
   1523     result = None

File ~/anaconda3/lib/python3.11/site-packages/diffusers/models/autoencoders/vae.py:338, in Decoder.forward(self, sample, latent_embeds)
    336     # up
    337     for up_block in self.up_blocks:
--> 338         sample = up_block(sample, latent_embeds)
    340 # post-process
    341 if latent_embeds is None:

File ~/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py:1511, in Module._wrapped_call_impl(self, *args, **kwargs)
   1509     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1510 else:
-> 1511     return self._call_impl(*args, **kwargs)

File ~/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py:1520, in Module._call_impl(self, *args, **kwargs)
   1515 # If we don't have any hooks, we want to skip the rest of the logic in
   1516 # this function, and just call forward.
   1517 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1518         or _global_backward_pre_hooks or _global_backward_hooks
   1519         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1520     return forward_call(*args, **kwargs)
   1522 try:
   1523     result = None

File ~/anaconda3/lib/python3.11/site-packages/diffusers/models/unets/unet_2d_blocks.py:2619, in UpDecoderBlock2D.forward(self, hidden_states, temb, scale)
   2617 if self.upsamplers is not None:
   2618     for upsampler in self.upsamplers:
-> 2619         hidden_states = upsampler(hidden_states)
   2621 return hidden_states

File ~/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py:1511, in Module._wrapped_call_impl(self, *args, **kwargs)
   1509     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1510 else:
-> 1511     return self._call_impl(*args, **kwargs)

File ~/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py:1520, in Module._call_impl(self, *args, **kwargs)
   1515 # If we don't have any hooks, we want to skip the rest of the logic in
   1516 # this function, and just call forward.
   1517 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1518         or _global_backward_pre_hooks or _global_backward_hooks
   1519         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1520     return forward_call(*args, **kwargs)
   1522 try:
   1523     result = None

File ~/anaconda3/lib/python3.11/site-packages/diffusers/models/upsampling.py:172, in Upsample2D.forward(self, hidden_states, output_size, scale)
    170 if self.interpolate:
    171     if output_size is None:
--> 172         hidden_states = F.interpolate(hidden_states, scale_factor=2.0, mode="nearest")
    173     else:
    174         hidden_states = F.interpolate(hidden_states, size=output_size, mode="nearest")

File ~/anaconda3/lib/python3.11/site-packages/torch/nn/functional.py:4001, in interpolate(input, size, scale_factor, mode, align_corners, recompute_scale_factor, antialias)
   3999     return torch._C._nn.upsample_nearest1d(input, output_size, scale_factors)
   4000 if input.dim() == 4 and mode == "nearest":
-> 4001     return torch._C._nn.upsample_nearest2d(input, output_size, scale_factors)
   4002 if input.dim() == 5 and mode == "nearest":
   4003     return torch._C._nn.upsample_nearest3d(input, output_size, scale_factors)

OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU 0 has a total capacity of 9.77 GiB of which 965.69 MiB is free. Including non-PyTorch memory, this process has 8.29 GiB memory in use. Of the allocated memory 7.47 GiB is allocated by PyTorch, and 553.85 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

Thanks a lot in advance

stabilityai
/

stable-diffusion-xl-base-1.0

Has anyone tried stabilityai/stable-diffusion-xl-base-1.0 with enable_attention_slicing() with success?