Has anyone tried stabilityai/stable-diffusion-xl-base-1.0 with enable_attention_slicing() with success?
#149
by
coderx7
- opened
Has anyone tried enable_attention_slicing(), with success with this model?
I seem to be having a problem, the generation seems to go fine, until it reaches unpsampling which crashes with out of CUDA memory!
For the record this is my config:
Python : 3.11.4 (main, Jul 5 2023, 14:15:25) [GCC 11.2.0]
numpy : 1.24.3
torch : 2.2.0+cu118
torchvision : 0.17.0+cu118
diffusers : 0.26.2
transformers: 4.37.2
matplotlib : 3.7.1
PIL : 9.4.0
using stabilityai/stable-diffusion-xl-base-1.0...
and this is the error message I get.
658 prompt = "a beautiful day in a lush forest"
--> 659 result = text2image(prompt)
660 # doesnt work on the sdxl_turbo model
661 if 'sdxl-turbo' not in repo_model_name:
File ~/anaconda3/lib/python3.11/site-packages/torch/utils/_contextlib.py:115, in context_decorator.<locals>.decorate_context(*args, **kwargs)
112 @functools.wraps(func)
113 def decorate_context(*args, **kwargs):
114 with ctx_factory():
--> 115 return func(*args, **kwargs)
File ~/anaconda3/lib/python3.11/site-packages/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py:1297, in StableDiffusionXLPipeline.__call__(self, prompt, prompt_2, height, width, num_inference_steps, timesteps, denoising_end, guidance_scale, negative_prompt, negative_prompt_2, num_images_per_prompt, eta, generator, latents, prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds, ip_adapter_image, output_type, return_dict, cross_attention_kwargs, guidance_rescale, original_size, crops_coords_top_left, target_size, negative_original_size, negative_crops_coords_top_left, negative_target_size, clip_skip, callback_on_step_end, callback_on_step_end_tensor_inputs, **kwargs)
1294 self.upcast_vae()
1295 latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
-> 1297 image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
1299 # cast back to fp16 if needed
1300 if needs_upcasting:
File ~/anaconda3/lib/python3.11/site-packages/diffusers/utils/accelerate_utils.py:46, in apply_forward_hook.<locals>.wrapper(self, *args, **kwargs)
44 if hasattr(self, "_hf_hook") and hasattr(self._hf_hook, "pre_forward"):
45 self._hf_hook.pre_forward(self)
---> 46 return method(self, *args, **kwargs)
File ~/anaconda3/lib/python3.11/site-packages/diffusers/models/autoencoders/autoencoder_kl.py:302, in AutoencoderKL.decode(self, z, return_dict, generator)
300 decoded = torch.cat(decoded_slices)
301 else:
--> 302 decoded = self._decode(z).sample
304 if not return_dict:
305 return (decoded,)
File ~/anaconda3/lib/python3.11/site-packages/diffusers/models/autoencoders/autoencoder_kl.py:273, in AutoencoderKL._decode(self, z, return_dict)
270 return self.tiled_decode(z, return_dict=return_dict)
272 z = self.post_quant_conv(z)
--> 273 dec = self.decoder(z)
275 if not return_dict:
276 return (dec,)
File ~/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py:1511, in Module._wrapped_call_impl(self, *args, **kwargs)
1509 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1510 else:
-> 1511 return self._call_impl(*args, **kwargs)
File ~/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py:1520, in Module._call_impl(self, *args, **kwargs)
1515 # If we don't have any hooks, we want to skip the rest of the logic in
1516 # this function, and just call forward.
1517 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1518 or _global_backward_pre_hooks or _global_backward_hooks
1519 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1520 return forward_call(*args, **kwargs)
1522 try:
1523 result = None
File ~/anaconda3/lib/python3.11/site-packages/diffusers/models/autoencoders/vae.py:338, in Decoder.forward(self, sample, latent_embeds)
336 # up
337 for up_block in self.up_blocks:
--> 338 sample = up_block(sample, latent_embeds)
340 # post-process
341 if latent_embeds is None:
File ~/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py:1511, in Module._wrapped_call_impl(self, *args, **kwargs)
1509 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1510 else:
-> 1511 return self._call_impl(*args, **kwargs)
File ~/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py:1520, in Module._call_impl(self, *args, **kwargs)
1515 # If we don't have any hooks, we want to skip the rest of the logic in
1516 # this function, and just call forward.
1517 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1518 or _global_backward_pre_hooks or _global_backward_hooks
1519 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1520 return forward_call(*args, **kwargs)
1522 try:
1523 result = None
File ~/anaconda3/lib/python3.11/site-packages/diffusers/models/unets/unet_2d_blocks.py:2619, in UpDecoderBlock2D.forward(self, hidden_states, temb, scale)
2617 if self.upsamplers is not None:
2618 for upsampler in self.upsamplers:
-> 2619 hidden_states = upsampler(hidden_states)
2621 return hidden_states
File ~/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py:1511, in Module._wrapped_call_impl(self, *args, **kwargs)
1509 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1510 else:
-> 1511 return self._call_impl(*args, **kwargs)
File ~/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py:1520, in Module._call_impl(self, *args, **kwargs)
1515 # If we don't have any hooks, we want to skip the rest of the logic in
1516 # this function, and just call forward.
1517 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1518 or _global_backward_pre_hooks or _global_backward_hooks
1519 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1520 return forward_call(*args, **kwargs)
1522 try:
1523 result = None
File ~/anaconda3/lib/python3.11/site-packages/diffusers/models/upsampling.py:172, in Upsample2D.forward(self, hidden_states, output_size, scale)
170 if self.interpolate:
171 if output_size is None:
--> 172 hidden_states = F.interpolate(hidden_states, scale_factor=2.0, mode="nearest")
173 else:
174 hidden_states = F.interpolate(hidden_states, size=output_size, mode="nearest")
File ~/anaconda3/lib/python3.11/site-packages/torch/nn/functional.py:4001, in interpolate(input, size, scale_factor, mode, align_corners, recompute_scale_factor, antialias)
3999 return torch._C._nn.upsample_nearest1d(input, output_size, scale_factors)
4000 if input.dim() == 4 and mode == "nearest":
-> 4001 return torch._C._nn.upsample_nearest2d(input, output_size, scale_factors)
4002 if input.dim() == 5 and mode == "nearest":
4003 return torch._C._nn.upsample_nearest3d(input, output_size, scale_factors)
OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU 0 has a total capacity of 9.77 GiB of which 965.69 MiB is free. Including non-PyTorch memory, this process has 8.29 GiB memory in use. Of the allocated memory 7.47 GiB is allocated by PyTorch, and 553.85 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Thanks a lot in advance