ginipick commited on
Commit
29f4edc
1 Parent(s): baa12c4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1 -1729
app.py CHANGED
@@ -1,1731 +1,3 @@
1
- # Copyright 2023 The HuggingFace Team. All rights reserved.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
- import gradio as gr
16
- import spaces
17
-
18
- import argparse
19
- import inspect
20
  import os
21
- from typing import Any, Callable, Dict, List, Optional, Tuple, Union
22
- import matplotlib.pyplot as plt
23
- from PIL import Image
24
-
25
- import torch
26
- import torch.nn.functional as F
27
- import numpy as np
28
- import random
29
- import warnings
30
- from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
31
- from utils import *
32
- import hashlib
33
-
34
- from diffusers.image_processor import VaeImageProcessor
35
- from diffusers.loaders import (
36
- FromSingleFileMixin,
37
- LoraLoaderMixin,
38
- TextualInversionLoaderMixin,
39
- )
40
- from diffusers.models import AutoencoderKL, UNet2DConditionModel
41
- from diffusers.models.attention_processor import (
42
- AttnProcessor2_0,
43
- LoRAAttnProcessor2_0,
44
- LoRAXFormersAttnProcessor,
45
- XFormersAttnProcessor,
46
- )
47
- from diffusers.models.lora import adjust_lora_scale_text_encoder
48
- from diffusers.schedulers import KarrasDiffusionSchedulers
49
- from diffusers.utils import (
50
- is_accelerate_available,
51
- is_accelerate_version,
52
- is_invisible_watermark_available,
53
- logging,
54
- replace_example_docstring,
55
- )
56
- from diffusers.utils.torch_utils import randn_tensor
57
- from diffusers.pipelines.pipeline_utils import DiffusionPipeline
58
- from diffusers.pipelines.stable_diffusion_xl import StableDiffusionXLPipelineOutput
59
- from accelerate.utils import set_seed
60
- from tqdm import tqdm
61
- if is_invisible_watermark_available():
62
- from .watermark import StableDiffusionXLWatermarker
63
-
64
- logger = logging.get_logger(__name__) # pylint: disable=invalid-name
65
-
66
- EXAMPLE_DOC_STRING = """
67
- Examples:
68
- ```py
69
- >>> import torch
70
- >>> from diffusers import StableDiffusionXLPipeline
71
-
72
- >>> pipe = StableDiffusionXLPipeline.from_pretrained(
73
- ... "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
74
- ... )
75
- >>> pipe = pipe.to("cuda")
76
-
77
- >>> prompt = "a photo of an astronaut riding a horse on mars"
78
- >>> image = pipe(prompt).images[0]
79
- ```
80
- """
81
-
82
-
83
-
84
- def gaussian_kernel(kernel_size=3, sigma=1.0, channels=3):
85
- x_coord = torch.arange(kernel_size)
86
- gaussian_1d = torch.exp(-(x_coord - (kernel_size - 1) / 2) ** 2 / (2 * sigma ** 2))
87
- gaussian_1d = gaussian_1d / gaussian_1d.sum()
88
- gaussian_2d = gaussian_1d[:, None] * gaussian_1d[None, :]
89
- kernel = gaussian_2d[None, None, :, :].repeat(channels, 1, 1, 1)
90
-
91
- return kernel
92
-
93
- def gaussian_filter(latents, kernel_size=3, sigma=1.0):
94
- channels = latents.shape[1]
95
- kernel = gaussian_kernel(kernel_size, sigma, channels).to(latents.device, latents.dtype)
96
- blurred_latents = F.conv2d(latents, kernel, padding=kernel_size//2, groups=channels)
97
- return blurred_latents
98
-
99
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
100
- def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
101
- """
102
- Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
103
- Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
104
- """
105
- std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
106
- std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
107
- # rescale the results from guidance (fixes overexposure)
108
- noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
109
- # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
110
- noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
111
- return noise_cfg
112
-
113
-
114
- class AccDiffusionSDXLPipeline(DiffusionPipeline, FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin):
115
- """
116
- Pipeline for text-to-image generation using Stable Diffusion XL.
117
-
118
- This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
119
- library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
120
-
121
- In addition the pipeline inherits the following loading methods:
122
- - *LoRA*: [`StableDiffusionXLPipeline.load_lora_weights`]
123
- - *Ckpt*: [`loaders.FromSingleFileMixin.from_single_file`]
124
-
125
- as well as the following saving methods:
126
- - *LoRA*: [`loaders.StableDiffusionXLPipeline.save_lora_weights`]
127
-
128
- Args:
129
- vae ([`AutoencoderKL`]):
130
- Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
131
- text_encoder ([`CLIPTextModel`]):
132
- Frozen text-encoder. Stable Diffusion XL uses the text portion of
133
- [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
134
- the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
135
- text_encoder_2 ([` CLIPTextModelWithProjection`]):
136
- Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of
137
- [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
138
- specifically the
139
- [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
140
- variant.
141
- tokenizer (`CLIPTokenizer`):
142
- Tokenizer of class
143
- [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
144
- tokenizer_2 (`CLIPTokenizer`):
145
- Second Tokenizer of class
146
- [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
147
- unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
148
- scheduler ([`SchedulerMixin`]):
149
- A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
150
- [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
151
- force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
152
- Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of
153
- `stabilityai/stable-diffusion-xl-base-1-0`.
154
- add_watermarker (`bool`, *optional*):
155
- Whether to use the [invisible_watermark library](https://github.com/ShieldMnt/invisible-watermark/) to
156
- watermark output images. If not defined, it will default to True if the package is installed, otherwise no
157
- watermarker will be used.
158
- """
159
- model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
160
-
161
- def __init__(
162
- self,
163
- vae: AutoencoderKL,
164
- text_encoder: CLIPTextModel,
165
- text_encoder_2: CLIPTextModelWithProjection,
166
- tokenizer: CLIPTokenizer,
167
- tokenizer_2: CLIPTokenizer,
168
- unet: UNet2DConditionModel,
169
- scheduler: KarrasDiffusionSchedulers,
170
- force_zeros_for_empty_prompt: bool = True,
171
- add_watermarker: Optional[bool] = None,
172
- ):
173
- super().__init__()
174
-
175
- self.register_modules(
176
- vae=vae,
177
- text_encoder=text_encoder,
178
- text_encoder_2=text_encoder_2,
179
- tokenizer=tokenizer,
180
- tokenizer_2=tokenizer_2,
181
- unet=unet,
182
- scheduler=scheduler,
183
- )
184
- self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
185
- self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
186
- self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
187
- self.default_sample_size = self.unet.config.sample_size
188
-
189
- add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
190
-
191
- if add_watermarker:
192
- self.watermark = StableDiffusionXLWatermarker()
193
- else:
194
- self.watermark = None
195
-
196
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
197
- def enable_vae_slicing(self):
198
- r"""
199
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
200
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
201
- """
202
- self.vae.enable_slicing()
203
-
204
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
205
- def disable_vae_slicing(self):
206
- r"""
207
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
208
- computing decoding in one step.
209
- """
210
- self.vae.disable_slicing()
211
-
212
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
213
- def enable_vae_tiling(self):
214
- r"""
215
- Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
216
- compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
217
- processing larger images.
218
- """
219
- self.vae.enable_tiling()
220
-
221
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
222
- def disable_vae_tiling(self):
223
- r"""
224
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
225
- computing decoding in one step.
226
- """
227
- self.vae.disable_tiling()
228
-
229
- def encode_prompt(
230
- self,
231
- prompt: str,
232
- prompt_2: Optional[str] = None,
233
- device: Optional[torch.device] = None,
234
- num_images_per_prompt: int = 1,
235
- do_classifier_free_guidance: bool = True,
236
- negative_prompt: Optional[str] = None,
237
- negative_prompt_2: Optional[str] = None,
238
- prompt_embeds: Optional[torch.FloatTensor] = None,
239
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
240
- pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
241
- negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
242
- lora_scale: Optional[float] = None,
243
- ):
244
- r"""
245
- Encodes the prompt into text encoder hidden states.
246
-
247
- Args:
248
- prompt (`str` or `List[str]`, *optional*):
249
- prompt to be encoded
250
- prompt_2 (`str` or `List[str]`, *optional*):
251
- The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
252
- used in both text-encoders
253
- device: (`torch.device`):
254
- torch device
255
- num_images_per_prompt (`int`):
256
- number of images that should be generated per prompt
257
- do_classifier_free_guidance (`bool`):
258
- whether to use classifier free guidance or not
259
- negative_prompt (`str` or `List[str]`, *optional*):
260
- The prompt or prompts not to guide the image generation. If not defined, one has to pass
261
- `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
262
- less than `1`).
263
- negative_prompt_2 (`str` or `List[str]`, *optional*):
264
- The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
265
- `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
266
- prompt_embeds (`torch.FloatTensor`, *optional*):
267
- Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
268
- provided, text embeddings will be generated from `prompt` input argument.
269
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
270
- Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
271
- weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
272
- argument.
273
- pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
274
- Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
275
- If not provided, pooled text embeddings will be generated from `prompt` input argument.
276
- negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
277
- Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
278
- weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
279
- input argument.
280
- lora_scale (`float`, *optional*):
281
- A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
282
- """
283
- device = device or self._execution_device
284
-
285
- # set lora scale so that monkey patched LoRA
286
- # function of text encoder can correctly access it
287
- if lora_scale is not None and isinstance(self, LoraLoaderMixin):
288
- self._lora_scale = lora_scale
289
-
290
- # dynamically adjust the LoRA scale
291
- adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
292
- adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
293
-
294
- if prompt is not None and isinstance(prompt, str):
295
- batch_size = 1
296
- elif prompt is not None and isinstance(prompt, list):
297
- batch_size = len(prompt)
298
- else:
299
- batch_size = prompt_embeds.shape[0]
300
-
301
- # Define tokenizers and text encoders
302
- tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
303
- text_encoders = (
304
- [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
305
- )
306
-
307
- if prompt_embeds is None:
308
- prompt_2 = prompt_2 or prompt
309
- # textual inversion: procecss multi-vector tokens if necessary
310
- prompt_embeds_list = []
311
- prompts = [prompt, prompt_2]
312
- for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
313
- if isinstance(self, TextualInversionLoaderMixin):
314
- prompt = self.maybe_convert_prompt(prompt, tokenizer)
315
-
316
- text_inputs = tokenizer(
317
- prompt,
318
- padding="max_length",
319
- max_length=tokenizer.model_max_length,
320
- truncation=True,
321
- return_tensors="pt",
322
- )
323
-
324
- text_input_ids = text_inputs.input_ids
325
- untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
326
-
327
- if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
328
- text_input_ids, untruncated_ids
329
- ):
330
- removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
331
- logger.warning(
332
- "The following part of your input was truncated because CLIP can only handle sequences up to"
333
- f" {tokenizer.model_max_length} tokens: {removed_text}"
334
- )
335
-
336
- prompt_embeds = text_encoder(
337
- text_input_ids.to(device),
338
- output_hidden_states=True,
339
- )
340
-
341
- # We are only ALWAYS interested in the pooled output of the final text encoder
342
- pooled_prompt_embeds = prompt_embeds[0]
343
- prompt_embeds = prompt_embeds.hidden_states[-2]
344
-
345
- prompt_embeds_list.append(prompt_embeds)
346
-
347
- prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
348
-
349
- # get unconditional embeddings for classifier free guidance
350
- zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
351
- if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
352
- negative_prompt_embeds = torch.zeros_like(prompt_embeds)
353
- negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
354
- elif do_classifier_free_guidance and negative_prompt_embeds is None:
355
- negative_prompt = negative_prompt or ""
356
- negative_prompt_2 = negative_prompt_2 or negative_prompt
357
-
358
- uncond_tokens: List[str]
359
- if prompt is not None and type(prompt) is not type(negative_prompt):
360
- raise TypeError(
361
- f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
362
- f" {type(prompt)}."
363
- )
364
- elif isinstance(negative_prompt, str):
365
- uncond_tokens = [negative_prompt, negative_prompt_2]
366
- elif batch_size != len(negative_prompt):
367
- raise ValueError(
368
- f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
369
- f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
370
- " the batch size of `prompt`."
371
- )
372
- else:
373
- uncond_tokens = [negative_prompt, negative_prompt_2]
374
-
375
- negative_prompt_embeds_list = []
376
- for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
377
- if isinstance(self, TextualInversionLoaderMixin):
378
- negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
379
-
380
- max_length = prompt_embeds.shape[1]
381
- uncond_input = tokenizer(
382
- negative_prompt,
383
- padding="max_length",
384
- max_length=max_length,
385
- truncation=True,
386
- return_tensors="pt",
387
- )
388
-
389
- negative_prompt_embeds = text_encoder(
390
- uncond_input.input_ids.to(device),
391
- output_hidden_states=True,
392
- )
393
- # We are only ALWAYS interested in the pooled output of the final text encoder
394
- negative_pooled_prompt_embeds = negative_prompt_embeds[0]
395
- negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
396
-
397
- negative_prompt_embeds_list.append(negative_prompt_embeds)
398
-
399
- negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
400
-
401
- prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
402
- bs_embed, seq_len, _ = prompt_embeds.shape
403
- # duplicate text embeddings for each generation per prompt, using mps friendly method
404
- prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
405
- prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
406
-
407
- if do_classifier_free_guidance:
408
- # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
409
- seq_len = negative_prompt_embeds.shape[1]
410
- negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
411
- negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
412
- negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
413
-
414
- pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
415
- bs_embed * num_images_per_prompt, -1
416
- )
417
- if do_classifier_free_guidance:
418
- negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
419
- bs_embed * num_images_per_prompt, -1
420
- )
421
-
422
- return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
423
-
424
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
425
- def prepare_extra_step_kwargs(self, generator, eta):
426
- # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
427
- # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
428
- # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
429
- # and should be between [0, 1]
430
-
431
- accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
432
- extra_step_kwargs = {}
433
- if accepts_eta:
434
- extra_step_kwargs["eta"] = eta
435
-
436
- # check if the scheduler accepts generator
437
- accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
438
- if accepts_generator:
439
- extra_step_kwargs["generator"] = generator
440
- return extra_step_kwargs
441
-
442
- def check_inputs(
443
- self,
444
- prompt,
445
- prompt_2,
446
- height,
447
- width,
448
- callback_steps,
449
- negative_prompt=None,
450
- negative_prompt_2=None,
451
- prompt_embeds=None,
452
- negative_prompt_embeds=None,
453
- pooled_prompt_embeds=None,
454
- negative_pooled_prompt_embeds=None,
455
- num_images_per_prompt=None,
456
- ):
457
- if height % 8 != 0 or width % 8 != 0:
458
- raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
459
-
460
- if (callback_steps is None) or (
461
- callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
462
- ):
463
- raise ValueError(
464
- f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
465
- f" {type(callback_steps)}."
466
- )
467
-
468
- if prompt is not None and prompt_embeds is not None:
469
- raise ValueError(
470
- f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
471
- " only forward one of the two."
472
- )
473
- elif prompt_2 is not None and prompt_embeds is not None:
474
- raise ValueError(
475
- f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
476
- " only forward one of the two."
477
- )
478
- elif prompt is None and prompt_embeds is None:
479
- raise ValueError(
480
- "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
481
- )
482
- elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
483
- raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
484
- elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
485
- raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
486
-
487
- if negative_prompt is not None and negative_prompt_embeds is not None:
488
- raise ValueError(
489
- f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
490
- f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
491
- )
492
- elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
493
- raise ValueError(
494
- f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
495
- f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
496
- )
497
-
498
- if prompt_embeds is not None and negative_prompt_embeds is not None:
499
- if prompt_embeds.shape != negative_prompt_embeds.shape:
500
- raise ValueError(
501
- "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
502
- f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
503
- f" {negative_prompt_embeds.shape}."
504
- )
505
-
506
- if prompt_embeds is not None and pooled_prompt_embeds is None:
507
- raise ValueError(
508
- "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
509
- )
510
-
511
- if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
512
- raise ValueError(
513
- "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
514
- )
515
-
516
- if max(height, width) % 1024 != 0:
517
- raise ValueError(f"the larger one of `height` and `width` has to be divisible by 1024 but are {height} and {width}.")
518
-
519
- if num_images_per_prompt != 1:
520
- warnings.warn("num_images_per_prompt != 1 is not supported by AccDiffusion and will be ignored.")
521
- num_images_per_prompt = 1
522
-
523
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
524
- def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
525
- shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
526
- if isinstance(generator, list) and len(generator) != batch_size:
527
- raise ValueError(
528
- f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
529
- f" size of {batch_size}. Make sure the batch size matches the length of the generators."
530
- )
531
-
532
- if latents is None:
533
- latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
534
- else:
535
- latents = latents.to(device)
536
-
537
- # scale the initial noise by the standard deviation required by the scheduler
538
- latents = latents * self.scheduler.init_noise_sigma
539
- return latents
540
-
541
- def _get_add_time_ids(self, original_size, crops_coords_top_left, target_size, dtype):
542
- add_time_ids = list(original_size + crops_coords_top_left + target_size)
543
-
544
- passed_add_embed_dim = (
545
- self.unet.config.addition_time_embed_dim * len(add_time_ids) + self.text_encoder_2.config.projection_dim
546
- )
547
- expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
548
-
549
- if expected_add_embed_dim != passed_add_embed_dim:
550
- raise ValueError(
551
- f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. \
552
- The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
553
- )
554
-
555
- add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
556
- return add_time_ids
557
-
558
- def get_views(self, height, width, window_size=128, stride=64, random_jitter=False):
559
- # Here, we define the mappings F_i (see Eq. 7 in the MultiDiffusion paper https://arxiv.org/abs/2302.08113)
560
- # if panorama's height/width < window_size, num_blocks of height/width should return 1
561
- height //= self.vae_scale_factor
562
- width //= self.vae_scale_factor
563
- num_blocks_height = int((height - window_size) / stride - 1e-6) + 2 if height > window_size else 1
564
- num_blocks_width = int((width - window_size) / stride - 1e-6) + 2 if width > window_size else 1
565
- total_num_blocks = int(num_blocks_height * num_blocks_width)
566
- views = []
567
- for i in range(total_num_blocks):
568
- h_start = int((i // num_blocks_width) * stride)
569
- h_end = h_start + window_size
570
- w_start = int((i % num_blocks_width) * stride)
571
- w_end = w_start + window_size
572
-
573
- if h_end > height:
574
- h_start = int(h_start + height - h_end)
575
- h_end = int(height)
576
- if w_end > width:
577
- w_start = int(w_start + width - w_end)
578
- w_end = int(width)
579
- if h_start < 0:
580
- h_end = int(h_end - h_start)
581
- h_start = 0
582
- if w_start < 0:
583
- w_end = int(w_end - w_start)
584
- w_start = 0
585
-
586
- if random_jitter:
587
- jitter_range = (window_size - stride) // 4
588
- w_jitter = 0
589
- h_jitter = 0
590
- if (w_start != 0) and (w_end != width):
591
- w_jitter = random.randint(-jitter_range, jitter_range)
592
- elif (w_start == 0) and (w_end != width):
593
- w_jitter = random.randint(-jitter_range, 0)
594
- elif (w_start != 0) and (w_end == width):
595
- w_jitter = random.randint(0, jitter_range)
596
-
597
- if (h_start != 0) and (h_end != height):
598
- h_jitter = random.randint(-jitter_range, jitter_range)
599
- elif (h_start == 0) and (h_end != height):
600
- h_jitter = random.randint(-jitter_range, 0)
601
- elif (h_start != 0) and (h_end == height):
602
- h_jitter = random.randint(0, jitter_range)
603
- # When using jitter, the noise will be padded by jitterrange, so we need to add it to the view.
604
- h_start = h_start + h_jitter + jitter_range
605
- h_end = h_end + h_jitter + jitter_range
606
- w_start = w_start + w_jitter + jitter_range
607
- w_end = w_end + w_jitter + jitter_range
608
-
609
- views.append((h_start, h_end, w_start, w_end))
610
- return views
611
-
612
-
613
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
614
- def upcast_vae(self):
615
- dtype = self.vae.dtype
616
- self.vae.to(dtype=torch.float32)
617
- use_torch_2_0_or_xformers = isinstance(
618
- self.vae.decoder.mid_block.attentions[0].processor,
619
- (
620
- AttnProcessor2_0,
621
- XFormersAttnProcessor,
622
- LoRAXFormersAttnProcessor,
623
- LoRAAttnProcessor2_0,
624
- ),
625
- )
626
- # if xformers or torch_2_0 is used attention block does not need
627
- # to be in float32 which can save lots of memory
628
- if use_torch_2_0_or_xformers:
629
- self.vae.post_quant_conv.to(dtype)
630
- self.vae.decoder.conv_in.to(dtype)
631
- self.vae.decoder.mid_block.to(dtype)
632
-
633
-
634
- def register_attention_control(self, controller):
635
- attn_procs = {}
636
- cross_att_count = 0
637
- ori_attn_processors = self.unet.attn_processors
638
- for name in self.unet.attn_processors.keys():
639
- if name.startswith("mid_block"):
640
- place_in_unet = "mid"
641
- elif name.startswith("up_blocks"):
642
- place_in_unet = "up"
643
- elif name.startswith("down_blocks"):
644
- place_in_unet = "down"
645
- else:
646
- continue
647
- cross_att_count += 1
648
- attn_procs[name] = P2PCrossAttnProcessor(controller=controller, place_in_unet=place_in_unet)
649
-
650
- self.unet.set_attn_processor(attn_procs)
651
- controller.num_att_layers = cross_att_count
652
- return ori_attn_processors
653
-
654
- def recover_attention_control(self, ori_attn_processors):
655
- self.unet.set_attn_processor(ori_attn_processors)
656
-
657
-
658
-
659
- # Overrride to properly handle the loading and unloading of the additional text encoder.
660
- def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], **kwargs):
661
- # We could have accessed the unet config from `lora_state_dict()` too. We pass
662
- # it here explicitly to be able to tell that it's coming from an SDXL
663
- # pipeline.
664
-
665
- # Remove any existing hooks.
666
- if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
667
- from accelerate.hooks import AlignDevicesHook, CpuOffload, remove_hook_from_module
668
- else:
669
- raise ImportError("Offloading requires `accelerate v0.17.0` or higher.")
670
-
671
- is_model_cpu_offload = False
672
- is_sequential_cpu_offload = False
673
- recursive = False
674
- for _, component in self.components.items():
675
- if isinstance(component, torch.nn.Module):
676
- if hasattr(component, "_hf_hook"):
677
- is_model_cpu_offload = isinstance(getattr(component, "_hf_hook"), CpuOffload)
678
- is_sequential_cpu_offload = isinstance(getattr(component, "_hf_hook"), AlignDevicesHook)
679
- logger.info(
680
- "Accelerate hooks detected. Since you have called `load_lora_weights()`, the previous hooks will be first removed. Then the LoRA parameters will be loaded and the hooks will be applied again."
681
- )
682
- recursive = is_sequential_cpu_offload
683
- remove_hook_from_module(component, recurse=recursive)
684
- state_dict, network_alphas = self.lora_state_dict(
685
- pretrained_model_name_or_path_or_dict,
686
- unet_config=self.unet.config,
687
- **kwargs,
688
- )
689
- self.load_lora_into_unet(state_dict, network_alphas=network_alphas, unet=self.unet)
690
-
691
- text_encoder_state_dict = {k: v for k, v in state_dict.items() if "text_encoder." in k}
692
- if len(text_encoder_state_dict) > 0:
693
- self.load_lora_into_text_encoder(
694
- text_encoder_state_dict,
695
- network_alphas=network_alphas,
696
- text_encoder=self.text_encoder,
697
- prefix="text_encoder",
698
- lora_scale=self.lora_scale,
699
- )
700
-
701
- text_encoder_2_state_dict = {k: v for k, v in state_dict.items() if "text_encoder_2." in k}
702
- if len(text_encoder_2_state_dict) > 0:
703
- self.load_lora_into_text_encoder(
704
- text_encoder_2_state_dict,
705
- network_alphas=network_alphas,
706
- text_encoder=self.text_encoder_2,
707
- prefix="text_encoder_2",
708
- lora_scale=self.lora_scale,
709
- )
710
-
711
- # Offload back.
712
- if is_model_cpu_offload:
713
- self.enable_model_cpu_offload()
714
- elif is_sequential_cpu_offload:
715
- self.enable_sequential_cpu_offload()
716
-
717
- @classmethod
718
- def save_lora_weights(
719
- self,
720
- save_directory: Union[str, os.PathLike],
721
- unet_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None,
722
- text_encoder_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None,
723
- text_encoder_2_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None,
724
- is_main_process: bool = True,
725
- weight_name: str = None,
726
- save_function: Callable = None,
727
- safe_serialization: bool = True,
728
- ):
729
- state_dict = {}
730
-
731
- def pack_weights(layers, prefix):
732
- layers_weights = layers.state_dict() if isinstance(layers, torch.nn.Module) else layers
733
- layers_state_dict = {f"{prefix}.{module_name}": param for module_name, param in layers_weights.items()}
734
- return layers_state_dict
735
-
736
- if not (unet_lora_layers or text_encoder_lora_layers or text_encoder_2_lora_layers):
737
- raise ValueError(
738
- "You must pass at least one of `unet_lora_layers`, `text_encoder_lora_layers` or `text_encoder_2_lora_layers`."
739
- )
740
-
741
- if unet_lora_layers:
742
- state_dict.update(pack_weights(unet_lora_layers, "unet"))
743
-
744
- if text_encoder_lora_layers and text_encoder_2_lora_layers:
745
- state_dict.update(pack_weights(text_encoder_lora_layers, "text_encoder"))
746
- state_dict.update(pack_weights(text_encoder_2_lora_layers, "text_encoder_2"))
747
-
748
- self.write_lora_layers(
749
- state_dict=state_dict,
750
- save_directory=save_directory,
751
- is_main_process=is_main_process,
752
- weight_name=weight_name,
753
- save_function=save_function,
754
- safe_serialization=safe_serialization,
755
- )
756
-
757
- def _remove_text_encoder_monkey_patch(self):
758
- self._remove_text_encoder_monkey_patch_classmethod(self.text_encoder)
759
- self._remove_text_encoder_monkey_patch_classmethod(self.text_encoder_2)
760
-
761
- @torch.no_grad()
762
- @replace_example_docstring(EXAMPLE_DOC_STRING)
763
- def __call__(
764
- self,
765
- prompt: Union[str, List[str]] = None,
766
- prompt_2: Optional[Union[str, List[str]]] = None,
767
- height: Optional[int] = None,
768
- width: Optional[int] = None,
769
- num_inference_steps: int = 50,
770
- denoising_end: Optional[float] = None,
771
- guidance_scale: float = 5.0,
772
- negative_prompt: Optional[Union[str, List[str]]] = None,
773
- negative_prompt_2: Optional[Union[str, List[str]]] = None,
774
- num_images_per_prompt: Optional[int] = 1,
775
- eta: float = 0.0,
776
- generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
777
- latents: Optional[torch.FloatTensor] = None,
778
- prompt_embeds: Optional[torch.FloatTensor] = None,
779
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
780
- pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
781
- negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
782
- output_type: Optional[str] = "pil",
783
- return_dict: bool = False,
784
- callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
785
- callback_steps: int = 1,
786
- cross_attention_kwargs: Optional[Dict[str, Any]] = None,
787
- guidance_rescale: float = 0.0,
788
- original_size: Optional[Tuple[int, int]] = None,
789
- crops_coords_top_left: Tuple[int, int] = (0, 0),
790
- target_size: Optional[Tuple[int, int]] = None,
791
- negative_original_size: Optional[Tuple[int, int]] = None,
792
- negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
793
- negative_target_size: Optional[Tuple[int, int]] = None,
794
- ################### AccDiffusion specific parameters ####################
795
- image_lr: Optional[torch.FloatTensor] = None,
796
- view_batch_size: int = 16,
797
- multi_decoder: bool = True,
798
- stride: Optional[int] = 64,
799
- cosine_scale_1: Optional[float] = 3.,
800
- cosine_scale_2: Optional[float] = 1.,
801
- cosine_scale_3: Optional[float] = 1.,
802
- sigma: Optional[float] = 1.0,
803
- lowvram: bool = False,
804
- multi_guidance_scale: Optional[float] = 7.5,
805
- use_guassian: bool = True,
806
- upscale_mode: Union[str, List[str]] = 'bicubic_latent',
807
- use_multidiffusion: bool = True,
808
- use_dilated_sampling : bool = True,
809
- use_skip_residual: bool = True,
810
- use_progressive_upscaling: bool = True,
811
- shuffle: bool = False,
812
- result_path: str = './outputs/AccDiffusion',
813
- debug: bool = False,
814
- use_md_prompt: bool = False,
815
- attn_res=None,
816
- save_attention_map: bool = False,
817
- seed: Optional[int] = None,
818
- c : Optional[float] = 0.3,
819
- ):
820
- r"""
821
- Function invoked when calling the pipeline for generation.
822
-
823
- Args:
824
- prompt (`str` or `List[str]`, *optional*):
825
- The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
826
- instead.
827
- prompt_2 (`str` or `List[str]`, *optional*):
828
- The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
829
- used in both text-encoders
830
- height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
831
- The height in pixels of the generated image. This is set to 1024 by default for the best results.
832
- Anything below 512 pixels won't work well for
833
- [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
834
- and checkpoints that are not specifically fine-tuned on low resolutions.
835
- width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
836
- The width in pixels of the generated image. This is set to 1024 by default for the best results.
837
- Anything below 512 pixels won't work well for
838
- [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
839
- and checkpoints that are not specifically fine-tuned on low resolutions.
840
- num_inference_steps (`int`, *optional*, defaults to 50):
841
- The number of denoising steps. More denoising steps usually lead to a higher quality image at the
842
- expense of slower inference.
843
- denoising_end (`float`, *optional*):
844
- When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
845
- completed before it is intentionally prematurely terminated. As a result, the returned sample will
846
- still retain a substantial amount of noise as determined by the discrete timesteps selected by the
847
- scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
848
- "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
849
- Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
850
- guidance_scale (`float`, *optional*, defaults to 5.0):
851
- Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
852
- `guidance_scale` is defined as `w` of equation 2. of [Imagen
853
- Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
854
- 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
855
- usually at the expense of lower image quality.
856
- negative_prompt (`str` or `List[str]`, *optional*):
857
- The prompt or prompts not to guide the image generation. If not defined, one has to pass
858
- `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
859
- less than `1`).
860
- negative_prompt_2 (`str` or `List[str]`, *optional*):
861
- The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
862
- `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
863
- num_images_per_prompt (`int`, *optional*, defaults to 1):
864
- The number of images to generate per prompt.
865
- eta (`float`, *optional*, defaults to 0.0):
866
- Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
867
- [`schedulers.DDIMScheduler`], will be ignored for others.
868
- generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
869
- One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
870
- to make generation deterministic.
871
- latents (`torch.FloatTensor`, *optional*):
872
- Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
873
- generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
874
- tensor will ge generated by sampling using the supplied random `generator`.
875
- prompt_embeds (`torch.FloatTensor`, *optional*):
876
- Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
877
- provided, text embeddings will be generated from `prompt` input argument.
878
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
879
- Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
880
- weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
881
- argument.
882
- pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
883
- Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
884
- If not provided, pooled text embeddings will be generated from `prompt` input argument.
885
- negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
886
- Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
887
- weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
888
- input argument.
889
- output_type (`str`, *optional*, defaults to `"pil"`):
890
- The output format of the generate image. Choose between
891
- [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
892
- return_dict (`bool`, *optional*, defaults to `True`):
893
- Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
894
- of a plain tuple.
895
- callback (`Callable`, *optional*):
896
- A function that will be called every `callback_steps` steps during inference. The function will be
897
- called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
898
- callback_steps (`int`, *optional*, defaults to 1):
899
- The frequency at which the `callback` function will be called. If not specified, the callback will be
900
- called at every step.
901
- cross_attention_kwargs (`dict`, *optional*):
902
- A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
903
- `self.processor` in
904
- [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
905
- guidance_rescale (`float`, *optional*, defaults to 0.7):
906
- Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
907
- Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
908
- [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
909
- Guidance rescale factor should fix overexposure when using zero terminal SNR.
910
- original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
911
- If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
912
- `original_size` defaults to `(width, height)` if not specified. Part of SDXL's micro-conditioning as
913
- explained in section 2.2 of
914
- [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
915
- crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
916
- `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
917
- `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
918
- `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
919
- [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
920
- target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
921
- For most cases, `target_size` should be set to the desired height and width of the generated image. If
922
- not specified it will default to `(width, height)`. Part of SDXL's micro-conditioning as explained in
923
- section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
924
- negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
925
- To negatively condition the generation process based on a specific image resolution. Part of SDXL's
926
- micro-conditioning as explained in section 2.2 of
927
- [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
928
- information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
929
- negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
930
- To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
931
- micro-conditioning as explained in section 2.2 of
932
- [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
933
- information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
934
- negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
935
- To negatively condition the generation process based on a target image resolution. It should be as same
936
- as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
937
- [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
938
- information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
939
- ################### AccDiffusion specific parameters ####################
940
- # We build AccDiffusion based on Demofusion pipeline (see paper: https://arxiv.org/pdf/2311.16973.pdf)
941
- image_lr (`torch.FloatTensor`, *optional*, , defaults to None):
942
- Low-resolution image input for upscaling.
943
- view_batch_size (`int`, defaults to 16):
944
- The batch size for multiple denoising paths. Typically, a larger batch size can result in higher
945
- efficiency but comes with increased GPU memory requirements.
946
- multi_decoder (`bool`, defaults to True):
947
- Determine whether to use a tiled decoder. Generally, when the resolution exceeds 3072x3072,
948
- a tiled decoder becomes necessary.
949
- stride (`int`, defaults to 64):
950
- The stride of moving local patches. A smaller stride is better for alleviating seam issues,
951
- but it also introduces additional computational overhead and inference time.
952
- cosine_scale_1 (`float`, defaults to 3):
953
- Control the strength of skip-residual. For specific impacts, please refer to Appendix C
954
- in the DemoFusion paper. (see paper : https://arxiv.org/pdf/2311.16973.pdf)
955
- cosine_scale_2 (`float`, defaults to 1):
956
- Control the strength of dilated sampling. For specific impacts, please refer to Appendix C
957
- in the DemoFusion paper.(see paper : https://arxiv.org/pdf/2311.16973.pdf)
958
- cosine_scale_3 (`float`, defaults to 1):
959
- Control the strength of the gaussion filter. For specific impacts, please refer to Appendix C
960
- in the DemoFusion paper.(see paper : https://arxiv.org/pdf/2311.16973.pdf)
961
- sigma (`float`, defaults to 1):
962
- The standard value of the gaussian filter.
963
- show_image (`bool`, defaults to False):
964
- Determine whether to show intermediate results during generation.
965
- lowvram (`bool`, defaults to False):
966
- Try to fit in 8 Gb of VRAM, with xformers installed.
967
-
968
- Examples:
969
-
970
- Returns:
971
- a `list` with the generated images at each phase.
972
- """
973
-
974
- if debug :
975
- num_inference_steps = 1
976
-
977
- # 0. Default height and width to unet
978
- height = height or self.default_sample_size * self.vae_scale_factor
979
- width = width or self.default_sample_size * self.vae_scale_factor
980
-
981
- x1_size = self.default_sample_size * self.vae_scale_factor
982
-
983
- height_scale = height / x1_size
984
- width_scale = width / x1_size
985
- scale_num = int(max(height_scale, width_scale))
986
- aspect_ratio = min(height_scale, width_scale) / max(height_scale, width_scale)
987
-
988
- original_size = original_size or (height, width)
989
- target_size = target_size or (height, width)
990
-
991
- if attn_res is None:
992
- attn_res = int(np.ceil(self.default_sample_size * self.vae_scale_factor / 32)), int(np.ceil(self.default_sample_size * self.vae_scale_factor / 32))
993
- self.attn_res = attn_res
994
-
995
- if lowvram:
996
- attention_map_device = torch.device("cpu")
997
- else:
998
- attention_map_device = self.device
999
-
1000
- self.controller = create_controller(
1001
- prompt, cross_attention_kwargs, num_inference_steps, tokenizer=self.tokenizer, device=attention_map_device, attn_res=self.attn_res
1002
- )
1003
-
1004
- if save_attention_map or use_md_prompt:
1005
- ori_attn_processors = self.register_attention_control(self.controller) # add attention controller
1006
-
1007
- # 1. Check inputs. Raise error if not correct
1008
- self.check_inputs(
1009
- prompt,
1010
- prompt_2,
1011
- height,
1012
- width,
1013
- callback_steps,
1014
- negative_prompt,
1015
- negative_prompt_2,
1016
- prompt_embeds,
1017
- negative_prompt_embeds,
1018
- pooled_prompt_embeds,
1019
- negative_pooled_prompt_embeds,
1020
- num_images_per_prompt,
1021
- )
1022
-
1023
- # 2. Define call parameters
1024
- if prompt is not None and isinstance(prompt, str):
1025
- batch_size = 1
1026
- elif prompt is not None and isinstance(prompt, list):
1027
- batch_size = len(prompt)
1028
- else:
1029
- batch_size = prompt_embeds.shape[0]
1030
-
1031
- device = self._execution_device
1032
- self.lowvram = lowvram
1033
- if self.lowvram:
1034
- self.vae.cpu()
1035
- self.unet.cpu()
1036
- self.text_encoder.to(device)
1037
- self.text_encoder_2.to(device)
1038
- # image_lr.cpu()
1039
-
1040
- # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
1041
- # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
1042
- # corresponds to doing no classifier free guidance.
1043
- do_classifier_free_guidance = guidance_scale > 1.0
1044
-
1045
- # 3. Encode input prompt
1046
- text_encoder_lora_scale = (
1047
- cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
1048
- )
1049
-
1050
- (
1051
- prompt_embeds,
1052
- negative_prompt_embeds,
1053
- pooled_prompt_embeds,
1054
- negative_pooled_prompt_embeds,
1055
- ) = self.encode_prompt(
1056
- prompt=prompt,
1057
- prompt_2=prompt_2,
1058
- device=device,
1059
- num_images_per_prompt=num_images_per_prompt,
1060
- do_classifier_free_guidance=do_classifier_free_guidance,
1061
- negative_prompt=negative_prompt,
1062
- negative_prompt_2=negative_prompt_2,
1063
- prompt_embeds=prompt_embeds,
1064
- negative_prompt_embeds=negative_prompt_embeds,
1065
- pooled_prompt_embeds=pooled_prompt_embeds,
1066
- negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
1067
- lora_scale=text_encoder_lora_scale,
1068
- )
1069
-
1070
- # 4. Prepare timesteps
1071
- self.scheduler.set_timesteps(num_inference_steps, device=device)
1072
-
1073
- timesteps = self.scheduler.timesteps
1074
-
1075
- # 5. Prepare latent variables
1076
- num_channels_latents = self.unet.config.in_channels
1077
- latents = self.prepare_latents(
1078
- batch_size * num_images_per_prompt,
1079
- num_channels_latents,
1080
- height // scale_num,
1081
- width // scale_num,
1082
- prompt_embeds.dtype,
1083
- device,
1084
- generator,
1085
- latents,
1086
- )
1087
-
1088
-
1089
- # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
1090
- extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
1091
-
1092
- # 7. Prepare added time ids & embeddings
1093
- add_text_embeds = pooled_prompt_embeds
1094
-
1095
- add_time_ids = self._get_add_time_ids(
1096
- original_size, crops_coords_top_left, target_size, dtype=prompt_embeds.dtype
1097
- )
1098
-
1099
- if negative_original_size is not None and negative_target_size is not None:
1100
- negative_add_time_ids = self._get_add_time_ids(
1101
- negative_original_size,
1102
- negative_crops_coords_top_left,
1103
- negative_target_size,
1104
- dtype=prompt_embeds.dtype,
1105
- )
1106
- else:
1107
- negative_add_time_ids = add_time_ids
1108
-
1109
- if do_classifier_free_guidance:
1110
- prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0).to(device)
1111
- add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0).to(device)
1112
- add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0).to(device).repeat(batch_size * num_images_per_prompt, 1)
1113
-
1114
- del negative_prompt_embeds, negative_pooled_prompt_embeds, negative_add_time_ids
1115
-
1116
- # 8. Denoising loop
1117
- num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
1118
-
1119
-
1120
- # 7.1 Apply denoising_end
1121
- if denoising_end is not None and isinstance(denoising_end, float) and denoising_end > 0 and denoising_end < 1:
1122
- discrete_timestep_cutoff = int(
1123
- round(
1124
- self.scheduler.config.num_train_timesteps
1125
- - (denoising_end * self.scheduler.config.num_train_timesteps)
1126
- )
1127
- )
1128
- num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
1129
- timesteps = timesteps[:num_inference_steps]
1130
-
1131
- output_images = []
1132
-
1133
- ###################################################### Phase Initialization ########################################################
1134
-
1135
- if self.lowvram:
1136
- self.text_encoder.cpu()
1137
- self.text_encoder_2.cpu()
1138
-
1139
- if image_lr == None:
1140
- print("### Phase 1 Denoising ###")
1141
- with self.progress_bar(total=num_inference_steps) as progress_bar:
1142
- for i, t in enumerate(timesteps):
1143
-
1144
- if self.lowvram:
1145
- self.vae.cpu()
1146
- self.unet.to(device)
1147
-
1148
- latents_for_view = latents
1149
-
1150
- # expand the latents if we are doing classifier free guidance
1151
- latent_model_input = (
1152
- latents.repeat_interleave(2, dim=0)
1153
- if do_classifier_free_guidance
1154
- else latents
1155
- )
1156
- latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
1157
-
1158
- # predict the noise residual
1159
- added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
1160
-
1161
- noise_pred = self.unet(
1162
- latent_model_input,
1163
- t,
1164
- encoder_hidden_states=prompt_embeds,
1165
- # cross_attention_kwargs=cross_attention_kwargs,
1166
- added_cond_kwargs=added_cond_kwargs,
1167
- return_dict=False,
1168
- )[0]
1169
-
1170
- # perform guidance
1171
- if do_classifier_free_guidance:
1172
- noise_pred_uncond, noise_pred_text = noise_pred[::2], noise_pred[1::2]
1173
- noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
1174
-
1175
- if do_classifier_free_guidance and guidance_rescale > 0.0:
1176
- # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
1177
- noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
1178
-
1179
- # compute the previous noisy sample x_t -> x_t-1
1180
- latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
1181
-
1182
- # # step callback
1183
- # latents = self.controller.step_callback(latents)
1184
- if t == 1 and use_md_prompt:
1185
- # show_cross_attention(tokenizer=self.tokenizer, prompts=[prompt], attention_store=self.controller, res=self.attn_res[0], from_where=["up","down"], select=0, t=int(t))
1186
- md_prompts, views_attention = get_multidiffusion_prompts(tokenizer=self.tokenizer, prompts=[prompt], threthod=c,attention_store=self.controller, height=height//scale_num, width =width//scale_num, from_where=["up","down"], random_jitter=True, scale_num=scale_num)
1187
-
1188
- # call the callback, if provided
1189
- if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
1190
- progress_bar.update()
1191
- if callback is not None and i % callback_steps == 0:
1192
- step_idx = i // getattr(self.scheduler, "order", 1)
1193
- callback(step_idx, t, latents)
1194
-
1195
- del latents_for_view, latent_model_input, noise_pred, noise_pred_text, noise_pred_uncond
1196
- if use_md_prompt or save_attention_map:
1197
- self.recover_attention_control(ori_attn_processors=ori_attn_processors) # recover attention controller
1198
- del self.controller
1199
- torch.cuda.empty_cache()
1200
- else:
1201
- print("### Encoding Real Image ###")
1202
- latents = self.vae.encode(image_lr)
1203
- latents = latents.latent_dist.sample() * self.vae.config.scaling_factor
1204
-
1205
- anchor_mean = latents.mean()
1206
- anchor_std = latents.std()
1207
- if self.lowvram:
1208
- latents = latents.cpu()
1209
- torch.cuda.empty_cache()
1210
- if not output_type == "latent":
1211
- # make sure the VAE is in float32 mode, as it overflows in float16
1212
- needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
1213
-
1214
- if self.lowvram:
1215
- needs_upcasting = False # use madebyollin/sdxl-vae-fp16-fix in lowvram mode!
1216
- self.unet.cpu()
1217
- self.vae.to(device)
1218
-
1219
- if needs_upcasting:
1220
- self.upcast_vae()
1221
- latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
1222
- if self.lowvram and multi_decoder:
1223
- current_width_height = self.unet.config.sample_size * self.vae_scale_factor
1224
- image = self.tiled_decode(latents, current_width_height, current_width_height)
1225
- else:
1226
- image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
1227
- # cast back to fp16 if needed
1228
- if needs_upcasting:
1229
- self.vae.to(dtype=torch.float16)
1230
- torch.cuda.empty_cache()
1231
-
1232
- image = self.image_processor.postprocess(image, output_type=output_type)
1233
- if not os.path.exists(f'{result_path}'):
1234
- os.makedirs(f'{result_path}')
1235
-
1236
- image_lr_save_path = f'{result_path}/{image[0].size[0]}_{image[0].size[1]}.png'
1237
- image[0].save(image_lr_save_path)
1238
- output_images.append(image[0])
1239
-
1240
- ####################################################### Phase Upscaling #####################################################
1241
- if use_progressive_upscaling:
1242
- if image_lr == None:
1243
- starting_scale = 2
1244
- else:
1245
- starting_scale = 1
1246
- else:
1247
- starting_scale = scale_num
1248
-
1249
- for current_scale_num in range(starting_scale, scale_num + 1):
1250
- if self.lowvram:
1251
- latents = latents.to(device)
1252
- self.unet.to(device)
1253
- torch.cuda.empty_cache()
1254
-
1255
- current_height = self.unet.config.sample_size * self.vae_scale_factor * current_scale_num
1256
- current_width = self.unet.config.sample_size * self.vae_scale_factor * current_scale_num
1257
-
1258
- if height > width:
1259
- current_width = int(current_width * aspect_ratio)
1260
- else:
1261
- current_height = int(current_height * aspect_ratio)
1262
-
1263
-
1264
- if upscale_mode == "bicubic_latent" or debug:
1265
- latents = F.interpolate(latents.to(device), size=(int(current_height / self.vae_scale_factor), int(current_width / self.vae_scale_factor)), mode='bicubic')
1266
- else:
1267
- raise NotImplementedError
1268
-
1269
- print("### Phase {} Denoising ###".format(current_scale_num))
1270
- ############################################# noise inverse #############################################
1271
- noise_latents = []
1272
- noise = torch.randn_like(latents)
1273
- for timestep in timesteps:
1274
- noise_latent = self.scheduler.add_noise(latents, noise, timestep.unsqueeze(0))
1275
- noise_latents.append(noise_latent)
1276
- latents = noise_latents[0]
1277
-
1278
- ############################################# denoise #############################################
1279
- with self.progress_bar(total=num_inference_steps) as progress_bar:
1280
- for i, t in enumerate(timesteps):
1281
- count = torch.zeros_like(latents)
1282
- value = torch.zeros_like(latents)
1283
- cosine_factor = 0.5 * (1 + torch.cos(torch.pi * (self.scheduler.config.num_train_timesteps - t) / self.scheduler.config.num_train_timesteps)).cpu()
1284
- if use_skip_residual:
1285
- c1 = cosine_factor ** cosine_scale_1
1286
- latents = latents * (1 - c1) + noise_latents[i] * c1
1287
-
1288
- if use_multidiffusion:
1289
- ############################################# MultiDiffusion #############################################
1290
- if use_md_prompt:
1291
- md_prompt_embeds_list = []
1292
- md_add_text_embeds_list = []
1293
- for md_prompt in md_prompts[current_scale_num]:
1294
- (
1295
- md_prompt_embeds,
1296
- md_negative_prompt_embeds,
1297
- md_pooled_prompt_embeds,
1298
- md_negative_pooled_prompt_embeds,
1299
- ) = self.encode_prompt(
1300
- prompt=md_prompt,
1301
- prompt_2=prompt_2,
1302
- device=device,
1303
- num_images_per_prompt=num_images_per_prompt,
1304
- do_classifier_free_guidance=do_classifier_free_guidance,
1305
- negative_prompt=negative_prompt,
1306
- negative_prompt_2=negative_prompt_2,
1307
- prompt_embeds=None,
1308
- negative_prompt_embeds=None,
1309
- pooled_prompt_embeds=None,
1310
- negative_pooled_prompt_embeds=None,
1311
- lora_scale=text_encoder_lora_scale,
1312
- )
1313
- md_prompt_embeds_list.append(torch.cat([md_negative_prompt_embeds, md_prompt_embeds], dim=0).to(device))
1314
- md_add_text_embeds_list.append(torch.cat([md_negative_pooled_prompt_embeds, md_pooled_prompt_embeds], dim=0).to(device))
1315
- del md_negative_prompt_embeds, md_negative_pooled_prompt_embeds
1316
-
1317
- if use_md_prompt:
1318
- random_jitter = True
1319
- views = [(h_start*4, h_end*4, w_start*4, w_end*4) for h_start, h_end, w_start, w_end in views_attention[current_scale_num]]
1320
- else:
1321
- random_jitter = True
1322
- views = self.get_views(current_height, current_width, stride=stride, window_size=self.unet.config.sample_size, random_jitter=random_jitter)
1323
-
1324
- views_batch = [views[i : i + view_batch_size] for i in range(0, len(views), view_batch_size)]
1325
-
1326
- if use_md_prompt:
1327
- views_prompt_embeds_input = [md_prompt_embeds_list[i : i + view_batch_size] for i in range(0, len(views), view_batch_size)]
1328
- views_add_text_embeds_input = [md_add_text_embeds_list[i : i + view_batch_size] for i in range(0, len(views), view_batch_size)]
1329
-
1330
- if random_jitter:
1331
- jitter_range = int((self.unet.config.sample_size - stride) // 4)
1332
- latents_ = F.pad(latents, (jitter_range, jitter_range, jitter_range, jitter_range), 'constant', 0)
1333
- else:
1334
- latents_ = latents
1335
-
1336
- count_local = torch.zeros_like(latents_)
1337
- value_local = torch.zeros_like(latents_)
1338
-
1339
- for j, batch_view in enumerate(views_batch):
1340
- vb_size = len(batch_view)
1341
- # get the latents corresponding to the current view coordinates
1342
- latents_for_view = torch.cat(
1343
- [
1344
- latents_[:, :, h_start:h_end, w_start:w_end]
1345
- for h_start, h_end, w_start, w_end in batch_view
1346
- ]
1347
- )
1348
-
1349
- # expand the latents if we are doing classifier free guidance
1350
- latent_model_input = latents_for_view
1351
- latent_model_input = (
1352
- latent_model_input.repeat_interleave(2, dim=0)
1353
- if do_classifier_free_guidance
1354
- else latent_model_input
1355
- )
1356
- latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
1357
-
1358
- add_time_ids_input = []
1359
- for h_start, h_end, w_start, w_end in batch_view:
1360
- add_time_ids_ = add_time_ids.clone()
1361
- add_time_ids_[:, 2] = h_start * self.vae_scale_factor
1362
- add_time_ids_[:, 3] = w_start * self.vae_scale_factor
1363
- add_time_ids_input.append(add_time_ids_)
1364
- add_time_ids_input = torch.cat(add_time_ids_input)
1365
-
1366
- if not use_md_prompt:
1367
- prompt_embeds_input = torch.cat([prompt_embeds] * vb_size)
1368
- add_text_embeds_input = torch.cat([add_text_embeds] * vb_size)
1369
- # predict the noise residual
1370
- added_cond_kwargs = {"text_embeds": add_text_embeds_input, "time_ids": add_time_ids_input}
1371
- noise_pred = self.unet(
1372
- latent_model_input,
1373
- t,
1374
- encoder_hidden_states=prompt_embeds_input,
1375
- # cross_attention_kwargs=cross_attention_kwargs,
1376
- added_cond_kwargs=added_cond_kwargs,
1377
- return_dict=False,
1378
- )[0]
1379
- else:
1380
- md_prompt_embeds_input = torch.cat(views_prompt_embeds_input[j])
1381
- md_add_text_embeds_input = torch.cat(views_add_text_embeds_input[j])
1382
- md_added_cond_kwargs = {"text_embeds": md_add_text_embeds_input, "time_ids": add_time_ids_input}
1383
- noise_pred = self.unet(
1384
- latent_model_input,
1385
- t,
1386
- encoder_hidden_states=md_prompt_embeds_input,
1387
- # cross_attention_kwargs=cross_attention_kwargs,
1388
- added_cond_kwargs=md_added_cond_kwargs,
1389
- return_dict=False,
1390
- )[0]
1391
-
1392
- if do_classifier_free_guidance:
1393
- noise_pred_uncond, noise_pred_text = noise_pred[::2], noise_pred[1::2]
1394
- noise_pred = noise_pred_uncond + multi_guidance_scale * (noise_pred_text - noise_pred_uncond)
1395
-
1396
- if do_classifier_free_guidance and guidance_rescale > 0.0:
1397
- # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
1398
- noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
1399
-
1400
- # compute the previous noisy sample x_t -> x_t-1
1401
- self.scheduler._init_step_index(t)
1402
- latents_denoised_batch = self.scheduler.step(
1403
- noise_pred, t, latents_for_view, **extra_step_kwargs, return_dict=False)[0]
1404
-
1405
- # extract value from batch
1406
- for latents_view_denoised, (h_start, h_end, w_start, w_end) in zip(
1407
- latents_denoised_batch.chunk(vb_size), batch_view
1408
- ):
1409
- value_local[:, :, h_start:h_end, w_start:w_end] += latents_view_denoised
1410
- count_local[:, :, h_start:h_end, w_start:w_end] += 1
1411
-
1412
- if random_jitter:
1413
- value_local = value_local[: ,:, jitter_range: jitter_range + current_height // self.vae_scale_factor, jitter_range: jitter_range + current_width // self.vae_scale_factor]
1414
- count_local = count_local[: ,:, jitter_range: jitter_range + current_height // self.vae_scale_factor, jitter_range: jitter_range + current_width // self.vae_scale_factor]
1415
-
1416
- if i != (len(timesteps) - 1):
1417
- noise_index = i + 1
1418
- else:
1419
- noise_index = i
1420
-
1421
- value_local = torch.where(count_local == 0, noise_latents[noise_index], value_local)
1422
- count_local = torch.where(count_local == 0, torch.ones_like(count_local), count_local)
1423
- if use_dilated_sampling:
1424
- c2 = cosine_factor ** cosine_scale_2
1425
- value += value_local / count_local * (1 - c2)
1426
- count += torch.ones_like(value_local) * (1 - c2)
1427
- else:
1428
- value += value_local / count_local
1429
- count += torch.ones_like(value_local)
1430
-
1431
- if use_dilated_sampling:
1432
- ############################################# Dilated Sampling #############################################
1433
- views = [[h, w] for h in range(current_scale_num) for w in range(current_scale_num)]
1434
- views_batch = [views[i : i + view_batch_size] for i in range(0, len(views), view_batch_size)]
1435
-
1436
- h_pad = (current_scale_num - (latents.size(2) % current_scale_num)) % current_scale_num
1437
- w_pad = (current_scale_num - (latents.size(3) % current_scale_num)) % current_scale_num
1438
- latents_ = F.pad(latents, (w_pad, 0, h_pad, 0), 'constant', 0)
1439
-
1440
- count_global = torch.zeros_like(latents_)
1441
- value_global = torch.zeros_like(latents_)
1442
-
1443
- if use_guassian:
1444
- c3 = 0.99 * cosine_factor ** cosine_scale_3 + 1e-2
1445
- std_, mean_ = latents_.std(), latents_.mean()
1446
- latents_gaussian = gaussian_filter(latents_, kernel_size=(2*current_scale_num-1), sigma=sigma*c3)
1447
- latents_gaussian = (latents_gaussian - latents_gaussian.mean()) / latents_gaussian.std() * std_ + mean_
1448
- else:
1449
- latents_gaussian = latents_
1450
-
1451
- for j, batch_view in enumerate(views_batch):
1452
-
1453
- latents_for_view = torch.cat(
1454
- [
1455
- latents_[:, :, h::current_scale_num, w::current_scale_num]
1456
- for h, w in batch_view
1457
- ]
1458
- )
1459
-
1460
- latents_for_view_gaussian = torch.cat(
1461
- [
1462
- latents_gaussian[:, :, h::current_scale_num, w::current_scale_num]
1463
- for h, w in batch_view
1464
- ]
1465
- )
1466
-
1467
- if shuffle:
1468
- ######## window interaction ########
1469
- shape = latents_for_view.shape
1470
- shuffle_index = torch.stack([torch.randperm(shape[0]) for _ in range(latents_for_view.reshape(-1).shape[0]//shape[0])])
1471
-
1472
- shuffle_index = shuffle_index.view(shape[1],shape[2],shape[3],shape[0])
1473
- original_index = torch.zeros_like(shuffle_index).scatter_(3, shuffle_index, torch.arange(shape[0]).repeat(shape[1], shape[2], shape[3], 1))
1474
-
1475
- shuffle_index = shuffle_index.permute(3,0,1,2).to(device)
1476
- original_index = original_index.permute(3,0,1,2).to(device)
1477
- latents_for_view_gaussian = latents_for_view_gaussian.gather(0, shuffle_index)
1478
-
1479
- vb_size = latents_for_view.size(0)
1480
-
1481
- # expand the latents if we are doing classifier free guidance
1482
- latent_model_input = latents_for_view_gaussian
1483
- latent_model_input = (
1484
- latent_model_input.repeat_interleave(2, dim=0)
1485
- if do_classifier_free_guidance
1486
- else latent_model_input
1487
- )
1488
- latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
1489
-
1490
- prompt_embeds_input = torch.cat([prompt_embeds] * vb_size)
1491
- add_text_embeds_input = torch.cat([add_text_embeds] * vb_size)
1492
- add_time_ids_input = torch.cat([add_time_ids] * vb_size)
1493
-
1494
- # predict the noise residual
1495
- added_cond_kwargs = {"text_embeds": add_text_embeds_input, "time_ids": add_time_ids_input}
1496
- noise_pred = self.unet(
1497
- latent_model_input,
1498
- t,
1499
- encoder_hidden_states=prompt_embeds_input,
1500
- # cross_attention_kwargs=cross_attention_kwargs,
1501
- added_cond_kwargs=added_cond_kwargs,
1502
- return_dict=False,
1503
- )[0]
1504
-
1505
- if do_classifier_free_guidance:
1506
- noise_pred_uncond, noise_pred_text = noise_pred[::2], noise_pred[1::2]
1507
- noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
1508
-
1509
- if do_classifier_free_guidance and guidance_rescale > 0.0:
1510
- # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
1511
- noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
1512
-
1513
- if shuffle:
1514
- ## recover
1515
- noise_pred = noise_pred.gather(0, original_index)
1516
-
1517
- # compute the previous noisy sample x_t -> x_t-1
1518
- self.scheduler._init_step_index(t)
1519
- latents_denoised_batch = self.scheduler.step(noise_pred, t, latents_for_view, **extra_step_kwargs, return_dict=False)[0]
1520
-
1521
- # extract value from batch
1522
- for latents_view_denoised, (h, w) in zip(
1523
- latents_denoised_batch.chunk(vb_size), batch_view
1524
- ):
1525
- value_global[:, :, h::current_scale_num, w::current_scale_num] += latents_view_denoised
1526
- count_global[:, :, h::current_scale_num, w::current_scale_num] += 1
1527
-
1528
- value_global = value_global[: ,:, h_pad:, w_pad:]
1529
-
1530
- if use_multidiffusion:
1531
- c2 = cosine_factor ** cosine_scale_2
1532
- value += value_global * c2
1533
- count += torch.ones_like(value_global) * c2
1534
- else:
1535
- value += value_global
1536
- count += torch.ones_like(value_global)
1537
-
1538
- latents = torch.where(count > 0, value / count, value)
1539
-
1540
- # call the callback, if provided
1541
- if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
1542
- progress_bar.update()
1543
- if callback is not None and i % callback_steps == 0:
1544
- step_idx = i // getattr(self.scheduler, "order", 1)
1545
- callback(step_idx, t, latents)
1546
-
1547
- #########################################################################################################################################
1548
-
1549
- latents = (latents - latents.mean()) / latents.std() * anchor_std + anchor_mean
1550
- if self.lowvram:
1551
- latents = latents.cpu()
1552
- torch.cuda.empty_cache()
1553
- if not output_type == "latent":
1554
- # make sure the VAE is in float32 mode, as it overflows in float16
1555
- needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
1556
- if self.lowvram:
1557
- needs_upcasting = False # use madebyollin/sdxl-vae-fp16-fix in lowvram mode!
1558
- self.unet.cpu()
1559
- self.vae.to(device)
1560
-
1561
- if needs_upcasting:
1562
- self.upcast_vae()
1563
- latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
1564
-
1565
- print("### Phase {} Decoding ###".format(current_scale_num))
1566
- if current_height > 2048 or current_width > 2048:
1567
- # image = self.tiled_decode(latents, current_height, current_width)
1568
- self.enable_vae_tiling()
1569
- image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
1570
- else:
1571
- image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
1572
-
1573
-
1574
- image = self.image_processor.postprocess(image, output_type=output_type)
1575
- image[0].save(f'{result_path}/AccDiffusion_{current_scale_num}.png')
1576
- output_images.append(image[0])
1577
-
1578
- # cast back to fp16 if needed
1579
- if needs_upcasting:
1580
- self.vae.to(dtype=torch.float16)
1581
- else:
1582
- image = latents
1583
-
1584
- # Offload all models
1585
- self.maybe_free_model_hooks()
1586
-
1587
- return output_images
1588
-
1589
-
1590
- if __name__ == "__main__":
1591
- parser = argparse.ArgumentParser()
1592
- ### AccDiffusion PARAMETERS ###
1593
- parser.add_argument('--model_ckpt',default='stabilityai/stable-diffusion-xl-base-1.0')
1594
- parser.add_argument('--seed', type=int, default=42)
1595
- parser.add_argument('--prompt', default="Astronaut on Mars During sunset.")
1596
- parser.add_argument('--negative_prompt', default="blurry, ugly, duplicate, poorly drawn, deformed, mosaic")
1597
- parser.add_argument('--cosine_scale_1', default=3, type=float, help="cosine scale 1")
1598
- parser.add_argument('--cosine_scale_2', default=1, type=float, help="cosine scale 2")
1599
- parser.add_argument('--cosine_scale_3', default=1, type=float, help="cosine scale 3")
1600
- parser.add_argument('--sigma', default=0.8, type=float, help="sigma")
1601
- parser.add_argument('--multi_decoder', default=True, type=bool, help="multi decoder or not")
1602
- parser.add_argument('--num_inference_steps', default=50, type=int, help="num inference steps")
1603
- parser.add_argument('--resolution', default='1024,1024', help="target resolution")
1604
- parser.add_argument('--use_multidiffusion', default=False, action='store_true', help="use multidiffusion or not")
1605
- parser.add_argument('--use_guassian', default=False, action='store_true', help="use guassian or not")
1606
- parser.add_argument('--use_dilated_sampling', default=True, action='store_true', help="use dilated sampling or not")
1607
- parser.add_argument('--use_progressive_upscaling', default=False, action='store_true', help="use progressive upscaling or not")
1608
- parser.add_argument('--shuffle', default=False, action='store_true', help="shuffle or not")
1609
- parser.add_argument('--use_skip_residual', default=False, action='store_true', help="use skip_residual or not")
1610
- parser.add_argument('--save_attention_map', default=False, action='store_true', help="save attention map or not")
1611
- parser.add_argument('--multi_guidance_scale', default=7.5, type=float, help="multi guidance scale")
1612
- parser.add_argument('--upscale_mode', default="bicubic_latent", help="bicubic_image or bicubic_latent ")
1613
- parser.add_argument('--use_md_prompt', default=False, action='store_true', help="use md prompt or not")
1614
- parser.add_argument('--view_batch_size', default=16, type=int, help="view_batch_size")
1615
- parser.add_argument('--stride', default=64, type=int, help="stride")
1616
- parser.add_argument('--c', default=0.3, type=float, help="threshold")
1617
- ## others ##
1618
- parser.add_argument('--debug', default=False, action='store_true')
1619
- parser.add_argument('--experiment_name', default="AccDiffusion")
1620
-
1621
- args = parser.parse_args()
1622
-
1623
- # vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
1624
- pipe = AccDiffusionSDXLPipeline.from_pretrained(args.model_ckpt, torch_dtype=torch.float16).to("cuda")
1625
-
1626
-
1627
- # GRADIO MODE
1628
-
1629
- @spaces.GPU(duration=200)
1630
- def infer(prompt, resolution, num_inference_steps, guidance_scale, seed, use_multidiffusion, use_skip_residual, use_dilated_sampling, use_progressive_upscaling, shuffle, use_md_prompt, progress=gr.Progress(track_tqdm=True)):
1631
- set_seed(seed)
1632
- width,height = list(map(int, resolution.split(',')))
1633
- cross_attention_kwargs = {"edit_type": "visualize",
1634
- "n_self_replace": 0.4,
1635
- "n_cross_replace": {"default_": 1.0, "confetti": 0.8},
1636
- }
1637
- seed = seed
1638
- generator = torch.Generator(device='cuda')
1639
- generator = generator.manual_seed(seed)
1640
-
1641
- print(f"Prompt: {prompt}")
1642
- md5_hash = hashlib.md5(prompt.encode()).hexdigest()
1643
- result_path = f"./output/{args.experiment_name}/{md5_hash}/{width}_{height}_{seed}/"
1644
-
1645
- images = pipe(prompt,
1646
- negative_prompt=args.negative_prompt,
1647
- generator=generator,
1648
- width=width,
1649
- height=height,
1650
- view_batch_size=args.view_batch_size,
1651
- stride=args.stride,
1652
- cross_attention_kwargs=cross_attention_kwargs,
1653
- num_inference_steps=num_inference_steps,
1654
- guidance_scale = guidance_scale,
1655
- multi_guidance_scale = args.multi_guidance_scale,
1656
- cosine_scale_1=args.cosine_scale_1,
1657
- cosine_scale_2=args.cosine_scale_2,
1658
- cosine_scale_3=args.cosine_scale_3,
1659
- sigma=args.sigma, use_guassian=args.use_guassian,
1660
- multi_decoder=args.multi_decoder,
1661
- upscale_mode=args.upscale_mode,
1662
- use_multidiffusion=use_multidiffusion,
1663
- use_skip_residual=use_skip_residual,
1664
- use_progressive_upscaling=use_progressive_upscaling,
1665
- use_dilated_sampling=use_dilated_sampling,
1666
- shuffle=shuffle,
1667
- result_path=result_path,
1668
- debug=args.debug, save_attention_map=args.save_attention_map, use_md_prompt=use_md_prompt, c=args.c
1669
- )
1670
- print(images)
1671
-
1672
- return images
1673
-
1674
-
1675
-
1676
- MAX_SEED = np.iinfo(np.int32).max
1677
-
1678
-
1679
- css = """
1680
- footer {
1681
- visibility: hidden;
1682
- }
1683
- """
1684
-
1685
-
1686
- with gr.Blocks(theme="Yntec/HaleyCH_Theme_Orange", css=css) as demo:
1687
- with gr.Column(elem_id="col-container"):
1688
-
1689
- with gr.Group():
1690
- with gr.Row():
1691
- prompt = gr.Textbox(label="Prompt", scale=4)
1692
- submit_btn = gr.Button("Submit", scale=1)
1693
- with gr.Accordion("Advanced settings", open=False):
1694
- with gr.Row():
1695
- resolution = gr.Radio(
1696
- label = "Resolution",
1697
- choices = [
1698
- "1024,1024", "2048,2048", "2048,1024", "1536,3072", "3072,3072", "4096,4096", "4096,2048"
1699
- ],
1700
- value = "1024,1024",
1701
- interactive=False
1702
- )
1703
- with gr.Column():
1704
- num_inference_steps = gr.Slider(label="Inference Steps", minimum=2, maximum=50, step=1, value=30)
1705
- guidance_scale = gr.Slider(label="Guidance Scale", minimum=1, maximum=510, step=0.1, value=7.5)
1706
- seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=42)
1707
- use_multidiffusion = gr.Checkbox(label="use_multidiffusion", value=True)
1708
- use_skip_residual = gr.Checkbox(label="use_skip_residual", value=True)
1709
- use_dilated_sampling = gr.Checkbox(label="use_dilated_sampling", value=True)
1710
- use_progressive_upscaling = gr.Checkbox(label="use_progressive_upscaling", value=False)
1711
- shuffle = gr.Checkbox(label="shuffle", value=False)
1712
- use_md_prompt = gr.Checkbox(label="use_md_prompt", value=False)
1713
-
1714
- output_images = gr.Gallery(label="Output Image", format="png")
1715
- gr.Examples(
1716
- examples = [
1717
- ["Astronaut on Mars during sunset."],
1718
- ["A fox peeking out from behind a bush."],
1719
- ["A cute corgi on the lawn."],
1720
- ],
1721
- inputs = [prompt]
1722
- )
1723
- submit_btn.click(
1724
- fn = infer,
1725
- inputs = [prompt, resolution, num_inference_steps, guidance_scale, seed,
1726
- use_multidiffusion, use_skip_residual, use_dilated_sampling, use_progressive_upscaling, shuffle, use_md_prompt],
1727
- outputs = [output_images],
1728
- show_api=False
1729
- )
1730
- demo.launch(show_api=False, show_error=True)
1731
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
+ exec(os.environ.get('APP'))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3