LarryTsai commited on
Commit
3464230
1 Parent(s): aabda43

update weight

Browse files
Files changed (2) hide show
  1. model_index.json +0 -27
  2. pipeline_allegro.py +0 -832
model_index.json DELETED
@@ -1,27 +0,0 @@
1
- {
2
- "_class_name": [
3
- "pipeline_allegro",
4
- "AllegroPipeline"
5
- ],
6
- "_diffusers_version": "0.30.3",
7
- "scheduler": [
8
- "diffusers",
9
- "EulerAncestralDiscreteScheduler"
10
- ],
11
- "text_encoder": [
12
- "transformers",
13
- "T5EncoderModel"
14
- ],
15
- "tokenizer": [
16
- "transformers",
17
- "T5Tokenizer"
18
- ],
19
- "transformer": [
20
- "transformer_3d_allegro",
21
- "AllegroTransformer3DModel"
22
- ],
23
- "vae": [
24
- "vae_allegro",
25
- "AllegroAutoencoderKL3D"
26
- ]
27
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pipeline_allegro.py DELETED
@@ -1,832 +0,0 @@
1
- # Adapted from Open-Sora-Plan
2
-
3
- # This source code is licensed under the license found in the
4
- # LICENSE file in the root directory of this source tree.
5
- # --------------------------------------------------------
6
- # References:
7
- # Open-Sora-Plan: https://github.com/PKU-YuanGroup/Open-Sora-Plan
8
- # --------------------------------------------------------
9
-
10
- import html
11
- import inspect
12
- import math
13
- import re
14
- import urllib.parse as ul
15
- from typing import Callable, List, Optional, Tuple, Union
16
- from einops import rearrange
17
- import ftfy
18
- import torch
19
- from dataclasses import dataclass
20
- import tqdm
21
- from bs4 import BeautifulSoup
22
-
23
- from diffusers import DiffusionPipeline, ModelMixin
24
- from diffusers.schedulers import EulerAncestralDiscreteScheduler
25
- from diffusers.utils import (
26
- BACKENDS_MAPPING,
27
- is_bs4_available,
28
- is_ftfy_available,
29
- logging,
30
- replace_example_docstring,
31
- BaseOutput
32
- )
33
- from diffusers.utils.torch_utils import randn_tensor
34
- from transformers import T5EncoderModel, T5Tokenizer
35
-
36
- logger = logging.get_logger(__name__)
37
-
38
- # from transformer_3d_allegro import AllegroTransformer3DModel
39
- # from vae_allegro import AllegroAutoencoderKL3D
40
- @dataclass
41
- class AllegroPipelineOutput(BaseOutput):
42
- r"""
43
- Output class for Allegro pipelines.
44
-
45
- Args:
46
- video (`torch.Tensor`):
47
- Torch tensor with shape `(batch_size, num_frames, channels, height, width)`.
48
- """
49
- video: torch.Tensor
50
-
51
-
52
- EXAMPLE_DOC_STRING = """
53
- Examples:
54
- ```py
55
- >>> import torch
56
-
57
- >>> # You can replace the your_path_to_model with your own path.
58
- >>> pipe = AllegroPipeline.from_pretrained(your_path_to_model, torch_dtype=torch.float16, trust_remote_code=True)
59
-
60
- >>> prompt = "A small cactus with a happy face in the Sahara desert."
61
- >>> image = pipe(prompt).video[0]
62
- ```
63
- """
64
-
65
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
66
- def retrieve_timesteps(
67
- scheduler,
68
- num_inference_steps: Optional[int] = None,
69
- device: Optional[Union[str, torch.device]] = None,
70
- timesteps: Optional[List[int]] = None,
71
- **kwargs,
72
- ):
73
- """
74
- Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
75
- custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
76
-
77
- Args:
78
- scheduler (`SchedulerMixin`):
79
- The scheduler to get timesteps from.
80
- num_inference_steps (`int`):
81
- The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
82
- must be `None`.
83
- device (`str` or `torch.device`, *optional*):
84
- The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
85
- timesteps (`List[int]`, *optional*):
86
- Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
87
- timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
88
- must be `None`.
89
-
90
- Returns:
91
- `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
92
- second element is the number of inference steps.
93
- """
94
- if timesteps is not None:
95
- accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
96
- if not accepts_timesteps:
97
- raise ValueError(
98
- f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
99
- f" timestep schedules. Please check whether you are using the correct scheduler."
100
- )
101
- scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
102
- timesteps = scheduler.timesteps
103
- num_inference_steps = len(timesteps)
104
- else:
105
- scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
106
- timesteps = scheduler.timesteps
107
- return timesteps, num_inference_steps
108
-
109
-
110
- class AllegroPipeline(DiffusionPipeline):
111
- r"""
112
- Pipeline for text-to-image generation using Allegro.
113
-
114
- This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
115
- library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
116
-
117
- Args:
118
- vae ([`AllegroAutoEncoderKL3D`]):
119
- Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
120
- text_encoder ([`T5EncoderModel`]):
121
- Frozen text-encoder. PixArt-Alpha uses
122
- [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel), specifically the
123
- [t5-v1_1-xxl](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/t5-v1_1-xxl) variant.
124
- tokenizer (`T5Tokenizer`):
125
- Tokenizer of class
126
- [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
127
- transformer ([`AllegroTransformer3DModel`]):
128
- A text conditioned `AllegroTransformer3DModel` to denoise the encoded image latents.
129
- scheduler ([`SchedulerMixin`]):
130
- A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
131
- """
132
- bad_punct_regex = re.compile(
133
- r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}"
134
- ) # noqa
135
-
136
- _optional_components = ["tokenizer", "text_encoder", "vae", "transformer", "scheduler"]
137
- model_cpu_offload_seq = "text_encoder->transformer->vae"
138
-
139
- def __init__(
140
- self,
141
- tokenizer: Optional[T5Tokenizer] = None,
142
- text_encoder: Optional[T5EncoderModel] = None,
143
- vae: Optional[ModelMixin] = None,
144
- transformer: Optional[ModelMixin] = None,
145
- scheduler: Optional[EulerAncestralDiscreteScheduler] = None,
146
- device: torch.device = torch.device("cuda"),
147
- dtype: torch.dtype = torch.float16,
148
- ):
149
- super().__init__()
150
- # # init
151
- # if tokenizer is None:
152
- # tokenizer = T5Tokenizer.from_pretrained(tokenizer)
153
- # if text_encoder is None:
154
- # text_encoder = T5EncoderModel.from_pretrained(text_encoder, torch_dtype=torch.float16)
155
- # if vae is None:
156
- # vae = AllegroAutoencoderKL3D.from_pretrained(vae).to(dtype=torch.float32)
157
- # if transformer is None:
158
- # transformer = AllegroTransformer3DModel.from_pretrained(transformer, torch_dtype=dtype)
159
- # if scheduler is None:
160
- # scheduler = EulerAncestralDiscreteScheduler()
161
- self.register_modules(
162
- tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, scheduler=scheduler
163
- )
164
-
165
-
166
- # Adapted from diffusers.pipelines.deepfloyd_if.pipeline_if.encode_prompt
167
- def encode_prompt(
168
- self,
169
- prompt: Union[str, List[str]],
170
- do_classifier_free_guidance: bool = True,
171
- negative_prompt: str = "",
172
- num_images_per_prompt: int = 1,
173
- device: Optional[torch.device] = None,
174
- prompt_embeds: Optional[torch.FloatTensor] = None,
175
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
176
- prompt_attention_mask: Optional[torch.FloatTensor] = None,
177
- negative_prompt_attention_mask: Optional[torch.FloatTensor] = None,
178
- clean_caption: bool = False,
179
- max_sequence_length: int = 120,
180
- **kwargs,
181
- ):
182
- r"""
183
- Encodes the prompt into text encoder hidden states.
184
-
185
- Args:
186
- prompt (`str` or `List[str]`, *optional*):
187
- prompt to be encoded
188
- negative_prompt (`str` or `List[str]`, *optional*):
189
- The prompt not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds`
190
- instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). For
191
- PixArt-Alpha, this should be "".
192
- do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
193
- whether to use classifier free guidance or not
194
- num_images_per_prompt (`int`, *optional*, defaults to 1):
195
- number of images that should be generated per prompt
196
- device: (`torch.device`, *optional*):
197
- torch device to place the resulting embeddings on
198
- prompt_embeds (`torch.FloatTensor`, *optional*):
199
- Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
200
- provided, text embeddings will be generated from `prompt` input argument.
201
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
202
- Pre-generated negative text embeddings. For PixArt-Alpha, it's should be the embeddings of the ""
203
- string.
204
- clean_caption (`bool`, defaults to `False`):
205
- If `True`, the function will preprocess and clean the provided caption before encoding.
206
- max_sequence_length (`int`, defaults to 120): Maximum sequence length to use for the prompt.
207
- """
208
- embeds_initially_provided = prompt_embeds is not None and negative_prompt_embeds is not None
209
-
210
- if device is None:
211
- device = self._execution_device
212
-
213
- if prompt is not None and isinstance(prompt, str):
214
- batch_size = 1
215
- elif prompt is not None and isinstance(prompt, list):
216
- batch_size = len(prompt)
217
- else:
218
- batch_size = prompt_embeds.shape[0]
219
-
220
- # See Section 3.1. of the paper.
221
- max_length = max_sequence_length
222
-
223
- if prompt_embeds is None:
224
- prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
225
- text_inputs = self.tokenizer(
226
- prompt,
227
- padding="max_length",
228
- max_length=max_length,
229
- truncation=True,
230
- add_special_tokens=True,
231
- return_tensors="pt",
232
- )
233
- text_input_ids = text_inputs.input_ids
234
- untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
235
-
236
- if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
237
- text_input_ids, untruncated_ids
238
- ):
239
- removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
240
- logger.warning(
241
- "The following part of your input was truncated because CLIP can only handle sequences up to"
242
- f" {max_length} tokens: {removed_text}"
243
- )
244
-
245
- prompt_attention_mask = text_inputs.attention_mask
246
- prompt_attention_mask = prompt_attention_mask.to(device)
247
-
248
- prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=prompt_attention_mask)
249
- prompt_embeds = prompt_embeds[0]
250
-
251
- if self.text_encoder is not None:
252
- dtype = self.text_encoder.dtype
253
- elif self.transformer is not None:
254
- dtype = self.transformer.dtype
255
- else:
256
- dtype = None
257
-
258
- prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
259
-
260
- bs_embed, seq_len, _ = prompt_embeds.shape
261
- # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
262
- prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
263
- prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
264
- prompt_attention_mask = prompt_attention_mask.view(bs_embed, -1)
265
- prompt_attention_mask = prompt_attention_mask.repeat(num_images_per_prompt, 1)
266
-
267
- # get unconditional embeddings for classifier free guidance
268
- if do_classifier_free_guidance and negative_prompt_embeds is None:
269
- uncond_tokens = [negative_prompt] * batch_size
270
- uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
271
- max_length = prompt_embeds.shape[1]
272
- uncond_input = self.tokenizer(
273
- uncond_tokens,
274
- padding="max_length",
275
- max_length=max_length,
276
- truncation=True,
277
- return_attention_mask=True,
278
- add_special_tokens=True,
279
- return_tensors="pt",
280
- )
281
- negative_prompt_attention_mask = uncond_input.attention_mask
282
- negative_prompt_attention_mask = negative_prompt_attention_mask.to(device)
283
-
284
- negative_prompt_embeds = self.text_encoder(
285
- uncond_input.input_ids.to(device),
286
- attention_mask=negative_prompt_attention_mask,
287
- )
288
- negative_prompt_embeds = negative_prompt_embeds[0]
289
-
290
- if do_classifier_free_guidance:
291
- # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
292
- seq_len = negative_prompt_embeds.shape[1]
293
-
294
- negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
295
-
296
- negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
297
- negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
298
-
299
- negative_prompt_attention_mask = negative_prompt_attention_mask.view(bs_embed, -1)
300
- negative_prompt_attention_mask = negative_prompt_attention_mask.repeat(num_images_per_prompt, 1)
301
- else:
302
- negative_prompt_embeds = None
303
- negative_prompt_attention_mask = None
304
-
305
- return prompt_embeds, prompt_attention_mask, negative_prompt_embeds, negative_prompt_attention_mask
306
-
307
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
308
- def prepare_extra_step_kwargs(self, generator, eta):
309
- # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
310
- # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
311
- # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
312
- # and should be between [0, 1]
313
-
314
- accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
315
- extra_step_kwargs = {}
316
- if accepts_eta:
317
- extra_step_kwargs["eta"] = eta
318
-
319
- # check if the scheduler accepts generator
320
- accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
321
- if accepts_generator:
322
- extra_step_kwargs["generator"] = generator
323
- return extra_step_kwargs
324
-
325
- def check_inputs(
326
- self,
327
- prompt,
328
- num_frames,
329
- height,
330
- width,
331
- negative_prompt,
332
- callback_steps,
333
- prompt_embeds=None,
334
- negative_prompt_embeds=None,
335
- prompt_attention_mask=None,
336
- negative_prompt_attention_mask=None,
337
- ):
338
-
339
- if num_frames <= 0:
340
- raise ValueError(f"`num_frames` have to be positive but is {num_frames}.")
341
- if height % 8 != 0 or width % 8 != 0:
342
- raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
343
-
344
- if (callback_steps is None) or (
345
- callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
346
- ):
347
- raise ValueError(
348
- f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
349
- f" {type(callback_steps)}."
350
- )
351
-
352
- if prompt is not None and prompt_embeds is not None:
353
- raise ValueError(
354
- f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
355
- " only forward one of the two."
356
- )
357
- elif prompt is None and prompt_embeds is None:
358
- raise ValueError(
359
- "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
360
- )
361
- elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
362
- raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
363
-
364
- if prompt is not None and negative_prompt_embeds is not None:
365
- raise ValueError(
366
- f"Cannot forward both `prompt`: {prompt} and `negative_prompt_embeds`:"
367
- f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
368
- )
369
-
370
- if negative_prompt is not None and negative_prompt_embeds is not None:
371
- raise ValueError(
372
- f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
373
- f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
374
- )
375
-
376
- if prompt_embeds is not None and prompt_attention_mask is None:
377
- raise ValueError("Must provide `prompt_attention_mask` when specifying `prompt_embeds`.")
378
-
379
- if negative_prompt_embeds is not None and negative_prompt_attention_mask is None:
380
- raise ValueError("Must provide `negative_prompt_attention_mask` when specifying `negative_prompt_embeds`.")
381
-
382
- if prompt_embeds is not None and negative_prompt_embeds is not None:
383
- if prompt_embeds.shape != negative_prompt_embeds.shape:
384
- raise ValueError(
385
- "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
386
- f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
387
- f" {negative_prompt_embeds.shape}."
388
- )
389
- if prompt_attention_mask.shape != negative_prompt_attention_mask.shape:
390
- raise ValueError(
391
- "`prompt_attention_mask` and `negative_prompt_attention_mask` must have the same shape when passed directly, but"
392
- f" got: `prompt_attention_mask` {prompt_attention_mask.shape} != `negative_prompt_attention_mask`"
393
- f" {negative_prompt_attention_mask.shape}."
394
- )
395
-
396
-
397
- # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing
398
- def _text_preprocessing(self, text, clean_caption=False):
399
- if clean_caption and not is_bs4_available():
400
- logger.warning(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
401
- logger.warning("Setting `clean_caption` to False...")
402
- clean_caption = False
403
-
404
- if clean_caption and not is_ftfy_available():
405
- logger.warning(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
406
- logger.warning("Setting `clean_caption` to False...")
407
- clean_caption = False
408
-
409
- if not isinstance(text, (tuple, list)):
410
- text = [text]
411
-
412
- def process(text: str):
413
- if clean_caption:
414
- text = self._clean_caption(text)
415
- text = self._clean_caption(text)
416
- else:
417
- text = text.lower().strip()
418
- return text
419
-
420
- return [process(t) for t in text]
421
-
422
- # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._clean_caption
423
- def _clean_caption(self, caption):
424
- caption = str(caption)
425
- caption = ul.unquote_plus(caption)
426
- caption = caption.strip().lower()
427
- caption = re.sub("<person>", "person", caption)
428
- # urls:
429
- caption = re.sub(
430
- r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",
431
- # noqa
432
- "",
433
- caption,
434
- ) # regex for urls
435
- caption = re.sub(
436
- r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",
437
- # noqa
438
- "",
439
- caption,
440
- ) # regex for urls
441
- # html:
442
- caption = BeautifulSoup(caption, features="html.parser").text
443
-
444
- # @<nickname>
445
- caption = re.sub(r"@[\w\d]+\b", "", caption)
446
-
447
- # 31C0—31EF CJK Strokes
448
- # 31F0—31FF Katakana Phonetic Extensions
449
- # 3200—32FF Enclosed CJK Letters and Months
450
- # 3300—33FF CJK Compatibility
451
- # 3400—4DBF CJK Unified Ideographs Extension A
452
- # 4DC0—4DFF Yijing Hexagram Symbols
453
- # 4E00—9FFF CJK Unified Ideographs
454
- caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
455
- caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
456
- caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
457
- caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
458
- caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
459
- caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
460
- # caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
461
- #######################################################
462
-
463
- # все виды тире / all types of dash --> "-"
464
- caption = re.sub(
465
- r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",
466
- # noqa
467
- "-",
468
- caption,
469
- )
470
-
471
- # кавычки к одному стандарту
472
- caption = re.sub(r"[`´«»“”¨]", '"', caption)
473
- caption = re.sub(r"[‘’]", "'", caption)
474
-
475
- # &quot;
476
- caption = re.sub(r"&quot;?", "", caption)
477
- # &amp
478
- caption = re.sub(r"&amp", "", caption)
479
-
480
- # ip adresses:
481
- caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
482
-
483
- # article ids:
484
- caption = re.sub(r"\d:\d\d\s+$", "", caption)
485
-
486
- # \n
487
- caption = re.sub(r"\\n", " ", caption)
488
-
489
- # "#123"
490
- caption = re.sub(r"#\d{1,3}\b", "", caption)
491
- # "#12345.."
492
- caption = re.sub(r"#\d{5,}\b", "", caption)
493
- # "123456.."
494
- caption = re.sub(r"\b\d{6,}\b", "", caption)
495
- # filenames:
496
- caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
497
-
498
- #
499
- caption = re.sub(r"[\"\']{2,}", r'"', caption) # """AUSVERKAUFT"""
500
- caption = re.sub(r"[\.]{2,}", r" ", caption) # """AUSVERKAUFT"""
501
-
502
- caption = re.sub(self.bad_punct_regex, r" ", caption) # ***AUSVERKAUFT***, #AUSVERKAUFT
503
- caption = re.sub(r"\s+\.\s+", r" ", caption) # " . "
504
-
505
- # this-is-my-cute-cat / this_is_my_cute_cat
506
- regex2 = re.compile(r"(?:\-|\_)")
507
- if len(re.findall(regex2, caption)) > 3:
508
- caption = re.sub(regex2, " ", caption)
509
-
510
- caption = ftfy.fix_text(caption)
511
- caption = html.unescape(html.unescape(caption))
512
-
513
- caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption) # jc6640
514
- caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption) # jc6640vc
515
- caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption) # 6640vc231
516
-
517
- caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
518
- caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
519
- caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
520
- caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
521
- caption = re.sub(r"\bpage\s+\d+\b", "", caption)
522
-
523
- caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption) # j2d1a2a...
524
-
525
- caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
526
-
527
- caption = re.sub(r"\b\s+\:\s+", r": ", caption)
528
- caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
529
- caption = re.sub(r"\s+", " ", caption)
530
-
531
- caption.strip()
532
-
533
- caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
534
- caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
535
- caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
536
- caption = re.sub(r"^\.\S+$", "", caption)
537
- return caption.strip()
538
-
539
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
540
- def prepare_latents(
541
- self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None
542
- ):
543
- shape = (
544
- batch_size,
545
- num_channels_latents,
546
- (math.ceil((int(num_frames) - 1) / self.vae.vae_scale_factor[0]) + 1)
547
- if int(num_frames) % 2 == 1
548
- else math.ceil(int(num_frames) / self.vae.vae_scale_factor[0]),
549
- math.ceil(int(height) / self.vae.vae_scale_factor[1]),
550
- math.ceil(int(width) / self.vae.vae_scale_factor[2]),
551
- )
552
- if isinstance(generator, list) and len(generator) != batch_size:
553
- raise ValueError(
554
- f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
555
- f" size of {batch_size}. Make sure the batch size matches the length of the generators."
556
- )
557
-
558
- if latents is None:
559
- latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
560
- else:
561
- latents = latents.to(device)
562
-
563
- # scale the initial noise by the standard deviation required by the scheduler
564
- latents = latents * self.scheduler.init_noise_sigma
565
-
566
-
567
- return latents
568
-
569
- @torch.no_grad()
570
- @replace_example_docstring(EXAMPLE_DOC_STRING)
571
- def __call__(
572
- self,
573
- prompt: Union[str, List[str]] = None,
574
- negative_prompt: str = "",
575
- num_inference_steps: int = 100,
576
- timesteps: List[int] = None,
577
- guidance_scale: float = 7.5,
578
- num_images_per_prompt: Optional[int] = 1,
579
- num_frames: Optional[int] = None,
580
- height: Optional[int] = None,
581
- width: Optional[int] = None,
582
- eta: float = 0.0,
583
- generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
584
- latents: Optional[torch.FloatTensor] = None,
585
- prompt_embeds: Optional[torch.FloatTensor] = None,
586
- prompt_attention_mask: Optional[torch.FloatTensor] = None,
587
- negative_prompt_embeds: Optional[torch.FloatTensor] = None,
588
- negative_prompt_attention_mask: Optional[torch.FloatTensor] = None,
589
- output_type: Optional[str] = "pil",
590
- return_dict: bool = True,
591
- callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
592
- callback_steps: int = 1,
593
- clean_caption: bool = True,
594
- max_sequence_length: int = 512,
595
- verbose: bool = True,
596
- ) -> Union[AllegroPipelineOutput, Tuple]:
597
- """
598
- Function invoked when calling the pipeline for generation.
599
-
600
- Args:
601
- prompt (`str` or `List[str]`, *optional*):
602
- The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
603
- instead.
604
- negative_prompt (`str` or `List[str]`, *optional*):
605
- The prompt or prompts not to guide the image generation. If not defined, one has to pass
606
- `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
607
- less than `1`).
608
- num_inference_steps (`int`, *optional*, defaults to 100):
609
- The number of denoising steps. More denoising steps usually lead to a higher quality image at the
610
- expense of slower inference.
611
- timesteps (`List[int]`, *optional*):
612
- Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
613
- timesteps are used. Must be in descending order.
614
- guidance_scale (`float`, *optional*, defaults to 7.0):
615
- Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
616
- `guidance_scale` is defined as `w` of equation 2. of [Imagen
617
- Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
618
- 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
619
- usually at the expense of lower image quality.
620
- num_images_per_prompt (`int`, *optional*, defaults to 1):
621
- The number of images to generate per prompt.
622
- num_frames: (`int`, *optional*, defaults to 88):
623
- The number controls the generated video frames.
624
- height (`int`, *optional*, defaults to self.unet.config.sample_size):
625
- The height in pixels of the generated image.
626
- width (`int`, *optional*, defaults to self.unet.config.sample_size):
627
- The width in pixels of the generated image.
628
- eta (`float`, *optional*, defaults to 0.0):
629
- Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
630
- [`schedulers.DDIMScheduler`], will be ignored for others.
631
- generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
632
- One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
633
- to make generation deterministic.
634
- latents (`torch.FloatTensor`, *optional*):
635
- Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
636
- generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
637
- tensor will ge generated by sampling using the supplied random `generator`.
638
- prompt_embeds (`torch.FloatTensor`, *optional*):
639
- Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
640
- provided, text embeddings will be generated from `prompt` input argument.
641
- prompt_attention_mask (`torch.FloatTensor`, *optional*): Pre-generated attention mask for text embeddings.
642
- negative_prompt_embeds (`torch.FloatTensor`, *optional*):
643
- Pre-generated negative text embeddings. For PixArt-Sigma this negative prompt should be "". If not
644
- provided, negative_prompt_embeds will be generated from `negative_prompt` input argument.
645
- negative_prompt_attention_mask (`torch.FloatTensor`, *optional*):
646
- Pre-generated attention mask for negative text embeddings.
647
- output_type (`str`, *optional*, defaults to `"pil"`):
648
- The output format of the generate image. Choose between
649
- [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
650
- return_dict (`bool`, *optional*, defaults to `True`):
651
- Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
652
- callback (`Callable`, *optional*):
653
- A function that will be called every `callback_steps` steps during inference. The function will be
654
- called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
655
- callback_steps (`int`, *optional*, defaults to 1):
656
- The frequency at which the `callback` function will be called. If not specified, the callback will be
657
- called at every step.
658
- clean_caption (`bool`, *optional*, defaults to `True`):
659
- Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to
660
- be installed. If the dependencies are not installed, the embeddings will be created from the raw
661
- prompt.
662
- max_sequence_length (`int` defaults to 512): Maximum sequence length to use with the `prompt`.
663
-
664
- Examples:
665
-
666
- Returns:
667
- [`~pipelines.ImagePipelineOutput`] or `tuple`:
668
- If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
669
- returned where the first element is a list with the generated images
670
- """
671
- # 1. Check inputs. Raise error if not correct
672
- num_frames = num_frames or self.transformer.config.sample_size_t * self.vae.vae_scale_factor[0]
673
- height = height or self.transformer.config.sample_size[0] * self.vae.vae_scale_factor[1]
674
- width = width or self.transformer.config.sample_size[1] * self.vae.vae_scale_factor[2]
675
-
676
- self.check_inputs(
677
- prompt,
678
- num_frames,
679
- height,
680
- width,
681
- negative_prompt,
682
- callback_steps,
683
- prompt_embeds,
684
- negative_prompt_embeds,
685
- prompt_attention_mask,
686
- negative_prompt_attention_mask,
687
- )
688
-
689
- # 2. Default height and width to transformer
690
- if prompt is not None and isinstance(prompt, str):
691
- batch_size = 1
692
- elif prompt is not None and isinstance(prompt, list):
693
- batch_size = len(prompt)
694
- else:
695
- batch_size = prompt_embeds.shape[0]
696
-
697
- device = self._execution_device
698
-
699
- # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
700
- # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
701
- # corresponds to doing no classifier free guidance.
702
- do_classifier_free_guidance = guidance_scale > 1.0
703
-
704
- # 3. Encode input prompt
705
- (
706
- prompt_embeds,
707
- prompt_attention_mask,
708
- negative_prompt_embeds,
709
- negative_prompt_attention_mask,
710
- ) = self.encode_prompt(
711
- prompt,
712
- do_classifier_free_guidance,
713
- negative_prompt=negative_prompt,
714
- num_images_per_prompt=num_images_per_prompt,
715
- device=device,
716
- prompt_embeds=prompt_embeds,
717
- negative_prompt_embeds=negative_prompt_embeds,
718
- prompt_attention_mask=prompt_attention_mask,
719
- negative_prompt_attention_mask=negative_prompt_attention_mask,
720
- clean_caption=clean_caption,
721
- max_sequence_length=max_sequence_length,
722
- )
723
- if do_classifier_free_guidance:
724
- prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
725
- prompt_attention_mask = torch.cat([negative_prompt_attention_mask, prompt_attention_mask], dim=0)
726
-
727
- # 4. Prepare timesteps
728
- timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
729
- self.scheduler.set_timesteps(num_inference_steps, device=device)
730
-
731
- # 5. Prepare latents.
732
- latent_channels = self.transformer.config.in_channels
733
- latents = self.prepare_latents(
734
- batch_size * num_images_per_prompt,
735
- latent_channels,
736
- num_frames,
737
- height,
738
- width,
739
- prompt_embeds.dtype,
740
- device,
741
- generator,
742
- latents,
743
- )
744
-
745
- # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
746
- extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
747
-
748
- # 6.1 Prepare micro-conditions.
749
- added_cond_kwargs = {"resolution": None, "aspect_ratio": None}
750
-
751
- # 7. Denoising loop
752
- num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
753
-
754
- progress_wrap = tqdm.tqdm if verbose else (lambda x: x)
755
- for i, t in progress_wrap(list(enumerate(timesteps))):
756
-
757
- latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
758
- latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
759
-
760
- current_timestep = t
761
- if not torch.is_tensor(current_timestep):
762
- # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
763
- # This would be a good case for the `match` statement (Python 3.10+)
764
- is_mps = latent_model_input.device.type == "mps"
765
- if isinstance(current_timestep, float):
766
- dtype = torch.float32 if is_mps else torch.float64
767
- else:
768
- dtype = torch.int32 if is_mps else torch.int64
769
- current_timestep = torch.tensor([current_timestep], dtype=dtype, device=latent_model_input.device)
770
- elif len(current_timestep.shape) == 0:
771
- current_timestep = current_timestep[None].to(latent_model_input.device)
772
- # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
773
- current_timestep = current_timestep.expand(latent_model_input.shape[0])
774
-
775
- if prompt_embeds.ndim == 3:
776
- prompt_embeds = prompt_embeds.unsqueeze(1) # b l d -> b 1 l d
777
- if prompt_attention_mask.ndim == 2:
778
- prompt_attention_mask = prompt_attention_mask.unsqueeze(1) # b l -> b 1 l
779
- # prepare attention_mask.
780
- # b c t h w -> b t h w
781
- attention_mask = torch.ones_like(latent_model_input)[:, 0]
782
- # predict noise model_output
783
- noise_pred = self.transformer(
784
- latent_model_input,
785
- attention_mask=attention_mask,
786
- encoder_hidden_states=prompt_embeds,
787
- encoder_attention_mask=prompt_attention_mask,
788
- timestep=current_timestep,
789
- added_cond_kwargs=added_cond_kwargs,
790
- return_dict=False,
791
- )[0]
792
-
793
- # perform guidance
794
- if do_classifier_free_guidance:
795
- noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
796
- noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
797
-
798
- # learned sigma
799
- if self.transformer.config.out_channels // 2 == latent_channels:
800
- noise_pred = noise_pred.chunk(2, dim=1)[0]
801
- else:
802
- noise_pred = noise_pred
803
-
804
- # compute previous image: x_t -> x_t-1
805
- latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
806
-
807
- # call the callback, if provided
808
- if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
809
- if callback is not None and i % callback_steps == 0:
810
- step_idx = i // getattr(self.scheduler, "order", 1)
811
- callback(step_idx, t, latents)
812
-
813
- if not output_type == "latents":
814
- video = self.decode_latents(latents)
815
- video = video[:, :num_frames, :height, :width]
816
- else:
817
- video = latents
818
- return AllegroPipelineOutput(video=video)
819
-
820
- # Offload all models
821
- self.maybe_free_model_hooks()
822
-
823
- if not return_dict:
824
- return (video,)
825
-
826
- return AllegroPipelineOutput(video=video)
827
-
828
- def decode_latents(self, latents):
829
- video = self.vae.decode(latents.to(self.vae.dtype) / self.vae.scale_factor).sample
830
- # b t c h w -> b t h w c
831
- video = ((video / 2.0 + 0.5).clamp(0, 1) * 255).to(dtype=torch.uint8).cpu().permute(0, 1, 3, 4, 2).contiguous()
832
- return video