resource-consumption

#3
by pcuenq HF staff - opened
Files changed (43) hide show
  1. .gitignore +1 -0
  2. app.py +103 -133
  3. outputs/000000.mp4 +0 -0
  4. outputs/000001.mp4 +0 -0
  5. outputs/000002.mp4 +0 -0
  6. outputs/000003.mp4 +0 -0
  7. outputs/000004.mp4 +0 -3
  8. outputs/000005.mp4 +0 -0
  9. outputs/simple_video_sample/svd_xt/000000.mp4 +0 -0
  10. scripts/__pycache__/__init__.cpython-310.pyc +0 -0
  11. scripts/util/__pycache__/__init__.cpython-310.pyc +0 -0
  12. scripts/util/detection/__pycache__/__init__.cpython-310.pyc +0 -0
  13. scripts/util/detection/__pycache__/nsfw_and_watermark_dectection.cpython-310.pyc +0 -0
  14. sgm/__pycache__/__init__.cpython-310.pyc +0 -0
  15. sgm/__pycache__/util.cpython-310.pyc +0 -0
  16. sgm/inference/__pycache__/helpers.cpython-310.pyc +0 -0
  17. sgm/models/__pycache__/__init__.cpython-310.pyc +0 -0
  18. sgm/models/__pycache__/autoencoder.cpython-310.pyc +0 -0
  19. sgm/models/__pycache__/diffusion.cpython-310.pyc +0 -0
  20. sgm/modules/__pycache__/__init__.cpython-310.pyc +0 -0
  21. sgm/modules/__pycache__/attention.cpython-310.pyc +0 -0
  22. sgm/modules/__pycache__/ema.cpython-310.pyc +0 -0
  23. sgm/modules/__pycache__/video_attention.cpython-310.pyc +0 -0
  24. sgm/modules/autoencoding/__pycache__/__init__.cpython-310.pyc +0 -0
  25. sgm/modules/autoencoding/__pycache__/temporal_ae.cpython-310.pyc +0 -0
  26. sgm/modules/autoencoding/regularizers/__pycache__/__init__.cpython-310.pyc +0 -0
  27. sgm/modules/autoencoding/regularizers/__pycache__/base.cpython-310.pyc +0 -0
  28. sgm/modules/diffusionmodules/__pycache__/__init__.cpython-310.pyc +0 -0
  29. sgm/modules/diffusionmodules/__pycache__/denoiser.cpython-310.pyc +0 -0
  30. sgm/modules/diffusionmodules/__pycache__/denoiser_scaling.cpython-310.pyc +0 -0
  31. sgm/modules/diffusionmodules/__pycache__/discretizer.cpython-310.pyc +0 -0
  32. sgm/modules/diffusionmodules/__pycache__/guiders.cpython-310.pyc +0 -0
  33. sgm/modules/diffusionmodules/__pycache__/model.cpython-310.pyc +0 -0
  34. sgm/modules/diffusionmodules/__pycache__/openaimodel.cpython-310.pyc +0 -0
  35. sgm/modules/diffusionmodules/__pycache__/sampling.cpython-310.pyc +0 -0
  36. sgm/modules/diffusionmodules/__pycache__/sampling_utils.cpython-310.pyc +0 -0
  37. sgm/modules/diffusionmodules/__pycache__/util.cpython-310.pyc +0 -0
  38. sgm/modules/diffusionmodules/__pycache__/video_model.cpython-310.pyc +0 -0
  39. sgm/modules/diffusionmodules/__pycache__/wrappers.cpython-310.pyc +0 -0
  40. sgm/modules/distributions/__pycache__/__init__.cpython-310.pyc +0 -0
  41. sgm/modules/distributions/__pycache__/distributions.cpython-310.pyc +0 -0
  42. sgm/modules/encoders/__pycache__/__init__.cpython-310.pyc +0 -0
  43. sgm/modules/encoders/__pycache__/modules.cpython-310.pyc +0 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__/
app.py CHANGED
@@ -69,158 +69,130 @@ model, filter = load_model(
69
  )
70
 
71
  def sample(
72
- input_path: str = "assets/test_image.png", # Can either be image file or folder with image files
73
  seed: Optional[int] = None,
74
  randomize_seed: bool = True,
75
  motion_bucket_id: int = 127,
76
  fps_id: int = 6,
77
  version: str = "svd_xt",
78
  cond_aug: float = 0.02,
79
- decoding_t: int = 7, # Number of frames decoded at a time! This eats most VRAM. Reduce if necessary.
80
  device: str = "cuda",
81
  output_folder: str = "outputs",
82
  progress=gr.Progress(track_tqdm=True)
83
  ):
84
- """
85
- Simple script to generate a single sample conditioned on an image `input_path` or multiple images, one for each
86
- image file in folder `input_path`. If you run out of VRAM, try decreasing `decoding_t`.
87
- """
88
  if(randomize_seed):
89
  seed = random.randint(0, max_64_bit_int)
90
 
91
  torch.manual_seed(seed)
92
 
93
- path = Path(input_path)
94
- all_img_paths = []
95
- if path.is_file():
96
- if any([input_path.endswith(x) for x in ["jpg", "jpeg", "png"]]):
97
- all_img_paths = [input_path]
98
- else:
99
- raise ValueError("Path is not valid image file.")
100
- elif path.is_dir():
101
- all_img_paths = sorted(
102
- [
103
- f
104
- for f in path.iterdir()
105
- if f.is_file() and f.suffix.lower() in [".jpg", ".jpeg", ".png"]
106
- ]
 
 
 
 
 
 
 
 
 
 
 
 
107
  )
108
- if len(all_img_paths) == 0:
109
- raise ValueError("Folder does not contain any images.")
110
- else:
111
- raise ValueError
112
-
113
- for input_img_path in all_img_paths:
114
- with Image.open(input_img_path) as image:
115
- if image.mode == "RGBA":
116
- image = image.convert("RGB")
117
- w, h = image.size
118
-
119
- if h % 64 != 0 or w % 64 != 0:
120
- width, height = map(lambda x: x - x % 64, (w, h))
121
- image = image.resize((width, height))
122
- print(
123
- f"WARNING: Your image is of size {h}x{w} which is not divisible by 64. We are resizing to {height}x{width}!"
124
- )
125
 
126
- image = ToTensor()(image)
127
- image = image * 2.0 - 1.0
128
-
129
- image = image.unsqueeze(0).to(device)
130
- H, W = image.shape[2:]
131
- assert image.shape[1] == 3
132
- F = 8
133
- C = 4
134
- shape = (num_frames, C, H // F, W // F)
135
- if (H, W) != (576, 1024):
136
- print(
137
- "WARNING: The conditioning frame you provided is not 576x1024. This leads to suboptimal performance as model was only trained on 576x1024. Consider increasing `cond_aug`."
 
 
 
 
 
 
 
 
 
 
138
  )
139
- if motion_bucket_id > 255:
140
- print(
141
- "WARNING: High motion bucket! This may lead to suboptimal performance."
 
 
 
 
142
  )
143
 
144
- if fps_id < 5:
145
- print("WARNING: Small fps value! This may lead to suboptimal performance.")
146
-
147
- if fps_id > 30:
148
- print("WARNING: Large fps value! This may lead to suboptimal performance.")
149
-
150
- value_dict = {}
151
- value_dict["motion_bucket_id"] = motion_bucket_id
152
- value_dict["fps_id"] = fps_id
153
- value_dict["cond_aug"] = cond_aug
154
- value_dict["cond_frames_without_noise"] = image
155
- value_dict["cond_frames"] = image + cond_aug * torch.randn_like(image)
156
- value_dict["cond_aug"] = cond_aug
157
-
158
- with torch.no_grad():
159
- with torch.autocast(device):
160
- batch, batch_uc = get_batch(
161
- get_unique_embedder_keys_from_conditioner(model.conditioner),
162
- value_dict,
163
- [1, num_frames],
164
- T=num_frames,
165
- device=device,
166
- )
167
- c, uc = model.conditioner.get_unconditional_conditioning(
168
- batch,
169
- batch_uc=batch_uc,
170
- force_uc_zero_embeddings=[
171
- "cond_frames",
172
- "cond_frames_without_noise",
173
- ],
174
- )
175
 
176
- for k in ["crossattn", "concat"]:
177
- uc[k] = repeat(uc[k], "b ... -> b t ...", t=num_frames)
178
- uc[k] = rearrange(uc[k], "b t ... -> (b t) ...", t=num_frames)
179
- c[k] = repeat(c[k], "b ... -> b t ...", t=num_frames)
180
- c[k] = rearrange(c[k], "b t ... -> (b t) ...", t=num_frames)
181
-
182
- randn = torch.randn(shape, device=device)
183
-
184
- additional_model_inputs = {}
185
- additional_model_inputs["image_only_indicator"] = torch.zeros(
186
- 2, num_frames
187
- ).to(device)
188
- additional_model_inputs["num_video_frames"] = batch["num_video_frames"]
189
-
190
- def denoiser(input, sigma, c):
191
- return model.denoiser(
192
- model.model, input, sigma, c, **additional_model_inputs
193
- )
194
-
195
- samples_z = model.sampler(denoiser, randn, cond=c, uc=uc)
196
- model.en_and_decode_n_samples_a_time = decoding_t
197
- samples_x = model.decode_first_stage(samples_z)
198
- samples = torch.clamp((samples_x + 1.0) / 2.0, min=0.0, max=1.0)
199
-
200
- os.makedirs(output_folder, exist_ok=True)
201
- base_count = len(glob(os.path.join(output_folder, "*.mp4")))
202
- video_path = os.path.join(output_folder, f"{base_count:06d}.mp4")
203
- writer = cv2.VideoWriter(
204
- video_path,
205
- cv2.VideoWriter_fourcc(*"mp4v"),
206
- fps_id + 1,
207
- (samples.shape[-1], samples.shape[-2]),
208
- )
209
 
210
- samples = embed_watermark(samples)
211
- samples = filter(samples)
212
- vid = (
213
- (rearrange(samples, "t c h w -> t h w c") * 255)
214
- .cpu()
215
- .numpy()
216
- .astype(np.uint8)
217
  )
218
- for frame in vid:
219
- frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
220
- writer.write(frame)
221
- writer.release()
222
-
223
- return video_path, seed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
 
225
  def get_unique_embedder_keys_from_conditioner(conditioner):
226
  return list(set([x.input_key for x in conditioner.embedders]))
@@ -266,8 +238,7 @@ def get_batch(keys, value_dict, N, T, device):
266
  batch_uc[key] = torch.clone(batch[key])
267
  return batch, batch_uc
268
 
269
- def resize_image(image_path, output_size=(1024, 576)):
270
- image = Image.open(image_path)
271
  # Calculate aspect ratios
272
  target_aspect = output_size[0] / output_size[1] # Aspect ratio of the desired size
273
  image_aspect = image.width / image.height # Aspect ratio of the original image
@@ -296,7 +267,6 @@ def resize_image(image_path, output_size=(1024, 576)):
296
 
297
  # Crop the image
298
  cropped_image = resized_image.crop((left, top, right, bottom))
299
-
300
  return cropped_image
301
 
302
  with gr.Blocks() as demo:
@@ -305,7 +275,7 @@ with gr.Blocks() as demo:
305
  ''')
306
  with gr.Row():
307
  with gr.Column():
308
- image = gr.Image(label="Upload your image", type="filepath")
309
  generate_btn = gr.Button("Generate")
310
  video = gr.Video()
311
  with gr.Accordion("Advanced options", open=False):
 
69
  )
70
 
71
  def sample(
72
+ image: Image,
73
  seed: Optional[int] = None,
74
  randomize_seed: bool = True,
75
  motion_bucket_id: int = 127,
76
  fps_id: int = 6,
77
  version: str = "svd_xt",
78
  cond_aug: float = 0.02,
79
+ decoding_t: int = 5, # Number of frames decoded at a time! This eats most VRAM. Reduce if necessary.
80
  device: str = "cuda",
81
  output_folder: str = "outputs",
82
  progress=gr.Progress(track_tqdm=True)
83
  ):
 
 
 
 
84
  if(randomize_seed):
85
  seed = random.randint(0, max_64_bit_int)
86
 
87
  torch.manual_seed(seed)
88
 
89
+ if image.mode == "RGBA":
90
+ image = image.convert("RGB")
91
+ w, h = image.size
92
+
93
+ if h % 64 != 0 or w % 64 != 0:
94
+ width, height = map(lambda x: x - x % 64, (w, h))
95
+ image = image.resize((width, height))
96
+ print(
97
+ f"WARNING: Your image is of size {h}x{w} which is not divisible by 64. We are resizing to {height}x{width}!"
98
+ )
99
+
100
+ image = ToTensor()(image)
101
+ image = image * 2.0 - 1.0
102
+ image = image.unsqueeze(0).to(device)
103
+ H, W = image.shape[2:]
104
+ assert image.shape[1] == 3
105
+ F = 8
106
+ C = 4
107
+ shape = (num_frames, C, H // F, W // F)
108
+ if (H, W) != (576, 1024):
109
+ print(
110
+ "WARNING: The conditioning frame you provided is not 576x1024. This leads to suboptimal performance as model was only trained on 576x1024. Consider increasing `cond_aug`."
111
+ )
112
+ if motion_bucket_id > 255:
113
+ print(
114
+ "WARNING: High motion bucket! This may lead to suboptimal performance."
115
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
 
117
+ if fps_id < 5:
118
+ print("WARNING: Small fps value! This may lead to suboptimal performance.")
119
+
120
+ if fps_id > 30:
121
+ print("WARNING: Large fps value! This may lead to suboptimal performance.")
122
+
123
+ value_dict = {}
124
+ value_dict["motion_bucket_id"] = motion_bucket_id
125
+ value_dict["fps_id"] = fps_id
126
+ value_dict["cond_aug"] = cond_aug
127
+ value_dict["cond_frames_without_noise"] = image
128
+ value_dict["cond_frames"] = image + cond_aug * torch.randn_like(image)
129
+ value_dict["cond_aug"] = cond_aug
130
+
131
+ with torch.no_grad():
132
+ with torch.autocast(device):
133
+ batch, batch_uc = get_batch(
134
+ get_unique_embedder_keys_from_conditioner(model.conditioner),
135
+ value_dict,
136
+ [1, num_frames],
137
+ T=num_frames,
138
+ device=device,
139
  )
140
+ c, uc = model.conditioner.get_unconditional_conditioning(
141
+ batch,
142
+ batch_uc=batch_uc,
143
+ force_uc_zero_embeddings=[
144
+ "cond_frames",
145
+ "cond_frames_without_noise",
146
+ ],
147
  )
148
 
149
+ for k in ["crossattn", "concat"]:
150
+ uc[k] = repeat(uc[k], "b ... -> b t ...", t=num_frames)
151
+ uc[k] = rearrange(uc[k], "b t ... -> (b t) ...", t=num_frames)
152
+ c[k] = repeat(c[k], "b ... -> b t ...", t=num_frames)
153
+ c[k] = rearrange(c[k], "b t ... -> (b t) ...", t=num_frames)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
 
155
+ randn = torch.randn(shape, device=device)
156
+
157
+ additional_model_inputs = {}
158
+ additional_model_inputs["image_only_indicator"] = torch.zeros(
159
+ 2, num_frames
160
+ ).to(device)
161
+ additional_model_inputs["num_video_frames"] = batch["num_video_frames"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
 
163
+ def denoiser(input, sigma, c):
164
+ return model.denoiser(
165
+ model.model, input, sigma, c, **additional_model_inputs
 
 
 
 
166
  )
167
+
168
+ samples_z = model.sampler(denoiser, randn, cond=c, uc=uc)
169
+ model.en_and_decode_n_samples_a_time = decoding_t
170
+ samples_x = model.decode_first_stage(samples_z)
171
+ samples = torch.clamp((samples_x + 1.0) / 2.0, min=0.0, max=1.0)
172
+
173
+ os.makedirs(output_folder, exist_ok=True)
174
+ base_count = len(glob(os.path.join(output_folder, "*.mp4")))
175
+ video_path = os.path.join(output_folder, f"{base_count:06d}.mp4")
176
+ writer = cv2.VideoWriter(
177
+ video_path,
178
+ cv2.VideoWriter_fourcc(*"mp4v"),
179
+ fps_id + 1,
180
+ (samples.shape[-1], samples.shape[-2]),
181
+ )
182
+
183
+ samples = embed_watermark(samples)
184
+ samples = filter(samples)
185
+ vid = (
186
+ (rearrange(samples, "t c h w -> t h w c") * 255)
187
+ .cpu()
188
+ .numpy()
189
+ .astype(np.uint8)
190
+ )
191
+ for frame in vid:
192
+ frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
193
+ writer.write(frame)
194
+ writer.release()
195
+ return video_path, seed
196
 
197
  def get_unique_embedder_keys_from_conditioner(conditioner):
198
  return list(set([x.input_key for x in conditioner.embedders]))
 
238
  batch_uc[key] = torch.clone(batch[key])
239
  return batch, batch_uc
240
 
241
+ def resize_image(image, output_size=(1024, 576)):
 
242
  # Calculate aspect ratios
243
  target_aspect = output_size[0] / output_size[1] # Aspect ratio of the desired size
244
  image_aspect = image.width / image.height # Aspect ratio of the original image
 
267
 
268
  # Crop the image
269
  cropped_image = resized_image.crop((left, top, right, bottom))
 
270
  return cropped_image
271
 
272
  with gr.Blocks() as demo:
 
275
  ''')
276
  with gr.Row():
277
  with gr.Column():
278
+ image = gr.Image(label="Upload your image", type="pil")
279
  generate_btn = gr.Button("Generate")
280
  video = gr.Video()
281
  with gr.Accordion("Advanced options", open=False):
outputs/000000.mp4 DELETED
Binary file (297 kB)
 
outputs/000001.mp4 DELETED
Binary file (297 kB)
 
outputs/000002.mp4 DELETED
Binary file (255 kB)
 
outputs/000003.mp4 DELETED
Binary file (288 kB)
 
outputs/000004.mp4 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c2cc34e39dd8c5d2022de56d1d83936ac2b7a286ab0351895f1b83e00a9e2fa7
3
- size 1574414
 
 
 
 
outputs/000005.mp4 DELETED
Binary file (265 kB)
 
outputs/simple_video_sample/svd_xt/000000.mp4 DELETED
Binary file (298 kB)
 
scripts/__pycache__/__init__.cpython-310.pyc DELETED
Binary file (154 Bytes)
 
scripts/util/__pycache__/__init__.cpython-310.pyc DELETED
Binary file (159 Bytes)
 
scripts/util/detection/__pycache__/__init__.cpython-310.pyc DELETED
Binary file (169 Bytes)
 
scripts/util/detection/__pycache__/nsfw_and_watermark_dectection.cpython-310.pyc DELETED
Binary file (3.9 kB)
 
sgm/__pycache__/__init__.cpython-310.pyc DELETED
Binary file (330 Bytes)
 
sgm/__pycache__/util.cpython-310.pyc DELETED
Binary file (9.45 kB)
 
sgm/inference/__pycache__/helpers.cpython-310.pyc DELETED
Binary file (8.87 kB)
 
sgm/models/__pycache__/__init__.cpython-310.pyc DELETED
Binary file (260 Bytes)
 
sgm/models/__pycache__/autoencoder.cpython-310.pyc DELETED
Binary file (19.2 kB)
 
sgm/models/__pycache__/diffusion.cpython-310.pyc DELETED
Binary file (10.9 kB)
 
sgm/modules/__pycache__/__init__.cpython-310.pyc DELETED
Binary file (321 Bytes)
 
sgm/modules/__pycache__/attention.cpython-310.pyc DELETED
Binary file (18 kB)
 
sgm/modules/__pycache__/ema.cpython-310.pyc DELETED
Binary file (3.22 kB)
 
sgm/modules/__pycache__/video_attention.cpython-310.pyc DELETED
Binary file (6.27 kB)
 
sgm/modules/autoencoding/__pycache__/__init__.cpython-310.pyc DELETED
Binary file (171 Bytes)
 
sgm/modules/autoencoding/__pycache__/temporal_ae.cpython-310.pyc DELETED
Binary file (8.48 kB)
 
sgm/modules/autoencoding/regularizers/__pycache__/__init__.cpython-310.pyc DELETED
Binary file (1.5 kB)
 
sgm/modules/autoencoding/regularizers/__pycache__/base.cpython-310.pyc DELETED
Binary file (2.04 kB)
 
sgm/modules/diffusionmodules/__pycache__/__init__.cpython-310.pyc DELETED
Binary file (175 Bytes)
 
sgm/modules/diffusionmodules/__pycache__/denoiser.cpython-310.pyc DELETED
Binary file (3.09 kB)
 
sgm/modules/diffusionmodules/__pycache__/denoiser_scaling.cpython-310.pyc DELETED
Binary file (2.45 kB)
 
sgm/modules/diffusionmodules/__pycache__/discretizer.cpython-310.pyc DELETED
Binary file (3 kB)
 
sgm/modules/diffusionmodules/__pycache__/guiders.cpython-310.pyc DELETED
Binary file (3.96 kB)
 
sgm/modules/diffusionmodules/__pycache__/model.cpython-310.pyc DELETED
Binary file (16.5 kB)
 
sgm/modules/diffusionmodules/__pycache__/openaimodel.cpython-310.pyc DELETED
Binary file (21.7 kB)
 
sgm/modules/diffusionmodules/__pycache__/sampling.cpython-310.pyc DELETED
Binary file (11.8 kB)
 
sgm/modules/diffusionmodules/__pycache__/sampling_utils.cpython-310.pyc DELETED
Binary file (1.53 kB)
 
sgm/modules/diffusionmodules/__pycache__/util.cpython-310.pyc DELETED
Binary file (11.7 kB)
 
sgm/modules/diffusionmodules/__pycache__/video_model.cpython-310.pyc DELETED
Binary file (8.21 kB)
 
sgm/modules/diffusionmodules/__pycache__/wrappers.cpython-310.pyc DELETED
Binary file (1.69 kB)
 
sgm/modules/distributions/__pycache__/__init__.cpython-310.pyc DELETED
Binary file (172 Bytes)
 
sgm/modules/distributions/__pycache__/distributions.cpython-310.pyc DELETED
Binary file (3.77 kB)
 
sgm/modules/encoders/__pycache__/__init__.cpython-310.pyc DELETED
Binary file (167 Bytes)
 
sgm/modules/encoders/__pycache__/modules.cpython-310.pyc DELETED
Binary file (29.5 kB)