tsqn commited on
Commit
3bc93fb
1 Parent(s): 7b2877a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +92 -74
app.py CHANGED
@@ -2,6 +2,7 @@ import spaces
2
 
3
  import torch
4
  import torchvision.transforms.functional as TF
 
5
  import numpy as np
6
  import random
7
  import os
@@ -21,6 +22,10 @@ from aspect_ratio_template import aspect_ratios
21
 
22
  # global variable
23
  base_model_path = 'SG161222/RealVisXL_V5.0'
 
 
 
 
24
  face_detector = FaceAnalysis2(providers=['CPUExecutionProvider', 'CUDAExecutionProvider'], allowed_modules=['detection', 'recognition'])
25
  face_detector.prepare(ctx_id=0, det_size=(640, 640))
26
 
@@ -64,6 +69,11 @@ pipe = PhotoMakerStableDiffusionXLAdapterPipeline.from_pretrained(
64
  variant="fp16",
65
  ).to(device)
66
 
 
 
 
 
 
67
  pipe.load_photomaker_adapter(
68
  os.path.dirname(photomaker_ckpt),
69
  subfolder="",
@@ -78,6 +88,11 @@ pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
78
  pipe.fuse_lora()
79
  pipe.to(device)
80
 
 
 
 
 
 
81
 
82
  @spaces.GPU(duration=120)
83
  def generate_image(
@@ -97,82 +112,85 @@ def generate_image(
97
  adapter_conditioning_factor,
98
  progress=gr.Progress(track_tqdm=True)
99
  ):
100
- if use_doodle:
101
- sketch_image = sketch_image["composite"]
102
- r, g, b, a = sketch_image.split()
103
- sketch_image = a.convert("RGB")
104
- sketch_image = TF.to_tensor(sketch_image) > 0.5 # Inversion
105
- sketch_image = TF.to_pil_image(sketch_image.to(torch.float32))
106
- adapter_conditioning_scale = adapter_conditioning_scale
107
- adapter_conditioning_factor = adapter_conditioning_factor
108
- else:
109
- adapter_conditioning_scale = 0.
110
- adapter_conditioning_factor = 0.
111
- sketch_image = None
112
-
113
- # check the trigger word
114
- image_token_id = pipe.tokenizer.convert_tokens_to_ids(pipe.trigger_word)
115
- input_ids = pipe.tokenizer.encode(prompt)
116
- if image_token_id not in input_ids:
117
- raise gr.Error(f"Cannot find the trigger word '{pipe.trigger_word}' in text prompt! Please refer to step 2️⃣")
118
-
119
- if input_ids.count(image_token_id) > 1:
120
- raise gr.Error(f"Cannot use multiple trigger words '{pipe.trigger_word}' in text prompt!")
121
-
122
- # determine output dimensions by the aspect ratio
123
- output_w, output_h = aspect_ratios[aspect_ratio_name]
124
- print(f"[Debug] Generate image using aspect ratio [{aspect_ratio_name}] => {output_w} x {output_h}")
125
-
126
- # apply the style template
127
- prompt, negative_prompt = apply_style(style_name, prompt, negative_prompt)
128
-
129
- if upload_images is None:
130
- raise gr.Error(f"Cannot find any input face image! Please refer to step 1️⃣")
131
-
132
- input_id_images = []
133
- for img in upload_images:
134
- input_id_images.append(load_image(img))
135
 
136
- id_embed_list = []
137
-
138
- for img in input_id_images:
139
- img = np.array(img)
140
- img = img[:, :, ::-1]
141
- faces = analyze_faces(face_detector, img)
142
- if len(faces) > 0:
143
- id_embed_list.append(torch.from_numpy((faces[0]['embedding'])))
144
-
145
- if len(id_embed_list) == 0:
146
- raise gr.Error(f"No face detected, please update the input face image(s)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
 
148
- id_embeds = torch.stack(id_embed_list)
149
-
150
- generator = torch.Generator(device=device).manual_seed(seed)
151
-
152
- print("Start inference...")
153
- print(f"[Debug] Seed: {seed}")
154
- print(f"[Debug] Prompt: {prompt}, \n[Debug] Neg Prompt: {negative_prompt}")
155
- start_merge_step = int(float(style_strength_ratio) / 100 * num_steps)
156
- if start_merge_step > 30:
157
- start_merge_step = 30
158
- print(start_merge_step)
159
- images = pipe(
160
- prompt=prompt,
161
- width=output_w,
162
- height=output_h,
163
- input_id_images=input_id_images,
164
- negative_prompt=negative_prompt,
165
- num_images_per_prompt=num_outputs,
166
- num_inference_steps=num_steps,
167
- start_merge_step=start_merge_step,
168
- generator=generator,
169
- guidance_scale=guidance_scale,
170
- id_embeds=id_embeds,
171
- image=sketch_image,
172
- adapter_conditioning_scale=adapter_conditioning_scale,
173
- adapter_conditioning_factor=adapter_conditioning_factor,
174
- ).images
175
- return images, gr.update(visible=True)
176
 
177
  def swap_to_gallery(images):
178
  return gr.update(value=images, visible=True), gr.update(visible=True), gr.update(visible=False)
 
2
 
3
  import torch
4
  import torchvision.transforms.functional as TF
5
+ import tomesd
6
  import numpy as np
7
  import random
8
  import os
 
22
 
23
  # global variable
24
  base_model_path = 'SG161222/RealVisXL_V5.0'
25
+ torch.backends.cuda.matmul.allow_tf32 = True
26
+ torch.backends.cudnn.allow_tf32 = True
27
+ torch.backends.cudnn.benchmark = True
28
+ torch.set_grad_enabled(False)
29
  face_detector = FaceAnalysis2(providers=['CPUExecutionProvider', 'CUDAExecutionProvider'], allowed_modules=['detection', 'recognition'])
30
  face_detector.prepare(ctx_id=0, det_size=(640, 640))
31
 
 
69
  variant="fp16",
70
  ).to(device)
71
 
72
+ pipe.unet = pipe.unet.to(device=device, dtype=torch_dtype)
73
+ pipe.text_encoder = pipe.text_encoder.to(device=device, dtype=torch_dtype)
74
+ pipe.text_encoder_2 = pipe.text_encoder_2.to(device=device, dtype=torch_dtype)
75
+ pipe.vae = pipe.vae.to(device=device, dtype=torch_dtype)
76
+
77
  pipe.load_photomaker_adapter(
78
  os.path.dirname(photomaker_ckpt),
79
  subfolder="",
 
88
  pipe.fuse_lora()
89
  pipe.to(device)
90
 
91
+ pipe.enable_vae_slicing()
92
+ pipe.enable_vae_tiling()
93
+ pipe.enable_xformers_memory_efficient_attention()
94
+
95
+ torch.cuda.empty_cache()
96
 
97
  @spaces.GPU(duration=120)
98
  def generate_image(
 
112
  adapter_conditioning_factor,
113
  progress=gr.Progress(track_tqdm=True)
114
  ):
115
+ with torch.inference_mode():
116
+ torch.cuda.empty_cache()
117
+ if use_doodle:
118
+ sketch_image = sketch_image["composite"]
119
+ r, g, b, a = sketch_image.split()
120
+ sketch_image = a.convert("RGB")
121
+ sketch_image = TF.to_tensor(sketch_image) > 0.5 # Inversion
122
+ sketch_image = TF.to_pil_image(sketch_image.to(torch.float32))
123
+ adapter_conditioning_scale = adapter_conditioning_scale
124
+ adapter_conditioning_factor = adapter_conditioning_factor
125
+ else:
126
+ adapter_conditioning_scale = 0.
127
+ adapter_conditioning_factor = 0.
128
+ sketch_image = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
 
130
+ # check the trigger word
131
+ image_token_id = pipe.tokenizer.convert_tokens_to_ids(pipe.trigger_word)
132
+ input_ids = pipe.tokenizer.encode(prompt)
133
+ if image_token_id not in input_ids:
134
+ raise gr.Error(f"Cannot find the trigger word '{pipe.trigger_word}' in text prompt! Please refer to step 2️⃣")
135
+
136
+ if input_ids.count(image_token_id) > 1:
137
+ raise gr.Error(f"Cannot use multiple trigger words '{pipe.trigger_word}' in text prompt!")
138
+
139
+ # determine output dimensions by the aspect ratio
140
+ output_w, output_h = aspect_ratios[aspect_ratio_name]
141
+ print(f"[Debug] Generate image using aspect ratio [{aspect_ratio_name}] => {output_w} x {output_h}")
142
+
143
+ # apply the style template
144
+ prompt, negative_prompt = apply_style(style_name, prompt, negative_prompt)
145
+
146
+ if upload_images is None:
147
+ raise gr.Error(f"Cannot find any input face image! Please refer to step 1️⃣")
148
+
149
+ input_id_images = []
150
+ for img in upload_images:
151
+ input_id_images.append(load_image(img))
152
+
153
+ id_embed_list = []
154
+
155
+ for img in input_id_images:
156
+ img = np.array(img)
157
+ img = img[:, :, ::-1]
158
+ faces = analyze_faces(face_detector, img)
159
+ if len(faces) > 0:
160
+ id_embed_list.append(torch.from_numpy((faces[0]['embedding'])))
161
+
162
+ if len(id_embed_list) == 0:
163
+ raise gr.Error(f"No face detected, please update the input face image(s)")
164
+
165
+ id_embeds = torch.stack(id_embed_list)
166
+
167
+ generator = torch.Generator(device=device).manual_seed(seed)
168
 
169
+ print("Start inference...")
170
+ print(f"[Debug] Seed: {seed}")
171
+ print(f"[Debug] Prompt: {prompt}, \n[Debug] Neg Prompt: {negative_prompt}")
172
+ start_merge_step = int(float(style_strength_ratio) / 100 * num_steps)
173
+ if start_merge_step > 30:
174
+ start_merge_step = 30
175
+ print(start_merge_step)
176
+ tomesd.apply_patch(pipe, ratio=0.5)
177
+ images = pipe(
178
+ prompt=prompt,
179
+ width=output_w,
180
+ height=output_h,
181
+ input_id_images=input_id_images,
182
+ negative_prompt=negative_prompt,
183
+ num_images_per_prompt=num_outputs,
184
+ num_inference_steps=num_steps,
185
+ start_merge_step=start_merge_step,
186
+ generator=generator,
187
+ guidance_scale=guidance_scale,
188
+ id_embeds=id_embeds,
189
+ image=sketch_image,
190
+ adapter_conditioning_scale=adapter_conditioning_scale,
191
+ adapter_conditioning_factor=adapter_conditioning_factor,
192
+ ).images
193
+ return images, gr.update(visible=True)
 
 
 
194
 
195
  def swap_to_gallery(images):
196
  return gr.update(value=images, visible=True), gr.update(visible=True), gr.update(visible=False)