Disty0 commited on
Commit
06e9a10
1 Parent(s): 5cafa9f

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +31 -10
README.md CHANGED
@@ -87,8 +87,8 @@ import diffusers
87
 
88
  device = "cuda"
89
  dtype = torch.float16
 
90
 
91
- model_path = "Disty0/sotediffusion-v2"
92
 
93
  def get_timestep_ratio_conditioning(t, alphas_cumprod):
94
  s = torch.tensor([0.008]) # diffusers uses 0.003 while the original is 0.008
@@ -100,9 +100,8 @@ def get_timestep_ratio_conditioning(t, alphas_cumprod):
100
  ratio = (((var * min_var) ** 0.5).acos() / (torch.pi * 0.5)) * (1 + s) - s
101
  return ratio
102
 
103
- pipe = diffusers.AutoPipelineForText2Image.from_pretrained(model_path, text_encoder=None, torch_dtype=dtype)
104
 
105
- # diffusers bugs
106
  pipe.prior_pipe.get_timestep_ratio_conditioning = get_timestep_ratio_conditioning
107
  pipe.prior_pipe.scheduler.config.clip_sample = False
108
 
@@ -120,20 +119,43 @@ pipe = pipe.to(device, dtype=dtype)
120
  pipe.prior_pipe = pipe.prior_pipe.to(device, dtype=dtype)
121
 
122
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  prompt = "1girl, solo, looking at viewer, open mouth, blue eyes, medium breasts, blonde hair, gloves, dress, bow, hair between eyes, bare shoulders, upper body, hair bow, indoors, elbow gloves, hand on own chest, bridal gauntlets, candlestand, smile, rim lighting, from side, castle interior, looking side,"
124
- quality_prompt = "very aesthetic, best quality, newest"
125
  negative_prompt = "very displeasing, displeasing, worst quality, bad quality, low quality, realistic, monochrome, comic, sketch, oldest, early, artist name, signature, blurry, simple background, upside down,"
126
-
127
  num_images_per_prompt=1
128
 
129
- # Encode prompts and quality prompts seperately:
130
  # device, batch_size, num_images_per_prompt, cfg, prompt
131
  prompt_embeds, prompt_embeds_pooled, _, _ = pipe.prior_pipe.encode_prompt(device, 1, num_images_per_prompt, False, prompt=prompt)
132
  quality_prompt_embeds, _, _, _ = pipe.prior_pipe.encode_prompt(device, 1, num_images_per_prompt, False, prompt=quality_prompt)
133
 
134
  negative_prompt_embeds, negative_prompt_embeds_pooled, _, _ = pipe.prior_pipe.encode_prompt(device, 1, num_images_per_prompt, False, prompt=negative_prompt)
135
- empty_prompt_embeds, _, _, _ = pipe.prior_pipe.encode_prompt(device, 1, num_images_per_prompt, False, prompt="")
136
- empty_prompt_embeds = torch.nn.functional.normalize(empty_prompt_embeds)
137
 
138
  prompt_embeds = torch.cat([prompt_embeds, quality_prompt_embeds], dim=1)
139
  negative_prompt_embeds = torch.cat([negative_prompt_embeds, empty_prompt_embeds], dim=1)
@@ -143,7 +165,7 @@ pipe.prior_pipe.maybe_free_model_hooks()
143
  output = pipe(
144
  width=1024,
145
  height=1536,
146
- decoder_guidance_scale=1.0,
147
  prior_guidance_scale=7.0,
148
  prior_num_inference_steps=30,
149
  num_inference_steps=10,
@@ -277,7 +299,6 @@ aesthetic tags, quality tags, date tags, custom tags, rating tags, character, se
277
  Add "realistic" tag to the negatives when this happens.
278
  - Far shot eyes and hands can be bad.
279
  - Still has a lot more room for more training.
280
- - Diffusers outputs aren't as good as ComfyUI outputs.
281
 
282
 
283
  ## License
 
87
 
88
  device = "cuda"
89
  dtype = torch.float16
90
+ model_path = "/mnt/DataSSD/AI/SoteDiffusion/Wuerstchen3/diffusers/sotediffusion-v2"
91
 
 
92
 
93
  def get_timestep_ratio_conditioning(t, alphas_cumprod):
94
  s = torch.tensor([0.008]) # diffusers uses 0.003 while the original is 0.008
 
100
  ratio = (((var * min_var) ** 0.5).acos() / (torch.pi * 0.5)) * (1 + s) - s
101
  return ratio
102
 
 
103
 
104
+ pipe = diffusers.AutoPipelineForText2Image.from_pretrained(model_path, text_encoder=None, torch_dtype=dtype)
105
  pipe.prior_pipe.get_timestep_ratio_conditioning = get_timestep_ratio_conditioning
106
  pipe.prior_pipe.scheduler.config.clip_sample = False
107
 
 
119
  pipe.prior_pipe = pipe.prior_pipe.to(device, dtype=dtype)
120
 
121
 
122
+ def encode_empty_prompt(
123
+ prior_pipe,
124
+ device,
125
+ batch_size,
126
+ num_images_per_prompt,
127
+ ):
128
+
129
+ text_inputs = prior_pipe.tokenizer(
130
+ "",
131
+ padding="max_length",
132
+ max_length=prior_pipe.tokenizer.model_max_length,
133
+ truncation=True,
134
+ return_tensors="pt",
135
+ )
136
+
137
+ # Don't use attention mask for empty prompt
138
+ text_encoder_output = prior_pipe.text_encoder(
139
+ text_inputs.input_ids.to(device), attention_mask=None, output_hidden_states=True
140
+ )
141
+ prompt_embeds = text_encoder_output.hidden_states[-1]
142
+ prompt_embeds = prompt_embeds.to(dtype=prior_pipe.text_encoder.dtype, device=device)
143
+ prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
144
+ return prompt_embeds
145
+
146
+
147
  prompt = "1girl, solo, looking at viewer, open mouth, blue eyes, medium breasts, blonde hair, gloves, dress, bow, hair between eyes, bare shoulders, upper body, hair bow, indoors, elbow gloves, hand on own chest, bridal gauntlets, candlestand, smile, rim lighting, from side, castle interior, looking side,"
148
+ quality_prompt = "extremely aesthetic, best quality, newest"
149
  negative_prompt = "very displeasing, displeasing, worst quality, bad quality, low quality, realistic, monochrome, comic, sketch, oldest, early, artist name, signature, blurry, simple background, upside down,"
 
150
  num_images_per_prompt=1
151
 
152
+ # Encode prompts and quality prompts eperately:
153
  # device, batch_size, num_images_per_prompt, cfg, prompt
154
  prompt_embeds, prompt_embeds_pooled, _, _ = pipe.prior_pipe.encode_prompt(device, 1, num_images_per_prompt, False, prompt=prompt)
155
  quality_prompt_embeds, _, _, _ = pipe.prior_pipe.encode_prompt(device, 1, num_images_per_prompt, False, prompt=quality_prompt)
156
 
157
  negative_prompt_embeds, negative_prompt_embeds_pooled, _, _ = pipe.prior_pipe.encode_prompt(device, 1, num_images_per_prompt, False, prompt=negative_prompt)
158
+ empty_prompt_embeds = encode_empty_prompt(pipe.prior_pipe, device, 1, num_images_per_prompt)
 
159
 
160
  prompt_embeds = torch.cat([prompt_embeds, quality_prompt_embeds], dim=1)
161
  negative_prompt_embeds = torch.cat([negative_prompt_embeds, empty_prompt_embeds], dim=1)
 
165
  output = pipe(
166
  width=1024,
167
  height=1536,
168
+ decoder_guidance_scale=1.2,
169
  prior_guidance_scale=7.0,
170
  prior_num_inference_steps=30,
171
  num_inference_steps=10,
 
299
  Add "realistic" tag to the negatives when this happens.
300
  - Far shot eyes and hands can be bad.
301
  - Still has a lot more room for more training.
 
302
 
303
 
304
  ## License