Disty0
/

sotediffusion-v2

@@ -87,8 +87,8 @@ import diffusers
 device = "cuda"
 dtype = torch.float16
-model_path = "Disty0/sotediffusion-v2"
 def get_timestep_ratio_conditioning(t, alphas_cumprod):
     s = torch.tensor([0.008]) # diffusers uses 0.003 while the original is 0.008
@@ -100,9 +100,8 @@ def get_timestep_ratio_conditioning(t, alphas_cumprod):
     ratio = (((var * min_var) ** 0.5).acos() / (torch.pi * 0.5)) * (1 + s) - s
     return ratio
-pipe = diffusers.AutoPipelineForText2Image.from_pretrained(model_path, text_encoder=None, torch_dtype=dtype)
-# diffusers bugs
 pipe.prior_pipe.get_timestep_ratio_conditioning = get_timestep_ratio_conditioning
 pipe.prior_pipe.scheduler.config.clip_sample = False
@@ -120,20 +119,43 @@ pipe = pipe.to(device, dtype=dtype)
 pipe.prior_pipe = pipe.prior_pipe.to(device, dtype=dtype)
 prompt = "1girl, solo, looking at viewer, open mouth, blue eyes, medium breasts, blonde hair, gloves, dress, bow, hair between eyes, bare shoulders, upper body, hair bow, indoors, elbow gloves, hand on own chest, bridal gauntlets, candlestand, smile, rim lighting, from side, castle interior, looking side,"
-quality_prompt = "very aesthetic, best quality, newest"
 negative_prompt = "very displeasing, displeasing, worst quality, bad quality, low quality, realistic, monochrome, comic, sketch, oldest, early, artist name, signature, blurry, simple background, upside down,"
 num_images_per_prompt=1
-# Encode prompts and quality prompts seperately:
 # device, batch_size, num_images_per_prompt, cfg, prompt
 prompt_embeds, prompt_embeds_pooled, _, _ = pipe.prior_pipe.encode_prompt(device, 1, num_images_per_prompt, False, prompt=prompt)
 quality_prompt_embeds, _, _, _ = pipe.prior_pipe.encode_prompt(device, 1, num_images_per_prompt, False, prompt=quality_prompt)
 negative_prompt_embeds, negative_prompt_embeds_pooled, _, _ = pipe.prior_pipe.encode_prompt(device, 1, num_images_per_prompt, False, prompt=negative_prompt)
-empty_prompt_embeds, _, _, _ = pipe.prior_pipe.encode_prompt(device, 1, num_images_per_prompt, False, prompt="")
-empty_prompt_embeds = torch.nn.functional.normalize(empty_prompt_embeds)
 prompt_embeds = torch.cat([prompt_embeds, quality_prompt_embeds], dim=1)
 negative_prompt_embeds = torch.cat([negative_prompt_embeds, empty_prompt_embeds], dim=1)
@@ -143,7 +165,7 @@ pipe.prior_pipe.maybe_free_model_hooks()
 output = pipe(
     width=1024,
     height=1536,
-    decoder_guidance_scale=1.0,
     prior_guidance_scale=7.0,
     prior_num_inference_steps=30,
     num_inference_steps=10,
@@ -277,7 +299,6 @@ aesthetic tags, quality tags, date tags, custom tags, rating tags, character, se
   Add "realistic" tag to the negatives when this happens.
 - Far shot eyes and hands can be bad.
 - Still has a lot more room for more training.
-- Diffusers outputs aren't as good as ComfyUI outputs.
 ## License

 device = "cuda"
 dtype = torch.float16
+model_path = "/mnt/DataSSD/AI/SoteDiffusion/Wuerstchen3/diffusers/sotediffusion-v2"
 def get_timestep_ratio_conditioning(t, alphas_cumprod):
     s = torch.tensor([0.008]) # diffusers uses 0.003 while the original is 0.008
     ratio = (((var * min_var) ** 0.5).acos() / (torch.pi * 0.5)) * (1 + s) - s
     return ratio
+pipe = diffusers.AutoPipelineForText2Image.from_pretrained(model_path, text_encoder=None, torch_dtype=dtype)
 pipe.prior_pipe.get_timestep_ratio_conditioning = get_timestep_ratio_conditioning
 pipe.prior_pipe.scheduler.config.clip_sample = False
 pipe.prior_pipe = pipe.prior_pipe.to(device, dtype=dtype)
+def encode_empty_prompt(
+    prior_pipe,
+    device,
+    batch_size,
+    num_images_per_prompt,
+    ):
+    text_inputs = prior_pipe.tokenizer(
+        "",
+        padding="max_length",
+        max_length=prior_pipe.tokenizer.model_max_length,
+        truncation=True,
+        return_tensors="pt",
+    )
+    # Don't use attention mask for empty prompt
+    text_encoder_output = prior_pipe.text_encoder(
+        text_inputs.input_ids.to(device), attention_mask=None, output_hidden_states=True
+    )
+    prompt_embeds = text_encoder_output.hidden_states[-1]
+    prompt_embeds = prompt_embeds.to(dtype=prior_pipe.text_encoder.dtype, device=device)
+    prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+    return prompt_embeds
 prompt = "1girl, solo, looking at viewer, open mouth, blue eyes, medium breasts, blonde hair, gloves, dress, bow, hair between eyes, bare shoulders, upper body, hair bow, indoors, elbow gloves, hand on own chest, bridal gauntlets, candlestand, smile, rim lighting, from side, castle interior, looking side,"
+quality_prompt = "extremely aesthetic, best quality, newest"
 negative_prompt = "very displeasing, displeasing, worst quality, bad quality, low quality, realistic, monochrome, comic, sketch, oldest, early, artist name, signature, blurry, simple background, upside down,"
 num_images_per_prompt=1
+# Encode prompts and quality prompts eperately:
 # device, batch_size, num_images_per_prompt, cfg, prompt
 prompt_embeds, prompt_embeds_pooled, _, _ = pipe.prior_pipe.encode_prompt(device, 1, num_images_per_prompt, False, prompt=prompt)
 quality_prompt_embeds, _, _, _ = pipe.prior_pipe.encode_prompt(device, 1, num_images_per_prompt, False, prompt=quality_prompt)
 negative_prompt_embeds, negative_prompt_embeds_pooled, _, _ = pipe.prior_pipe.encode_prompt(device, 1, num_images_per_prompt, False, prompt=negative_prompt)
+empty_prompt_embeds = encode_empty_prompt(pipe.prior_pipe, device, 1, num_images_per_prompt)
 prompt_embeds = torch.cat([prompt_embeds, quality_prompt_embeds], dim=1)
 negative_prompt_embeds = torch.cat([negative_prompt_embeds, empty_prompt_embeds], dim=1)
 output = pipe(
     width=1024,
     height=1536,
+    decoder_guidance_scale=1.2,
     prior_guidance_scale=7.0,
     prior_num_inference_steps=30,
     num_inference_steps=10,
   Add "realistic" tag to the negatives when this happens.
 - Far shot eyes and hands can be bad.
 - Still has a lot more room for more training.
 ## License