Update README.md
Browse files
README.md
CHANGED
@@ -87,8 +87,8 @@ import diffusers
|
|
87 |
|
88 |
device = "cuda"
|
89 |
dtype = torch.float16
|
|
|
90 |
|
91 |
-
model_path = "Disty0/sotediffusion-v2"
|
92 |
|
93 |
def get_timestep_ratio_conditioning(t, alphas_cumprod):
|
94 |
s = torch.tensor([0.008]) # diffusers uses 0.003 while the original is 0.008
|
@@ -100,9 +100,8 @@ def get_timestep_ratio_conditioning(t, alphas_cumprod):
|
|
100 |
ratio = (((var * min_var) ** 0.5).acos() / (torch.pi * 0.5)) * (1 + s) - s
|
101 |
return ratio
|
102 |
|
103 |
-
pipe = diffusers.AutoPipelineForText2Image.from_pretrained(model_path, text_encoder=None, torch_dtype=dtype)
|
104 |
|
105 |
-
|
106 |
pipe.prior_pipe.get_timestep_ratio_conditioning = get_timestep_ratio_conditioning
|
107 |
pipe.prior_pipe.scheduler.config.clip_sample = False
|
108 |
|
@@ -120,20 +119,43 @@ pipe = pipe.to(device, dtype=dtype)
|
|
120 |
pipe.prior_pipe = pipe.prior_pipe.to(device, dtype=dtype)
|
121 |
|
122 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
prompt = "1girl, solo, looking at viewer, open mouth, blue eyes, medium breasts, blonde hair, gloves, dress, bow, hair between eyes, bare shoulders, upper body, hair bow, indoors, elbow gloves, hand on own chest, bridal gauntlets, candlestand, smile, rim lighting, from side, castle interior, looking side,"
|
124 |
-
quality_prompt = "
|
125 |
negative_prompt = "very displeasing, displeasing, worst quality, bad quality, low quality, realistic, monochrome, comic, sketch, oldest, early, artist name, signature, blurry, simple background, upside down,"
|
126 |
-
|
127 |
num_images_per_prompt=1
|
128 |
|
129 |
-
# Encode prompts and quality prompts
|
130 |
# device, batch_size, num_images_per_prompt, cfg, prompt
|
131 |
prompt_embeds, prompt_embeds_pooled, _, _ = pipe.prior_pipe.encode_prompt(device, 1, num_images_per_prompt, False, prompt=prompt)
|
132 |
quality_prompt_embeds, _, _, _ = pipe.prior_pipe.encode_prompt(device, 1, num_images_per_prompt, False, prompt=quality_prompt)
|
133 |
|
134 |
negative_prompt_embeds, negative_prompt_embeds_pooled, _, _ = pipe.prior_pipe.encode_prompt(device, 1, num_images_per_prompt, False, prompt=negative_prompt)
|
135 |
-
empty_prompt_embeds
|
136 |
-
empty_prompt_embeds = torch.nn.functional.normalize(empty_prompt_embeds)
|
137 |
|
138 |
prompt_embeds = torch.cat([prompt_embeds, quality_prompt_embeds], dim=1)
|
139 |
negative_prompt_embeds = torch.cat([negative_prompt_embeds, empty_prompt_embeds], dim=1)
|
@@ -143,7 +165,7 @@ pipe.prior_pipe.maybe_free_model_hooks()
|
|
143 |
output = pipe(
|
144 |
width=1024,
|
145 |
height=1536,
|
146 |
-
decoder_guidance_scale=1.
|
147 |
prior_guidance_scale=7.0,
|
148 |
prior_num_inference_steps=30,
|
149 |
num_inference_steps=10,
|
@@ -277,7 +299,6 @@ aesthetic tags, quality tags, date tags, custom tags, rating tags, character, se
|
|
277 |
Add "realistic" tag to the negatives when this happens.
|
278 |
- Far shot eyes and hands can be bad.
|
279 |
- Still has a lot more room for more training.
|
280 |
-
- Diffusers outputs aren't as good as ComfyUI outputs.
|
281 |
|
282 |
|
283 |
## License
|
|
|
87 |
|
88 |
device = "cuda"
|
89 |
dtype = torch.float16
|
90 |
+
model_path = "/mnt/DataSSD/AI/SoteDiffusion/Wuerstchen3/diffusers/sotediffusion-v2"
|
91 |
|
|
|
92 |
|
93 |
def get_timestep_ratio_conditioning(t, alphas_cumprod):
|
94 |
s = torch.tensor([0.008]) # diffusers uses 0.003 while the original is 0.008
|
|
|
100 |
ratio = (((var * min_var) ** 0.5).acos() / (torch.pi * 0.5)) * (1 + s) - s
|
101 |
return ratio
|
102 |
|
|
|
103 |
|
104 |
+
pipe = diffusers.AutoPipelineForText2Image.from_pretrained(model_path, text_encoder=None, torch_dtype=dtype)
|
105 |
pipe.prior_pipe.get_timestep_ratio_conditioning = get_timestep_ratio_conditioning
|
106 |
pipe.prior_pipe.scheduler.config.clip_sample = False
|
107 |
|
|
|
119 |
pipe.prior_pipe = pipe.prior_pipe.to(device, dtype=dtype)
|
120 |
|
121 |
|
122 |
+
def encode_empty_prompt(
|
123 |
+
prior_pipe,
|
124 |
+
device,
|
125 |
+
batch_size,
|
126 |
+
num_images_per_prompt,
|
127 |
+
):
|
128 |
+
|
129 |
+
text_inputs = prior_pipe.tokenizer(
|
130 |
+
"",
|
131 |
+
padding="max_length",
|
132 |
+
max_length=prior_pipe.tokenizer.model_max_length,
|
133 |
+
truncation=True,
|
134 |
+
return_tensors="pt",
|
135 |
+
)
|
136 |
+
|
137 |
+
# Don't use attention mask for empty prompt
|
138 |
+
text_encoder_output = prior_pipe.text_encoder(
|
139 |
+
text_inputs.input_ids.to(device), attention_mask=None, output_hidden_states=True
|
140 |
+
)
|
141 |
+
prompt_embeds = text_encoder_output.hidden_states[-1]
|
142 |
+
prompt_embeds = prompt_embeds.to(dtype=prior_pipe.text_encoder.dtype, device=device)
|
143 |
+
prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
|
144 |
+
return prompt_embeds
|
145 |
+
|
146 |
+
|
147 |
prompt = "1girl, solo, looking at viewer, open mouth, blue eyes, medium breasts, blonde hair, gloves, dress, bow, hair between eyes, bare shoulders, upper body, hair bow, indoors, elbow gloves, hand on own chest, bridal gauntlets, candlestand, smile, rim lighting, from side, castle interior, looking side,"
|
148 |
+
quality_prompt = "extremely aesthetic, best quality, newest"
|
149 |
negative_prompt = "very displeasing, displeasing, worst quality, bad quality, low quality, realistic, monochrome, comic, sketch, oldest, early, artist name, signature, blurry, simple background, upside down,"
|
|
|
150 |
num_images_per_prompt=1
|
151 |
|
152 |
+
# Encode prompts and quality prompts eperately:
|
153 |
# device, batch_size, num_images_per_prompt, cfg, prompt
|
154 |
prompt_embeds, prompt_embeds_pooled, _, _ = pipe.prior_pipe.encode_prompt(device, 1, num_images_per_prompt, False, prompt=prompt)
|
155 |
quality_prompt_embeds, _, _, _ = pipe.prior_pipe.encode_prompt(device, 1, num_images_per_prompt, False, prompt=quality_prompt)
|
156 |
|
157 |
negative_prompt_embeds, negative_prompt_embeds_pooled, _, _ = pipe.prior_pipe.encode_prompt(device, 1, num_images_per_prompt, False, prompt=negative_prompt)
|
158 |
+
empty_prompt_embeds = encode_empty_prompt(pipe.prior_pipe, device, 1, num_images_per_prompt)
|
|
|
159 |
|
160 |
prompt_embeds = torch.cat([prompt_embeds, quality_prompt_embeds], dim=1)
|
161 |
negative_prompt_embeds = torch.cat([negative_prompt_embeds, empty_prompt_embeds], dim=1)
|
|
|
165 |
output = pipe(
|
166 |
width=1024,
|
167 |
height=1536,
|
168 |
+
decoder_guidance_scale=1.2,
|
169 |
prior_guidance_scale=7.0,
|
170 |
prior_num_inference_steps=30,
|
171 |
num_inference_steps=10,
|
|
|
299 |
Add "realistic" tag to the negatives when this happens.
|
300 |
- Far shot eyes and hands can be bad.
|
301 |
- Still has a lot more room for more training.
|
|
|
302 |
|
303 |
|
304 |
## License
|