Create evo_nishikie_v1.py
#1
by
yuki-imajuku
- opened
- .gitattributes +0 -1
- README.md +30 -65
- config.json +0 -57
- diffusion_pytorch_model.safetensors +0 -3
- evo_nishikie_v1.py +31 -31
- requirements.txt +0 -9
- test.jpg +0 -3
.gitattributes
CHANGED
@@ -33,4 +33,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
-
test.jpg filter=lfs diff=lfs merge=lfs -text
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
README.md
CHANGED
@@ -3,22 +3,26 @@ library_name: diffusers
|
|
3 |
license: apache-2.0
|
4 |
language:
|
5 |
- ja
|
6 |
-
pipeline_tag:
|
7 |
tags:
|
8 |
- stable-diffusion
|
9 |
---
|
10 |
-
# 🐟 Evo-
|
11 |
|
12 |
-
🤗 [Models](https://huggingface.co/SakanaAI) | 📝 [Blog](
|
13 |
|
14 |
|
15 |
-
**
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
Please refer to our [blog](https://sakana.ai/evo-ukiyoe/) for more details.
|
21 |
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
|
24 |
## Usage
|
@@ -31,53 +35,21 @@ Use the code below to get started with the model.
|
|
31 |
|
32 |
1. Git clone this model card
|
33 |
```
|
34 |
-
git clone https://huggingface.co/SakanaAI/
|
35 |
-
```
|
36 |
-
2. Install git-lfs if you don't have it yet.
|
37 |
-
```
|
38 |
-
sudo apt install git-lfs
|
39 |
-
git lfs install
|
40 |
```
|
41 |
-
|
42 |
```
|
43 |
-
|
44 |
-
conda activate evo-nishikie
|
45 |
-
```
|
46 |
-
4. Install packages
|
47 |
-
```
|
48 |
-
cd Evo-Nishikie-v1
|
49 |
pip install -r requirements.txt
|
50 |
```
|
51 |
-
|
52 |
```python
|
53 |
-
import
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
#Get image from URL
|
60 |
-
#url = "https://huggingface.co/spaces/SakanaAI/Evo-Nishikie/resolve/main/sample2.jpg"
|
61 |
-
#original_image = Image.open(BytesIO(requests.get(url).content))
|
62 |
-
|
63 |
-
#Use local image
|
64 |
-
original_image = Image.open('test.jpg')
|
65 |
-
|
66 |
-
# Generate
|
67 |
-
device = "cuda"
|
68 |
-
pipe, processor = load_evo_nishikie(device)
|
69 |
-
images = pipe(
|
70 |
-
prompt="着物を着た女性が、赤ん坊を抱え、もう一人の子どもが手押し車を引いています。背景には木があります。最高品質の輻の浮世絵。超詳細。",
|
71 |
-
negative_prompt="暗い",
|
72 |
-
image=processor(original_image),
|
73 |
-
guidance_scale=7.0,
|
74 |
-
controlnet_conditioning_scale=0.8,
|
75 |
-
num_inference_steps=35,
|
76 |
-
num_images_per_prompt=1,
|
77 |
-
output_type="pil",
|
78 |
-
).images
|
79 |
-
images[0].save("out.png")
|
80 |
-
|
81 |
```
|
82 |
|
83 |
</details>
|
@@ -89,15 +61,14 @@ Use the code below to get started with the model.
|
|
89 |
<!-- Provide a longer summary of what this model is. -->
|
90 |
|
91 |
- **Developed by:** [Sakana AI](https://sakana.ai/)
|
92 |
-
- **Model type:** Diffusion-based
|
93 |
- **Language(s):** Japanese
|
94 |
-
- **Blog:** https://sakana.ai/
|
95 |
|
96 |
|
97 |
## License
|
98 |
The Python script included in this repository is licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0).
|
99 |
-
Please note that the license for the model/pipeline generated by this script is inherited from the source models.
|
100 |
-
The sample images used in the code has CC BY SA 4.0 license and is belonged to the Pre-Modern Japanese Text dataset, owned by National Institute of Japanese Literature and curated by ROIS-DS Center for Open Data in the Humanities.
|
101 |
|
102 |
## Uses
|
103 |
This model is provided for research and development purposes only and should be considered as an experimental prototype.
|
@@ -109,17 +80,11 @@ Users must fully understand the risks associated with the use of this model and
|
|
109 |
|
110 |
## Acknowledgement
|
111 |
|
112 |
-
|
113 |
-
- [SDXL](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
|
114 |
-
- [Juggernaut-XL-v9](https://huggingface.co/RunDiffusion/Juggernaut-XL-v9)
|
115 |
-
- [SDXL-DPO](https://huggingface.co/mhdang/dpo-sdxl-text2image-v1)
|
116 |
-
- [JSDXL](https://huggingface.co/stabilityai/japanese-stable-diffusion-xl)
|
117 |
|
118 |
|
119 |
## Citation
|
120 |
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
author = {Clanuwat, Tarin and Shing, Makoto and Imajuku, Yuki and Kitamoto, Asanobu and Akama, Ryo}
|
125 |
-
}
|
|
|
3 |
license: apache-2.0
|
4 |
language:
|
5 |
- ja
|
6 |
+
pipeline_tag: text-to-image
|
7 |
tags:
|
8 |
- stable-diffusion
|
9 |
---
|
10 |
+
# 🐟 Evo-Ukiyoe-v1
|
11 |
|
12 |
+
🤗 [Models](https://huggingface.co/SakanaAI) | 📝 [Blog](TODO) | 🐦 [Twitter](https://twitter.com/SakanaAILabs)
|
13 |
|
14 |
|
15 |
+
**EvoSDXL-JP-v1** is an experimental education-purpose Japanese SDXL Lightning.
|
16 |
+
This model was created using the Evolutionary Model Merge method.
|
17 |
+
Please refer to our [report](https://arxiv.org/abs/2403.13187) and [blog](https://sakana.ai/evosdxl-jp/) for more details.
|
18 |
+
This model was produced by merging the following models.
|
19 |
+
We are grateful to the developers of the source models.
|
|
|
20 |
|
21 |
+
- [SDXL](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
|
22 |
+
- [Juggernaut-XL-v9](https://huggingface.co/RunDiffusion/Juggernaut-XL-v9)
|
23 |
+
- [SDXL-DPO](https://huggingface.co/mhdang/dpo-sdxl-text2image-v1)
|
24 |
+
- [JSDXL](https://huggingface.co/stabilityai/japanese-stable-diffusion-xl)
|
25 |
+
- [SDXL-Lightning](https://huggingface.co/ByteDance/SDXL-Lightning)
|
26 |
|
27 |
|
28 |
## Usage
|
|
|
35 |
|
36 |
1. Git clone this model card
|
37 |
```
|
38 |
+
git clone https://huggingface.co/SakanaAI/EvoSDXL-JP-v1
|
|
|
|
|
|
|
|
|
|
|
39 |
```
|
40 |
+
2. Install packages
|
41 |
```
|
42 |
+
cd EvoSDXL-JP-v1
|
|
|
|
|
|
|
|
|
|
|
43 |
pip install -r requirements.txt
|
44 |
```
|
45 |
+
3. Run
|
46 |
```python
|
47 |
+
from evosdxl_jp_v1 import load_evosdxl_jp
|
48 |
+
|
49 |
+
prompt = "柴犬"
|
50 |
+
pipe = load_evosdxl_jp(device="cuda")
|
51 |
+
images = pipe(prompt, num_inference_steps=4, guidance_scale=0).images
|
52 |
+
images[0].save("image.png")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
```
|
54 |
|
55 |
</details>
|
|
|
61 |
<!-- Provide a longer summary of what this model is. -->
|
62 |
|
63 |
- **Developed by:** [Sakana AI](https://sakana.ai/)
|
64 |
+
- **Model type:** Diffusion-based text-to-image generative model
|
65 |
- **Language(s):** Japanese
|
66 |
+
- **Blog:** https://sakana.ai/TODO
|
67 |
|
68 |
|
69 |
## License
|
70 |
The Python script included in this repository is licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0).
|
71 |
+
Please note that the license for the model/pipeline generated by this script is inherited from the source models.
|
|
|
72 |
|
73 |
## Uses
|
74 |
This model is provided for research and development purposes only and should be considered as an experimental prototype.
|
|
|
80 |
|
81 |
## Acknowledgement
|
82 |
|
83 |
+
We would like to thank the developers of the source models for their contributions and for making their work available.
|
|
|
|
|
|
|
|
|
84 |
|
85 |
|
86 |
## Citation
|
87 |
|
88 |
+
```bibtex
|
89 |
+
TODO
|
90 |
+
```
|
|
|
|
config.json
DELETED
@@ -1,57 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"_class_name": "ControlNetModel",
|
3 |
-
"_diffusers_version": "0.29.0",
|
4 |
-
"act_fn": "silu",
|
5 |
-
"addition_embed_type": "text_time",
|
6 |
-
"addition_embed_type_num_heads": 64,
|
7 |
-
"addition_time_embed_dim": 256,
|
8 |
-
"attention_head_dim": [
|
9 |
-
5,
|
10 |
-
10,
|
11 |
-
20
|
12 |
-
],
|
13 |
-
"block_out_channels": [
|
14 |
-
320,
|
15 |
-
640,
|
16 |
-
1280
|
17 |
-
],
|
18 |
-
"class_embed_type": null,
|
19 |
-
"conditioning_channels": 3,
|
20 |
-
"conditioning_embedding_out_channels": [
|
21 |
-
16,
|
22 |
-
32,
|
23 |
-
96,
|
24 |
-
256
|
25 |
-
],
|
26 |
-
"controlnet_conditioning_channel_order": "rgb",
|
27 |
-
"cross_attention_dim": 2048,
|
28 |
-
"down_block_types": [
|
29 |
-
"DownBlock2D",
|
30 |
-
"CrossAttnDownBlock2D",
|
31 |
-
"CrossAttnDownBlock2D"
|
32 |
-
],
|
33 |
-
"downsample_padding": 1,
|
34 |
-
"encoder_hid_dim": null,
|
35 |
-
"encoder_hid_dim_type": null,
|
36 |
-
"flip_sin_to_cos": true,
|
37 |
-
"freq_shift": 0,
|
38 |
-
"global_pool_conditions": false,
|
39 |
-
"in_channels": 4,
|
40 |
-
"layers_per_block": 2,
|
41 |
-
"mid_block_scale_factor": 1,
|
42 |
-
"mid_block_type": "UNetMidBlock2DCrossAttn",
|
43 |
-
"norm_eps": 1e-05,
|
44 |
-
"norm_num_groups": 32,
|
45 |
-
"num_attention_heads": null,
|
46 |
-
"num_class_embeds": null,
|
47 |
-
"only_cross_attention": false,
|
48 |
-
"projection_class_embeddings_input_dim": 2816,
|
49 |
-
"resnet_time_scale_shift": "default",
|
50 |
-
"transformer_layers_per_block": [
|
51 |
-
1,
|
52 |
-
2,
|
53 |
-
10
|
54 |
-
],
|
55 |
-
"upcast_attention": null,
|
56 |
-
"use_linear_projection": true
|
57 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
diffusion_pytorch_model.safetensors
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:50513516f08fe6185d469218e69cc2d42ba59dc56eb315d50d096e2921ad6ce1
|
3 |
-
size 5004167864
|
|
|
|
|
|
|
|
evo_nishikie_v1.py
CHANGED
@@ -1,22 +1,23 @@
|
|
1 |
import gc
|
|
|
2 |
import os
|
3 |
-
from typing import Dict, List,
|
4 |
|
5 |
-
from PIL import Image
|
6 |
-
from controlnet_aux import
|
7 |
from diffusers import (
|
8 |
ControlNetModel,
|
9 |
StableDiffusionXLControlNetPipeline,
|
10 |
UNet2DConditionModel,
|
11 |
)
|
12 |
from huggingface_hub import hf_hub_download
|
|
|
13 |
import safetensors
|
14 |
import torch
|
15 |
from tqdm import tqdm
|
16 |
from transformers import AutoTokenizer, CLIPTextModelWithProjection
|
17 |
|
18 |
-
|
19 |
-
# Base models
|
20 |
SDXL_REPO = "stabilityai/stable-diffusion-xl-base-1.0"
|
21 |
DPO_REPO = "mhdang/dpo-sdxl-text2image-v1"
|
22 |
JN_REPO = "RunDiffusion/Juggernaut-XL-v9"
|
@@ -29,18 +30,6 @@ UKIYOE_REPO = "SakanaAI/Evo-Ukiyoe-v1"
|
|
29 |
NISHIKIE_REPO = "SakanaAI/Evo-Nishikie-v1"
|
30 |
|
31 |
|
32 |
-
class EvoNishikieConditioningImageProcessor:
|
33 |
-
def __init__(self, device="cpu"):
|
34 |
-
self.lineart_detector = LineartDetector.from_pretrained("lllyasviel/Annotators").to(device)
|
35 |
-
self.image_filter = ImageFilter.MedianFilter(size=3)
|
36 |
-
|
37 |
-
def __call__(self, original_image: Image.Image) -> Image.Image:
|
38 |
-
lineart_image = self.lineart_detector(original_image, coarse=False, image_resolution=1024)
|
39 |
-
lineart_image_filtered = lineart_image.filter(self.image_filter)
|
40 |
-
conditioning_image = lineart_image_filtered.point(lambda p: 255 if p > 40 else 0).convert("L")
|
41 |
-
return conditioning_image
|
42 |
-
|
43 |
-
|
44 |
def load_state_dict(checkpoint_file: Union[str, os.PathLike], device: str = "cpu"):
|
45 |
file_extension = os.path.basename(checkpoint_file).split(".")[-1]
|
46 |
if file_extension == "safetensors":
|
@@ -125,9 +114,7 @@ def split_conv_attn(weights):
|
|
125 |
return {"conv": conv_tensors, "attn": attn_tensors}
|
126 |
|
127 |
|
128 |
-
def load_evo_nishikie(device="cuda"
|
129 |
-
StableDiffusionXLControlNetPipeline, EvoNishikieConditioningImageProcessor
|
130 |
-
]:
|
131 |
# Load base models
|
132 |
sdxl_weights = split_conv_attn(load_from_pretrained(SDXL_REPO, device=device))
|
133 |
dpo_weights = split_conv_attn(
|
@@ -137,7 +124,6 @@ def load_evo_nishikie(device="cuda", processor_device="cpu") -> Tuple[
|
|
137 |
)
|
138 |
jn_weights = split_conv_attn(load_from_pretrained(JN_REPO, device=device))
|
139 |
jsdxl_weights = split_conv_attn(load_from_pretrained(JSDXL_REPO, device=device))
|
140 |
-
|
141 |
# Merge base models
|
142 |
tensors = [sdxl_weights, dpo_weights, jn_weights, jsdxl_weights]
|
143 |
new_conv = merge_models(
|
@@ -158,14 +144,11 @@ def load_evo_nishikie(device="cuda", processor_device="cpu") -> Tuple[
|
|
158 |
0.2198623756106564,
|
159 |
],
|
160 |
)
|
161 |
-
|
162 |
-
# Delete no longer needed variables to free
|
163 |
del sdxl_weights, dpo_weights, jn_weights, jsdxl_weights
|
164 |
gc.collect()
|
165 |
if "cuda" in device:
|
166 |
torch.cuda.empty_cache()
|
167 |
|
168 |
-
# Instantiate UNet
|
169 |
unet_config = UNet2DConditionModel.load_config(SDXL_REPO, subfolder="unet")
|
170 |
unet = UNet2DConditionModel.from_config(unet_config).to(device=device)
|
171 |
unet.load_state_dict({**new_conv, **new_attn})
|
@@ -193,14 +176,31 @@ def load_evo_nishikie(device="cuda", processor_device="cpu") -> Tuple[
|
|
193 |
torch_dtype=torch.float16,
|
194 |
variant="fp16",
|
195 |
)
|
|
|
196 |
|
197 |
# Load Evo-Ukiyoe weights
|
198 |
pipe.load_lora_weights(UKIYOE_REPO)
|
199 |
pipe.fuse_lora(lora_scale=1.0)
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gc
|
2 |
+
from io import BytesIO
|
3 |
import os
|
4 |
+
from typing import Dict, List, Union
|
5 |
|
6 |
+
from PIL import Image
|
7 |
+
from controlnet_aux import CannyDetector
|
8 |
from diffusers import (
|
9 |
ControlNetModel,
|
10 |
StableDiffusionXLControlNetPipeline,
|
11 |
UNet2DConditionModel,
|
12 |
)
|
13 |
from huggingface_hub import hf_hub_download
|
14 |
+
import requests
|
15 |
import safetensors
|
16 |
import torch
|
17 |
from tqdm import tqdm
|
18 |
from transformers import AutoTokenizer, CLIPTextModelWithProjection
|
19 |
|
20 |
+
# Base models (fine-tuned from SDXL-1.0)
|
|
|
21 |
SDXL_REPO = "stabilityai/stable-diffusion-xl-base-1.0"
|
22 |
DPO_REPO = "mhdang/dpo-sdxl-text2image-v1"
|
23 |
JN_REPO = "RunDiffusion/Juggernaut-XL-v9"
|
|
|
30 |
NISHIKIE_REPO = "SakanaAI/Evo-Nishikie-v1"
|
31 |
|
32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
def load_state_dict(checkpoint_file: Union[str, os.PathLike], device: str = "cpu"):
|
34 |
file_extension = os.path.basename(checkpoint_file).split(".")[-1]
|
35 |
if file_extension == "safetensors":
|
|
|
114 |
return {"conv": conv_tensors, "attn": attn_tensors}
|
115 |
|
116 |
|
117 |
+
def load_evo_nishikie(device="cuda") -> StableDiffusionXLControlNetPipeline:
|
|
|
|
|
118 |
# Load base models
|
119 |
sdxl_weights = split_conv_attn(load_from_pretrained(SDXL_REPO, device=device))
|
120 |
dpo_weights = split_conv_attn(
|
|
|
124 |
)
|
125 |
jn_weights = split_conv_attn(load_from_pretrained(JN_REPO, device=device))
|
126 |
jsdxl_weights = split_conv_attn(load_from_pretrained(JSDXL_REPO, device=device))
|
|
|
127 |
# Merge base models
|
128 |
tensors = [sdxl_weights, dpo_weights, jn_weights, jsdxl_weights]
|
129 |
new_conv = merge_models(
|
|
|
144 |
0.2198623756106564,
|
145 |
],
|
146 |
)
|
|
|
|
|
147 |
del sdxl_weights, dpo_weights, jn_weights, jsdxl_weights
|
148 |
gc.collect()
|
149 |
if "cuda" in device:
|
150 |
torch.cuda.empty_cache()
|
151 |
|
|
|
152 |
unet_config = UNet2DConditionModel.load_config(SDXL_REPO, subfolder="unet")
|
153 |
unet = UNet2DConditionModel.from_config(unet_config).to(device=device)
|
154 |
unet.load_state_dict({**new_conv, **new_attn})
|
|
|
176 |
torch_dtype=torch.float16,
|
177 |
variant="fp16",
|
178 |
)
|
179 |
+
pipe = pipe.to(device, dtype=torch.float16)
|
180 |
|
181 |
# Load Evo-Ukiyoe weights
|
182 |
pipe.load_lora_weights(UKIYOE_REPO)
|
183 |
pipe.fuse_lora(lora_scale=1.0)
|
184 |
+
return pipe
|
185 |
+
|
186 |
+
|
187 |
+
if __name__ == "__main__":
|
188 |
+
url = "https://sakana.ai/assets/nedo-grant/nedo_grant.jpeg"
|
189 |
+
original_image = Image.open(
|
190 |
+
BytesIO(requests.get(url).content)
|
191 |
+
).resize((1024, 1024), Image.Resampling.LANCZOS)
|
192 |
+
canny_detector = CannyDetector()
|
193 |
+
canny_image = canny_detector(original_image, image_resolution=1024)
|
194 |
+
pipe: StableDiffusionXLControlNetPipeline = load_evo_nishikie()
|
195 |
+
images = pipe(
|
196 |
+
prompt="銀杏が色づく。草木が生えた地面と青空の富士山。最高品質の輻の浮世絵。",
|
197 |
+
negative_prompt="暗い。",
|
198 |
+
image=canny_image,
|
199 |
+
guidance_scale=8.0,
|
200 |
+
controlnet_conditioning_scale=0.6,
|
201 |
+
num_inference_steps=50,
|
202 |
+
generator=torch.Generator().manual_seed(0),
|
203 |
+
num_images_per_prompt=1,
|
204 |
+
output_type="pil",
|
205 |
+
).images
|
206 |
+
images[0].save("out.png")
|
requirements.txt
DELETED
@@ -1,9 +0,0 @@
|
|
1 |
-
torch
|
2 |
-
torchvision
|
3 |
-
|
4 |
-
accelerate==0.32.0
|
5 |
-
controlnet-aux==0.0.9
|
6 |
-
diffusers==0.29.2
|
7 |
-
sentencepiece==0.2.0
|
8 |
-
transformers==4.42.3
|
9 |
-
peft==0.11.1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
test.jpg
DELETED
Git LFS Details
|