Spaces:
Running
Running
Jae-Won Chung
commited on
Commit
•
c97bae1
1
Parent(s):
abd945c
Updated diffusion benchmark and data
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitignore +1 -1
- benchmark/diffusion/image-to-video/pegasus/A100/queue_1gpu.yaml +4 -4
- benchmark/diffusion/image-to-video/pegasus/H100/queue_1gpu.yaml +4 -4
- benchmark/diffusion/image-to-video/scripts/aggregate_leaderboard_models.py +1 -1
- benchmark/diffusion/image-to-video/scripts/benchmark_one_datapoint.py +26 -15
- benchmark/diffusion/image-to-video/scripts/benchmark_one_model.py +41 -35
- benchmark/diffusion/image-to-video/sharegpt4video/extract_first_frame.py +1 -1
- benchmark/diffusion/text-to-image/pegasus/A100/queue_1gpu.yaml +1 -1
- benchmark/diffusion/text-to-image/scripts/aggregate_leaderboard_models.py +2 -1
- benchmark/diffusion/text-to-image/scripts/benchmark_one_datapoint.py +49 -8
- benchmark/diffusion/text-to-image/scripts/benchmark_one_model.py +20 -18
- benchmark/diffusion/text-to-video/pegasus/A100/queue_1gpu.yaml +1 -1
- benchmark/diffusion/text-to-video/pegasus/H100/queue_1gpu.yaml +1 -1
- benchmark/diffusion/text-to-video/scripts/aggregate_leaderboard_models.py +2 -1
- benchmark/diffusion/text-to-video/scripts/benchmark_one_datapoint.py +11 -11
- benchmark/diffusion/text-to-video/scripts/benchmark_one_model.py +37 -35
- data/diffusion/image-to-video/A100-SXM4-40GB/ali-vilab/i2vgen-xl/bs1+steps25+frames16.json +2 -2
- data/diffusion/image-to-video/A100-SXM4-40GB/ali-vilab/i2vgen-xl/bs2+steps25+frames16.json +2 -2
- data/diffusion/image-to-video/A100-SXM4-40GB/stabilityai/stable-video-diffusion-img2vid-xt/bs1+steps25+frames25.json +2 -2
- data/diffusion/image-to-video/A100-SXM4-40GB/stabilityai/stable-video-diffusion-img2vid-xt/bs2+steps25+frames25.json +9 -0
- data/diffusion/image-to-video/A100-SXM4-40GB/stabilityai/stable-video-diffusion-img2vid-xt/bs3+steps25+frames25.json +9 -0
- data/diffusion/image-to-video/A100-SXM4-40GB/stabilityai/stable-video-diffusion-img2vid/bs1+steps25+frames14.json +2 -2
- data/diffusion/image-to-video/A100-SXM4-40GB/stabilityai/stable-video-diffusion-img2vid/bs2+steps25+frames14.json +9 -0
- data/diffusion/image-to-video/A100-SXM4-40GB/stabilityai/stable-video-diffusion-img2vid/bs3+steps25+frames14.json +9 -0
- data/diffusion/image-to-video/H100 80GB HBM3/ali-vilab/i2vgen-xl/bs1+steps25+frames16.json +2 -2
- data/diffusion/image-to-video/H100 80GB HBM3/ali-vilab/i2vgen-xl/bs2+steps25+frames16.json +2 -2
- data/diffusion/image-to-video/H100 80GB HBM3/stabilityai/stable-video-diffusion-img2vid-xt/bs1+steps25+frames25.json +2 -2
- data/diffusion/image-to-video/H100 80GB HBM3/stabilityai/stable-video-diffusion-img2vid-xt/bs2+steps25+frames25.json +9 -0
- data/diffusion/image-to-video/H100 80GB HBM3/stabilityai/stable-video-diffusion-img2vid-xt/bs3+steps25+frames25.json +9 -0
- data/diffusion/image-to-video/H100 80GB HBM3/stabilityai/stable-video-diffusion-img2vid/bs1+steps25+frames14.json +2 -2
- data/diffusion/image-to-video/H100 80GB HBM3/stabilityai/stable-video-diffusion-img2vid/bs2+steps25+frames14.json +9 -0
- data/diffusion/image-to-video/H100 80GB HBM3/stabilityai/stable-video-diffusion-img2vid/bs3+steps25+frames14.json +9 -0
- data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-2-2-decoder/bs1+steps25.json +2 -2
- data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-2-2-decoder/bs16+steps25.json +2 -2
- data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-2-2-decoder/bs2+steps25.json +2 -2
- data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-2-2-decoder/bs32+steps25.json +8 -0
- data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-2-2-decoder/bs4+steps25.json +2 -2
- data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-2-2-decoder/bs8+steps25.json +2 -2
- data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-3/bs1+steps25.json +2 -2
- data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-3/bs2+steps25.json +2 -2
- data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs1+steps25.json +2 -2
- data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs16+steps25.json +2 -2
- data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs2+steps25.json +2 -2
- data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs32+steps25.json +8 -0
- data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs4+steps25.json +2 -2
- data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs64+steps25.json +8 -0
- data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs8+steps25.json +2 -2
- data/diffusion/text-to-image/A100-SXM4-40GB/segmind/SSD-1B/bs1+steps25.json +2 -2
- data/diffusion/text-to-image/A100-SXM4-40GB/segmind/SSD-1B/bs2+steps25.json +2 -2
- data/diffusion/text-to-image/A100-SXM4-40GB/segmind/SSD-1B/bs4+steps25.json +2 -2
.gitignore
CHANGED
@@ -18,4 +18,4 @@ build/
|
|
18 |
|
19 |
# Data files
|
20 |
*.log
|
21 |
-
|
|
|
18 |
|
19 |
# Data files
|
20 |
*.log
|
21 |
+
figures/
|
benchmark/diffusion/image-to-video/pegasus/A100/queue_1gpu.yaml
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
- command:
|
2 |
-
- "python scripts/benchmark_one_model.py {{ model }} --result-root results/joule --dataset-path sharegpt4video/sharegpt4video_100.json --gpu-ids {{ gpu }} --batch-sizes
|
3 |
model:
|
4 |
-
- '--model ali-vilab/i2vgen-xl --num-frames 16 --add-text-prompt'
|
5 |
-
- '--model stabilityai/stable-video-diffusion-img2vid --num-frames 14'
|
6 |
-
- '--model stabilityai/stable-video-diffusion-img2vid-xt --num-frames 25'
|
|
|
1 |
- command:
|
2 |
+
- "python scripts/benchmark_one_model.py {{ model }} --result-root results/joule --dataset-path sharegpt4video/sharegpt4video_100.json --gpu-ids {{ gpu }} --batch-sizes 4 3 2 1 --power-limits 400 --num-inference-steps 1 2 4 8 16 25 30 40 50"
|
3 |
model:
|
4 |
+
- '--model ali-vilab/i2vgen-xl --num-frames 16 --add-text-prompt --width 1280 --height 720'
|
5 |
+
- '--model stabilityai/stable-video-diffusion-img2vid --num-frames 14 --width 1024 --height 576'
|
6 |
+
- '--model stabilityai/stable-video-diffusion-img2vid-xt --num-frames 25 --width 1024 --height 576'
|
benchmark/diffusion/image-to-video/pegasus/H100/queue_1gpu.yaml
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
- command:
|
2 |
-
- "python scripts/benchmark_one_model.py {{ model }} --result-root results/joule --dataset-path sharegpt4video/
|
3 |
model:
|
4 |
-
-
|
5 |
-
-
|
6 |
-
-
|
|
|
1 |
- command:
|
2 |
+
- "python scripts/benchmark_one_model.py {{ model }} --result-root results/joule --dataset-path sharegpt4video/sharegpt4video_100.json --gpu-ids {{ gpu }} --batch-sizes 4 3 2 1 --power-limits 700 --num-inference-steps 1 2 4 8 16 25 30 40 50"
|
3 |
model:
|
4 |
+
- "--model ali-vilab/i2vgen-xl --num-frames 16 --add-text-prompt --width 1280 --height 720"
|
5 |
+
- "--model stabilityai/stable-video-diffusion-img2vid --num-frames 14 --width 1024 --height 576"
|
6 |
+
- "--model stabilityai/stable-video-diffusion-img2vid-xt --num-frames 25 --width 1024 --height 576"
|
benchmark/diffusion/image-to-video/scripts/aggregate_leaderboard_models.py
CHANGED
@@ -15,7 +15,7 @@ def main(results_dir: Path, output_file: Path) -> None:
|
|
15 |
for model_dir in sorted(glob(f"{results_dir}/*/*")):
|
16 |
model_name = "/".join(model_dir.split("/")[-2:])
|
17 |
print(f" {model_name}")
|
18 |
-
result_file_cand = glob(f"{model_dir}/bs1+*+results.json")
|
19 |
assert len(result_file_cand) == 1, model_name
|
20 |
results_data = json.load(open(result_file_cand[0]))
|
21 |
denosing_module_name = "unet" if "unet" in results_data["num_parameters"] else "transformer"
|
|
|
15 |
for model_dir in sorted(glob(f"{results_dir}/*/*")):
|
16 |
model_name = "/".join(model_dir.split("/")[-2:])
|
17 |
print(f" {model_name}")
|
18 |
+
result_file_cand = glob(f"{model_dir}/bs1+*+steps25+results.json")
|
19 |
assert len(result_file_cand) == 1, model_name
|
20 |
results_data = json.load(open(result_file_cand[0]))
|
21 |
denosing_module_name = "unet" if "unet" in results_data["num_parameters"] else "transformer"
|
benchmark/diffusion/image-to-video/scripts/benchmark_one_datapoint.py
CHANGED
@@ -27,10 +27,10 @@ class Results:
|
|
27 |
model: str
|
28 |
num_parameters: dict[str, int]
|
29 |
gpu_model: str
|
30 |
-
num_infernece_steps: int
|
31 |
-
num_frames: int
|
32 |
power_limit: int
|
33 |
batch_size: int
|
|
|
|
|
34 |
num_prompts: int
|
35 |
total_runtime: float = 0.0
|
36 |
total_energy: float = 0.0
|
@@ -80,6 +80,7 @@ def load_text_image_prompts(
|
|
80 |
path: str,
|
81 |
batch_size: int,
|
82 |
num_batches: int | None = None,
|
|
|
83 |
) -> tuple[int, list[tuple[list[str], list[Image.Image]]]]:
|
84 |
"""Load the dataset to feed the model and return it as a list of batches of prompts.
|
85 |
|
@@ -93,6 +94,9 @@ def load_text_image_prompts(
|
|
93 |
dataset = json.load(open(path))
|
94 |
assert len(dataset["caption"]) == len(dataset["video_id"])
|
95 |
|
|
|
|
|
|
|
96 |
if num_batches is not None:
|
97 |
if len(dataset["caption"]) < num_batches * batch_size:
|
98 |
raise ValueError("Not enough data for the requested number of batches.")
|
@@ -103,6 +107,8 @@ def load_text_image_prompts(
|
|
103 |
dataset["first_frame"] = [
|
104 |
load_image(str(image_path / f"{video_id}.jpg")) for video_id in dataset["video_id"]
|
105 |
]
|
|
|
|
|
106 |
|
107 |
batched = [
|
108 |
(dataset["caption"][i : i + batch_size], dataset["first_frame"][i : i + batch_size])
|
@@ -135,8 +141,8 @@ def benchmark(args: argparse.Namespace) -> None:
|
|
135 |
|
136 |
results_dir = Path(args.result_root) / args.model
|
137 |
results_dir.mkdir(parents=True, exist_ok=True)
|
138 |
-
benchmark_name = str(results_dir / f"bs{args.batch_size}+pl{args.power_limit}")
|
139 |
-
video_dir = results_dir / f"bs{args.batch_size}+pl{args.power_limit}+generated"
|
140 |
video_dir.mkdir(exist_ok=True)
|
141 |
|
142 |
arg_out_filename = f"{benchmark_name}+args.json"
|
@@ -150,11 +156,16 @@ def benchmark(args: argparse.Namespace) -> None:
|
|
150 |
pynvml.nvmlInit()
|
151 |
handle = pynvml.nvmlDeviceGetHandleByIndex(0)
|
152 |
gpu_model = pynvml.nvmlDeviceGetName(handle)
|
153 |
-
pynvml.nvmlDeviceSetPersistenceMode(handle, pynvml.NVML_FEATURE_ENABLED)
|
154 |
-
pynvml.nvmlDeviceSetPowerManagementLimit(handle, args.power_limit * 1000)
|
155 |
pynvml.nvmlShutdown()
|
156 |
|
157 |
-
num_prompts, batched_prompts = load_text_image_prompts(
|
|
|
|
|
|
|
|
|
|
|
158 |
|
159 |
pipeline = get_pipeline(args.model)
|
160 |
|
@@ -189,7 +200,7 @@ def benchmark(args: argparse.Namespace) -> None:
|
|
189 |
fps_param_name = fps_param_name_candidates[0]
|
190 |
|
191 |
torch.cuda.reset_peak_memory_stats(device="cuda:0")
|
192 |
-
zeus_monitor.begin_window("benchmark",
|
193 |
|
194 |
# Build common parameter dict for all batches
|
195 |
params: dict[str, Any] = dict(
|
@@ -210,15 +221,15 @@ def benchmark(args: argparse.Namespace) -> None:
|
|
210 |
if args.add_text_prompt:
|
211 |
params["prompt"] = intermediate.prompts
|
212 |
|
213 |
-
zeus_monitor.begin_window("batch",
|
214 |
frames = pipeline(**params).frames
|
215 |
-
batch_measurements = zeus_monitor.end_window("batch",
|
216 |
|
217 |
intermediate.frames = frames
|
218 |
intermediate.batch_latency = batch_measurements.time
|
219 |
intermediate.batch_energy = batch_measurements.total_energy
|
220 |
|
221 |
-
measurements = zeus_monitor.end_window("benchmark",
|
222 |
peak_memory = torch.cuda.max_memory_allocated(device="cuda:0")
|
223 |
|
224 |
results: list[Result] = []
|
@@ -255,10 +266,10 @@ def benchmark(args: argparse.Namespace) -> None:
|
|
255 |
model=args.model,
|
256 |
num_parameters=count_parameters(pipeline),
|
257 |
gpu_model=gpu_model,
|
258 |
-
num_infernece_steps=args.num_inference_steps,
|
259 |
-
num_frames=args.num_frames,
|
260 |
power_limit=args.power_limit,
|
261 |
batch_size=args.batch_size,
|
|
|
|
|
262 |
num_prompts=num_prompts,
|
263 |
total_runtime=measurements.time,
|
264 |
total_energy=measurements.total_energy,
|
@@ -289,8 +300,8 @@ if __name__ == "__main__":
|
|
289 |
parser.add_argument("--num-inference-steps", type=int, default=50, help="The number of denoising steps.")
|
290 |
parser.add_argument("--num-frames", type=int, default=1, help="The number of frames to generate.")
|
291 |
parser.add_argument("--fps", type=int, default=16, help="Frames per second for micro-conditioning.")
|
292 |
-
parser.add_argument("--height", type=int, help="Height of the generated video.")
|
293 |
-
parser.add_argument("--width", type=int, help="Width of the generated video.")
|
294 |
parser.add_argument("--num-batches", type=int, default=None, help="The number of batches to use from the dataset.")
|
295 |
parser.add_argument("--save-every", type=int, default=10, help="Save generations to file every N prompts.")
|
296 |
parser.add_argument("--seed", type=int, default=0, help="The seed to use for the RNG.")
|
|
|
27 |
model: str
|
28 |
num_parameters: dict[str, int]
|
29 |
gpu_model: str
|
|
|
|
|
30 |
power_limit: int
|
31 |
batch_size: int
|
32 |
+
num_inference_steps: int
|
33 |
+
num_frames: int
|
34 |
num_prompts: int
|
35 |
total_runtime: float = 0.0
|
36 |
total_energy: float = 0.0
|
|
|
80 |
path: str,
|
81 |
batch_size: int,
|
82 |
num_batches: int | None = None,
|
83 |
+
image_resize: tuple[int, int] | None = None,
|
84 |
) -> tuple[int, list[tuple[list[str], list[Image.Image]]]]:
|
85 |
"""Load the dataset to feed the model and return it as a list of batches of prompts.
|
86 |
|
|
|
94 |
dataset = json.load(open(path))
|
95 |
assert len(dataset["caption"]) == len(dataset["video_id"])
|
96 |
|
97 |
+
dataset["caption"] *= 10
|
98 |
+
dataset["video_id"] *= 10
|
99 |
+
|
100 |
if num_batches is not None:
|
101 |
if len(dataset["caption"]) < num_batches * batch_size:
|
102 |
raise ValueError("Not enough data for the requested number of batches.")
|
|
|
107 |
dataset["first_frame"] = [
|
108 |
load_image(str(image_path / f"{video_id}.jpg")) for video_id in dataset["video_id"]
|
109 |
]
|
110 |
+
if image_resize is not None:
|
111 |
+
dataset["first_frame"] = [image.resize(image_resize) for image in dataset["first_frame"]]
|
112 |
|
113 |
batched = [
|
114 |
(dataset["caption"][i : i + batch_size], dataset["first_frame"][i : i + batch_size])
|
|
|
141 |
|
142 |
results_dir = Path(args.result_root) / args.model
|
143 |
results_dir.mkdir(parents=True, exist_ok=True)
|
144 |
+
benchmark_name = str(results_dir / f"bs{args.batch_size}+pl{args.power_limit}+steps{args.num_inference_steps}")
|
145 |
+
video_dir = results_dir / f"bs{args.batch_size}+pl{args.power_limit}+steps{args.num_inference_steps}+generated"
|
146 |
video_dir.mkdir(exist_ok=True)
|
147 |
|
148 |
arg_out_filename = f"{benchmark_name}+args.json"
|
|
|
156 |
pynvml.nvmlInit()
|
157 |
handle = pynvml.nvmlDeviceGetHandleByIndex(0)
|
158 |
gpu_model = pynvml.nvmlDeviceGetName(handle)
|
159 |
+
# pynvml.nvmlDeviceSetPersistenceMode(handle, pynvml.NVML_FEATURE_ENABLED)
|
160 |
+
# pynvml.nvmlDeviceSetPowerManagementLimit(handle, args.power_limit * 1000)
|
161 |
pynvml.nvmlShutdown()
|
162 |
|
163 |
+
num_prompts, batched_prompts = load_text_image_prompts(
|
164 |
+
args.dataset_path,
|
165 |
+
args.batch_size,
|
166 |
+
args.num_batches,
|
167 |
+
(args.width, args.height),
|
168 |
+
)
|
169 |
|
170 |
pipeline = get_pipeline(args.model)
|
171 |
|
|
|
200 |
fps_param_name = fps_param_name_candidates[0]
|
201 |
|
202 |
torch.cuda.reset_peak_memory_stats(device="cuda:0")
|
203 |
+
zeus_monitor.begin_window("benchmark", sync_execution=False)
|
204 |
|
205 |
# Build common parameter dict for all batches
|
206 |
params: dict[str, Any] = dict(
|
|
|
221 |
if args.add_text_prompt:
|
222 |
params["prompt"] = intermediate.prompts
|
223 |
|
224 |
+
zeus_monitor.begin_window("batch", sync_execution=False)
|
225 |
frames = pipeline(**params).frames
|
226 |
+
batch_measurements = zeus_monitor.end_window("batch", sync_execution=False)
|
227 |
|
228 |
intermediate.frames = frames
|
229 |
intermediate.batch_latency = batch_measurements.time
|
230 |
intermediate.batch_energy = batch_measurements.total_energy
|
231 |
|
232 |
+
measurements = zeus_monitor.end_window("benchmark", sync_execution=False)
|
233 |
peak_memory = torch.cuda.max_memory_allocated(device="cuda:0")
|
234 |
|
235 |
results: list[Result] = []
|
|
|
266 |
model=args.model,
|
267 |
num_parameters=count_parameters(pipeline),
|
268 |
gpu_model=gpu_model,
|
|
|
|
|
269 |
power_limit=args.power_limit,
|
270 |
batch_size=args.batch_size,
|
271 |
+
num_inference_steps=args.num_inference_steps,
|
272 |
+
num_frames=args.num_frames,
|
273 |
num_prompts=num_prompts,
|
274 |
total_runtime=measurements.time,
|
275 |
total_energy=measurements.total_energy,
|
|
|
300 |
parser.add_argument("--num-inference-steps", type=int, default=50, help="The number of denoising steps.")
|
301 |
parser.add_argument("--num-frames", type=int, default=1, help="The number of frames to generate.")
|
302 |
parser.add_argument("--fps", type=int, default=16, help="Frames per second for micro-conditioning.")
|
303 |
+
parser.add_argument("--height", type=int, required=True, help="Height of the generated video.")
|
304 |
+
parser.add_argument("--width", type=int, required=True, help="Width of the generated video.")
|
305 |
parser.add_argument("--num-batches", type=int, default=None, help="The number of batches to use from the dataset.")
|
306 |
parser.add_argument("--save-every", type=int, default=10, help="Save generations to file every N prompts.")
|
307 |
parser.add_argument("--seed", type=int, default=0, help="The seed to use for the RNG.")
|
benchmark/diffusion/image-to-video/scripts/benchmark_one_model.py
CHANGED
@@ -28,44 +28,48 @@ def main(args: argparse.Namespace) -> None:
|
|
28 |
print_and_write(outfile, f"Benchmarking {args.model}\n")
|
29 |
print_and_write(outfile, f"Batch sizes: {args.batch_sizes}\n")
|
30 |
print_and_write(outfile, f"Power limits: {args.power_limits}\n")
|
|
|
31 |
|
32 |
for batch_size in args.batch_sizes:
|
33 |
for power_limit in args.power_limits:
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
i
|
|
|
|
|
|
|
64 |
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
|
70 |
|
71 |
|
@@ -77,8 +81,10 @@ if __name__ == "__main__":
|
|
77 |
parser.add_argument("--batch-sizes", type=str, nargs="+", default=["8", "4", "2", "1"], help="Batch sizes to benchmark")
|
78 |
parser.add_argument("--power-limits", type=str, nargs="+", default=["400", "300", "200"], help="Power limits to benchmark")
|
79 |
parser.add_argument("--num-frames", type=str, help="Number of frames to generate")
|
80 |
-
parser.add_argument("--num-inference-steps", type=str, help="Number of
|
81 |
parser.add_argument("--add-text-prompt", action="store_true", help="Input text prompt alongside image.")
|
|
|
|
|
82 |
parser.add_argument("--dataset-path", type=str, help="Path to the dataset JSON file.")
|
83 |
args = parser.parse_args()
|
84 |
main(args)
|
|
|
28 |
print_and_write(outfile, f"Benchmarking {args.model}\n")
|
29 |
print_and_write(outfile, f"Batch sizes: {args.batch_sizes}\n")
|
30 |
print_and_write(outfile, f"Power limits: {args.power_limits}\n")
|
31 |
+
print_and_write(outfile, f"Number of inference steps: {args.num_inference_steps}\n")
|
32 |
|
33 |
for batch_size in args.batch_sizes:
|
34 |
for power_limit in args.power_limits:
|
35 |
+
for num_inference_steps in args.num_inference_steps:
|
36 |
+
print_and_write(outfile, f"{batch_size=}, {power_limit=}, {num_inference_steps=}\n", flush=True)
|
37 |
+
with subprocess.Popen(
|
38 |
+
args=[
|
39 |
+
"docker", "run",
|
40 |
+
"--gpus", '"device=' + ','.join(args.gpu_ids) + '"',
|
41 |
+
"--cap-add", "SYS_ADMIN",
|
42 |
+
"--name", f"leaderboard-i2v-{''.join(args.gpu_ids)}",
|
43 |
+
"--rm",
|
44 |
+
"-v", "/data/leaderboard/hfcache:/root/.cache/huggingface",
|
45 |
+
"-v", f"{os.getcwd()}:/workspace/image-to-video",
|
46 |
+
"mlenergy/leaderboard:diffusion-i2v",
|
47 |
+
"--dataset-path", args.dataset_path,
|
48 |
+
"--result-root", args.result_root,
|
49 |
+
"--batch-size", batch_size,
|
50 |
+
"--num-batches", "8",
|
51 |
+
"--power-limit", power_limit,
|
52 |
+
"--model", args.model,
|
53 |
+
"--huggingface-token", hf_token,
|
54 |
+
"--num-frames", args.num_frames,
|
55 |
+
"--num-inference-steps", num_inference_steps,
|
56 |
+
"--width", str(args.width),
|
57 |
+
"--height", str(args.height),
|
58 |
+
] + (["--add-text-prompt"] if args.add_text_prompt else []),
|
59 |
+
stdout=subprocess.PIPE,
|
60 |
+
stderr=subprocess.STDOUT,
|
61 |
+
text=True,
|
62 |
+
) as proc:
|
63 |
+
if proc.stdout:
|
64 |
+
i = 0
|
65 |
+
for line in proc.stdout:
|
66 |
+
print_and_write(outfile, line, flush=i % 50 == 0)
|
67 |
+
i += 1
|
68 |
|
69 |
+
# If proc exited with non-zero status, it's probably an OOM.
|
70 |
+
# Move on to the next batch size.
|
71 |
+
if proc.returncode != 0:
|
72 |
+
break
|
73 |
|
74 |
|
75 |
|
|
|
81 |
parser.add_argument("--batch-sizes", type=str, nargs="+", default=["8", "4", "2", "1"], help="Batch sizes to benchmark")
|
82 |
parser.add_argument("--power-limits", type=str, nargs="+", default=["400", "300", "200"], help="Power limits to benchmark")
|
83 |
parser.add_argument("--num-frames", type=str, help="Number of frames to generate")
|
84 |
+
parser.add_argument("--num-inference-steps", type=str, nargs="+", default=["1", "2", "4", "8", "16", "30", "40", "50"], help="Number of inference steps to run")
|
85 |
parser.add_argument("--add-text-prompt", action="store_true", help="Input text prompt alongside image.")
|
86 |
+
parser.add_argument("--height", type=int, required=True, help="Height of the generated video.")
|
87 |
+
parser.add_argument("--width", type=int, required=True, help="Width of the generated video.")
|
88 |
parser.add_argument("--dataset-path", type=str, help="Path to the dataset JSON file.")
|
89 |
args = parser.parse_args()
|
90 |
main(args)
|
benchmark/diffusion/image-to-video/sharegpt4video/extract_first_frame.py
CHANGED
@@ -3,7 +3,7 @@ import json
|
|
3 |
|
4 |
import cv2
|
5 |
|
6 |
-
DATASET_PATH = "
|
7 |
|
8 |
|
9 |
def main() -> None:
|
|
|
3 |
|
4 |
import cv2
|
5 |
|
6 |
+
DATASET_PATH = "sharegpt4video_100.json"
|
7 |
|
8 |
|
9 |
def main() -> None:
|
benchmark/diffusion/text-to-image/pegasus/A100/queue_1gpu.yaml
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
- command:
|
2 |
-
- "python scripts/benchmark_one_model.py --model {{ model }} --result-root results/joule --gpu-ids {{ gpu }} --batch-sizes 16 8 4 2 1 --power-limits 400"
|
3 |
model:
|
4 |
- stabilityai/stable-diffusion-2-1
|
5 |
- stabilityai/stable-diffusion-xl-base-1.0
|
|
|
1 |
- command:
|
2 |
+
- "python scripts/benchmark_one_model.py --model {{ model }} --result-root results/joule --gpu-ids {{ gpu }} --batch-sizes 64 32 16 8 4 2 1 --num-inference-steps 1 2 4 8 16 25 30 40 50 --power-limits 400"
|
3 |
model:
|
4 |
- stabilityai/stable-diffusion-2-1
|
5 |
- stabilityai/stable-diffusion-xl-base-1.0
|
benchmark/diffusion/text-to-image/scripts/aggregate_leaderboard_models.py
CHANGED
@@ -15,7 +15,7 @@ def main(results_dir: Path, output_file: Path) -> None:
|
|
15 |
for model_dir in sorted(glob(f"{results_dir}/*/*")):
|
16 |
model_name = "/".join(model_dir.split("/")[-2:])
|
17 |
print(f" {model_name}")
|
18 |
-
result_file_cand = glob(f"{model_dir}/bs1+*+results.json")
|
19 |
assert len(result_file_cand) == 1, model_name
|
20 |
results_data = json.load(open(result_file_cand[0]))
|
21 |
denosing_module_name = "unet" if "unet" in results_data["num_parameters"] else "transformer"
|
@@ -24,6 +24,7 @@ def main(results_dir: Path, output_file: Path) -> None:
|
|
24 |
nickname=model_name.split("/")[-1].replace("-", " ").title(),
|
25 |
total_params=raw_params_to_readable(sum(results_data["num_parameters"].values())),
|
26 |
denoising_params=raw_params_to_readable(results_data["num_parameters"][denosing_module_name]),
|
|
|
27 |
)
|
28 |
assert model_name not in models
|
29 |
models[model_name] = model_info
|
|
|
15 |
for model_dir in sorted(glob(f"{results_dir}/*/*")):
|
16 |
model_name = "/".join(model_dir.split("/")[-2:])
|
17 |
print(f" {model_name}")
|
18 |
+
result_file_cand = glob(f"{model_dir}/bs1+*+steps25+results.json")
|
19 |
assert len(result_file_cand) == 1, model_name
|
20 |
results_data = json.load(open(result_file_cand[0]))
|
21 |
denosing_module_name = "unet" if "unet" in results_data["num_parameters"] else "transformer"
|
|
|
24 |
nickname=model_name.split("/")[-1].replace("-", " ").title(),
|
25 |
total_params=raw_params_to_readable(sum(results_data["num_parameters"].values())),
|
26 |
denoising_params=raw_params_to_readable(results_data["num_parameters"][denosing_module_name]),
|
27 |
+
resolution="NA",
|
28 |
)
|
29 |
assert model_name not in models
|
30 |
models[model_name] = model_info
|
benchmark/diffusion/text-to-image/scripts/benchmark_one_datapoint.py
CHANGED
@@ -1,8 +1,10 @@
|
|
1 |
from __future__ import annotations
|
2 |
|
3 |
import os
|
|
|
4 |
import json
|
5 |
import argparse
|
|
|
6 |
from pprint import pprint
|
7 |
from pathlib import Path
|
8 |
from contextlib import suppress
|
@@ -11,6 +13,7 @@ from dataclasses import dataclass, field, asdict
|
|
11 |
import torch
|
12 |
import pynvml
|
13 |
import numpy as np
|
|
|
14 |
from PIL import Image
|
15 |
from datasets import load_dataset, Dataset
|
16 |
from transformers.trainer_utils import set_seed
|
@@ -35,9 +38,9 @@ class Results:
|
|
35 |
model: str
|
36 |
num_parameters: dict[str, int]
|
37 |
gpu_model: str
|
38 |
-
num_inference_steps: int
|
39 |
power_limit: int
|
40 |
batch_size: int
|
|
|
41 |
num_prompts: int
|
42 |
average_clip_score: float = 0.0
|
43 |
total_runtime: float = 0.0
|
@@ -118,6 +121,28 @@ def load_partiprompts(
|
|
118 |
return len(batched) * batch_size, batched
|
119 |
|
120 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
def calculate_clip_score(
|
122 |
model: CLIPModel,
|
123 |
processor: CLIPProcessor,
|
@@ -183,8 +208,8 @@ def benchmark(args: argparse.Namespace) -> None:
|
|
183 |
|
184 |
results_dir = Path(args.result_root) / args.model
|
185 |
results_dir.mkdir(parents=True, exist_ok=True)
|
186 |
-
benchmark_name = str(results_dir / f"bs{args.batch_size}+pl{args.power_limit}")
|
187 |
-
image_dir = results_dir / f"bs{args.batch_size}+pl{args.power_limit}+generated"
|
188 |
image_dir.mkdir(exist_ok=True)
|
189 |
|
190 |
arg_out_filename = f"{benchmark_name}+args.json"
|
@@ -222,27 +247,42 @@ def benchmark(args: argparse.Namespace) -> None:
|
|
222 |
ResultIntermediateBatched(prompts=batch) for batch in batched_prompts
|
223 |
]
|
224 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
225 |
torch.cuda.reset_peak_memory_stats(device="cuda:0")
|
226 |
-
zeus_monitor.begin_window("benchmark",
|
227 |
|
228 |
for ind, intermediate in enumerate(intermediates):
|
229 |
print(f"Batch {ind + 1}/{len(intermediates)}")
|
230 |
-
zeus_monitor.begin_window("batch",
|
231 |
images = pipeline(
|
232 |
intermediate.prompts,
|
233 |
generator=rng,
|
234 |
num_inference_steps=args.num_inference_steps,
|
235 |
output_type="np",
|
236 |
).images
|
237 |
-
batch_measurements = zeus_monitor.end_window("batch",
|
238 |
|
239 |
intermediate.images = images
|
240 |
intermediate.batch_latency = batch_measurements.time
|
241 |
intermediate.batch_energy = batch_measurements.total_energy
|
242 |
|
243 |
-
measurements = zeus_monitor.end_window("benchmark",
|
244 |
peak_memory = torch.cuda.max_memory_allocated(device="cuda:0")
|
245 |
|
|
|
|
|
|
|
|
|
|
|
246 |
# Scale images to [0, 256] and convert to uint8
|
247 |
for intermediate in intermediates:
|
248 |
intermediate.images = (intermediate.images * 255).astype("uint8")
|
@@ -292,9 +332,9 @@ def benchmark(args: argparse.Namespace) -> None:
|
|
292 |
model=args.model,
|
293 |
num_parameters=count_parameters(pipeline),
|
294 |
gpu_model=gpu_model,
|
295 |
-
num_inference_steps=args.num_inference_steps,
|
296 |
power_limit=args.power_limit,
|
297 |
batch_size=args.batch_size,
|
|
|
298 |
num_prompts=num_prompts,
|
299 |
average_clip_score=sum(r.clip_score for r in results) / len(results),
|
300 |
total_runtime=measurements.time,
|
@@ -326,6 +366,7 @@ if __name__ == "__main__":
|
|
326 |
parser.add_argument("--image-save-every", type=int, default=10, help="Save images to file every N prompts.")
|
327 |
parser.add_argument("--seed", type=int, default=0, help="The seed to use for the RNG.")
|
328 |
parser.add_argument("--huggingface-token", type=str, help="The HuggingFace token to use.")
|
|
|
329 |
args = parser.parse_args()
|
330 |
|
331 |
benchmark(args)
|
|
|
1 |
from __future__ import annotations
|
2 |
|
3 |
import os
|
4 |
+
import time
|
5 |
import json
|
6 |
import argparse
|
7 |
+
import multiprocessing as mp
|
8 |
from pprint import pprint
|
9 |
from pathlib import Path
|
10 |
from contextlib import suppress
|
|
|
13 |
import torch
|
14 |
import pynvml
|
15 |
import numpy as np
|
16 |
+
import pandas as pd
|
17 |
from PIL import Image
|
18 |
from datasets import load_dataset, Dataset
|
19 |
from transformers.trainer_utils import set_seed
|
|
|
38 |
model: str
|
39 |
num_parameters: dict[str, int]
|
40 |
gpu_model: str
|
|
|
41 |
power_limit: int
|
42 |
batch_size: int
|
43 |
+
num_inference_steps: int
|
44 |
num_prompts: int
|
45 |
average_clip_score: float = 0.0
|
46 |
total_runtime: float = 0.0
|
|
|
121 |
return len(batched) * batch_size, batched
|
122 |
|
123 |
|
124 |
+
def power_monitor(csv_path: str, gpu_indices: list[int], chan: mp.SimpleQueue) -> None:
|
125 |
+
pynvml.nvmlInit()
|
126 |
+
handles = [pynvml.nvmlDeviceGetHandleByIndex(i) for i in gpu_indices]
|
127 |
+
|
128 |
+
fields = [
|
129 |
+
(pynvml.NVML_FI_DEV_POWER_AVERAGE, pynvml.NVML_POWER_SCOPE_GPU),
|
130 |
+
(pynvml.NVML_FI_DEV_POWER_AVERAGE, pynvml.NVML_POWER_SCOPE_MEMORY),
|
131 |
+
]
|
132 |
+
|
133 |
+
columns = ["timestamp"] + sum([[f"gpu{i}", f"vram{i}"] for i in gpu_indices], [])
|
134 |
+
power: list[list] = []
|
135 |
+
while chan.empty():
|
136 |
+
row = [time.monotonic()]
|
137 |
+
values = [pynvml.nvmlDeviceGetFieldValues(h, fields) for h in handles]
|
138 |
+
for value in values:
|
139 |
+
row.extend((value[0].value.uiVal, value[1].value.uiVal))
|
140 |
+
power.append(row)
|
141 |
+
time.sleep(max(0.0, 0.1 - (time.monotonic() - row[0])))
|
142 |
+
|
143 |
+
pd.DataFrame(power, columns=columns).to_csv(csv_path, index=False)
|
144 |
+
|
145 |
+
|
146 |
def calculate_clip_score(
|
147 |
model: CLIPModel,
|
148 |
processor: CLIPProcessor,
|
|
|
208 |
|
209 |
results_dir = Path(args.result_root) / args.model
|
210 |
results_dir.mkdir(parents=True, exist_ok=True)
|
211 |
+
benchmark_name = str(results_dir / f"bs{args.batch_size}+pl{args.power_limit}+steps{args.num_inference_steps}")
|
212 |
+
image_dir = results_dir / f"bs{args.batch_size}+pl{args.power_limit}+steps{args.num_inference_steps}+generated"
|
213 |
image_dir.mkdir(exist_ok=True)
|
214 |
|
215 |
arg_out_filename = f"{benchmark_name}+args.json"
|
|
|
247 |
ResultIntermediateBatched(prompts=batch) for batch in batched_prompts
|
248 |
]
|
249 |
|
250 |
+
pmon = None
|
251 |
+
pmon_chan = None
|
252 |
+
if args.monitor_power:
|
253 |
+
pmon_chan = mp.SimpleQueue()
|
254 |
+
pmon = mp.get_context("spawn").Process(
|
255 |
+
target=power_monitor,
|
256 |
+
args=(f"{benchmark_name}+power.csv", [g.gpu_index for g in zeus_monitor.gpus.gpus], pmon_chan),
|
257 |
+
)
|
258 |
+
pmon.start()
|
259 |
+
|
260 |
torch.cuda.reset_peak_memory_stats(device="cuda:0")
|
261 |
+
zeus_monitor.begin_window("benchmark", sync_execution=False)
|
262 |
|
263 |
for ind, intermediate in enumerate(intermediates):
|
264 |
print(f"Batch {ind + 1}/{len(intermediates)}")
|
265 |
+
zeus_monitor.begin_window("batch", sync_execution=False)
|
266 |
images = pipeline(
|
267 |
intermediate.prompts,
|
268 |
generator=rng,
|
269 |
num_inference_steps=args.num_inference_steps,
|
270 |
output_type="np",
|
271 |
).images
|
272 |
+
batch_measurements = zeus_monitor.end_window("batch", sync_execution=False)
|
273 |
|
274 |
intermediate.images = images
|
275 |
intermediate.batch_latency = batch_measurements.time
|
276 |
intermediate.batch_energy = batch_measurements.total_energy
|
277 |
|
278 |
+
measurements = zeus_monitor.end_window("benchmark", sync_execution=False)
|
279 |
peak_memory = torch.cuda.max_memory_allocated(device="cuda:0")
|
280 |
|
281 |
+
if pmon is not None and pmon_chan is not None:
|
282 |
+
pmon_chan.put("stop")
|
283 |
+
pmon.join(timeout=5.0)
|
284 |
+
pmon.terminate()
|
285 |
+
|
286 |
# Scale images to [0, 256] and convert to uint8
|
287 |
for intermediate in intermediates:
|
288 |
intermediate.images = (intermediate.images * 255).astype("uint8")
|
|
|
332 |
model=args.model,
|
333 |
num_parameters=count_parameters(pipeline),
|
334 |
gpu_model=gpu_model,
|
|
|
335 |
power_limit=args.power_limit,
|
336 |
batch_size=args.batch_size,
|
337 |
+
num_inference_steps=args.num_inference_steps,
|
338 |
num_prompts=num_prompts,
|
339 |
average_clip_score=sum(r.clip_score for r in results) / len(results),
|
340 |
total_runtime=measurements.time,
|
|
|
366 |
parser.add_argument("--image-save-every", type=int, default=10, help="Save images to file every N prompts.")
|
367 |
parser.add_argument("--seed", type=int, default=0, help="The seed to use for the RNG.")
|
368 |
parser.add_argument("--huggingface-token", type=str, help="The HuggingFace token to use.")
|
369 |
+
parser.add_argument("--monitor-power", default=False, action="store_true", help="Whether to monitor power over time.")
|
370 |
args = parser.parse_args()
|
371 |
|
372 |
benchmark(args)
|
benchmark/diffusion/text-to-image/scripts/benchmark_one_model.py
CHANGED
@@ -28,12 +28,13 @@ def main(args: argparse.Namespace) -> None:
|
|
28 |
print_and_write(outfile, f"Benchmarking {args.model}\n")
|
29 |
print_and_write(outfile, f"Batch sizes: {args.batch_sizes}\n")
|
30 |
print_and_write(outfile, f"Power limits: {args.power_limits}\n")
|
|
|
31 |
|
32 |
for batch_size in args.batch_sizes:
|
33 |
for power_limit in args.power_limits:
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
"docker", "run",
|
38 |
"--gpus", '"device=' + ','.join(args.gpu_ids) + '"',
|
39 |
"--cap-add", "SYS_ADMIN",
|
@@ -48,22 +49,21 @@ def main(args: argparse.Namespace) -> None:
|
|
48 |
"--power-limit", power_limit,
|
49 |
"--model", args.model,
|
50 |
"--huggingface-token", hf_token,
|
51 |
-
"--num-inference-steps",
|
52 |
-
]
|
53 |
-
|
54 |
-
|
55 |
-
text=True
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
i += 1
|
62 |
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
|
68 |
|
69 |
|
@@ -74,5 +74,7 @@ if __name__ == "__main__":
|
|
74 |
parser.add_argument("--gpu-ids", type=str, nargs="+", help="GPU IDs to use")
|
75 |
parser.add_argument("--batch-sizes", type=str, nargs="+", default=["8", "4", "2", "1"], help="Batch sizes to benchmark")
|
76 |
parser.add_argument("--power-limits", type=str, nargs="+", default=["400", "300", "200"], help="Power limits to benchmark")
|
|
|
|
|
77 |
args = parser.parse_args()
|
78 |
main(args)
|
|
|
28 |
print_and_write(outfile, f"Benchmarking {args.model}\n")
|
29 |
print_and_write(outfile, f"Batch sizes: {args.batch_sizes}\n")
|
30 |
print_and_write(outfile, f"Power limits: {args.power_limits}\n")
|
31 |
+
print_and_write(outfile, f"Number of inference steps: {args.num_inference_steps}\n")
|
32 |
|
33 |
for batch_size in args.batch_sizes:
|
34 |
for power_limit in args.power_limits:
|
35 |
+
for num_inference_steps in args.num_inference_steps:
|
36 |
+
print_and_write(outfile, f"{batch_size=}, {power_limit=}, {num_inference_steps=}\n", flush=True)
|
37 |
+
cmd=[
|
38 |
"docker", "run",
|
39 |
"--gpus", '"device=' + ','.join(args.gpu_ids) + '"',
|
40 |
"--cap-add", "SYS_ADMIN",
|
|
|
49 |
"--power-limit", power_limit,
|
50 |
"--model", args.model,
|
51 |
"--huggingface-token", hf_token,
|
52 |
+
"--num-inference-steps", num_inference_steps,
|
53 |
+
]
|
54 |
+
if args.monitor_power:
|
55 |
+
cmd.append("--monitor-power")
|
56 |
+
with subprocess.Popen(args=cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) as proc:
|
57 |
+
if proc.stdout:
|
58 |
+
i = 0
|
59 |
+
for line in proc.stdout:
|
60 |
+
print_and_write(outfile, line, flush=i % 50 == 0)
|
61 |
+
i += 1
|
|
|
62 |
|
63 |
+
# If proc exited with non-zero status, it's probably an OOM.
|
64 |
+
# Move on to the next batch size.
|
65 |
+
if proc.returncode != 0:
|
66 |
+
break
|
67 |
|
68 |
|
69 |
|
|
|
74 |
parser.add_argument("--gpu-ids", type=str, nargs="+", help="GPU IDs to use")
|
75 |
parser.add_argument("--batch-sizes", type=str, nargs="+", default=["8", "4", "2", "1"], help="Batch sizes to benchmark")
|
76 |
parser.add_argument("--power-limits", type=str, nargs="+", default=["400", "300", "200"], help="Power limits to benchmark")
|
77 |
+
parser.add_argument("--num-inference-steps", type=str, nargs="+", default=["1", "2", "4", "8", "16", "25", "30", "40", "50"], help="Number of inference steps to run")
|
78 |
+
parser.add_argument("--monitor-power", default=False, action="store_true", help="Whether to monitor power over time.")
|
79 |
args = parser.parse_args()
|
80 |
main(args)
|
benchmark/diffusion/text-to-video/pegasus/A100/queue_1gpu.yaml
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
- command:
|
2 |
-
- "python scripts/benchmark_one_model.py --model {{ model }} --result-root results/joule --dataset-path sharegpt4video/sharegpt4video_100.json --gpu-ids {{ gpu }} --batch-sizes 16 8 4 2 1 --power-limits 400 --num-inference-steps 25 --num-frames 16"
|
3 |
model:
|
4 |
- ali-vilab/text-to-video-ms-1.7b
|
5 |
- guoyww/animatediff-motion-adapter-v1-5-3
|
|
|
1 |
- command:
|
2 |
+
- "python scripts/benchmark_one_model.py --model {{ model }} --result-root results/joule --dataset-path sharegpt4video/sharegpt4video_100.json --gpu-ids {{ gpu }} --batch-sizes 32 16 8 4 2 1 --power-limits 400 --num-inference-steps 1 2 4 8 16 25 30 40 50 --num-frames 16"
|
3 |
model:
|
4 |
- ali-vilab/text-to-video-ms-1.7b
|
5 |
- guoyww/animatediff-motion-adapter-v1-5-3
|
benchmark/diffusion/text-to-video/pegasus/H100/queue_1gpu.yaml
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
- command:
|
2 |
-
- "python scripts/benchmark_one_model.py --model {{ model }} --result-root results/joule --dataset-path sharegpt4video/
|
3 |
model:
|
4 |
- ali-vilab/text-to-video-ms-1.7b
|
5 |
- guoyww/animatediff-motion-adapter-v1-5-3
|
|
|
1 |
- command:
|
2 |
+
- "python scripts/benchmark_one_model.py --model {{ model }} --result-root results/joule --dataset-path sharegpt4video/sharegpt4video_100.json --gpu-ids {{ gpu }} --batch-sizes 32 16 8 4 2 1 --power-limits 700 --num-inference-steps 1 2 4 8 16 25 30 40 50 --num-frames 16"
|
3 |
model:
|
4 |
- ali-vilab/text-to-video-ms-1.7b
|
5 |
- guoyww/animatediff-motion-adapter-v1-5-3
|
benchmark/diffusion/text-to-video/scripts/aggregate_leaderboard_models.py
CHANGED
@@ -15,7 +15,7 @@ def main(results_dir: Path, output_file: Path) -> None:
|
|
15 |
for model_dir in sorted(glob(f"{results_dir}/*/*")):
|
16 |
model_name = "/".join(model_dir.split("/")[-2:])
|
17 |
print(f" {model_name}")
|
18 |
-
result_file_cand = glob(f"{model_dir}/bs1+*+results.json")
|
19 |
assert len(result_file_cand) == 1, model_name
|
20 |
results_data = json.load(open(result_file_cand[0]))
|
21 |
denosing_module_name = "unet" if "unet" in results_data["num_parameters"] else "transformer"
|
@@ -24,6 +24,7 @@ def main(results_dir: Path, output_file: Path) -> None:
|
|
24 |
nickname=model_name.split("/")[-1].replace("-", " ").title(),
|
25 |
total_params=raw_params_to_readable(sum(results_data["num_parameters"].values())),
|
26 |
denoising_params=raw_params_to_readable(results_data["num_parameters"][denosing_module_name]),
|
|
|
27 |
)
|
28 |
assert model_name not in models
|
29 |
models[model_name] = model_info
|
|
|
15 |
for model_dir in sorted(glob(f"{results_dir}/*/*")):
|
16 |
model_name = "/".join(model_dir.split("/")[-2:])
|
17 |
print(f" {model_name}")
|
18 |
+
result_file_cand = glob(f"{model_dir}/bs1+*+steps25+results.json")
|
19 |
assert len(result_file_cand) == 1, model_name
|
20 |
results_data = json.load(open(result_file_cand[0]))
|
21 |
denosing_module_name = "unet" if "unet" in results_data["num_parameters"] else "transformer"
|
|
|
24 |
nickname=model_name.split("/")[-1].replace("-", " ").title(),
|
25 |
total_params=raw_params_to_readable(sum(results_data["num_parameters"].values())),
|
26 |
denoising_params=raw_params_to_readable(results_data["num_parameters"][denosing_module_name]),
|
27 |
+
resolution="NA",
|
28 |
)
|
29 |
assert model_name not in models
|
30 |
models[model_name] = model_info
|
benchmark/diffusion/text-to-video/scripts/benchmark_one_datapoint.py
CHANGED
@@ -32,10 +32,10 @@ class Results:
|
|
32 |
model: str
|
33 |
num_parameters: dict[str, int]
|
34 |
gpu_model: str
|
35 |
-
num_inference_steps: int
|
36 |
-
num_frames: int
|
37 |
power_limit: int
|
38 |
batch_size: int
|
|
|
|
|
39 |
num_prompts: int
|
40 |
total_runtime: float = 0.0
|
41 |
total_energy: float = 0.0
|
@@ -119,7 +119,7 @@ def load_text_prompts(
|
|
119 |
Returns:
|
120 |
Total number of prompts and a list of batches of prompts.
|
121 |
"""
|
122 |
-
dataset = json.load(open(path))["caption"]
|
123 |
if num_batches is not None:
|
124 |
if len(dataset) < num_batches * batch_size:
|
125 |
raise ValueError("Dataset is too small for the given number of batches.")
|
@@ -151,8 +151,8 @@ def benchmark(args: argparse.Namespace) -> None:
|
|
151 |
|
152 |
results_dir = Path(args.result_root) / args.model
|
153 |
results_dir.mkdir(parents=True, exist_ok=True)
|
154 |
-
benchmark_name = str(results_dir / f"bs{args.batch_size}+pl{args.power_limit}")
|
155 |
-
video_dir = results_dir / f"bs{args.batch_size}+pl{args.power_limit}+generated"
|
156 |
video_dir.mkdir(exist_ok=True)
|
157 |
|
158 |
arg_out_filename = f"{benchmark_name}+args.json"
|
@@ -190,7 +190,7 @@ def benchmark(args: argparse.Namespace) -> None:
|
|
190 |
]
|
191 |
|
192 |
torch.cuda.reset_peak_memory_stats(device="cuda:0")
|
193 |
-
zeus_monitor.begin_window("benchmark",
|
194 |
|
195 |
# Build common parameter dict for all batches
|
196 |
params: dict[str, Any] = dict(
|
@@ -208,15 +208,15 @@ def benchmark(args: argparse.Namespace) -> None:
|
|
208 |
|
209 |
params["prompt"] = intermediate.prompts
|
210 |
|
211 |
-
zeus_monitor.begin_window("batch",
|
212 |
frames = pipeline(**params).frames
|
213 |
-
batch_measurements = zeus_monitor.end_window("batch",
|
214 |
|
215 |
intermediate.frames = frames
|
216 |
intermediate.batch_latency = batch_measurements.time
|
217 |
intermediate.batch_energy = batch_measurements.total_energy
|
218 |
|
219 |
-
measurements = zeus_monitor.end_window("benchmark",
|
220 |
peak_memory = torch.cuda.max_memory_allocated(device="cuda:0")
|
221 |
|
222 |
results: list[Result] = []
|
@@ -253,10 +253,10 @@ def benchmark(args: argparse.Namespace) -> None:
|
|
253 |
model=args.model,
|
254 |
num_parameters=count_parameters(pipeline),
|
255 |
gpu_model=gpu_model,
|
256 |
-
num_inference_steps=args.num_inference_steps,
|
257 |
-
num_frames=args.num_frames,
|
258 |
power_limit=args.power_limit,
|
259 |
batch_size=args.batch_size,
|
|
|
|
|
260 |
num_prompts=num_prompts,
|
261 |
total_runtime=measurements.time,
|
262 |
total_energy=measurements.total_energy,
|
|
|
32 |
model: str
|
33 |
num_parameters: dict[str, int]
|
34 |
gpu_model: str
|
|
|
|
|
35 |
power_limit: int
|
36 |
batch_size: int
|
37 |
+
num_inference_steps: int
|
38 |
+
num_frames: int
|
39 |
num_prompts: int
|
40 |
total_runtime: float = 0.0
|
41 |
total_energy: float = 0.0
|
|
|
119 |
Returns:
|
120 |
Total number of prompts and a list of batches of prompts.
|
121 |
"""
|
122 |
+
dataset = json.load(open(path))["caption"] * 10
|
123 |
if num_batches is not None:
|
124 |
if len(dataset) < num_batches * batch_size:
|
125 |
raise ValueError("Dataset is too small for the given number of batches.")
|
|
|
151 |
|
152 |
results_dir = Path(args.result_root) / args.model
|
153 |
results_dir.mkdir(parents=True, exist_ok=True)
|
154 |
+
benchmark_name = str(results_dir / f"bs{args.batch_size}+pl{args.power_limit}+steps{args.num_inference_steps}")
|
155 |
+
video_dir = results_dir / f"bs{args.batch_size}+pl{args.power_limit}+steps{args.num_inference_steps}+generated"
|
156 |
video_dir.mkdir(exist_ok=True)
|
157 |
|
158 |
arg_out_filename = f"{benchmark_name}+args.json"
|
|
|
190 |
]
|
191 |
|
192 |
torch.cuda.reset_peak_memory_stats(device="cuda:0")
|
193 |
+
zeus_monitor.begin_window("benchmark", sync_execution=False)
|
194 |
|
195 |
# Build common parameter dict for all batches
|
196 |
params: dict[str, Any] = dict(
|
|
|
208 |
|
209 |
params["prompt"] = intermediate.prompts
|
210 |
|
211 |
+
zeus_monitor.begin_window("batch", sync_execution=False)
|
212 |
frames = pipeline(**params).frames
|
213 |
+
batch_measurements = zeus_monitor.end_window("batch", sync_execution=False)
|
214 |
|
215 |
intermediate.frames = frames
|
216 |
intermediate.batch_latency = batch_measurements.time
|
217 |
intermediate.batch_energy = batch_measurements.total_energy
|
218 |
|
219 |
+
measurements = zeus_monitor.end_window("benchmark", sync_execution=False)
|
220 |
peak_memory = torch.cuda.max_memory_allocated(device="cuda:0")
|
221 |
|
222 |
results: list[Result] = []
|
|
|
253 |
model=args.model,
|
254 |
num_parameters=count_parameters(pipeline),
|
255 |
gpu_model=gpu_model,
|
|
|
|
|
256 |
power_limit=args.power_limit,
|
257 |
batch_size=args.batch_size,
|
258 |
+
num_inference_steps=args.num_inference_steps,
|
259 |
+
num_frames=args.num_frames,
|
260 |
num_prompts=num_prompts,
|
261 |
total_runtime=measurements.time,
|
262 |
total_energy=measurements.total_energy,
|
benchmark/diffusion/text-to-video/scripts/benchmark_one_model.py
CHANGED
@@ -28,44 +28,46 @@ def main(args: argparse.Namespace) -> None:
|
|
28 |
print_and_write(outfile, f"Benchmarking {args.model}\n")
|
29 |
print_and_write(outfile, f"Batch sizes: {args.batch_sizes}\n")
|
30 |
print_and_write(outfile, f"Power limits: {args.power_limits}\n")
|
|
|
31 |
|
32 |
for batch_size in args.batch_sizes:
|
33 |
for power_limit in args.power_limits:
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
|
|
64 |
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
|
70 |
|
71 |
|
@@ -76,7 +78,7 @@ if __name__ == "__main__":
|
|
76 |
parser.add_argument("--gpu-ids", type=str, nargs="+", help="GPU IDs to use")
|
77 |
parser.add_argument("--batch-sizes", type=str, nargs="+", default=["8", "4", "2", "1"], help="Batch sizes to benchmark")
|
78 |
parser.add_argument("--power-limits", type=str, nargs="+", default=["400", "300", "200"], help="Power limits to benchmark")
|
79 |
-
parser.add_argument("--num-inference-steps", type=str,
|
80 |
parser.add_argument("--num-frames", type=str, required=True, help="Number of frames to generate")
|
81 |
parser.add_argument("--dataset-path", type=str, help="Path to the dataset JSON file.")
|
82 |
args = parser.parse_args()
|
|
|
28 |
print_and_write(outfile, f"Benchmarking {args.model}\n")
|
29 |
print_and_write(outfile, f"Batch sizes: {args.batch_sizes}\n")
|
30 |
print_and_write(outfile, f"Power limits: {args.power_limits}\n")
|
31 |
+
print_and_write(outfile, f"Number of inference steps: {args.num_inference_steps}\n")
|
32 |
|
33 |
for batch_size in args.batch_sizes:
|
34 |
for power_limit in args.power_limits:
|
35 |
+
for num_inference_steps in args.num_inference_steps:
|
36 |
+
print_and_write(outfile, f"{batch_size=}, {power_limit=}, {num_inference_steps=}\n", flush=True)
|
37 |
+
with subprocess.Popen(
|
38 |
+
args=[
|
39 |
+
"docker", "run",
|
40 |
+
"--gpus", '"device=' + ','.join(args.gpu_ids) + '"',
|
41 |
+
"--cap-add", "SYS_ADMIN",
|
42 |
+
"--name", f"leaderboard-t2v-{''.join(args.gpu_ids)}",
|
43 |
+
"--rm",
|
44 |
+
"-v", "/data/leaderboard/hfcache:/root/.cache/huggingface",
|
45 |
+
"-v", f"{os.getcwd()}:/workspace/text-to-video",
|
46 |
+
"mlenergy/leaderboard:diffusion-t2v",
|
47 |
+
"--result-root", args.result_root,
|
48 |
+
"--batch-size", batch_size,
|
49 |
+
"--num-batches", "10",
|
50 |
+
"--power-limit", power_limit,
|
51 |
+
"--model", args.model,
|
52 |
+
"--dataset-path", args.dataset_path,
|
53 |
+
"--huggingface-token", hf_token,
|
54 |
+
"--num-inference-steps", num_inference_steps,
|
55 |
+
"--num-frames", args.num_frames,
|
56 |
+
],
|
57 |
+
stdout=subprocess.PIPE,
|
58 |
+
stderr=subprocess.STDOUT,
|
59 |
+
text=True,
|
60 |
+
) as proc:
|
61 |
+
if proc.stdout:
|
62 |
+
i = 0
|
63 |
+
for line in proc.stdout:
|
64 |
+
print_and_write(outfile, line, flush=i % 50 == 0)
|
65 |
+
i += 1
|
66 |
|
67 |
+
# If proc exited with non-zero status, it's probably an OOM.
|
68 |
+
# Move on to the next batch size.
|
69 |
+
if proc.returncode != 0:
|
70 |
+
break
|
71 |
|
72 |
|
73 |
|
|
|
78 |
parser.add_argument("--gpu-ids", type=str, nargs="+", help="GPU IDs to use")
|
79 |
parser.add_argument("--batch-sizes", type=str, nargs="+", default=["8", "4", "2", "1"], help="Batch sizes to benchmark")
|
80 |
parser.add_argument("--power-limits", type=str, nargs="+", default=["400", "300", "200"], help="Power limits to benchmark")
|
81 |
+
parser.add_argument("--num-inference-steps", type=str, nargs="+", default=["1", "2", "4", "8", "16", "25", "30", "40", "50"], help="Number of denoising steps")
|
82 |
parser.add_argument("--num-frames", type=str, required=True, help="Number of frames to generate")
|
83 |
parser.add_argument("--dataset-path", type=str, help="Path to the dataset JSON file.")
|
84 |
args = parser.parse_args()
|
data/diffusion/image-to-video/A100-SXM4-40GB/ali-vilab/i2vgen-xl/bs1+steps25+frames16.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
{
|
2 |
"Model": "ali-vilab/i2vgen-xl",
|
3 |
"GPU": "NVIDIA A100-SXM4-40GB",
|
4 |
-
"Energy/video (J)":
|
5 |
-
"Batch latency (s)":
|
6 |
"Batch size": 1,
|
7 |
"Denoising steps": 25,
|
8 |
"Frames": 16
|
|
|
1 |
{
|
2 |
"Model": "ali-vilab/i2vgen-xl",
|
3 |
"GPU": "NVIDIA A100-SXM4-40GB",
|
4 |
+
"Energy/video (J)": 16915.850124999997,
|
5 |
+
"Batch latency (s)": 46.14208295941353,
|
6 |
"Batch size": 1,
|
7 |
"Denoising steps": 25,
|
8 |
"Frames": 16
|
data/diffusion/image-to-video/A100-SXM4-40GB/ali-vilab/i2vgen-xl/bs2+steps25+frames16.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
{
|
2 |
"Model": "ali-vilab/i2vgen-xl",
|
3 |
"GPU": "NVIDIA A100-SXM4-40GB",
|
4 |
-
"Energy/video (J)":
|
5 |
-
"Batch latency (s)":
|
6 |
"Batch size": 2,
|
7 |
"Denoising steps": 25,
|
8 |
"Frames": 16
|
|
|
1 |
{
|
2 |
"Model": "ali-vilab/i2vgen-xl",
|
3 |
"GPU": "NVIDIA A100-SXM4-40GB",
|
4 |
+
"Energy/video (J)": 16496.045437499997,
|
5 |
+
"Batch latency (s)": 89.03019031882286,
|
6 |
"Batch size": 2,
|
7 |
"Denoising steps": 25,
|
8 |
"Frames": 16
|
data/diffusion/image-to-video/A100-SXM4-40GB/stabilityai/stable-video-diffusion-img2vid-xt/bs1+steps25+frames25.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
{
|
2 |
"Model": "stabilityai/stable-video-diffusion-img2vid-xt",
|
3 |
"GPU": "NVIDIA A100-SXM4-40GB",
|
4 |
-
"Energy/video (J)":
|
5 |
-
"Batch latency (s)": 42.
|
6 |
"Batch size": 1,
|
7 |
"Denoising steps": 25,
|
8 |
"Frames": 25
|
|
|
1 |
{
|
2 |
"Model": "stabilityai/stable-video-diffusion-img2vid-xt",
|
3 |
"GPU": "NVIDIA A100-SXM4-40GB",
|
4 |
+
"Energy/video (J)": 15709.767625000095,
|
5 |
+
"Batch latency (s)": 42.397395104169846,
|
6 |
"Batch size": 1,
|
7 |
"Denoising steps": 25,
|
8 |
"Frames": 25
|
data/diffusion/image-to-video/A100-SXM4-40GB/stabilityai/stable-video-diffusion-img2vid-xt/bs2+steps25+frames25.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"Model": "stabilityai/stable-video-diffusion-img2vid-xt",
|
3 |
+
"GPU": "NVIDIA A100-SXM4-40GB",
|
4 |
+
"Energy/video (J)": 15291.016625000047,
|
5 |
+
"Batch latency (s)": 82.90474811196327,
|
6 |
+
"Batch size": 2,
|
7 |
+
"Denoising steps": 25,
|
8 |
+
"Frames": 25
|
9 |
+
}
|
data/diffusion/image-to-video/A100-SXM4-40GB/stabilityai/stable-video-diffusion-img2vid-xt/bs3+steps25+frames25.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"Model": "stabilityai/stable-video-diffusion-img2vid-xt",
|
3 |
+
"GPU": "NVIDIA A100-SXM4-40GB",
|
4 |
+
"Energy/video (J)": 14761.389999999976,
|
5 |
+
"Batch latency (s)": 120.65004900523594,
|
6 |
+
"Batch size": 3,
|
7 |
+
"Denoising steps": 25,
|
8 |
+
"Frames": 25
|
9 |
+
}
|
data/diffusion/image-to-video/A100-SXM4-40GB/stabilityai/stable-video-diffusion-img2vid/bs1+steps25+frames14.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
{
|
2 |
"Model": "stabilityai/stable-video-diffusion-img2vid",
|
3 |
"GPU": "NVIDIA A100-SXM4-40GB",
|
4 |
-
"Energy/video (J)":
|
5 |
-
"Batch latency (s)": 24.
|
6 |
"Batch size": 1,
|
7 |
"Denoising steps": 25,
|
8 |
"Frames": 14
|
|
|
1 |
{
|
2 |
"Model": "stabilityai/stable-video-diffusion-img2vid",
|
3 |
"GPU": "NVIDIA A100-SXM4-40GB",
|
4 |
+
"Energy/video (J)": 9066.434124999912,
|
5 |
+
"Batch latency (s)": 24.369865357875824,
|
6 |
"Batch size": 1,
|
7 |
"Denoising steps": 25,
|
8 |
"Frames": 14
|
data/diffusion/image-to-video/A100-SXM4-40GB/stabilityai/stable-video-diffusion-img2vid/bs2+steps25+frames14.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"Model": "stabilityai/stable-video-diffusion-img2vid",
|
3 |
+
"GPU": "NVIDIA A100-SXM4-40GB",
|
4 |
+
"Energy/video (J)": 8835.22312499996,
|
5 |
+
"Batch latency (s)": 47.65615049004555,
|
6 |
+
"Batch size": 2,
|
7 |
+
"Denoising steps": 25,
|
8 |
+
"Frames": 14
|
9 |
+
}
|
data/diffusion/image-to-video/A100-SXM4-40GB/stabilityai/stable-video-diffusion-img2vid/bs3+steps25+frames14.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"Model": "stabilityai/stable-video-diffusion-img2vid",
|
3 |
+
"GPU": "NVIDIA A100-SXM4-40GB",
|
4 |
+
"Energy/video (J)": 8683.536285714292,
|
5 |
+
"Batch latency (s)": 70.55723374230521,
|
6 |
+
"Batch size": 3,
|
7 |
+
"Denoising steps": 25,
|
8 |
+
"Frames": 14
|
9 |
+
}
|
data/diffusion/image-to-video/H100 80GB HBM3/ali-vilab/i2vgen-xl/bs1+steps25+frames16.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
{
|
2 |
"Model": "ali-vilab/i2vgen-xl",
|
3 |
"GPU": "NVIDIA H100 80GB HBM3",
|
4 |
-
"Energy/video (J)":
|
5 |
-
"Batch latency (s)":
|
6 |
"Batch size": 1,
|
7 |
"Denoising steps": 25,
|
8 |
"Frames": 16
|
|
|
1 |
{
|
2 |
"Model": "ali-vilab/i2vgen-xl",
|
3 |
"GPU": "NVIDIA H100 80GB HBM3",
|
4 |
+
"Energy/video (J)": 14867.419125000015,
|
5 |
+
"Batch latency (s)": 23.717748790979385,
|
6 |
"Batch size": 1,
|
7 |
"Denoising steps": 25,
|
8 |
"Frames": 16
|
data/diffusion/image-to-video/H100 80GB HBM3/ali-vilab/i2vgen-xl/bs2+steps25+frames16.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
{
|
2 |
"Model": "ali-vilab/i2vgen-xl",
|
3 |
"GPU": "NVIDIA H100 80GB HBM3",
|
4 |
-
"Energy/video (J)":
|
5 |
-
"Batch latency (s)":
|
6 |
"Batch size": 2,
|
7 |
"Denoising steps": 25,
|
8 |
"Frames": 16
|
|
|
1 |
{
|
2 |
"Model": "ali-vilab/i2vgen-xl",
|
3 |
"GPU": "NVIDIA H100 80GB HBM3",
|
4 |
+
"Energy/video (J)": 14348.508499999996,
|
5 |
+
"Batch latency (s)": 44.71498331427574,
|
6 |
"Batch size": 2,
|
7 |
"Denoising steps": 25,
|
8 |
"Frames": 16
|
data/diffusion/image-to-video/H100 80GB HBM3/stabilityai/stable-video-diffusion-img2vid-xt/bs1+steps25+frames25.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
{
|
2 |
"Model": "stabilityai/stable-video-diffusion-img2vid-xt",
|
3 |
"GPU": "NVIDIA H100 80GB HBM3",
|
4 |
-
"Energy/video (J)":
|
5 |
-
"Batch latency (s)": 20.
|
6 |
"Batch size": 1,
|
7 |
"Denoising steps": 25,
|
8 |
"Frames": 25
|
|
|
1 |
{
|
2 |
"Model": "stabilityai/stable-video-diffusion-img2vid-xt",
|
3 |
"GPU": "NVIDIA H100 80GB HBM3",
|
4 |
+
"Energy/video (J)": 13392.813624999952,
|
5 |
+
"Batch latency (s)": 20.788252592086792,
|
6 |
"Batch size": 1,
|
7 |
"Denoising steps": 25,
|
8 |
"Frames": 25
|
data/diffusion/image-to-video/H100 80GB HBM3/stabilityai/stable-video-diffusion-img2vid-xt/bs2+steps25+frames25.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"Model": "stabilityai/stable-video-diffusion-img2vid-xt",
|
3 |
+
"GPU": "NVIDIA H100 80GB HBM3",
|
4 |
+
"Energy/video (J)": 12901.83275000006,
|
5 |
+
"Batch latency (s)": 39.99498334527016,
|
6 |
+
"Batch size": 2,
|
7 |
+
"Denoising steps": 25,
|
8 |
+
"Frames": 25
|
9 |
+
}
|
data/diffusion/image-to-video/H100 80GB HBM3/stabilityai/stable-video-diffusion-img2vid-xt/bs3+steps25+frames25.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"Model": "stabilityai/stable-video-diffusion-img2vid-xt",
|
3 |
+
"GPU": "NVIDIA H100 80GB HBM3",
|
4 |
+
"Energy/video (J)": 12790.552809523862,
|
5 |
+
"Batch latency (s)": 59.380911929266794,
|
6 |
+
"Batch size": 3,
|
7 |
+
"Denoising steps": 25,
|
8 |
+
"Frames": 25
|
9 |
+
}
|
data/diffusion/image-to-video/H100 80GB HBM3/stabilityai/stable-video-diffusion-img2vid/bs1+steps25+frames14.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
{
|
2 |
"Model": "stabilityai/stable-video-diffusion-img2vid",
|
3 |
"GPU": "NVIDIA H100 80GB HBM3",
|
4 |
-
"Energy/video (J)":
|
5 |
-
"Batch latency (s)": 12.
|
6 |
"Batch size": 1,
|
7 |
"Denoising steps": 25,
|
8 |
"Frames": 14
|
|
|
1 |
{
|
2 |
"Model": "stabilityai/stable-video-diffusion-img2vid",
|
3 |
"GPU": "NVIDIA H100 80GB HBM3",
|
4 |
+
"Energy/video (J)": 7623.074500000104,
|
5 |
+
"Batch latency (s)": 12.191031396389008,
|
6 |
"Batch size": 1,
|
7 |
"Denoising steps": 25,
|
8 |
"Frames": 14
|
data/diffusion/image-to-video/H100 80GB HBM3/stabilityai/stable-video-diffusion-img2vid/bs2+steps25+frames14.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"Model": "stabilityai/stable-video-diffusion-img2vid",
|
3 |
+
"GPU": "NVIDIA H100 80GB HBM3",
|
4 |
+
"Energy/video (J)": 7416.721437499975,
|
5 |
+
"Batch latency (s)": 23.368041068315506,
|
6 |
+
"Batch size": 2,
|
7 |
+
"Denoising steps": 25,
|
8 |
+
"Frames": 14
|
9 |
+
}
|
data/diffusion/image-to-video/H100 80GB HBM3/stabilityai/stable-video-diffusion-img2vid/bs3+steps25+frames14.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"Model": "stabilityai/stable-video-diffusion-img2vid",
|
3 |
+
"GPU": "NVIDIA H100 80GB HBM3",
|
4 |
+
"Energy/video (J)": 7354.00133333333,
|
5 |
+
"Batch latency (s)": 34.5100462777274,
|
6 |
+
"Batch size": 3,
|
7 |
+
"Denoising steps": 25,
|
8 |
+
"Frames": 14
|
9 |
+
}
|
data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-2-2-decoder/bs1+steps25.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
{
|
2 |
"Model": "kandinsky-community/kandinsky-2-2-decoder",
|
3 |
"GPU": "NVIDIA A100-SXM4-40GB",
|
4 |
-
"Energy/image (J)":
|
5 |
-
"Batch latency (s)": 1.
|
6 |
"Batch size": 1,
|
7 |
"Denoising steps": 25
|
8 |
}
|
|
|
1 |
{
|
2 |
"Model": "kandinsky-community/kandinsky-2-2-decoder",
|
3 |
"GPU": "NVIDIA A100-SXM4-40GB",
|
4 |
+
"Energy/image (J)": 324.06850000005215,
|
5 |
+
"Batch latency (s)": 1.6537675857543945,
|
6 |
"Batch size": 1,
|
7 |
"Denoising steps": 25
|
8 |
}
|
data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-2-2-decoder/bs16+steps25.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
{
|
2 |
"Model": "kandinsky-community/kandinsky-2-2-decoder",
|
3 |
"GPU": "NVIDIA A100-SXM4-40GB",
|
4 |
-
"Energy/image (J)":
|
5 |
-
"Batch latency (s)": 7.
|
6 |
"Batch size": 16,
|
7 |
"Denoising steps": 25
|
8 |
}
|
|
|
1 |
{
|
2 |
"Model": "kandinsky-community/kandinsky-2-2-decoder",
|
3 |
"GPU": "NVIDIA A100-SXM4-40GB",
|
4 |
+
"Energy/image (J)": 172.51030000000029,
|
5 |
+
"Batch latency (s)": 7.375234842300415,
|
6 |
"Batch size": 16,
|
7 |
"Denoising steps": 25
|
8 |
}
|
data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-2-2-decoder/bs2+steps25.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
{
|
2 |
"Model": "kandinsky-community/kandinsky-2-2-decoder",
|
3 |
"GPU": "NVIDIA A100-SXM4-40GB",
|
4 |
-
"Energy/image (J)":
|
5 |
-
"Batch latency (s)": 1.
|
6 |
"Batch size": 2,
|
7 |
"Denoising steps": 25
|
8 |
}
|
|
|
1 |
{
|
2 |
"Model": "kandinsky-community/kandinsky-2-2-decoder",
|
3 |
"GPU": "NVIDIA A100-SXM4-40GB",
|
4 |
+
"Energy/image (J)": 230.3378000000026,
|
5 |
+
"Batch latency (s)": 1.5861663103103638,
|
6 |
"Batch size": 2,
|
7 |
"Denoising steps": 25
|
8 |
}
|
data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-2-2-decoder/bs32+steps25.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"Model": "kandinsky-community/kandinsky-2-2-decoder",
|
3 |
+
"GPU": "NVIDIA A100-SXM4-40GB",
|
4 |
+
"Energy/image (J)": 163.0797656249997,
|
5 |
+
"Batch latency (s)": 13.998618459701538,
|
6 |
+
"Batch size": 32,
|
7 |
+
"Denoising steps": 25
|
8 |
+
}
|
data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-2-2-decoder/bs4+steps25.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
{
|
2 |
"Model": "kandinsky-community/kandinsky-2-2-decoder",
|
3 |
"GPU": "NVIDIA A100-SXM4-40GB",
|
4 |
-
"Energy/image (J)":
|
5 |
-
"Batch latency (s)": 2.
|
6 |
"Batch size": 4,
|
7 |
"Denoising steps": 25
|
8 |
}
|
|
|
1 |
{
|
2 |
"Model": "kandinsky-community/kandinsky-2-2-decoder",
|
3 |
"GPU": "NVIDIA A100-SXM4-40GB",
|
4 |
+
"Energy/image (J)": 200.16462499999906,
|
5 |
+
"Batch latency (s)": 2.299217462539673,
|
6 |
"Batch size": 4,
|
7 |
"Denoising steps": 25
|
8 |
}
|
data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-2-2-decoder/bs8+steps25.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
{
|
2 |
"Model": "kandinsky-community/kandinsky-2-2-decoder",
|
3 |
"GPU": "NVIDIA A100-SXM4-40GB",
|
4 |
-
"Energy/image (J)":
|
5 |
-
"Batch latency (s)": 4.
|
6 |
"Batch size": 8,
|
7 |
"Denoising steps": 25
|
8 |
}
|
|
|
1 |
{
|
2 |
"Model": "kandinsky-community/kandinsky-2-2-decoder",
|
3 |
"GPU": "NVIDIA A100-SXM4-40GB",
|
4 |
+
"Energy/image (J)": 184.9021625000052,
|
5 |
+
"Batch latency (s)": 4.0124232292175295,
|
6 |
"Batch size": 8,
|
7 |
"Denoising steps": 25
|
8 |
}
|
data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-3/bs1+steps25.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
{
|
2 |
"Model": "kandinsky-community/kandinsky-3",
|
3 |
"GPU": "NVIDIA A100-SXM4-40GB",
|
4 |
-
"Energy/image (J)":
|
5 |
-
"Batch latency (s)": 3.
|
6 |
"Batch size": 1,
|
7 |
"Denoising steps": 25
|
8 |
}
|
|
|
1 |
{
|
2 |
"Model": "kandinsky-community/kandinsky-3",
|
3 |
"GPU": "NVIDIA A100-SXM4-40GB",
|
4 |
+
"Energy/image (J)": 930.2532999999821,
|
5 |
+
"Batch latency (s)": 3.0359585523605346,
|
6 |
"Batch size": 1,
|
7 |
"Denoising steps": 25
|
8 |
}
|
data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-3/bs2+steps25.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
{
|
2 |
"Model": "kandinsky-community/kandinsky-3",
|
3 |
"GPU": "NVIDIA A100-SXM4-40GB",
|
4 |
-
"Energy/image (J)":
|
5 |
-
"Batch latency (s)": 5.
|
6 |
"Batch size": 2,
|
7 |
"Denoising steps": 25
|
8 |
}
|
|
|
1 |
{
|
2 |
"Model": "kandinsky-community/kandinsky-3",
|
3 |
"GPU": "NVIDIA A100-SXM4-40GB",
|
4 |
+
"Energy/image (J)": 895.7575500000036,
|
5 |
+
"Batch latency (s)": 5.261959171295166,
|
6 |
"Batch size": 2,
|
7 |
"Denoising steps": 25
|
8 |
}
|
data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs1+steps25.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
{
|
2 |
"Model": "prompthero/openjourney-v4",
|
3 |
"GPU": "NVIDIA A100-SXM4-40GB",
|
4 |
-
"Energy/image (J)":
|
5 |
-
"Batch latency (s)":
|
6 |
"Batch size": 1,
|
7 |
"Denoising steps": 25
|
8 |
}
|
|
|
1 |
{
|
2 |
"Model": "prompthero/openjourney-v4",
|
3 |
"GPU": "NVIDIA A100-SXM4-40GB",
|
4 |
+
"Energy/image (J)": 227.21699999999254,
|
5 |
+
"Batch latency (s)": 0.9210062503814698,
|
6 |
"Batch size": 1,
|
7 |
"Denoising steps": 25
|
8 |
}
|
data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs16+steps25.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
{
|
2 |
"Model": "prompthero/openjourney-v4",
|
3 |
"GPU": "NVIDIA A100-SXM4-40GB",
|
4 |
-
"Energy/image (J)":
|
5 |
-
"Batch latency (s)": 6.
|
6 |
"Batch size": 16,
|
7 |
"Denoising steps": 25
|
8 |
}
|
|
|
1 |
{
|
2 |
"Model": "prompthero/openjourney-v4",
|
3 |
"GPU": "NVIDIA A100-SXM4-40GB",
|
4 |
+
"Energy/image (J)": 156.51368749999673,
|
5 |
+
"Batch latency (s)": 6.559858226776123,
|
6 |
"Batch size": 16,
|
7 |
"Denoising steps": 25
|
8 |
}
|
data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs2+steps25.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
{
|
2 |
"Model": "prompthero/openjourney-v4",
|
3 |
"GPU": "NVIDIA A100-SXM4-40GB",
|
4 |
-
"Energy/image (J)":
|
5 |
-
"Batch latency (s)": 1.
|
6 |
"Batch size": 2,
|
7 |
"Denoising steps": 25
|
8 |
}
|
|
|
1 |
{
|
2 |
"Model": "prompthero/openjourney-v4",
|
3 |
"GPU": "NVIDIA A100-SXM4-40GB",
|
4 |
+
"Energy/image (J)": 188.78500000000932,
|
5 |
+
"Batch latency (s)": 1.1187455892562865,
|
6 |
"Batch size": 2,
|
7 |
"Denoising steps": 25
|
8 |
}
|
data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs32+steps25.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"Model": "prompthero/openjourney-v4",
|
3 |
+
"GPU": "NVIDIA A100-SXM4-40GB",
|
4 |
+
"Energy/image (J)": 154.23499999999768,
|
5 |
+
"Batch latency (s)": 12.850126147270203,
|
6 |
+
"Batch size": 32,
|
7 |
+
"Denoising steps": 25
|
8 |
+
}
|
data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs4+steps25.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
{
|
2 |
"Model": "prompthero/openjourney-v4",
|
3 |
"GPU": "NVIDIA A100-SXM4-40GB",
|
4 |
-
"Energy/image (J)":
|
5 |
-
"Batch latency (s)": 1.
|
6 |
"Batch size": 4,
|
7 |
"Denoising steps": 25
|
8 |
}
|
|
|
1 |
{
|
2 |
"Model": "prompthero/openjourney-v4",
|
3 |
"GPU": "NVIDIA A100-SXM4-40GB",
|
4 |
+
"Energy/image (J)": 175.33082500000017,
|
5 |
+
"Batch latency (s)": 1.8664743423461914,
|
6 |
"Batch size": 4,
|
7 |
"Denoising steps": 25
|
8 |
}
|
data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs64+steps25.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"Model": "prompthero/openjourney-v4",
|
3 |
+
"GPU": "NVIDIA A100-SXM4-40GB",
|
4 |
+
"Energy/image (J)": 150.57691875000017,
|
5 |
+
"Batch latency (s)": 25.000647592544556,
|
6 |
+
"Batch size": 64,
|
7 |
+
"Denoising steps": 25
|
8 |
+
}
|
data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs8+steps25.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
{
|
2 |
"Model": "prompthero/openjourney-v4",
|
3 |
"GPU": "NVIDIA A100-SXM4-40GB",
|
4 |
-
"Energy/image (J)":
|
5 |
-
"Batch latency (s)": 3.
|
6 |
"Batch size": 8,
|
7 |
"Denoising steps": 25
|
8 |
}
|
|
|
1 |
{
|
2 |
"Model": "prompthero/openjourney-v4",
|
3 |
"GPU": "NVIDIA A100-SXM4-40GB",
|
4 |
+
"Energy/image (J)": 163.7534500000067,
|
5 |
+
"Batch latency (s)": 3.423132634162903,
|
6 |
"Batch size": 8,
|
7 |
"Denoising steps": 25
|
8 |
}
|
data/diffusion/text-to-image/A100-SXM4-40GB/segmind/SSD-1B/bs1+steps25.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
{
|
2 |
"Model": "segmind/SSD-1B",
|
3 |
"GPU": "NVIDIA A100-SXM4-40GB",
|
4 |
-
"Energy/image (J)":
|
5 |
-
"Batch latency (s)": 1.
|
6 |
"Batch size": 1,
|
7 |
"Denoising steps": 25
|
8 |
}
|
|
|
1 |
{
|
2 |
"Model": "segmind/SSD-1B",
|
3 |
"GPU": "NVIDIA A100-SXM4-40GB",
|
4 |
+
"Energy/image (J)": 745.7899999999441,
|
5 |
+
"Batch latency (s)": 1.9644724607467652,
|
6 |
"Batch size": 1,
|
7 |
"Denoising steps": 25
|
8 |
}
|
data/diffusion/text-to-image/A100-SXM4-40GB/segmind/SSD-1B/bs2+steps25.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
{
|
2 |
"Model": "segmind/SSD-1B",
|
3 |
"GPU": "NVIDIA A100-SXM4-40GB",
|
4 |
-
"Energy/image (J)":
|
5 |
-
"Batch latency (s)": 3.
|
6 |
"Batch size": 2,
|
7 |
"Denoising steps": 25
|
8 |
}
|
|
|
1 |
{
|
2 |
"Model": "segmind/SSD-1B",
|
3 |
"GPU": "NVIDIA A100-SXM4-40GB",
|
4 |
+
"Energy/image (J)": 700.4580500000156,
|
5 |
+
"Batch latency (s)": 3.6897377252578734,
|
6 |
"Batch size": 2,
|
7 |
"Denoising steps": 25
|
8 |
}
|
data/diffusion/text-to-image/A100-SXM4-40GB/segmind/SSD-1B/bs4+steps25.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
{
|
2 |
"Model": "segmind/SSD-1B",
|
3 |
"GPU": "NVIDIA A100-SXM4-40GB",
|
4 |
-
"Energy/image (J)":
|
5 |
-
"Batch latency (s)": 7.
|
6 |
"Batch size": 4,
|
7 |
"Denoising steps": 25
|
8 |
}
|
|
|
1 |
{
|
2 |
"Model": "segmind/SSD-1B",
|
3 |
"GPU": "NVIDIA A100-SXM4-40GB",
|
4 |
+
"Energy/image (J)": 688.6121250000084,
|
5 |
+
"Batch latency (s)": 7.168970584869385,
|
6 |
"Batch size": 4,
|
7 |
"Denoising steps": 25
|
8 |
}
|