Spaces:
Running
on
Zero
Running
on
Zero
pack_to_cuda timings
Browse files- spaces/zero/torch/packing.py +43 -0
spaces/zero/torch/packing.py
CHANGED
@@ -141,30 +141,62 @@ def pack_to_cuda(pack: ZeroGPUTensorPack, callback: Callable[[int]] | None = Non
|
|
141 |
read_bytes += os.readv(fd, [mv[read_bytes:]])
|
142 |
|
143 |
def disk_to_pin(fd: int):
|
|
|
|
|
|
|
|
|
|
|
|
|
144 |
for batch in pack.batches:
|
|
|
145 |
buffer = free_buffers.get()
|
|
|
146 |
batch_size = sum([aligned_size for *_, aligned_size in batch])
|
|
|
147 |
read(fd, buffer, batch_size)
|
|
|
|
|
148 |
read_buffers.put(buffer)
|
149 |
for *_, aligned_size in pack.big_tensors:
|
150 |
read_bytes = 0
|
151 |
while read_bytes < aligned_size:
|
|
|
152 |
buffer = free_buffers.get()
|
|
|
153 |
read_size = min(BUFFER_SIZE, aligned_size - read_bytes)
|
|
|
154 |
read(fd, buffer, read_size)
|
|
|
|
|
155 |
read_buffers.put(buffer)
|
156 |
read_bytes += read_size
|
|
|
|
|
|
|
|
|
|
|
157 |
|
158 |
def pin_to_cuda():
|
159 |
total_duration_in_callback = 0
|
|
|
|
|
|
|
|
|
|
|
|
|
160 |
for batch in pack.batches:
|
|
|
161 |
buffer = read_buffers.get()
|
|
|
162 |
offset = 0
|
163 |
cuda_storages = []
|
|
|
164 |
for tensor, size, aligned_size in batch:
|
165 |
cuda_storages += [buffer[offset:offset+size].cuda(non_blocking=True)]
|
|
|
166 |
offset += aligned_size
|
167 |
torch.cuda.synchronize()
|
|
|
168 |
free_buffers.put(buffer)
|
169 |
batch_total_size = 0
|
170 |
for (tensor, size, _), cuda_storage in zip(batch, cuda_storages):
|
@@ -180,9 +212,14 @@ def pack_to_cuda(pack: ZeroGPUTensorPack, callback: Callable[[int]] | None = Non
|
|
180 |
cuda_storage = torch.empty(size, dtype=torch.uint8, device='cuda')
|
181 |
offset = 0
|
182 |
while offset < size:
|
|
|
183 |
buffer = read_buffers.get()
|
|
|
184 |
read_size = min(BUFFER_SIZE, size - offset)
|
|
|
185 |
cuda_storage[offset:offset+read_size] = buffer[:read_size]
|
|
|
|
|
186 |
offset += read_size
|
187 |
torch.cuda.synchronize() # Probably not needed
|
188 |
free_buffers.put(buffer)
|
@@ -194,6 +231,12 @@ def pack_to_cuda(pack: ZeroGPUTensorPack, callback: Callable[[int]] | None = Non
|
|
194 |
for fake in pack.fakes[tensor]:
|
195 |
fake.data = cuda_tensor
|
196 |
|
|
|
|
|
|
|
|
|
|
|
|
|
197 |
debug(f"{total_duration_in_callback=}")
|
198 |
|
199 |
with ThreadPoolExecutor(2) as e:
|
|
|
141 |
read_bytes += os.readv(fd, [mv[read_bytes:]])
|
142 |
|
143 |
def disk_to_pin(fd: int):
|
144 |
+
total_time_batches = 0
|
145 |
+
total_size_batches = 0
|
146 |
+
total_wait_batches = 0
|
147 |
+
total_time_tensors = 0
|
148 |
+
total_size_tensors = 0
|
149 |
+
total_wait_tensors = 0
|
150 |
for batch in pack.batches:
|
151 |
+
t0 = time.perf_counter()
|
152 |
buffer = free_buffers.get()
|
153 |
+
total_wait_batches += time.perf_counter() - t0
|
154 |
batch_size = sum([aligned_size for *_, aligned_size in batch])
|
155 |
+
t0 = time.perf_counter()
|
156 |
read(fd, buffer, batch_size)
|
157 |
+
total_time_batches += time.perf_counter() - t0
|
158 |
+
total_size_batches += batch_size
|
159 |
read_buffers.put(buffer)
|
160 |
for *_, aligned_size in pack.big_tensors:
|
161 |
read_bytes = 0
|
162 |
while read_bytes < aligned_size:
|
163 |
+
t0 = time.perf_counter()
|
164 |
buffer = free_buffers.get()
|
165 |
+
total_wait_tensors += time.perf_counter() - t0
|
166 |
read_size = min(BUFFER_SIZE, aligned_size - read_bytes)
|
167 |
+
t0 = time.perf_counter()
|
168 |
read(fd, buffer, read_size)
|
169 |
+
total_time_tensors += time.perf_counter() - t0
|
170 |
+
total_size_tensors += read_size
|
171 |
read_buffers.put(buffer)
|
172 |
read_bytes += read_size
|
173 |
+
print("disk_to_pin (batches)", f"{total_size_batches / total_time_batches / 2**30:.3f}GB/s")
|
174 |
+
print("disk_to_pin (batches)", f"{total_wait_batches:.3f}s buffer wait")
|
175 |
+
print("disk_to_pin (tensors)", f"{total_size_tensors / total_time_tensors / 2**30:.3f}GB/s")
|
176 |
+
print("disk_to_pin (tensors)", f"{total_wait_tensors:.3f}s buffer wait")
|
177 |
+
|
178 |
|
179 |
def pin_to_cuda():
|
180 |
total_duration_in_callback = 0
|
181 |
+
total_time_batches = 0
|
182 |
+
total_size_batches = 0
|
183 |
+
total_wait_batches = 0
|
184 |
+
total_time_tensors = 0
|
185 |
+
total_size_tensors = 0
|
186 |
+
total_wait_tensors = 0
|
187 |
for batch in pack.batches:
|
188 |
+
t0 = time.perf_counter()
|
189 |
buffer = read_buffers.get()
|
190 |
+
total_wait_batches += time.perf_counter() - t0
|
191 |
offset = 0
|
192 |
cuda_storages = []
|
193 |
+
t0 = time.perf_counter()
|
194 |
for tensor, size, aligned_size in batch:
|
195 |
cuda_storages += [buffer[offset:offset+size].cuda(non_blocking=True)]
|
196 |
+
total_size_batches += size
|
197 |
offset += aligned_size
|
198 |
torch.cuda.synchronize()
|
199 |
+
total_time_batches += time.perf_counter() - t0
|
200 |
free_buffers.put(buffer)
|
201 |
batch_total_size = 0
|
202 |
for (tensor, size, _), cuda_storage in zip(batch, cuda_storages):
|
|
|
212 |
cuda_storage = torch.empty(size, dtype=torch.uint8, device='cuda')
|
213 |
offset = 0
|
214 |
while offset < size:
|
215 |
+
t0 = time.perf_counter()
|
216 |
buffer = read_buffers.get()
|
217 |
+
total_wait_tensors += time.perf_counter() - t0
|
218 |
read_size = min(BUFFER_SIZE, size - offset)
|
219 |
+
t0 = time.perf_counter()
|
220 |
cuda_storage[offset:offset+read_size] = buffer[:read_size]
|
221 |
+
total_time_tensors += time.perf_counter() - t0
|
222 |
+
total_size_tensors += read_size
|
223 |
offset += read_size
|
224 |
torch.cuda.synchronize() # Probably not needed
|
225 |
free_buffers.put(buffer)
|
|
|
231 |
for fake in pack.fakes[tensor]:
|
232 |
fake.data = cuda_tensor
|
233 |
|
234 |
+
print("pin_to_cuda (batches)", f"{total_size_batches / total_time_batches / 2**30:.3f}GB/s")
|
235 |
+
print("pin_to_cuda (batches)", f"{total_wait_batches:.3f}s buffer wait")
|
236 |
+
print("pin_to_cuda (tensors)", f"{total_size_tensors / total_time_tensors / 2**30:.3f}GB/s")
|
237 |
+
print("pin_to_cuda (tensors)", f"{total_wait_tensors:.3f}s buffer wait")
|
238 |
+
print("pin_to_cuda", f"{total_duration_in_callback:.3f}s in callback")
|
239 |
+
|
240 |
debug(f"{total_duration_in_callback=}")
|
241 |
|
242 |
with ThreadPoolExecutor(2) as e:
|