cbensimon HF staff commited on
Commit
8764fac
1 Parent(s): c97ac6c

pack_to_cuda timings

Browse files
Files changed (1) hide show
  1. spaces/zero/torch/packing.py +43 -0
spaces/zero/torch/packing.py CHANGED
@@ -141,30 +141,62 @@ def pack_to_cuda(pack: ZeroGPUTensorPack, callback: Callable[[int]] | None = Non
141
  read_bytes += os.readv(fd, [mv[read_bytes:]])
142
 
143
  def disk_to_pin(fd: int):
 
 
 
 
 
 
144
  for batch in pack.batches:
 
145
  buffer = free_buffers.get()
 
146
  batch_size = sum([aligned_size for *_, aligned_size in batch])
 
147
  read(fd, buffer, batch_size)
 
 
148
  read_buffers.put(buffer)
149
  for *_, aligned_size in pack.big_tensors:
150
  read_bytes = 0
151
  while read_bytes < aligned_size:
 
152
  buffer = free_buffers.get()
 
153
  read_size = min(BUFFER_SIZE, aligned_size - read_bytes)
 
154
  read(fd, buffer, read_size)
 
 
155
  read_buffers.put(buffer)
156
  read_bytes += read_size
 
 
 
 
 
157
 
158
  def pin_to_cuda():
159
  total_duration_in_callback = 0
 
 
 
 
 
 
160
  for batch in pack.batches:
 
161
  buffer = read_buffers.get()
 
162
  offset = 0
163
  cuda_storages = []
 
164
  for tensor, size, aligned_size in batch:
165
  cuda_storages += [buffer[offset:offset+size].cuda(non_blocking=True)]
 
166
  offset += aligned_size
167
  torch.cuda.synchronize()
 
168
  free_buffers.put(buffer)
169
  batch_total_size = 0
170
  for (tensor, size, _), cuda_storage in zip(batch, cuda_storages):
@@ -180,9 +212,14 @@ def pack_to_cuda(pack: ZeroGPUTensorPack, callback: Callable[[int]] | None = Non
180
  cuda_storage = torch.empty(size, dtype=torch.uint8, device='cuda')
181
  offset = 0
182
  while offset < size:
 
183
  buffer = read_buffers.get()
 
184
  read_size = min(BUFFER_SIZE, size - offset)
 
185
  cuda_storage[offset:offset+read_size] = buffer[:read_size]
 
 
186
  offset += read_size
187
  torch.cuda.synchronize() # Probably not needed
188
  free_buffers.put(buffer)
@@ -194,6 +231,12 @@ def pack_to_cuda(pack: ZeroGPUTensorPack, callback: Callable[[int]] | None = Non
194
  for fake in pack.fakes[tensor]:
195
  fake.data = cuda_tensor
196
 
 
 
 
 
 
 
197
  debug(f"{total_duration_in_callback=}")
198
 
199
  with ThreadPoolExecutor(2) as e:
 
141
  read_bytes += os.readv(fd, [mv[read_bytes:]])
142
 
143
  def disk_to_pin(fd: int):
144
+ total_time_batches = 0
145
+ total_size_batches = 0
146
+ total_wait_batches = 0
147
+ total_time_tensors = 0
148
+ total_size_tensors = 0
149
+ total_wait_tensors = 0
150
  for batch in pack.batches:
151
+ t0 = time.perf_counter()
152
  buffer = free_buffers.get()
153
+ total_wait_batches += time.perf_counter() - t0
154
  batch_size = sum([aligned_size for *_, aligned_size in batch])
155
+ t0 = time.perf_counter()
156
  read(fd, buffer, batch_size)
157
+ total_time_batches += time.perf_counter() - t0
158
+ total_size_batches += batch_size
159
  read_buffers.put(buffer)
160
  for *_, aligned_size in pack.big_tensors:
161
  read_bytes = 0
162
  while read_bytes < aligned_size:
163
+ t0 = time.perf_counter()
164
  buffer = free_buffers.get()
165
+ total_wait_tensors += time.perf_counter() - t0
166
  read_size = min(BUFFER_SIZE, aligned_size - read_bytes)
167
+ t0 = time.perf_counter()
168
  read(fd, buffer, read_size)
169
+ total_time_tensors += time.perf_counter() - t0
170
+ total_size_tensors += read_size
171
  read_buffers.put(buffer)
172
  read_bytes += read_size
173
+ print("disk_to_pin (batches)", f"{total_size_batches / total_time_batches / 2**30:.3f}GB/s")
174
+ print("disk_to_pin (batches)", f"{total_wait_batches:.3f}s buffer wait")
175
+ print("disk_to_pin (tensors)", f"{total_size_tensors / total_time_tensors / 2**30:.3f}GB/s")
176
+ print("disk_to_pin (tensors)", f"{total_wait_tensors:.3f}s buffer wait")
177
+
178
 
179
  def pin_to_cuda():
180
  total_duration_in_callback = 0
181
+ total_time_batches = 0
182
+ total_size_batches = 0
183
+ total_wait_batches = 0
184
+ total_time_tensors = 0
185
+ total_size_tensors = 0
186
+ total_wait_tensors = 0
187
  for batch in pack.batches:
188
+ t0 = time.perf_counter()
189
  buffer = read_buffers.get()
190
+ total_wait_batches += time.perf_counter() - t0
191
  offset = 0
192
  cuda_storages = []
193
+ t0 = time.perf_counter()
194
  for tensor, size, aligned_size in batch:
195
  cuda_storages += [buffer[offset:offset+size].cuda(non_blocking=True)]
196
+ total_size_batches += size
197
  offset += aligned_size
198
  torch.cuda.synchronize()
199
+ total_time_batches += time.perf_counter() - t0
200
  free_buffers.put(buffer)
201
  batch_total_size = 0
202
  for (tensor, size, _), cuda_storage in zip(batch, cuda_storages):
 
212
  cuda_storage = torch.empty(size, dtype=torch.uint8, device='cuda')
213
  offset = 0
214
  while offset < size:
215
+ t0 = time.perf_counter()
216
  buffer = read_buffers.get()
217
+ total_wait_tensors += time.perf_counter() - t0
218
  read_size = min(BUFFER_SIZE, size - offset)
219
+ t0 = time.perf_counter()
220
  cuda_storage[offset:offset+read_size] = buffer[:read_size]
221
+ total_time_tensors += time.perf_counter() - t0
222
+ total_size_tensors += read_size
223
  offset += read_size
224
  torch.cuda.synchronize() # Probably not needed
225
  free_buffers.put(buffer)
 
231
  for fake in pack.fakes[tensor]:
232
  fake.data = cuda_tensor
233
 
234
+ print("pin_to_cuda (batches)", f"{total_size_batches / total_time_batches / 2**30:.3f}GB/s")
235
+ print("pin_to_cuda (batches)", f"{total_wait_batches:.3f}s buffer wait")
236
+ print("pin_to_cuda (tensors)", f"{total_size_tensors / total_time_tensors / 2**30:.3f}GB/s")
237
+ print("pin_to_cuda (tensors)", f"{total_wait_tensors:.3f}s buffer wait")
238
+ print("pin_to_cuda", f"{total_duration_in_callback:.3f}s in callback")
239
+
240
  debug(f"{total_duration_in_callback=}")
241
 
242
  with ThreadPoolExecutor(2) as e: