dalle-3-xl-lora-v2

Running on Zero

App Files Files Community

cbensimon HF staff commited on 16 days ago

Commit

8764fac

•

1 Parent(s): c97ac6c

pack_to_cuda timings

Browse files

Files changed (1) hide show

spaces/zero/torch/packing.py +43 -0

spaces/zero/torch/packing.py CHANGED Viewed

@@ -141,30 +141,62 @@ def pack_to_cuda(pack: ZeroGPUTensorPack, callback: Callable[[int]] | None = Non
             read_bytes += os.readv(fd, [mv[read_bytes:]])
     def disk_to_pin(fd: int):
         for batch in pack.batches:
             buffer = free_buffers.get()
             batch_size = sum([aligned_size for *_, aligned_size in batch])
             read(fd, buffer, batch_size)
             read_buffers.put(buffer)
         for *_, aligned_size in pack.big_tensors:
             read_bytes = 0
             while read_bytes < aligned_size:
                 buffer = free_buffers.get()
                 read_size = min(BUFFER_SIZE, aligned_size - read_bytes)
                 read(fd, buffer, read_size)
                 read_buffers.put(buffer)
                 read_bytes += read_size
     def pin_to_cuda():
         total_duration_in_callback = 0
         for batch in pack.batches:
             buffer = read_buffers.get()
             offset = 0
             cuda_storages = []
             for tensor, size, aligned_size in batch:
                 cuda_storages += [buffer[offset:offset+size].cuda(non_blocking=True)]
                 offset += aligned_size
             torch.cuda.synchronize()
             free_buffers.put(buffer)
             batch_total_size = 0
             for (tensor, size, _), cuda_storage in zip(batch, cuda_storages):
@@ -180,9 +212,14 @@ def pack_to_cuda(pack: ZeroGPUTensorPack, callback: Callable[[int]] | None = Non
             cuda_storage = torch.empty(size, dtype=torch.uint8, device='cuda')
             offset = 0
             while offset < size:
                 buffer = read_buffers.get()
                 read_size = min(BUFFER_SIZE, size - offset)
                 cuda_storage[offset:offset+read_size] = buffer[:read_size]
                 offset += read_size
                 torch.cuda.synchronize() # Probably not needed
                 free_buffers.put(buffer)
@@ -194,6 +231,12 @@ def pack_to_cuda(pack: ZeroGPUTensorPack, callback: Callable[[int]] | None = Non
             for fake in pack.fakes[tensor]:
                 fake.data = cuda_tensor
         debug(f"{total_duration_in_callback=}")
     with ThreadPoolExecutor(2) as e:

             read_bytes += os.readv(fd, [mv[read_bytes:]])
     def disk_to_pin(fd: int):
+        total_time_batches = 0
+        total_size_batches = 0
+        total_wait_batches = 0
+        total_time_tensors = 0
+        total_size_tensors = 0
+        total_wait_tensors = 0
         for batch in pack.batches:
+            t0 = time.perf_counter()
             buffer = free_buffers.get()
+            total_wait_batches += time.perf_counter() - t0
             batch_size = sum([aligned_size for *_, aligned_size in batch])
+            t0 = time.perf_counter()
             read(fd, buffer, batch_size)
+            total_time_batches += time.perf_counter() - t0
+            total_size_batches += batch_size
             read_buffers.put(buffer)
         for *_, aligned_size in pack.big_tensors:
             read_bytes = 0
             while read_bytes < aligned_size:
+                t0 = time.perf_counter()
                 buffer = free_buffers.get()
+                total_wait_tensors += time.perf_counter() - t0
                 read_size = min(BUFFER_SIZE, aligned_size - read_bytes)
+                t0 = time.perf_counter()
                 read(fd, buffer, read_size)
+                total_time_tensors += time.perf_counter() - t0
+                total_size_tensors += read_size
                 read_buffers.put(buffer)
                 read_bytes += read_size
+        print("disk_to_pin (batches)", f"{total_size_batches / total_time_batches / 2**30:.3f}GB/s")
+        print("disk_to_pin (batches)", f"{total_wait_batches:.3f}s buffer wait")
+        print("disk_to_pin (tensors)", f"{total_size_tensors / total_time_tensors / 2**30:.3f}GB/s")
+        print("disk_to_pin (tensors)", f"{total_wait_tensors:.3f}s buffer wait")
     def pin_to_cuda():
         total_duration_in_callback = 0
+        total_time_batches = 0
+        total_size_batches = 0
+        total_wait_batches = 0
+        total_time_tensors = 0
+        total_size_tensors = 0
+        total_wait_tensors = 0
         for batch in pack.batches:
+            t0 = time.perf_counter()
             buffer = read_buffers.get()
+            total_wait_batches += time.perf_counter() - t0
             offset = 0
             cuda_storages = []
+            t0 = time.perf_counter()
             for tensor, size, aligned_size in batch:
                 cuda_storages += [buffer[offset:offset+size].cuda(non_blocking=True)]
+                total_size_batches += size
                 offset += aligned_size
             torch.cuda.synchronize()
+            total_time_batches += time.perf_counter() - t0
             free_buffers.put(buffer)
             batch_total_size = 0
             for (tensor, size, _), cuda_storage in zip(batch, cuda_storages):
             cuda_storage = torch.empty(size, dtype=torch.uint8, device='cuda')
             offset = 0
             while offset < size:
+                t0 = time.perf_counter()
                 buffer = read_buffers.get()
+                total_wait_tensors += time.perf_counter() - t0
                 read_size = min(BUFFER_SIZE, size - offset)
+                t0 = time.perf_counter()
                 cuda_storage[offset:offset+read_size] = buffer[:read_size]
+                total_time_tensors += time.perf_counter() - t0
+                total_size_tensors += read_size
                 offset += read_size
                 torch.cuda.synchronize() # Probably not needed
                 free_buffers.put(buffer)
             for fake in pack.fakes[tensor]:
                 fake.data = cuda_tensor
+        print("pin_to_cuda (batches)", f"{total_size_batches / total_time_batches / 2**30:.3f}GB/s")
+        print("pin_to_cuda (batches)", f"{total_wait_batches:.3f}s buffer wait")
+        print("pin_to_cuda (tensors)", f"{total_size_tensors / total_time_tensors / 2**30:.3f}GB/s")
+        print("pin_to_cuda (tensors)", f"{total_wait_tensors:.3f}s buffer wait")
+        print("pin_to_cuda", f"{total_duration_in_callback:.3f}s in callback")
         debug(f"{total_duration_in_callback=}")
     with ThreadPoolExecutor(2) as e: