/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Test evaluation for caching allocator of device memory ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include "test_util.h" using namespace cub; //--------------------------------------------------------------------- // Main //--------------------------------------------------------------------- /** * Main */ int main(int argc, char** argv) { // Initialize command line CommandLineArgs args(argc, argv); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--device=]" "[--bytes=]" "[--i=]" "\n", argv[0]); exit(0); } #if (CUB_PTX_ARCH == 0) // Initialize device CubDebugExit(args.DeviceInit()); // Get number of GPUs and current GPU int num_gpus; int initial_gpu; int timing_iterations = 10000; int timing_bytes = 1024 * 1024; if (CubDebug(cudaGetDeviceCount(&num_gpus))) exit(1); if (CubDebug(cudaGetDevice(&initial_gpu))) exit(1); args.GetCmdLineArgument("i", timing_iterations); args.GetCmdLineArgument("bytes", timing_bytes); // Create default allocator (caches up to 6MB in device allocations per GPU) CachingDeviceAllocator allocator; allocator.debug = true; printf("Running single-gpu tests...\n"); fflush(stdout); // // Test0 // // Create a new stream cudaStream_t other_stream; CubDebugExit(cudaStreamCreate(&other_stream)); // Allocate 999 bytes on the current gpu in stream0 char *d_999B_stream0_a; char *d_999B_stream0_b; CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_a, 999, 0)); // Run some big kernel in stream 0 EmptyKernel<<<32000, 512, 1024 * 8, 0>>>(); // Free d_999B_stream0_a CubDebugExit(allocator.DeviceFree(d_999B_stream0_a)); // Allocate another 999 bytes in stream 0 CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_b, 999, 0)); // Check that that we have 1 live block on the initial GPU AssertEquals(allocator.live_blocks.size(), 1); // Check that that we have no cached block on the initial GPU AssertEquals(allocator.cached_blocks.size(), 0); // Run some big kernel in stream 0 EmptyKernel<<<32000, 512, 1024 * 8, 0>>>(); // Free d_999B_stream0_b CubDebugExit(allocator.DeviceFree(d_999B_stream0_b)); // Allocate 999 bytes on the current gpu in other_stream char *d_999B_stream_other_a; char *d_999B_stream_other_b; allocator.DeviceAllocate((void **) &d_999B_stream_other_a, 999, other_stream); // Check that that we have 1 live blocks on the initial GPU (that we allocated a new one because d_999B_stream0_b is only available for stream 0 until it becomes idle) AssertEquals(allocator.live_blocks.size(), 1); // Check that that we have one cached block on the initial GPU AssertEquals(allocator.cached_blocks.size(), 1); // Run some big kernel in other_stream EmptyKernel<<<32000, 512, 1024 * 8, other_stream>>>(); // Free d_999B_stream_other CubDebugExit(allocator.DeviceFree(d_999B_stream_other_a)); // Check that we can now use both allocations in stream 0 after synchronizing the device CubDebugExit(cudaDeviceSynchronize()); CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_a, 999, 0)); CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_b, 999, 0)); // Check that that we have 2 live blocks on the initial GPU AssertEquals(allocator.live_blocks.size(), 2); // Check that that we have no cached block on the initial GPU AssertEquals(allocator.cached_blocks.size(), 0); // Free d_999B_stream0_a and d_999B_stream0_b CubDebugExit(allocator.DeviceFree(d_999B_stream0_a)); CubDebugExit(allocator.DeviceFree(d_999B_stream0_b)); // Check that we can now use both allocations in other_stream CubDebugExit(cudaDeviceSynchronize()); CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream_other_a, 999, other_stream)); CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream_other_b, 999, other_stream)); // Check that that we have 2 live blocks on the initial GPU AssertEquals(allocator.live_blocks.size(), 2); // Check that that we have no cached block on the initial GPU AssertEquals(allocator.cached_blocks.size(), 0); // Run some big kernel in other_stream EmptyKernel<<<32000, 512, 1024 * 8, other_stream>>>(); // Free d_999B_stream_other_a and d_999B_stream_other_b CubDebugExit(allocator.DeviceFree(d_999B_stream_other_a)); CubDebugExit(allocator.DeviceFree(d_999B_stream_other_b)); // Check that we can now use both allocations in stream 0 after synchronizing the device and destroying the other stream CubDebugExit(cudaDeviceSynchronize()); CubDebugExit(cudaStreamDestroy(other_stream)); CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_a, 999, 0)); CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_b, 999, 0)); // Check that that we have 2 live blocks on the initial GPU AssertEquals(allocator.live_blocks.size(), 2); // Check that that we have no cached block on the initial GPU AssertEquals(allocator.cached_blocks.size(), 0); // Free d_999B_stream0_a and d_999B_stream0_b CubDebugExit(allocator.DeviceFree(d_999B_stream0_a)); CubDebugExit(allocator.DeviceFree(d_999B_stream0_b)); // Free all cached CubDebugExit(allocator.FreeAllCached()); // // Test1 // // Allocate 5 bytes on the current gpu char *d_5B; CubDebugExit(allocator.DeviceAllocate((void **) &d_5B, 5)); // Check that that we have zero free bytes cached on the initial GPU AssertEquals(allocator.cached_bytes[initial_gpu].free, 0); // Check that that we have 1 live block on the initial GPU AssertEquals(allocator.live_blocks.size(), 1); // // Test2 // // Allocate 4096 bytes on the current gpu char *d_4096B; CubDebugExit(allocator.DeviceAllocate((void **) &d_4096B, 4096)); // Check that that we have 2 live blocks on the initial GPU AssertEquals(allocator.live_blocks.size(), 2); // // Test3 // // DeviceFree d_5B CubDebugExit(allocator.DeviceFree(d_5B)); // Check that that we have min_bin_bytes free bytes cached on the initial gpu AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes); // Check that that we have 1 live block on the initial GPU AssertEquals(allocator.live_blocks.size(), 1); // Check that that we have 1 cached block on the initial GPU AssertEquals(allocator.cached_blocks.size(), 1); // // Test4 // // DeviceFree d_4096B CubDebugExit(allocator.DeviceFree(d_4096B)); // Check that that we have the 4096 + min_bin free bytes cached on the initial gpu AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes + 4096); // Check that that we have 0 live block on the initial GPU AssertEquals(allocator.live_blocks.size(), 0); // Check that that we have 2 cached block on the initial GPU AssertEquals(allocator.cached_blocks.size(), 2); // // Test5 // // Allocate 768 bytes on the current gpu char *d_768B; CubDebugExit(allocator.DeviceAllocate((void **) &d_768B, 768)); // Check that that we have the min_bin free bytes cached on the initial gpu (4096 was reused) AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes); // Check that that we have 1 live block on the initial GPU AssertEquals(allocator.live_blocks.size(), 1); // Check that that we have 1 cached block on the initial GPU AssertEquals(allocator.cached_blocks.size(), 1); // // Test6 // // Allocate max_cached_bytes on the current gpu char *d_max_cached; CubDebugExit(allocator.DeviceAllocate((void **) &d_max_cached, allocator.max_cached_bytes)); // DeviceFree d_max_cached CubDebugExit(allocator.DeviceFree(d_max_cached)); // Check that that we have the min_bin free bytes cached on the initial gpu (max cached was not returned because we went over) AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes); // Check that that we have 1 live block on the initial GPU AssertEquals(allocator.live_blocks.size(), 1); // Check that that we still have 1 cached block on the initial GPU AssertEquals(allocator.cached_blocks.size(), 1); // // Test7 // // Free all cached blocks on all GPUs CubDebugExit(allocator.FreeAllCached()); // Check that that we have 0 bytes cached on the initial GPU AssertEquals(allocator.cached_bytes[initial_gpu].free, 0); // Check that that we have 0 cached blocks across all GPUs AssertEquals(allocator.cached_blocks.size(), 0); // Check that that still we have 1 live block across all GPUs AssertEquals(allocator.live_blocks.size(), 1); // // Test8 // // Allocate max cached bytes + 1 on the current gpu char *d_max_cached_plus; CubDebugExit(allocator.DeviceAllocate((void **) &d_max_cached_plus, allocator.max_cached_bytes + 1)); // DeviceFree max cached bytes CubDebugExit(allocator.DeviceFree(d_max_cached_plus)); // DeviceFree d_768B CubDebugExit(allocator.DeviceFree(d_768B)); unsigned int power; size_t rounded_bytes; allocator.NearestPowerOf(power, rounded_bytes, allocator.bin_growth, 768); // Check that that we have 4096 free bytes cached on the initial gpu AssertEquals(allocator.cached_bytes[initial_gpu].free, rounded_bytes); // Check that that we have 1 cached blocks across all GPUs AssertEquals(allocator.cached_blocks.size(), 1); // Check that that still we have 0 live block across all GPUs AssertEquals(allocator.live_blocks.size(), 0); #ifndef CUB_CDP // BUG: find out why these tests fail when one GPU is CDP compliant and the other is not if (num_gpus > 1) { printf("\nRunning multi-gpu tests...\n"); fflush(stdout); // // Test9 // // Allocate 768 bytes on the next gpu int next_gpu = (initial_gpu + 1) % num_gpus; char *d_768B_2; CubDebugExit(allocator.DeviceAllocate(next_gpu, (void **) &d_768B_2, 768)); // DeviceFree d_768B on the next gpu CubDebugExit(allocator.DeviceFree(next_gpu, d_768B_2)); // Re-allocate 768 bytes on the next gpu CubDebugExit(allocator.DeviceAllocate(next_gpu, (void **) &d_768B_2, 768)); // Re-free d_768B on the next gpu CubDebugExit(allocator.DeviceFree(next_gpu, d_768B_2)); // Check that that we have 4096 free bytes cached on the initial gpu AssertEquals(allocator.cached_bytes[initial_gpu].free, rounded_bytes); // Check that that we have 4096 free bytes cached on the second gpu AssertEquals(allocator.cached_bytes[next_gpu].free, rounded_bytes); // Check that that we have 2 cached blocks across all GPUs AssertEquals(allocator.cached_blocks.size(), 2); // Check that that still we have 0 live block across all GPUs AssertEquals(allocator.live_blocks.size(), 0); } #endif // CUB_CDP // // Performance // printf("\nCPU Performance (%d timing iterations, %d bytes):\n", timing_iterations, timing_bytes); fflush(stdout); fflush(stderr); // CPU performance comparisons vs cached. Allocate and free a 1MB block 2000 times CpuTimer cpu_timer; char *d_1024MB = NULL; allocator.debug = false; // Prime the caching allocator and the kernel CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes)); CubDebugExit(allocator.DeviceFree(d_1024MB)); cub::EmptyKernel<<<1, 32>>>(); // CUDA cpu_timer.Start(); for (int i = 0; i < timing_iterations; ++i) { CubDebugExit(cudaMalloc((void **) &d_1024MB, timing_bytes)); CubDebugExit(cudaFree(d_1024MB)); } cpu_timer.Stop(); float cuda_malloc_elapsed_millis = cpu_timer.ElapsedMillis(); // CUB cpu_timer.Start(); for (int i = 0; i < timing_iterations; ++i) { CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes)); CubDebugExit(allocator.DeviceFree(d_1024MB)); } cpu_timer.Stop(); float cub_calloc_elapsed_millis = cpu_timer.ElapsedMillis(); printf("\t CUB CachingDeviceAllocator allocation CPU speedup: %.2f (avg cudaMalloc %.4f ms vs. avg DeviceAllocate %.4f ms)\n", cuda_malloc_elapsed_millis / cub_calloc_elapsed_millis, cuda_malloc_elapsed_millis / timing_iterations, cub_calloc_elapsed_millis / timing_iterations); // GPU performance comparisons. Allocate and free a 1MB block 2000 times GpuTimer gpu_timer; printf("\nGPU Performance (%d timing iterations, %d bytes):\n", timing_iterations, timing_bytes); fflush(stdout); fflush(stderr); // Kernel-only gpu_timer.Start(); for (int i = 0; i < timing_iterations; ++i) { cub::EmptyKernel<<<1, 32>>>(); } gpu_timer.Stop(); float cuda_empty_elapsed_millis = gpu_timer.ElapsedMillis(); // CUDA gpu_timer.Start(); for (int i = 0; i < timing_iterations; ++i) { CubDebugExit(cudaMalloc((void **) &d_1024MB, timing_bytes)); cub::EmptyKernel<<<1, 32>>>(); CubDebugExit(cudaFree(d_1024MB)); } gpu_timer.Stop(); cuda_malloc_elapsed_millis = gpu_timer.ElapsedMillis() - cuda_empty_elapsed_millis; // CUB gpu_timer.Start(); for (int i = 0; i < timing_iterations; ++i) { CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes)); cub::EmptyKernel<<<1, 32>>>(); CubDebugExit(allocator.DeviceFree(d_1024MB)); } gpu_timer.Stop(); cub_calloc_elapsed_millis = gpu_timer.ElapsedMillis() - cuda_empty_elapsed_millis; printf("\t CUB CachingDeviceAllocator allocation GPU speedup: %.2f (avg cudaMalloc %.4f ms vs. avg DeviceAllocate %.4f ms)\n", cuda_malloc_elapsed_millis / cub_calloc_elapsed_millis, cuda_malloc_elapsed_millis / timing_iterations, cub_calloc_elapsed_millis / timing_iterations); #endif printf("Success\n"); return 0; }