Spaces:

ma-xu
/

LIVE

Runtime error

LIVE / thrust /dependencies /cub /test /test_allocator.cu

Xu Ma

update

1c3c0d9 over 2 years ago

16.6 kB

	/******************************************************************************
	* Copyright (c) 2011, Duane Merrill. All rights reserved.
	* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* * Neither the name of the NVIDIA CORPORATION nor the
	* names of its contributors may be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	******************************************************************************/

	/******************************************************************************
	* Test evaluation for caching allocator of device memory
	******************************************************************************/

	// Ensure printing of CUDA runtime errors to console
	#define CUB_STDERR

	#include <stdio.h>

	#include <cub/util_allocator.cuh>
	#include "test_util.h"

	using namespace cub;


	//---------------------------------------------------------------------
	// Main
	//---------------------------------------------------------------------

	/**
	* Main
	*/
	int main(int argc, char** argv)
	{
	// Initialize command line
	CommandLineArgs args(argc, argv);

	// Print usage
	if (args.CheckCmdLineFlag("help"))
	{
	printf("%s "
	"[--device=<device-id>]"
	"[--bytes=<timing bytes>]"
	"[--i=<timing iterations>]"
	"\n", argv[0]);
	exit(0);
	}

	#if (CUB_PTX_ARCH == 0)

	// Initialize device
	CubDebugExit(args.DeviceInit());

	// Get number of GPUs and current GPU
	int num_gpus;
	int initial_gpu;
	int timing_iterations = 10000;
	int timing_bytes = 1024 * 1024;

	if (CubDebug(cudaGetDeviceCount(&num_gpus))) exit(1);
	if (CubDebug(cudaGetDevice(&initial_gpu))) exit(1);
	args.GetCmdLineArgument("i", timing_iterations);
	args.GetCmdLineArgument("bytes", timing_bytes);

	// Create default allocator (caches up to 6MB in device allocations per GPU)
	CachingDeviceAllocator allocator;
	allocator.debug = true;

	printf("Running single-gpu tests...\n"); fflush(stdout);

	//
	// Test0
	//

	// Create a new stream
	cudaStream_t other_stream;
	CubDebugExit(cudaStreamCreate(&other_stream));

	// Allocate 999 bytes on the current gpu in stream0
	char *d_999B_stream0_a;
	char *d_999B_stream0_b;
	CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_a, 999, 0));

	// Run some big kernel in stream 0
	EmptyKernel<void><<<32000, 512, 1024 * 8, 0>>>();

	// Free d_999B_stream0_a
	CubDebugExit(allocator.DeviceFree(d_999B_stream0_a));

	// Allocate another 999 bytes in stream 0
	CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_b, 999, 0));

	// Check that that we have 1 live block on the initial GPU
	AssertEquals(allocator.live_blocks.size(), 1);

	// Check that that we have no cached block on the initial GPU
	AssertEquals(allocator.cached_blocks.size(), 0);

	// Run some big kernel in stream 0
	EmptyKernel<void><<<32000, 512, 1024 * 8, 0>>>();

	// Free d_999B_stream0_b
	CubDebugExit(allocator.DeviceFree(d_999B_stream0_b));

	// Allocate 999 bytes on the current gpu in other_stream
	char *d_999B_stream_other_a;
	char *d_999B_stream_other_b;
	allocator.DeviceAllocate((void **) &d_999B_stream_other_a, 999, other_stream);

	// Check that that we have 1 live blocks on the initial GPU (that we allocated a new one because d_999B_stream0_b is only available for stream 0 until it becomes idle)
	AssertEquals(allocator.live_blocks.size(), 1);

	// Check that that we have one cached block on the initial GPU
	AssertEquals(allocator.cached_blocks.size(), 1);

	// Run some big kernel in other_stream
	EmptyKernel<void><<<32000, 512, 1024 * 8, other_stream>>>();

	// Free d_999B_stream_other
	CubDebugExit(allocator.DeviceFree(d_999B_stream_other_a));

	// Check that we can now use both allocations in stream 0 after synchronizing the device
	CubDebugExit(cudaDeviceSynchronize());
	CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_a, 999, 0));
	CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_b, 999, 0));

	// Check that that we have 2 live blocks on the initial GPU
	AssertEquals(allocator.live_blocks.size(), 2);

	// Check that that we have no cached block on the initial GPU
	AssertEquals(allocator.cached_blocks.size(), 0);

	// Free d_999B_stream0_a and d_999B_stream0_b
	CubDebugExit(allocator.DeviceFree(d_999B_stream0_a));
	CubDebugExit(allocator.DeviceFree(d_999B_stream0_b));

	// Check that we can now use both allocations in other_stream
	CubDebugExit(cudaDeviceSynchronize());
	CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream_other_a, 999, other_stream));
	CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream_other_b, 999, other_stream));

	// Check that that we have 2 live blocks on the initial GPU
	AssertEquals(allocator.live_blocks.size(), 2);

	// Check that that we have no cached block on the initial GPU
	AssertEquals(allocator.cached_blocks.size(), 0);

	// Run some big kernel in other_stream
	EmptyKernel<void><<<32000, 512, 1024 * 8, other_stream>>>();

	// Free d_999B_stream_other_a and d_999B_stream_other_b
	CubDebugExit(allocator.DeviceFree(d_999B_stream_other_a));
	CubDebugExit(allocator.DeviceFree(d_999B_stream_other_b));

	// Check that we can now use both allocations in stream 0 after synchronizing the device and destroying the other stream
	CubDebugExit(cudaDeviceSynchronize());
	CubDebugExit(cudaStreamDestroy(other_stream));
	CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_a, 999, 0));
	CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_b, 999, 0));

	// Check that that we have 2 live blocks on the initial GPU
	AssertEquals(allocator.live_blocks.size(), 2);

	// Check that that we have no cached block on the initial GPU
	AssertEquals(allocator.cached_blocks.size(), 0);

	// Free d_999B_stream0_a and d_999B_stream0_b
	CubDebugExit(allocator.DeviceFree(d_999B_stream0_a));
	CubDebugExit(allocator.DeviceFree(d_999B_stream0_b));

	// Free all cached
	CubDebugExit(allocator.FreeAllCached());

	//
	// Test1
	//

	// Allocate 5 bytes on the current gpu
	char *d_5B;
	CubDebugExit(allocator.DeviceAllocate((void **) &d_5B, 5));

	// Check that that we have zero free bytes cached on the initial GPU
	AssertEquals(allocator.cached_bytes[initial_gpu].free, 0);

	// Check that that we have 1 live block on the initial GPU
	AssertEquals(allocator.live_blocks.size(), 1);

	//
	// Test2
	//

	// Allocate 4096 bytes on the current gpu
	char *d_4096B;
	CubDebugExit(allocator.DeviceAllocate((void **) &d_4096B, 4096));

	// Check that that we have 2 live blocks on the initial GPU
	AssertEquals(allocator.live_blocks.size(), 2);

	//
	// Test3
	//

	// DeviceFree d_5B
	CubDebugExit(allocator.DeviceFree(d_5B));

	// Check that that we have min_bin_bytes free bytes cached on the initial gpu
	AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes);

	// Check that that we have 1 live block on the initial GPU
	AssertEquals(allocator.live_blocks.size(), 1);

	// Check that that we have 1 cached block on the initial GPU
	AssertEquals(allocator.cached_blocks.size(), 1);

	//
	// Test4
	//

	// DeviceFree d_4096B
	CubDebugExit(allocator.DeviceFree(d_4096B));

	// Check that that we have the 4096 + min_bin free bytes cached on the initial gpu
	AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes + 4096);

	// Check that that we have 0 live block on the initial GPU
	AssertEquals(allocator.live_blocks.size(), 0);

	// Check that that we have 2 cached block on the initial GPU
	AssertEquals(allocator.cached_blocks.size(), 2);

	//
	// Test5
	//

	// Allocate 768 bytes on the current gpu
	char *d_768B;
	CubDebugExit(allocator.DeviceAllocate((void **) &d_768B, 768));

	// Check that that we have the min_bin free bytes cached on the initial gpu (4096 was reused)
	AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes);

	// Check that that we have 1 live block on the initial GPU
	AssertEquals(allocator.live_blocks.size(), 1);

	// Check that that we have 1 cached block on the initial GPU
	AssertEquals(allocator.cached_blocks.size(), 1);

	//
	// Test6
	//

	// Allocate max_cached_bytes on the current gpu
	char *d_max_cached;
	CubDebugExit(allocator.DeviceAllocate((void **) &d_max_cached, allocator.max_cached_bytes));

	// DeviceFree d_max_cached
	CubDebugExit(allocator.DeviceFree(d_max_cached));

	// Check that that we have the min_bin free bytes cached on the initial gpu (max cached was not returned because we went over)
	AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes);

	// Check that that we have 1 live block on the initial GPU
	AssertEquals(allocator.live_blocks.size(), 1);

	// Check that that we still have 1 cached block on the initial GPU
	AssertEquals(allocator.cached_blocks.size(), 1);

	//
	// Test7
	//

	// Free all cached blocks on all GPUs
	CubDebugExit(allocator.FreeAllCached());

	// Check that that we have 0 bytes cached on the initial GPU
	AssertEquals(allocator.cached_bytes[initial_gpu].free, 0);

	// Check that that we have 0 cached blocks across all GPUs
	AssertEquals(allocator.cached_blocks.size(), 0);

	// Check that that still we have 1 live block across all GPUs
	AssertEquals(allocator.live_blocks.size(), 1);

	//
	// Test8
	//

	// Allocate max cached bytes + 1 on the current gpu
	char *d_max_cached_plus;
	CubDebugExit(allocator.DeviceAllocate((void **) &d_max_cached_plus, allocator.max_cached_bytes + 1));

	// DeviceFree max cached bytes
	CubDebugExit(allocator.DeviceFree(d_max_cached_plus));

	// DeviceFree d_768B
	CubDebugExit(allocator.DeviceFree(d_768B));

	unsigned int power;
	size_t rounded_bytes;
	allocator.NearestPowerOf(power, rounded_bytes, allocator.bin_growth, 768);

	// Check that that we have 4096 free bytes cached on the initial gpu
	AssertEquals(allocator.cached_bytes[initial_gpu].free, rounded_bytes);

	// Check that that we have 1 cached blocks across all GPUs
	AssertEquals(allocator.cached_blocks.size(), 1);

	// Check that that still we have 0 live block across all GPUs
	AssertEquals(allocator.live_blocks.size(), 0);

	#ifndef CUB_CDP
	// BUG: find out why these tests fail when one GPU is CDP compliant and the other is not

	if (num_gpus > 1)
	{
	printf("\nRunning multi-gpu tests...\n"); fflush(stdout);

	//
	// Test9
	//

	// Allocate 768 bytes on the next gpu
	int next_gpu = (initial_gpu + 1) % num_gpus;
	char *d_768B_2;
	CubDebugExit(allocator.DeviceAllocate(next_gpu, (void **) &d_768B_2, 768));

	// DeviceFree d_768B on the next gpu
	CubDebugExit(allocator.DeviceFree(next_gpu, d_768B_2));

	// Re-allocate 768 bytes on the next gpu
	CubDebugExit(allocator.DeviceAllocate(next_gpu, (void **) &d_768B_2, 768));

	// Re-free d_768B on the next gpu
	CubDebugExit(allocator.DeviceFree(next_gpu, d_768B_2));

	// Check that that we have 4096 free bytes cached on the initial gpu
	AssertEquals(allocator.cached_bytes[initial_gpu].free, rounded_bytes);

	// Check that that we have 4096 free bytes cached on the second gpu
	AssertEquals(allocator.cached_bytes[next_gpu].free, rounded_bytes);

	// Check that that we have 2 cached blocks across all GPUs
	AssertEquals(allocator.cached_blocks.size(), 2);

	// Check that that still we have 0 live block across all GPUs
	AssertEquals(allocator.live_blocks.size(), 0);
	}
	#endif // CUB_CDP

	//
	// Performance
	//

	printf("\nCPU Performance (%d timing iterations, %d bytes):\n", timing_iterations, timing_bytes);
	fflush(stdout); fflush(stderr);

	// CPU performance comparisons vs cached. Allocate and free a 1MB block 2000 times
	CpuTimer cpu_timer;
	char *d_1024MB = NULL;
	allocator.debug = false;

	// Prime the caching allocator and the kernel
	CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes));
	CubDebugExit(allocator.DeviceFree(d_1024MB));
	cub::EmptyKernel<void><<<1, 32>>>();

	// CUDA
	cpu_timer.Start();
	for (int i = 0; i < timing_iterations; ++i)
	{
	CubDebugExit(cudaMalloc((void **) &d_1024MB, timing_bytes));
	CubDebugExit(cudaFree(d_1024MB));
	}
	cpu_timer.Stop();
	float cuda_malloc_elapsed_millis = cpu_timer.ElapsedMillis();

	// CUB
	cpu_timer.Start();
	for (int i = 0; i < timing_iterations; ++i)
	{
	CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes));
	CubDebugExit(allocator.DeviceFree(d_1024MB));
	}
	cpu_timer.Stop();
	float cub_calloc_elapsed_millis = cpu_timer.ElapsedMillis();

	printf("\t CUB CachingDeviceAllocator allocation CPU speedup: %.2f (avg cudaMalloc %.4f ms vs. avg DeviceAllocate %.4f ms)\n",
	cuda_malloc_elapsed_millis / cub_calloc_elapsed_millis,
	cuda_malloc_elapsed_millis / timing_iterations,
	cub_calloc_elapsed_millis / timing_iterations);

	// GPU performance comparisons. Allocate and free a 1MB block 2000 times
	GpuTimer gpu_timer;

	printf("\nGPU Performance (%d timing iterations, %d bytes):\n", timing_iterations, timing_bytes);
	fflush(stdout); fflush(stderr);

	// Kernel-only
	gpu_timer.Start();
	for (int i = 0; i < timing_iterations; ++i)
	{
	cub::EmptyKernel<void><<<1, 32>>>();
	}
	gpu_timer.Stop();
	float cuda_empty_elapsed_millis = gpu_timer.ElapsedMillis();

	// CUDA
	gpu_timer.Start();
	for (int i = 0; i < timing_iterations; ++i)
	{
	CubDebugExit(cudaMalloc((void **) &d_1024MB, timing_bytes));
	cub::EmptyKernel<void><<<1, 32>>>();
	CubDebugExit(cudaFree(d_1024MB));
	}
	gpu_timer.Stop();
	cuda_malloc_elapsed_millis = gpu_timer.ElapsedMillis() - cuda_empty_elapsed_millis;

	// CUB
	gpu_timer.Start();
	for (int i = 0; i < timing_iterations; ++i)
	{
	CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes));
	cub::EmptyKernel<void><<<1, 32>>>();
	CubDebugExit(allocator.DeviceFree(d_1024MB));
	}
	gpu_timer.Stop();
	cub_calloc_elapsed_millis = gpu_timer.ElapsedMillis() - cuda_empty_elapsed_millis;

	printf("\t CUB CachingDeviceAllocator allocation GPU speedup: %.2f (avg cudaMalloc %.4f ms vs. avg DeviceAllocate %.4f ms)\n",
	cuda_malloc_elapsed_millis / cub_calloc_elapsed_millis,
	cuda_malloc_elapsed_millis / timing_iterations,
	cub_calloc_elapsed_millis / timing_iterations);


	#endif

	printf("Success\n");

	return 0;
	}