Spaces:

ma-xu
/

LIVE

Runtime error

LIVE / thrust /dependencies /cub /test /test_block_reduce.cu

Xu Ma

update

1c3c0d9 over 2 years ago

26.7 kB

	/******************************************************************************
	* Copyright (c) 2011, Duane Merrill. All rights reserved.
	* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* * Neither the name of the NVIDIA CORPORATION nor the
	* names of its contributors may be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	******************************************************************************/

	/******************************************************************************
	* Test of BlockReduce utilities
	******************************************************************************/

	// Ensure printing of CUDA runtime errors to console
	#define CUB_STDERR

	#include <stdio.h>

	#include <cuda_runtime_api.h>
	#include <typeinfo>

	#include <cub/block/block_reduce.cuh>
	#include <cub/block/block_load.cuh>
	#include <cub/util_ptx.cuh>
	#include <cub/util_allocator.cuh>
	#include <cub/util_debug.cuh>

	#include "test_util.h"

	using namespace cub;


	//---------------------------------------------------------------------
	// Globals, constants and typedefs
	//---------------------------------------------------------------------

	bool g_verbose = false;
	int g_repeat = 0;
	CachingDeviceAllocator g_allocator(true);



	//---------------------------------------------------------------------
	// Test kernels
	//---------------------------------------------------------------------


	/// Generic reduction (full, 1)
	template <typename BlockReduceT, typename T, typename ReductionOp>
	__device__ __forceinline__ T DeviceTest(
	BlockReduceT &block_reduce, T (&data)[1], ReductionOp &reduction_op)
	{
	return block_reduce.Reduce(data[0], reduction_op);
	}

	/// Generic reduction (full, ITEMS_PER_THREAD)
	template <typename BlockReduceT, typename T, int ITEMS_PER_THREAD, typename ReductionOp>
	__device__ __forceinline__ T DeviceTest(
	BlockReduceT &block_reduce, T (&data)[ITEMS_PER_THREAD], ReductionOp &reduction_op)
	{
	return block_reduce.Reduce(data, reduction_op);
	}

	/// Generic reduction (partial, 1)
	template <typename BlockReduceT, typename T, typename ReductionOp>
	__device__ __forceinline__ T DeviceTest(
	BlockReduceT &block_reduce, T &data, ReductionOp &reduction_op, int valid_threads)
	{
	return block_reduce.Reduce(data, reduction_op, valid_threads);
	}

	/// Sum reduction (full, 1)
	template <typename BlockReduceT, typename T>
	__device__ __forceinline__ T DeviceTest(
	BlockReduceT &block_reduce, T (&data)[1], Sum &reduction_op)
	{
	return block_reduce.Sum(data[0]);
	}

	/// Sum reduction (full, ITEMS_PER_THREAD)
	template <typename BlockReduceT, typename T, int ITEMS_PER_THREAD>
	__device__ __forceinline__ T DeviceTest(
	BlockReduceT &block_reduce, T (&data)[ITEMS_PER_THREAD], Sum &reduction_op)
	{
	return block_reduce.Sum(data);
	}

	/// Sum reduction (partial, 1)
	template <typename BlockReduceT, typename T>
	__device__ __forceinline__ T DeviceTest(
	BlockReduceT &block_reduce, T &data, Sum &reduction_op, int valid_threads)
	{
	return block_reduce.Sum(data, valid_threads);
	}


	/**
	* Test full-tile reduction kernel (where num_items is an even
	* multiple of BLOCK_THREADS)
	*/
	template <
	BlockReduceAlgorithm ALGORITHM,
	int BLOCK_DIM_X,
	int BLOCK_DIM_Y,
	int BLOCK_DIM_Z,
	int ITEMS_PER_THREAD,
	typename T,
	typename ReductionOp>
	__launch_bounds__ (BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z)
	__global__ void FullTileReduceKernel(
	T *d_in,
	T *d_out,
	ReductionOp reduction_op,
	int tiles,
	clock_t *d_elapsed)
	{
	const int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
	const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;

	// Cooperative thread block reduction utility type (returns aggregate in thread 0)
	typedef BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z> BlockReduceT;

	// Allocate temp storage in shared memory
	__shared__ typename BlockReduceT::TempStorage temp_storage;

	int linear_tid = RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z);

	// Per-thread tile data
	T data[ITEMS_PER_THREAD];

	// Load first tile of data
	int block_offset = 0;

	if (block_offset < TILE_SIZE * tiles)
	{
	LoadDirectBlocked(linear_tid, d_in + block_offset, data);
	block_offset += TILE_SIZE;

	// Start cycle timer
	clock_t start = clock();

	// Cooperative reduce first tile
	BlockReduceT block_reduce(temp_storage) ;
	T block_aggregate = DeviceTest(block_reduce, data, reduction_op);

	// Stop cycle timer
	#if CUB_PTX_ARCH == 100
	// Bug: recording stop clock causes mis-write of running prefix value
	clock_t stop = 0;
	#else
	clock_t stop = clock();
	#endif // CUB_PTX_ARCH == 100
	clock_t elapsed = (start > stop) ? start - stop : stop - start;

	// Loop over input tiles
	while (block_offset < TILE_SIZE * tiles)
	{
	// TestBarrier between thread block reductions
	__syncthreads();

	// Load tile of data
	LoadDirectBlocked(linear_tid, d_in + block_offset, data);
	block_offset += TILE_SIZE;

	// Start cycle timer
	clock_t start = clock();

	// Cooperatively reduce the tile's aggregate
	BlockReduceT block_reduce(temp_storage) ;
	T tile_aggregate = DeviceTest(block_reduce, data, reduction_op);

	// Stop cycle timer
	#if CUB_PTX_ARCH == 100
	// Bug: recording stop clock causes mis-write of running prefix value
	clock_t stop = 0;
	#else
	clock_t stop = clock();
	#endif // CUB_PTX_ARCH == 100
	elapsed += (start > stop) ? start - stop : stop - start;

	// Reduce thread block aggregate
	block_aggregate = reduction_op(block_aggregate, tile_aggregate);
	}

	// Store data
	if (linear_tid == 0)
	{
	d_out[0] = block_aggregate;
	*d_elapsed = elapsed;
	}
	}
	}



	/**
	* Test partial-tile reduction kernel (where num_items < BLOCK_THREADS)
	*/
	template <
	BlockReduceAlgorithm ALGORITHM,
	int BLOCK_DIM_X,
	int BLOCK_DIM_Y,
	int BLOCK_DIM_Z,
	typename T,
	typename ReductionOp>
	__launch_bounds__ (BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z)
	__global__ void PartialTileReduceKernel(
	T *d_in,
	T *d_out,
	int num_items,
	ReductionOp reduction_op,
	clock_t *d_elapsed)
	{
	// Cooperative thread block reduction utility type (returns aggregate only in thread-0)
	typedef BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z> BlockReduceT;

	// Allocate temp storage in shared memory
	__shared__ typename BlockReduceT::TempStorage temp_storage;

	int linear_tid = RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z);

	// Per-thread tile data
	T partial;

	// Load partial tile data
	if (linear_tid < num_items)
	{
	partial = d_in[linear_tid];
	}

	// Start cycle timer
	clock_t start = clock();

	// Cooperatively reduce the tile's aggregate
	BlockReduceT block_reduce(temp_storage) ;
	T tile_aggregate = DeviceTest(block_reduce, partial, reduction_op, num_items);

	// Stop cycle timer
	#if CUB_PTX_ARCH == 100
	// Bug: recording stop clock causes mis-write of running prefix value
	clock_t stop = 0;
	#else
	clock_t stop = clock();
	#endif // CUB_PTX_ARCH == 100

	clock_t elapsed = (start > stop) ? start - stop : stop - start;

	// Store data
	if (linear_tid == 0)
	{
	d_out[0] = tile_aggregate;
	*d_elapsed = elapsed;
	}
	}


	//---------------------------------------------------------------------
	// Host utility subroutines
	//---------------------------------------------------------------------

	/**
	* Initialize problem (and solution)
	*/
	template <
	typename T,
	typename ReductionOp>
	void Initialize(
	GenMode gen_mode,
	T *h_in,
	T h_reference[1],
	ReductionOp reduction_op,
	int num_items)
	{
	for (int i = 0; i < num_items; ++i)
	{
	InitValue(gen_mode, h_in[i], i);
	if (i == 0)
	h_reference[0] = h_in[0];
	else
	h_reference[0] = reduction_op(h_reference[0], h_in[i]);
	}

	if (g_verbose)
	{
	printf("Input:\n");
	DisplayResults(h_in, num_items);
	printf("\n");
	}
	}


	//---------------------------------------------------------------------
	// Full tile test generation
	//---------------------------------------------------------------------


	/**
	* Test full-tile reduction. (Specialized for sufficient resources)
	*/
	template <
	BlockReduceAlgorithm ALGORITHM,
	int BLOCK_DIM_X,
	int BLOCK_DIM_Y,
	int BLOCK_DIM_Z,
	int ITEMS_PER_THREAD,
	typename T,
	typename ReductionOp>
	void TestFullTile(
	GenMode gen_mode,
	int tiles,
	ReductionOp reduction_op,
	Int2Type<true> /sufficient_resources/)
	{
	const int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
	const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;

	int num_items = TILE_SIZE * tiles;

	// Allocate host arrays
	T *h_in = new T[num_items];
	T h_reference[1];

	// Initialize problem
	Initialize(gen_mode, h_in, h_reference, reduction_op, num_items);

	// Initialize/clear device arrays
	T *d_in = NULL;
	T *d_out = NULL;
	clock_t *d_elapsed = NULL;
	CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(unsigned long long)));
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_in, sizeof(T) num_items));
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_out, sizeof(T) 1));
	CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * num_items, cudaMemcpyHostToDevice));
	CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * 1));

	// Test multi-tile (unguarded)
	printf("TestFullTile %s, %s, gen-mode %d, num_items(%d), BLOCK_THREADS(%d) (%d,%d,%d), ITEMS_PER_THREAD(%d), tiles(%d), %s (%d bytes) elements:\n",
	Equals<ReductionOp, Sum>::VALUE ? "Sum" : "Max",
	(ALGORITHM == BLOCK_REDUCE_RAKING) ? "BLOCK_REDUCE_RAKING" : (ALGORITHM == BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY) ? "BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY" : "BLOCK_REDUCE_WARP_REDUCTIONS",
	gen_mode,
	num_items,
	BLOCK_THREADS, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z,
	ITEMS_PER_THREAD,
	tiles,
	typeid(T).name(),
	(int) sizeof(T));
	fflush(stdout);

	dim3 block_dims(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z);
	FullTileReduceKernel<ALGORITHM, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, ITEMS_PER_THREAD><<<1, block_dims>>>(
	d_in,
	d_out,
	reduction_op,
	tiles,
	d_elapsed);

	CubDebugExit(cudaPeekAtLastError());
	CubDebugExit(cudaDeviceSynchronize());

	// Copy out and display results
	printf("\tReduction results: ");
	int compare = CompareDeviceResults(h_reference, d_out, 1, g_verbose, g_verbose);
	printf("%s\n", compare ? "FAIL" : "PASS");
	AssertEquals(0, compare);

	printf("\tElapsed clocks: ");
	DisplayDeviceResults(d_elapsed, 1);

	// Cleanup
	if (h_in) delete[] h_in;
	if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
	if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
	if (d_elapsed) CubDebugExit(g_allocator.DeviceFree(d_elapsed));
	}


	/**
	* Test full-tile reduction. (Specialized for insufficient resources)
	*/
	template <
	BlockReduceAlgorithm ALGORITHM,
	int BLOCK_DIM_X,
	int BLOCK_DIM_Y,
	int BLOCK_DIM_Z,
	int ITEMS_PER_THREAD,
	typename T,
	typename ReductionOp>
	void TestFullTile(
	GenMode gen_mode,
	int tiles,
	ReductionOp reduction_op,
	Int2Type<false> sufficient_resources)
	{}


	/**
	* Test full-tile reduction.
	*/
	template <
	BlockReduceAlgorithm ALGORITHM,
	int BLOCK_DIM_X,
	int BLOCK_DIM_Y,
	int BLOCK_DIM_Z,
	int ITEMS_PER_THREAD,
	typename T,
	typename ReductionOp>
	void TestFullTile(
	GenMode gen_mode,
	int tiles,
	ReductionOp reduction_op)
	{
	// Check size of smem storage for the target arch to make sure it will fit
	typedef BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, TEST_ARCH> BlockReduceT;

	enum
	{
	#if defined(SM100) \|\| defined(SM110) \|\| defined(SM130)
	sufficient_smem = (sizeof(typename BlockReduceT::TempStorage) <= 16 * 1024),
	sufficient_threads = ((BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z) <= 512),
	#else
	sufficient_smem = (sizeof(typename BlockReduceT::TempStorage) <= 48 * 1024),
	sufficient_threads = ((BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z) <= 1024),
	#endif
	};

	TestFullTile<ALGORITHM, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, ITEMS_PER_THREAD, T>(gen_mode, tiles, reduction_op, Int2Type<sufficient_smem && sufficient_threads>());
	}


	/**
	* Run battery of tests for different thread block dimensions
	*/
	template <
	BlockReduceAlgorithm ALGORITHM,
	int BLOCK_THREADS,
	int ITEMS_PER_THREAD,
	typename T,
	typename ReductionOp>
	void TestFullTile(
	GenMode gen_mode,
	int tiles,
	ReductionOp reduction_op)
	{
	TestFullTile<ALGORITHM, BLOCK_THREADS, 1, 1, ITEMS_PER_THREAD, T>(gen_mode, tiles, reduction_op);
	TestFullTile<ALGORITHM, BLOCK_THREADS, 2, 2, ITEMS_PER_THREAD, T>(gen_mode, tiles, reduction_op);
	}

	/**
	* Run battery of tests for different thread items
	*/
	template <
	BlockReduceAlgorithm ALGORITHM,
	int BLOCK_THREADS,
	typename T,
	typename ReductionOp>
	void TestFullTile(
	GenMode gen_mode,
	int tiles,
	ReductionOp reduction_op)
	{
	TestFullTile<ALGORITHM, BLOCK_THREADS, 1, T>(gen_mode, tiles, reduction_op);
	TestFullTile<ALGORITHM, BLOCK_THREADS, 4, T>(gen_mode, tiles, reduction_op);
	}


	/**
	* Run battery of full-tile tests for different numbers of tiles
	*/
	template <
	BlockReduceAlgorithm ALGORITHM,
	int BLOCK_THREADS,
	typename T,
	typename ReductionOp>
	void TestFullTile(
	GenMode gen_mode,
	ReductionOp reduction_op)
	{
	for (int tiles = 1; tiles < 3; tiles++)
	{
	TestFullTile<ALGORITHM, BLOCK_THREADS, T>(gen_mode, tiles, reduction_op);
	}
	}


	//---------------------------------------------------------------------
	// Partial-tile test generation
	//---------------------------------------------------------------------

	/**
	* Test partial-tile reduction. (Specialized for sufficient resources)
	*/
	template <
	BlockReduceAlgorithm ALGORITHM,
	int BLOCK_DIM_X,
	int BLOCK_DIM_Y,
	int BLOCK_DIM_Z,
	typename T,
	typename ReductionOp>
	void TestPartialTile(
	GenMode gen_mode,
	int num_items,
	ReductionOp reduction_op,
	Int2Type<true> /sufficient_resources/)
	{
	const int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
	const int TILE_SIZE = BLOCK_THREADS;

	// Allocate host arrays
	T *h_in = new T[num_items];
	T h_reference[1];

	// Initialize problem
	Initialize(gen_mode, h_in, h_reference, reduction_op, num_items);

	// Initialize/clear device arrays
	T *d_in = NULL;
	T *d_out = NULL;
	clock_t *d_elapsed = NULL;
	CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(unsigned long long)));
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_in, sizeof(T) TILE_SIZE));
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_out, sizeof(T) 1));
	CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * num_items, cudaMemcpyHostToDevice));
	CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * 1));

	printf("TestPartialTile %s, gen-mode %d, num_items(%d), BLOCK_THREADS(%d) (%d,%d,%d), %s (%d bytes) elements:\n",
	(ALGORITHM == BLOCK_REDUCE_RAKING) ? "BLOCK_REDUCE_RAKING" : (ALGORITHM == BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY) ? "BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY" : "BLOCK_REDUCE_WARP_REDUCTIONS",
	gen_mode,
	num_items,
	BLOCK_THREADS, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z,
	typeid(T).name(),
	(int) sizeof(T));
	fflush(stdout);

	dim3 block_dims(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z);
	PartialTileReduceKernel<ALGORITHM, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z><<<1, block_dims>>>(
	d_in,
	d_out,
	num_items,
	reduction_op,
	d_elapsed);

	CubDebugExit(cudaPeekAtLastError());
	CubDebugExit(cudaDeviceSynchronize());

	// Copy out and display results
	printf("\tReduction results: ");
	int compare = CompareDeviceResults(h_reference, d_out, 1, g_verbose, g_verbose);
	printf("%s\n", compare ? "FAIL" : "PASS");
	AssertEquals(0, compare);

	printf("\tElapsed clocks: ");
	DisplayDeviceResults(d_elapsed, 1);

	// Cleanup
	if (h_in) delete[] h_in;
	if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
	if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
	if (d_elapsed) CubDebugExit(g_allocator.DeviceFree(d_elapsed));
	}



	/**
	* Test partial-tile reduction (specialized for insufficient resources)
	*/
	template <
	BlockReduceAlgorithm ALGORITHM,
	int BLOCK_DIM_X,
	int BLOCK_DIM_Y,
	int BLOCK_DIM_Z,
	typename T,
	typename ReductionOp>
	void TestPartialTile(
	GenMode gen_mode,
	int num_items,
	ReductionOp reduction_op,
	Int2Type<false> sufficient_resources)
	{}


	/**
	* Run battery of partial-tile tests for different numbers of effective threads and thread dimensions
	*/
	template <
	BlockReduceAlgorithm ALGORITHM,
	int BLOCK_DIM_X,
	int BLOCK_DIM_Y,
	int BLOCK_DIM_Z,
	typename T,
	typename ReductionOp>
	void TestPartialTile(
	GenMode gen_mode,
	int num_items,
	ReductionOp reduction_op)
	{
	// Check size of smem storage for the target arch to make sure it will fit
	typedef BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, TEST_ARCH> BlockReduceT;

	enum
	{
	#if defined(SM100) \|\| defined(SM110) \|\| defined(SM130)
	sufficient_smem = sizeof(typename BlockReduceT::TempStorage) <= 16 * 1024,
	sufficient_threads = (BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z) <= 512,
	#else
	sufficient_smem = sizeof(typename BlockReduceT::TempStorage) <= 48 * 1024,
	sufficient_threads = (BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z) <= 1024,
	#endif
	};

	TestPartialTile<ALGORITHM, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, T>(gen_mode, num_items, reduction_op, Int2Type<sufficient_smem && sufficient_threads>());
	}



	/**
	* Run battery of partial-tile tests for different numbers of effective threads and thread dimensions
	*/
	template <
	BlockReduceAlgorithm ALGORITHM,
	int BLOCK_THREADS,
	typename T,
	typename ReductionOp>
	void TestPartialTile(
	GenMode gen_mode,
	ReductionOp reduction_op)
	{
	for (
	int num_items = 1;
	num_items < BLOCK_THREADS;
	num_items += CUB_MAX(1, BLOCK_THREADS / 5))
	{
	TestPartialTile<ALGORITHM, BLOCK_THREADS, 1, 1, T>(gen_mode, num_items, reduction_op);
	TestPartialTile<ALGORITHM, BLOCK_THREADS, 2, 2, T>(gen_mode, num_items, reduction_op);
	}
	}



	//---------------------------------------------------------------------
	// Main
	//---------------------------------------------------------------------

	/**
	* Run battery of full-tile tests for different gen modes
	*/
	template <
	BlockReduceAlgorithm ALGORITHM,
	int BLOCK_THREADS,
	typename T,
	typename ReductionOp>
	void Test(
	ReductionOp reduction_op)
	{
	TestFullTile<ALGORITHM, BLOCK_THREADS, T>(UNIFORM, reduction_op);
	TestPartialTile<ALGORITHM, BLOCK_THREADS, T>(UNIFORM, reduction_op);

	TestFullTile<ALGORITHM, BLOCK_THREADS, T>(INTEGER_SEED, reduction_op);
	TestPartialTile<ALGORITHM, BLOCK_THREADS, T>(INTEGER_SEED, reduction_op);

	if (Traits<T>::CATEGORY != FLOATING_POINT)
	{
	// Don't test randomly-generated floats b/c of stability
	TestFullTile<ALGORITHM, BLOCK_THREADS, T>(RANDOM, reduction_op);
	TestPartialTile<ALGORITHM, BLOCK_THREADS, T>(RANDOM, reduction_op);
	}
	}


	/**
	* Run battery of tests for different block-reduction algorithmic variants
	*/
	template <
	int BLOCK_THREADS,
	typename T,
	typename ReductionOp>
	void Test(
	ReductionOp reduction_op)
	{
	(void)reduction_op;
	#ifdef TEST_RAKING
	Test<BLOCK_REDUCE_RAKING, BLOCK_THREADS, T>(reduction_op);
	Test<BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY, BLOCK_THREADS, T>(reduction_op);
	#endif
	#ifdef TEST_WARP_REDUCTIONS
	Test<BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_THREADS, T>(reduction_op);
	#endif
	}


	/**
	* Run battery of tests for different block sizes
	*/
	template <
	typename T,
	typename ReductionOp>
	void Test(
	ReductionOp reduction_op)
	{
	Test<7, T>(reduction_op);
	Test<32, T>(reduction_op);
	Test<63, T>(reduction_op);
	Test<97, T>(reduction_op);
	Test<128, T>(reduction_op);
	Test<238, T>(reduction_op);
	}


	/**
	* Run battery of tests for different block sizes
	*/
	template <typename T>
	void Test()
	{
	Test<T>(Sum());
	Test<T>(Max());
	}


	/**
	* Main
	*/
	int main(int argc, char** argv)
	{
	// Initialize command line
	CommandLineArgs args(argc, argv);
	g_verbose = args.CheckCmdLineFlag("v");
	args.GetCmdLineArgument("repeat", g_repeat);

	// Print usage
	if (args.CheckCmdLineFlag("help"))
	{
	printf("%s "
	"[--device=<device-id>] "
	"[--repeat=<repetitions of entire test suite>]"
	"[--v] "
	"\n", argv[0]);
	exit(0);
	}

	// Initialize device
	CubDebugExit(args.DeviceInit());

	// Get ptx version
	int ptx_version = 0;
	CubDebugExit(PtxVersion(ptx_version));

	#ifdef QUICK_TEST

	// Compile/run quick tests


	printf("\n full tile ------------------------\n\n");

	TestFullTile<BLOCK_REDUCE_RAKING, 128, 1, 1, 4, int>(RANDOM, 1, Sum());
	TestFullTile<BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY, 128, 1, 1, 4, int>(RANDOM, 1, Sum());
	TestFullTile<BLOCK_REDUCE_WARP_REDUCTIONS, 128, 1, 1, 4, int>(RANDOM, 1, Sum());

	TestFullTile<BLOCK_REDUCE_RAKING, 128, 1, 1, 1, int>(RANDOM, 1, Sum());
	TestFullTile<BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY, 128, 1, 1, 1, int>(RANDOM, 1, Sum());
	TestFullTile<BLOCK_REDUCE_WARP_REDUCTIONS, 128, 1, 1, 1, int>(RANDOM, 1, Sum());

	printf("\n partial tile ------------------------\n\n");

	TestPartialTile<BLOCK_REDUCE_RAKING, 128, 1, 1, int>(RANDOM, 7, Sum());
	TestPartialTile<BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY, 128, 1, 1, int>(RANDOM, 7, Sum());
	TestPartialTile<BLOCK_REDUCE_WARP_REDUCTIONS, 128, 1, 1, int>(RANDOM, 7, Sum());

	#else

	// Compile/run thorough tests
	for (int i = 0; i <= g_repeat; ++i)
	{
	// primitives
	Test<char>();
	Test<short>();
	Test<int>();
	Test<long long>();
	if (ptx_version > 120) // Don't check doubles on PTX120 or below because they're down-converted
	Test<double>();

	Test<float>();

	// vector types
	Test<char2>();
	Test<short2>();
	Test<int2>();
	Test<longlong2>();

	Test<char4>();
	Test<short4>();
	Test<int4>();
	Test<longlong4>();

	// Complex types
	Test<TestFoo>();
	Test<TestBar>();
	}

	#endif

	return 0;
	}