Spaces:

ma-xu
/

LIVE

Runtime error

App Files Files Community

LIVE / thrust /dependencies /cub /test /test_device_run_length_encode.cu

Xu Ma

update

1c3c0d9 over 2 years ago

raw

history blame

No virus

31.3 kB

	/******************************************************************************
	* Copyright (c) 2011, Duane Merrill. All rights reserved.
	* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* * Neither the name of the NVIDIA CORPORATION nor the
	* names of its contributors may be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	******************************************************************************/

	/******************************************************************************
	* Test of DeviceReduce::RunLengthEncode utilities
	******************************************************************************/

	// Ensure printing of CUDA runtime errors to console
	#define CUB_STDERR

	#include <stdio.h>
	#include <typeinfo>

	#include <thrust/device_ptr.h>
	#include <thrust/reduce.h>
	#include <thrust/iterator/constant_iterator.h>

	#include <cub/util_allocator.cuh>
	#include <cub/iterator/constant_input_iterator.cuh>
	#include <cub/device/device_reduce.cuh>
	#include <cub/device/device_run_length_encode.cuh>
	#include <cub/thread/thread_operators.cuh>

	#include "test_util.h"

	using namespace cub;


	//---------------------------------------------------------------------
	// Globals, constants and typedefs
	//---------------------------------------------------------------------

	bool g_verbose = false;
	int g_timing_iterations = 0;
	int g_repeat = 0;
	CachingDeviceAllocator g_allocator(true);

	// Dispatch types
	enum Backend
	{
	CUB, // CUB method
	THRUST, // Thrust method
	CDP, // GPU-based (dynamic parallelism) dispatch to CUB method
	};

	// Operation types
	enum RleMethod
	{
	RLE, // Run length encode
	NON_TRIVIAL,
	CSR,
	};


	//---------------------------------------------------------------------
	// Dispatch to different CUB entrypoints
	//---------------------------------------------------------------------


	/**
	* Dispatch to run-length encode entrypoint
	*/
	template <
	typename InputIteratorT,
	typename UniqueOutputIteratorT,
	typename OffsetsOutputIteratorT,
	typename LengthsOutputIteratorT,
	typename NumRunsIterator,
	typename OffsetT>
	CUB_RUNTIME_FUNCTION __forceinline__
	cudaError_t Dispatch(
	Int2Type<RLE> /method/,
	Int2Type<CUB> /dispatch_to/,
	int timing_timing_iterations,
	size_t /d_temp_storage_bytes*/,
	cudaError_t /d_cdp_error*/,

	void* d_temp_storage,
	size_t &temp_storage_bytes,
	InputIteratorT d_in,
	UniqueOutputIteratorT d_unique_out,
	OffsetsOutputIteratorT /d_offsets_out/,
	LengthsOutputIteratorT d_lengths_out,
	NumRunsIterator d_num_runs,
	cub::Equality /equality_op/,
	OffsetT num_items,
	cudaStream_t stream,
	bool debug_synchronous)
	{
	cudaError_t error = cudaSuccess;
	for (int i = 0; i < timing_timing_iterations; ++i)
	{
	error = DeviceRunLengthEncode::Encode(
	d_temp_storage,
	temp_storage_bytes,
	d_in,
	d_unique_out,
	d_lengths_out,
	d_num_runs,
	num_items,
	stream,
	debug_synchronous);
	}
	return error;
	}


	/**
	* Dispatch to non-trivial runs entrypoint
	*/
	template <
	typename InputIteratorT,
	typename UniqueOutputIteratorT,
	typename OffsetsOutputIteratorT,
	typename LengthsOutputIteratorT,
	typename NumRunsIterator,
	typename OffsetT>
	CUB_RUNTIME_FUNCTION __forceinline__
	cudaError_t Dispatch(
	Int2Type<NON_TRIVIAL> /method/,
	Int2Type<CUB> /dispatch_to/,
	int timing_timing_iterations,
	size_t /d_temp_storage_bytes*/,
	cudaError_t /d_cdp_error*/,

	void* d_temp_storage,
	size_t &temp_storage_bytes,
	InputIteratorT d_in,
	UniqueOutputIteratorT /d_unique_out/,
	OffsetsOutputIteratorT d_offsets_out,
	LengthsOutputIteratorT d_lengths_out,
	NumRunsIterator d_num_runs,
	cub::Equality /equality_op/,
	OffsetT num_items,
	cudaStream_t stream,
	bool debug_synchronous)
	{
	cudaError_t error = cudaSuccess;
	for (int i = 0; i < timing_timing_iterations; ++i)
	{
	error = DeviceRunLengthEncode::NonTrivialRuns(
	d_temp_storage,
	temp_storage_bytes,
	d_in,
	d_offsets_out,
	d_lengths_out,
	d_num_runs,
	num_items,
	stream,
	debug_synchronous);
	}
	return error;
	}



	//---------------------------------------------------------------------
	// Dispatch to different Thrust entrypoints
	//---------------------------------------------------------------------

	/**
	* Dispatch to run-length encode entrypoint
	*/
	template <
	typename InputIteratorT,
	typename UniqueOutputIteratorT,
	typename OffsetsOutputIteratorT,
	typename LengthsOutputIteratorT,
	typename NumRunsIterator,
	typename OffsetT>
	cudaError_t Dispatch(
	Int2Type<RLE> /method/,
	Int2Type<THRUST> /dispatch_to/,
	int timing_timing_iterations,
	size_t /d_temp_storage_bytes*/,
	cudaError_t /d_cdp_error*/,

	void *d_temp_storage,
	size_t &temp_storage_bytes,
	InputIteratorT d_in,
	UniqueOutputIteratorT d_unique_out,
	OffsetsOutputIteratorT /d_offsets_out/,
	LengthsOutputIteratorT d_lengths_out,
	NumRunsIterator d_num_runs,
	cub::Equality /equality_op/,
	OffsetT num_items,
	cudaStream_t /stream/,
	bool /debug_synchronous/)
	{
	// The input value type
	typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;

	// The output value type
	typedef typename If<(Equals<typename std::iterator_traits<UniqueOutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
	typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type,
	typename std::iterator_traits<UniqueOutputIteratorT>::value_type>::Type UniqueT; // ... else the output iterator's value type

	// The lengths output value type
	typedef typename If<(Equals<typename std::iterator_traits<LengthsOutputIteratorT>::value_type, void>::VALUE), // LengthT = (if output iterator's value type is void) ?
	OffsetT, // ... then the OffsetT type,
	typename std::iterator_traits<LengthsOutputIteratorT>::value_type>::Type LengthT; // ... else the output iterator's value type

	if (d_temp_storage == 0)
	{
	temp_storage_bytes = 1;
	}
	else
	{
	thrust::device_ptr<InputT> d_in_wrapper(d_in);
	thrust::device_ptr<UniqueT> d_unique_out_wrapper(d_unique_out);
	thrust::device_ptr<LengthT> d_lengths_out_wrapper(d_lengths_out);

	thrust::pair<thrust::device_ptr<UniqueT>, thrust::device_ptr<LengthT> > d_out_ends;

	LengthT one_val;
	InitValue(INTEGER_SEED, one_val, 1);
	thrust::constant_iterator<LengthT> constant_one(one_val);

	for (int i = 0; i < timing_timing_iterations; ++i)
	{
	d_out_ends = thrust::reduce_by_key(
	d_in_wrapper,
	d_in_wrapper + num_items,
	constant_one,
	d_unique_out_wrapper,
	d_lengths_out_wrapper);
	}

	OffsetT num_runs = OffsetT(d_out_ends.first - d_unique_out_wrapper);
	CubDebugExit(cudaMemcpy(d_num_runs, &num_runs, sizeof(OffsetT), cudaMemcpyHostToDevice));
	}

	return cudaSuccess;
	}



	//---------------------------------------------------------------------
	// CUDA Nested Parallelism Test Kernel
	//---------------------------------------------------------------------

	/**
	* Simple wrapper kernel to invoke DeviceRunLengthEncode
	*/
	template <
	int RLE_METHOD,
	typename InputIteratorT,
	typename UniqueOutputIteratorT,
	typename OffsetsOutputIteratorT,
	typename LengthsOutputIteratorT,
	typename NumRunsIterator,
	typename EqualityOp,
	typename OffsetT>
	__global__ void CnpDispatchKernel(
	Int2Type<RLE_METHOD> method,
	int timing_timing_iterations,
	size_t *d_temp_storage_bytes,
	cudaError_t *d_cdp_error,

	void* d_temp_storage,
	size_t temp_storage_bytes,
	InputIteratorT d_in,
	UniqueOutputIteratorT d_unique_out,
	OffsetsOutputIteratorT d_offsets_out,
	LengthsOutputIteratorT d_lengths_out,
	NumRunsIterator d_num_runs,
	cub::Equality equality_op,
	OffsetT num_items,
	cudaStream_t stream,
	bool debug_synchronous)
	{

	#ifndef CUB_CDP
	*d_cdp_error = cudaErrorNotSupported;
	#else
	*d_cdp_error = Dispatch(method, Int2Type<CUB>(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
	d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0, debug_synchronous);

	*d_temp_storage_bytes = temp_storage_bytes;
	#endif
	}


	/**
	* Dispatch to CDP kernel
	*/
	template <
	int RLE_METHOD,
	typename InputIteratorT,
	typename UniqueOutputIteratorT,
	typename OffsetsOutputIteratorT,
	typename LengthsOutputIteratorT,
	typename NumRunsIterator,
	typename EqualityOp,
	typename OffsetT>
	CUB_RUNTIME_FUNCTION __forceinline__
	cudaError_t Dispatch(
	Int2Type<RLE_METHOD> method,
	Int2Type<CDP> dispatch_to,
	int timing_timing_iterations,
	size_t *d_temp_storage_bytes,
	cudaError_t *d_cdp_error,

	void* d_temp_storage,
	size_t &temp_storage_bytes,
	InputIteratorT d_in,
	UniqueOutputIteratorT d_unique_out,
	OffsetsOutputIteratorT d_offsets_out,
	LengthsOutputIteratorT d_lengths_out,
	NumRunsIterator d_num_runs,
	EqualityOp equality_op,
	OffsetT num_items,
	cudaStream_t stream,
	bool debug_synchronous)
	{
	// Invoke kernel to invoke device-side dispatch
	CnpDispatchKernel<<<1,1>>>(method, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
	d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0, debug_synchronous);

	// Copy out temp_storage_bytes
	CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));

	// Copy out error
	cudaError_t retval;
	CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
	return retval;
	}



	//---------------------------------------------------------------------
	// Test generation
	//---------------------------------------------------------------------


	/**
	* Initialize problem
	*/
	template <typename T>
	void Initialize(
	int entropy_reduction,
	T *h_in,
	int num_items,
	int max_segment)
	{
	unsigned int max_int = (unsigned int) -1;

	int key = 0;
	int i = 0;
	while (i < num_items)
	{
	// Select number of repeating occurrences for the current run
	int repeat;
	if (max_segment < 0)
	{
	repeat = num_items;
	}
	else if (max_segment < 2)
	{
	repeat = 1;
	}
	else
	{
	RandomBits(repeat, entropy_reduction);
	repeat = (int) ((double(repeat) * double(max_segment)) / double(max_int));
	repeat = CUB_MAX(1, repeat);
	}

	int j = i;
	while (j < CUB_MIN(i + repeat, num_items))
	{
	InitValue(INTEGER_SEED, h_in[j], key);
	j++;
	}

	i = j;
	key++;
	}

	if (g_verbose)
	{
	printf("Input:\n");
	DisplayResults(h_in, num_items);
	printf("\n\n");
	}
	}


	/**
	* Solve problem. Returns total number of segments identified
	*/
	template <
	RleMethod RLE_METHOD,
	typename InputIteratorT,
	typename T,
	typename OffsetT,
	typename LengthT,
	typename EqualityOp>
	int Solve(
	InputIteratorT h_in,
	T *h_unique_reference,
	OffsetT *h_offsets_reference,
	LengthT *h_lengths_reference,
	EqualityOp equality_op,
	int num_items)
	{
	if (num_items == 0)
	return 0;

	// First item
	T previous = h_in[0];
	LengthT length = 1;
	int num_runs = 0;
	int run_begin = 0;

	// Subsequent items
	for (int i = 1; i < num_items; ++i)
	{
	if (!equality_op(previous, h_in[i]))
	{
	if ((RLE_METHOD != NON_TRIVIAL) \|\| (length > 1))
	{
	h_unique_reference[num_runs] = previous;
	h_offsets_reference[num_runs] = run_begin;
	h_lengths_reference[num_runs] = length;
	num_runs++;
	}
	length = 1;
	run_begin = i;
	}
	else
	{
	length++;
	}
	previous = h_in[i];
	}

	if ((RLE_METHOD != NON_TRIVIAL) \|\| (length > 1))
	{
	h_unique_reference[num_runs] = previous;
	h_offsets_reference[num_runs] = run_begin;
	h_lengths_reference[num_runs] = length;
	num_runs++;
	}

	return num_runs;
	}



	/**
	* Test DeviceRunLengthEncode for a given problem input
	*/
	template <
	RleMethod RLE_METHOD,
	Backend BACKEND,
	typename DeviceInputIteratorT,
	typename T,
	typename OffsetT,
	typename LengthT,
	typename EqualityOp>
	void Test(
	DeviceInputIteratorT d_in,
	T *h_unique_reference,
	OffsetT *h_offsets_reference,
	LengthT *h_lengths_reference,
	EqualityOp equality_op,
	int num_runs,
	int num_items)
	{
	// Allocate device output arrays and number of segments
	T* d_unique_out = NULL;
	LengthT* d_offsets_out = NULL;
	OffsetT* d_lengths_out = NULL;
	int* d_num_runs = NULL;

	if (RLE_METHOD == RLE)
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_unique_out, sizeof(T) num_items));
	if (RLE_METHOD == NON_TRIVIAL)
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_offsets_out, sizeof(OffsetT) num_items));
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_lengths_out, sizeof(LengthT) num_items));
	CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_runs, sizeof(int)));

	// Allocate CDP device arrays
	size_t* d_temp_storage_bytes = NULL;
	cudaError_t* d_cdp_error = NULL;
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_temp_storage_bytes, sizeof(size_t) 1));
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_cdp_error, sizeof(cudaError_t) 1));

	// Allocate temporary storage
	void* d_temp_storage = NULL;
	size_t temp_storage_bytes = 0;
	CubDebugExit(Dispatch(Int2Type<RLE_METHOD>(), Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0, true));
	CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));

	// Clear device output arrays
	if (RLE_METHOD == RLE)
	CubDebugExit(cudaMemset(d_unique_out, 0, sizeof(T) * num_items));
	if (RLE_METHOD == NON_TRIVIAL)
	CubDebugExit(cudaMemset(d_offsets_out, 0, sizeof(OffsetT) * num_items));
	CubDebugExit(cudaMemset(d_lengths_out, 0, sizeof(LengthT) * num_items));
	CubDebugExit(cudaMemset(d_num_runs, 0, sizeof(int)));

	// Run warmup/correctness iteration
	CubDebugExit(Dispatch(Int2Type<RLE_METHOD>(), Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0, true));

	// Check for correctness (and display results, if specified)
	int compare0 = 0;
	int compare1 = 0;
	int compare2 = 0;
	int compare3 = 0;

	if (RLE_METHOD == RLE)
	{
	compare0 = CompareDeviceResults(h_unique_reference, d_unique_out, num_runs, true, g_verbose);
	printf("\t Keys %s\n", compare0 ? "FAIL" : "PASS");
	}

	if (RLE_METHOD != RLE)
	{
	compare1 = CompareDeviceResults(h_offsets_reference, d_offsets_out, num_runs, true, g_verbose);
	printf("\t Offsets %s\n", compare1 ? "FAIL" : "PASS");
	}

	if (RLE_METHOD != CSR)
	{
	compare2 = CompareDeviceResults(h_lengths_reference, d_lengths_out, num_runs, true, g_verbose);
	printf("\t Lengths %s\n", compare2 ? "FAIL" : "PASS");
	}

	compare3 = CompareDeviceResults(&num_runs, d_num_runs, 1, true, g_verbose);
	printf("\t Count %s\n", compare3 ? "FAIL" : "PASS");

	// Flush any stdout/stderr
	fflush(stdout);
	fflush(stderr);

	// Performance
	GpuTimer gpu_timer;
	gpu_timer.Start();
	CubDebugExit(Dispatch(Int2Type<RLE_METHOD>(), Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0, false));
	gpu_timer.Stop();
	float elapsed_millis = gpu_timer.ElapsedMillis();

	// Display performance
	if (g_timing_iterations > 0)
	{
	float avg_millis = elapsed_millis / g_timing_iterations;
	float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
	int bytes_moved = (num_items * sizeof(T)) + (num_runs * (sizeof(OffsetT) + sizeof(LengthT)));
	float giga_bandwidth = float(bytes_moved) / avg_millis / 1000.0f / 1000.0f;
	printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", avg_millis, giga_rate, giga_bandwidth);
	}
	printf("\n\n");

	// Flush any stdout/stderr
	fflush(stdout);
	fflush(stderr);

	// Cleanup
	if (d_unique_out) CubDebugExit(g_allocator.DeviceFree(d_unique_out));
	if (d_offsets_out) CubDebugExit(g_allocator.DeviceFree(d_offsets_out));
	if (d_lengths_out) CubDebugExit(g_allocator.DeviceFree(d_lengths_out));
	if (d_num_runs) CubDebugExit(g_allocator.DeviceFree(d_num_runs));
	if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
	if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
	if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));

	// Correctness asserts
	AssertEquals(0, compare0 \| compare1 \| compare2 \| compare3);
	}


	/**
	* Test DeviceRunLengthEncode on pointer type
	*/
	template <
	RleMethod RLE_METHOD,
	Backend BACKEND,
	typename T,
	typename OffsetT,
	typename LengthT>
	void TestPointer(
	int num_items,
	int entropy_reduction,
	int max_segment)
	{
	// Allocate host arrays
	T* h_in = new T[num_items];
	T* h_unique_reference = new T[num_items];
	OffsetT* h_offsets_reference = new OffsetT[num_items];
	LengthT* h_lengths_reference = new LengthT[num_items];

	for (int i = 0; i < num_items; ++i)
	InitValue(INTEGER_SEED, h_offsets_reference[i], 1);

	// Initialize problem and solution
	Equality equality_op;
	Initialize(entropy_reduction, h_in, num_items, max_segment);

	int num_runs = Solve<RLE_METHOD>(h_in, h_unique_reference, h_offsets_reference, h_lengths_reference, equality_op, num_items);

	printf("\nPointer %s cub::%s on %d items, %d segments (avg run length %.3f), {%s key, %s offset, %s length}, max_segment %d, entropy_reduction %d\n",
	(RLE_METHOD == RLE) ? "DeviceReduce::RunLengthEncode" : (RLE_METHOD == NON_TRIVIAL) ? "DeviceRunLengthEncode::NonTrivialRuns" : "Other",
	(BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
	num_items, num_runs, float(num_items) / num_runs,
	typeid(T).name(), typeid(OffsetT).name(), typeid(LengthT).name(),
	max_segment, entropy_reduction);
	fflush(stdout);

	// Allocate problem device arrays
	T* d_in = NULL;
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_in, sizeof(T) num_items));

	// Initialize device input
	CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * num_items, cudaMemcpyHostToDevice));

	// Run Test
	Test<RLE_METHOD, BACKEND>(d_in, h_unique_reference, h_offsets_reference, h_lengths_reference, equality_op, num_runs, num_items);

	// Cleanup
	if (h_in) delete[] h_in;
	if (h_unique_reference) delete[] h_unique_reference;
	if (h_offsets_reference) delete[] h_offsets_reference;
	if (h_lengths_reference) delete[] h_lengths_reference;
	if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
	}


	/**
	* Test on iterator type
	*/
	template <
	RleMethod RLE_METHOD,
	Backend BACKEND,
	typename T,
	typename OffsetT,
	typename LengthT>
	void TestIterator(
	int num_items,
	Int2Type<true> /is_primitive/)
	{
	// Allocate host arrays
	T* h_unique_reference = new T[num_items];
	OffsetT* h_offsets_reference = new OffsetT[num_items];
	LengthT* h_lengths_reference = new LengthT[num_items];

	T one_val;
	InitValue(INTEGER_SEED, one_val, 1);
	ConstantInputIterator<T, int> h_in(one_val);

	// Initialize problem and solution
	Equality equality_op;
	int num_runs = Solve<RLE_METHOD>(h_in, h_unique_reference, h_offsets_reference, h_lengths_reference, equality_op, num_items);

	printf("\nIterator %s cub::%s on %d items, %d segments (avg run length %.3f), {%s key, %s offset, %s length}\n",
	(RLE_METHOD == RLE) ? "DeviceReduce::RunLengthEncode" : (RLE_METHOD == NON_TRIVIAL) ? "DeviceRunLengthEncode::NonTrivialRuns" : "Other",
	(BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
	num_items, num_runs, float(num_items) / num_runs,
	typeid(T).name(), typeid(OffsetT).name(), typeid(LengthT).name());
	fflush(stdout);

	// Run Test
	Test<RLE_METHOD, BACKEND>(h_in, h_unique_reference, h_offsets_reference, h_lengths_reference, equality_op, num_runs, num_items);

	// Cleanup
	if (h_unique_reference) delete[] h_unique_reference;
	if (h_offsets_reference) delete[] h_offsets_reference;
	if (h_lengths_reference) delete[] h_lengths_reference;
	}


	template <
	RleMethod RLE_METHOD,
	Backend BACKEND,
	typename T,
	typename OffsetT,
	typename LengthT>
	void TestIterator(
	int /num_items/,
	Int2Type<false> /is_primitive/)
	{}


	/**
	* Test different gen modes
	*/
	template <
	RleMethod RLE_METHOD,
	Backend BACKEND,
	typename T,
	typename OffsetT,
	typename LengthT>
	void Test(
	int num_items)
	{
	// Test iterator (one run)
	TestIterator<RLE_METHOD, BACKEND, T, OffsetT, LengthT>(num_items, Int2Type<Traits<T>::PRIMITIVE>());

	// num_items runs
	TestPointer<RLE_METHOD, BACKEND, T, OffsetT, LengthT>(num_items, 0, 1);

	// Evaluate different run lengths
	for (int max_segment = 3; max_segment < CUB_MIN(num_items, (unsigned short) -1); max_segment *= 3)
	{
	// Uniform selection run length
	TestPointer<RLE_METHOD, BACKEND, T, OffsetT, LengthT>(num_items, 0, max_segment);

	// Reduced-entropy run length
	TestPointer<RLE_METHOD, BACKEND, T, OffsetT, LengthT>(num_items, 4, max_segment);
	}
	}


	/**
	* Test different dispatch
	*/
	template <
	typename T,
	typename OffsetT,
	typename LengthT>
	void TestDispatch(
	int num_items)
	{
	Test<RLE, CUB, T, OffsetT, LengthT>(num_items);
	Test<NON_TRIVIAL, CUB, T, OffsetT, LengthT>(num_items);

	#ifdef CUB_CDP
	Test<RLE, CDP, T, OffsetT, LengthT>(num_items);
	Test<NON_TRIVIAL, CDP, T, OffsetT, LengthT>(num_items);
	#endif
	}


	/**
	* Test different input sizes
	*/
	template <
	typename T,
	typename OffsetT,
	typename LengthT>
	void TestSize(
	int num_items)
	{
	if (num_items < 0)
	{
	TestDispatch<T, OffsetT, LengthT>(0);
	TestDispatch<T, OffsetT, LengthT>(1);
	TestDispatch<T, OffsetT, LengthT>(100);
	TestDispatch<T, OffsetT, LengthT>(10000);
	TestDispatch<T, OffsetT, LengthT>(1000000);

	// Randomly select problem size between 1:10,000,000
	unsigned int max_int = (unsigned int) -1;
	for (int i = 0; i < 10; ++i)
	{
	unsigned int num_items;
	RandomBits(num_items);
	num_items = (unsigned int) ((double(num_items) * double(10000000)) / double(max_int));
	num_items = CUB_MAX(1, num_items);
	TestDispatch<T, OffsetT, LengthT>(num_items);
	}
	}
	else
	{
	TestDispatch<T, OffsetT, LengthT>(num_items);
	}

	}


	//---------------------------------------------------------------------
	// Main
	//---------------------------------------------------------------------

	/**
	* Main
	*/
	int main(int argc, char** argv)
	{
	int num_items = -1;
	int entropy_reduction = 0;
	int max_segment = 1000;

	// Initialize command line
	CommandLineArgs args(argc, argv);
	g_verbose = args.CheckCmdLineFlag("v");
	args.GetCmdLineArgument("n", num_items);
	args.GetCmdLineArgument("i", g_timing_iterations);
	args.GetCmdLineArgument("repeat", g_repeat);
	args.GetCmdLineArgument("maxseg", max_segment);
	args.GetCmdLineArgument("entropy", entropy_reduction);

	// Print usage
	if (args.CheckCmdLineFlag("help"))
	{
	printf("%s "
	"[--n=<input items> "
	"[--i=<timing iterations> "
	"[--device=<device-id>] "
	"[--maxseg=<max segment length>]"
	"[--entropy=<segment length bit entropy reduction rounds>]"
	"[--repeat=<repetitions of entire test suite>]"
	"[--v] "
	"[--cdp]"
	"\n", argv[0]);
	exit(0);
	}

	// Initialize device
	CubDebugExit(args.DeviceInit());
	printf("\n");

	// Get ptx version
	int ptx_version = 0;
	CubDebugExit(PtxVersion(ptx_version));

	#ifdef QUICKER_TEST

	// Compile/run basic CUB test
	if (num_items < 0) num_items = 32000000;

	TestPointer<RLE, CUB, int, int, int>( num_items, entropy_reduction, max_segment);
	TestPointer<NON_TRIVIAL, CUB, int, int, int>( num_items, entropy_reduction, max_segment);
	TestIterator<RLE, CUB, float, int, int>( num_items, Int2Type<Traits<float>::PRIMITIVE>());


	#elif defined(QUICK_TEST)

	// Compile/run quick tests
	if (num_items < 0) num_items = 32000000;

	TestPointer<RLE, CUB, int, int, int>( num_items, entropy_reduction, max_segment);
	TestPointer<RLE, THRUST, int, int, int>( num_items, entropy_reduction, max_segment);

	#else

	// Compile/run thorough tests
	for (int i = 0; i <= g_repeat; ++i)
	{
	// Test different input types
	TestSize<char, int, int>(num_items);
	TestSize<short, int, int>(num_items);
	TestSize<int, int, int>(num_items);
	TestSize<long, int, int>(num_items);
	TestSize<long long, int, int>(num_items);
	TestSize<float, int, int>(num_items);
	TestSize<double, int, int>(num_items);

	TestSize<uchar2, int, int>(num_items);
	TestSize<uint2, int, int>(num_items);
	TestSize<uint3, int, int>(num_items);
	TestSize<uint4, int, int>(num_items);
	TestSize<ulonglong4, int, int>(num_items);
	TestSize<TestFoo, int, int>(num_items);
	TestSize<TestBar, int, int>(num_items);
	}

	#endif

	return 0;
	}