Spaces:

ma-xu
/

LIVE

Runtime error

LIVE / thrust /dependencies /cub /test /test_device_reduce.cu

Xu Ma

update

1c3c0d9 over 2 years ago

50 kB

	/******************************************************************************
	* Copyright (c) 2011, Duane Merrill. All rights reserved.
	* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* * Neither the name of the NVIDIA CORPORATION nor the
	* names of its contributors may be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	******************************************************************************/

	/******************************************************************************
	* Test of DeviceReduce utilities
	******************************************************************************/

	// Ensure printing of CUDA runtime errors to console
	#define CUB_STDERR

	#include <stdio.h>
	#include <limits>
	#include <typeinfo>

	#include <thrust/device_ptr.h>
	#include <thrust/reduce.h>

	#include <cub/util_allocator.cuh>
	#include <cub/device/device_reduce.cuh>
	#include <cub/device/device_segmented_reduce.cuh>
	#include <cub/iterator/constant_input_iterator.cuh>
	#include <cub/iterator/discard_output_iterator.cuh>
	#include <cub/iterator/transform_input_iterator.cuh>

	#include "test_util.h"

	using namespace cub;


	//---------------------------------------------------------------------
	// Globals, constants and typedefs
	//---------------------------------------------------------------------

	int g_ptx_version;
	int g_sm_count;
	double g_device_giga_bandwidth;
	bool g_verbose = false;
	bool g_verbose_input = false;
	int g_timing_iterations = 0;
	int g_repeat = 0;
	CachingDeviceAllocator g_allocator(true);


	// Dispatch types
	enum Backend
	{
	CUB, // CUB method
	CUB_SEGMENTED, // CUB segmented method
	CUB_CDP, // GPU-based (dynamic parallelism) dispatch to CUB method
	THRUST, // Thrust method
	};


	// Custom max functor
	struct CustomMax
	{
	/// Boolean max operator, returns <tt>(a > b) ? a : b</tt>
	template <typename OutputT>
	__host__ __device__ __forceinline__ OutputT operator()(const OutputT &a, const OutputT &b)
	{
	return CUB_MAX(a, b);
	}
	};


	//---------------------------------------------------------------------
	// Dispatch to different CUB DeviceReduce entrypoints
	//---------------------------------------------------------------------

	/**
	* Dispatch to reduce entrypoint (custom-max)
	*/
	template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT, typename ReductionOpT>
	CUB_RUNTIME_FUNCTION __forceinline__
	cudaError_t Dispatch(
	Int2Type<CUB> /dispatch_to/,
	int timing_iterations,
	size_t /d_temp_storage_bytes*/,
	cudaError_t /d_cdp_error*/,

	void* d_temp_storage,
	size_t& temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	int num_items,
	int /max_segments/,
	OffsetIteratorT /d_segment_offsets/,
	ReductionOpT reduction_op,
	cudaStream_t stream,
	bool debug_synchronous)
	{
	typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;

	// The output value type
	typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
	typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type,
	typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type

	// Max-identity
	OutputT identity = Traits<InputT>::Lowest(); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent

	// Invoke kernel to device reduction directly
	cudaError_t error = cudaSuccess;
	for (int i = 0; i < timing_iterations; ++i)
	{
	error = DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes,
	d_in, d_out, num_items, reduction_op, identity,
	stream, debug_synchronous);
	}

	return error;
	}

	/**
	* Dispatch to sum entrypoint
	*/
	template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
	CUB_RUNTIME_FUNCTION __forceinline__
	cudaError_t Dispatch(
	Int2Type<CUB> /dispatch_to/,
	int timing_iterations,
	size_t /d_temp_storage_bytes*/,
	cudaError_t /d_cdp_error*/,

	void* d_temp_storage,
	size_t& temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	int num_items,
	int /max_segments/,
	OffsetIteratorT /d_segment_offsets/,
	cub::Sum /reduction_op/,
	cudaStream_t stream,
	bool debug_synchronous)
	{
	// Invoke kernel to device reduction directly
	cudaError_t error = cudaSuccess;
	for (int i = 0; i < timing_iterations; ++i)
	{
	error = DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
	}

	return error;
	}

	/**
	* Dispatch to min entrypoint
	*/
	template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
	CUB_RUNTIME_FUNCTION __forceinline__
	cudaError_t Dispatch(
	Int2Type<CUB> /dispatch_to/,
	int timing_iterations,
	size_t /d_temp_storage_bytes*/,
	cudaError_t /d_cdp_error*/,

	void* d_temp_storage,
	size_t& temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	int num_items,
	int /max_segments/,
	OffsetIteratorT /d_segment_offsets/,
	cub::Min /reduction_op/,
	cudaStream_t stream,
	bool debug_synchronous)
	{
	// Invoke kernel to device reduction directly
	cudaError_t error = cudaSuccess;
	for (int i = 0; i < timing_iterations; ++i)
	{
	error = DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
	}

	return error;
	}

	/**
	* Dispatch to max entrypoint
	*/
	template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
	CUB_RUNTIME_FUNCTION __forceinline__
	cudaError_t Dispatch(
	Int2Type<CUB> /dispatch_to/,
	int timing_iterations,
	size_t /d_temp_storage_bytes*/,
	cudaError_t /d_cdp_error*/,

	void* d_temp_storage,
	size_t& temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	int num_items,
	int /max_segments/,
	OffsetIteratorT /d_segment_offsets/,
	cub::Max /reduction_op/,
	cudaStream_t stream,
	bool debug_synchronous)
	{
	// Invoke kernel to device reduction directly
	cudaError_t error = cudaSuccess;
	for (int i = 0; i < timing_iterations; ++i)
	{
	error = DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
	}

	return error;
	}

	/**
	* Dispatch to argmin entrypoint
	*/
	template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
	CUB_RUNTIME_FUNCTION __forceinline__
	cudaError_t Dispatch(
	Int2Type<CUB> /dispatch_to/,
	int timing_iterations,
	size_t /d_temp_storage_bytes*/,
	cudaError_t /d_cdp_error*/,

	void* d_temp_storage,
	size_t& temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	int num_items,
	int /max_segments/,
	OffsetIteratorT /d_segment_offsets/,
	cub::ArgMin /reduction_op/,
	cudaStream_t stream,
	bool debug_synchronous)
	{
	// Invoke kernel to device reduction directly
	cudaError_t error = cudaSuccess;
	for (int i = 0; i < timing_iterations; ++i)
	{
	error = DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
	}

	return error;
	}

	/**
	* Dispatch to argmax entrypoint
	*/
	template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
	CUB_RUNTIME_FUNCTION __forceinline__
	cudaError_t Dispatch(
	Int2Type<CUB> /dispatch_to/,
	int timing_iterations,
	size_t /d_temp_storage_bytes*/,
	cudaError_t /d_cdp_error*/,

	void* d_temp_storage,
	size_t& temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	int num_items,
	int /max_segments/,
	OffsetIteratorT /d_segment_offsets/,
	cub::ArgMax /reduction_op/,
	cudaStream_t stream,
	bool debug_synchronous)
	{
	// Invoke kernel to device reduction directly
	cudaError_t error = cudaSuccess;
	for (int i = 0; i < timing_iterations; ++i)
	{
	error = DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
	}

	return error;
	}


	//---------------------------------------------------------------------
	// Dispatch to different CUB DeviceSegmentedReduce entrypoints
	//---------------------------------------------------------------------

	/**
	* Dispatch to reduce entrypoint (custom-max)
	*/
	template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT, typename ReductionOpT>
	CUB_RUNTIME_FUNCTION __forceinline__
	cudaError_t Dispatch(
	Int2Type<CUB_SEGMENTED> /dispatch_to/,
	int timing_iterations,
	size_t /d_temp_storage_bytes*/,
	cudaError_t /d_cdp_error*/,

	void* d_temp_storage,
	size_t& temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	int /num_items/,
	int max_segments,
	OffsetIteratorT d_segment_offsets,
	ReductionOpT reduction_op,
	cudaStream_t stream,
	bool debug_synchronous)
	{
	// The input value type
	typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;

	// The output value type
	typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
	typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type,
	typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type

	// Max-identity
	OutputT identity = Traits<InputT>::Lowest(); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent

	// Invoke kernel to device reduction directly
	cudaError_t error = cudaSuccess;
	for (int i = 0; i < timing_iterations; ++i)
	{
	error = DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes,
	d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1, reduction_op, identity,
	stream, debug_synchronous);
	}
	return error;
	}

	/**
	* Dispatch to sum entrypoint
	*/
	template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
	CUB_RUNTIME_FUNCTION __forceinline__
	cudaError_t Dispatch(
	Int2Type<CUB_SEGMENTED> /dispatch_to/,
	int timing_iterations,
	size_t /d_temp_storage_bytes*/,
	cudaError_t /d_cdp_error*/,

	void* d_temp_storage,
	size_t& temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	int /num_items/,
	int max_segments,
	OffsetIteratorT d_segment_offsets,
	cub::Sum /reduction_op/,
	cudaStream_t stream,
	bool debug_synchronous)
	{
	// Invoke kernel to device reduction directly
	cudaError_t error = cudaSuccess;
	for (int i = 0; i < timing_iterations; ++i)
	{
	error = DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes,
	d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1,
	stream, debug_synchronous);
	}
	return error;
	}

	/**
	* Dispatch to min entrypoint
	*/
	template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
	CUB_RUNTIME_FUNCTION __forceinline__
	cudaError_t Dispatch(
	Int2Type<CUB_SEGMENTED> /dispatch_to/,
	int timing_iterations,
	size_t /d_temp_storage_bytes*/,
	cudaError_t /d_cdp_error*/,

	void* d_temp_storage,
	size_t& temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	int /num_items/,
	int max_segments,
	OffsetIteratorT d_segment_offsets,
	cub::Min /reduction_op/,
	cudaStream_t stream,
	bool debug_synchronous)
	{
	// Invoke kernel to device reduction directly
	cudaError_t error = cudaSuccess;
	for (int i = 0; i < timing_iterations; ++i)
	{
	error = DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes,
	d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1,
	stream, debug_synchronous);
	}
	return error;
	}

	/**
	* Dispatch to max entrypoint
	*/
	template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
	CUB_RUNTIME_FUNCTION __forceinline__
	cudaError_t Dispatch(
	Int2Type<CUB_SEGMENTED> /dispatch_to/,
	int timing_iterations,
	size_t /d_temp_storage_bytes*/,
	cudaError_t /d_cdp_error*/,

	void* d_temp_storage,
	size_t& temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	int /num_items/,
	int max_segments,
	OffsetIteratorT d_segment_offsets,
	cub::Max /reduction_op/,
	cudaStream_t stream,
	bool debug_synchronous)
	{
	// Invoke kernel to device reduction directly
	cudaError_t error = cudaSuccess;
	for (int i = 0; i < timing_iterations; ++i)
	{
	error = DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes,
	d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1,
	stream, debug_synchronous);
	}
	return error;
	}

	/**
	* Dispatch to argmin entrypoint
	*/
	template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
	CUB_RUNTIME_FUNCTION __forceinline__
	cudaError_t Dispatch(
	Int2Type<CUB_SEGMENTED> /dispatch_to/,
	int timing_iterations,
	size_t /d_temp_storage_bytes*/,
	cudaError_t /d_cdp_error*/,

	void* d_temp_storage,
	size_t& temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	int /num_items/,
	int max_segments,
	OffsetIteratorT d_segment_offsets,
	cub::ArgMin /reduction_op/,
	cudaStream_t stream,
	bool debug_synchronous)
	{
	// Invoke kernel to device reduction directly
	cudaError_t error = cudaSuccess;
	for (int i = 0; i < timing_iterations; ++i)
	{
	error = DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes,
	d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1,
	stream, debug_synchronous);
	}
	return error;
	}

	/**
	* Dispatch to argmax entrypoint
	*/
	template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
	CUB_RUNTIME_FUNCTION __forceinline__
	cudaError_t Dispatch(
	Int2Type<CUB_SEGMENTED> /dispatch_to/,
	int timing_iterations,
	size_t /d_temp_storage_bytes*/,
	cudaError_t /d_cdp_error*/,

	void* d_temp_storage,
	size_t& temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	int /num_items/,
	int max_segments,
	OffsetIteratorT d_segment_offsets,
	cub::ArgMax /reduction_op/,
	cudaStream_t stream,
	bool debug_synchronous)
	{
	// Invoke kernel to device reduction directly
	cudaError_t error = cudaSuccess;
	for (int i = 0; i < timing_iterations; ++i)
	{
	error = DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes,
	d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1,
	stream, debug_synchronous);
	}
	return error;
	}


	//---------------------------------------------------------------------
	// Dispatch to different Thrust entrypoints
	//---------------------------------------------------------------------

	/**
	* Dispatch to reduction entrypoint (min or max specialization)
	*/
	template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT, typename ReductionOpT>
	cudaError_t Dispatch(
	Int2Type<THRUST> /dispatch_to/,
	int timing_iterations,
	size_t /d_temp_storage_bytes*/,
	cudaError_t /d_cdp_error*/,

	void* d_temp_storage,
	size_t& temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	int num_items,
	int /max_segments/,
	OffsetIteratorT /d_segment_offsets/,
	ReductionOpT reduction_op,
	cudaStream_t /stream/,
	bool /debug_synchronous/)
	{
	// The output value type
	typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
	typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type,
	typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type

	if (d_temp_storage == 0)
	{
	temp_storage_bytes = 1;
	}
	else
	{
	OutputT init;
	CubDebugExit(cudaMemcpy(&init, d_in + 0, sizeof(OutputT), cudaMemcpyDeviceToHost));

	thrust::device_ptr<OutputT> d_in_wrapper(d_in);
	OutputT retval;
	for (int i = 0; i < timing_iterations; ++i)
	{
	retval = thrust::reduce(d_in_wrapper, d_in_wrapper + num_items, init, reduction_op);
	}

	if (!Equals<OutputIteratorT, DiscardOutputIterator<int> >::VALUE)
	CubDebugExit(cudaMemcpy(d_out, &retval, sizeof(OutputT), cudaMemcpyHostToDevice));
	}

	return cudaSuccess;
	}

	/**
	* Dispatch to reduction entrypoint (sum specialization)
	*/
	template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
	cudaError_t Dispatch(
	Int2Type<THRUST> /dispatch_to/,
	int timing_iterations,
	size_t /d_temp_storage_bytes*/,
	cudaError_t /d_cdp_error*/,

	void* d_temp_storage,
	size_t& temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	int num_items,
	int /max_segments/,
	OffsetIteratorT /d_segment_offsets/,
	Sum /reduction_op/,
	cudaStream_t /stream/,
	bool /debug_synchronous/)
	{
	// The output value type
	typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
	typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type,
	typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type

	if (d_temp_storage == 0)
	{
	temp_storage_bytes = 1;
	}
	else
	{
	thrust::device_ptr<OutputT> d_in_wrapper(d_in);
	OutputT retval;
	for (int i = 0; i < timing_iterations; ++i)
	{
	retval = thrust::reduce(d_in_wrapper, d_in_wrapper + num_items);
	}

	if (!Equals<OutputIteratorT, DiscardOutputIterator<int> >::VALUE)
	CubDebugExit(cudaMemcpy(d_out, &retval, sizeof(OutputT), cudaMemcpyHostToDevice));
	}

	return cudaSuccess;
	}


	//---------------------------------------------------------------------
	// CUDA nested-parallelism test kernel
	//---------------------------------------------------------------------

	/**
	* Simple wrapper kernel to invoke DeviceReduce
	*/
	template <
	typename InputIteratorT,
	typename OutputIteratorT,
	typename OffsetIteratorT,
	typename ReductionOpT>
	__global__ void CnpDispatchKernel(
	int timing_iterations,
	size_t *d_temp_storage_bytes,
	cudaError_t *d_cdp_error,

	void* d_temp_storage,
	size_t temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	int num_items,
	int max_segments,
	OffsetIteratorT d_segment_offsets,
	ReductionOpT reduction_op,
	bool debug_synchronous)
	{
	#ifndef CUB_CDP
	(void)timing_iterations;
	(void)d_temp_storage_bytes;
	(void)d_cdp_error;
	(void)d_temp_storage;
	(void)temp_storage_bytes;
	(void)d_in;
	(void)d_out;
	(void)num_items;
	(void)max_segments;
	(void)d_segment_offsets;
	(void)reduction_op;
	(void)debug_synchronous;
	*d_cdp_error = cudaErrorNotSupported;
	#else
	*d_cdp_error = Dispatch(Int2Type<CUB>(), timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
	d_in, d_out, num_items, max_segments, d_segment_offsets, reduction_op, 0, debug_synchronous);
	*d_temp_storage_bytes = temp_storage_bytes;
	#endif
	}


	/**
	* Dispatch to CUB_CDP kernel
	*/
	template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT, typename ReductionOpT>
	CUB_RUNTIME_FUNCTION __forceinline__
	cudaError_t Dispatch(
	Int2Type<CUB_CDP> dispatch_to,
	int timing_iterations,
	size_t *d_temp_storage_bytes,
	cudaError_t *d_cdp_error,

	void* d_temp_storage,
	size_t& temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	int num_items,
	int max_segments,
	OffsetIteratorT d_segment_offsets,
	ReductionOpT reduction_op,
	cudaStream_t stream,
	bool debug_synchronous)
	{
	// Invoke kernel to invoke device-side dispatch
	CnpDispatchKernel<<<1,1>>>(timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
	d_in, d_out, num_items, max_segments, d_segment_offsets, reduction_op, debug_synchronous);

	// Copy out temp_storage_bytes
	CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));

	// Copy out error
	cudaError_t retval;
	CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
	return retval;
	}



	//---------------------------------------------------------------------
	// Problem generation
	//---------------------------------------------------------------------

	/// Initialize problem
	template <typename InputT>
	void Initialize(
	GenMode gen_mode,
	InputT *h_in,
	int num_items)
	{
	for (int i = 0; i < num_items; ++i)
	{
	InitValue(gen_mode, h_in[i], i);
	}

	if (g_verbose_input)
	{
	printf("Input:\n");
	DisplayResults(h_in, num_items);
	printf("\n\n");
	}
	}


	/// Solve problem (max/custom-max functor)
	template <typename ReductionOpT, typename InputT, typename _OutputT>
	struct Solution
	{
	typedef _OutputT OutputT;

	template <typename HostInputIteratorT, typename OffsetT, typename OffsetIteratorT>
	static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, OffsetIteratorT h_segment_offsets,
	ReductionOpT reduction_op)
	{
	for (int i = 0; i < num_segments; ++i)
	{
	OutputT aggregate = Traits<InputT>::Lowest(); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent
	for (int j = h_segment_offsets[i]; j < h_segment_offsets[i + 1]; ++j)
	aggregate = reduction_op(aggregate, OutputT(h_in[j]));
	h_reference[i] = aggregate;
	}
	}
	};

	/// Solve problem (min functor)
	template <typename InputT, typename _OutputT>
	struct Solution<cub::Min, InputT, _OutputT>
	{
	typedef _OutputT OutputT;

	template <typename HostInputIteratorT, typename OffsetT, typename OffsetIteratorT>
	static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, OffsetIteratorT h_segment_offsets,
	cub::Min reduction_op)
	{
	for (int i = 0; i < num_segments; ++i)
	{
	OutputT aggregate = Traits<InputT>::Max(); // replace with std::numeric_limits<OutputT>::max() when C++ support is more prevalent
	for (int j = h_segment_offsets[i]; j < h_segment_offsets[i + 1]; ++j)
	aggregate = reduction_op(aggregate, OutputT(h_in[j]));
	h_reference[i] = aggregate;
	}
	}
	};


	/// Solve problem (sum functor)
	template <typename InputT, typename _OutputT>
	struct Solution<cub::Sum, InputT, _OutputT>
	{
	typedef _OutputT OutputT;

	template <typename HostInputIteratorT, typename OffsetT, typename OffsetIteratorT>
	static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, OffsetIteratorT h_segment_offsets,
	cub::Sum reduction_op)
	{
	for (int i = 0; i < num_segments; ++i)
	{
	OutputT aggregate;
	InitValue(INTEGER_SEED, aggregate, 0);
	for (int j = h_segment_offsets[i]; j < h_segment_offsets[i + 1]; ++j)
	aggregate = reduction_op(aggregate, OutputT(h_in[j]));
	h_reference[i] = aggregate;
	}
	}
	};

	/// Solve problem (argmin functor)
	template <typename InputValueT, typename OutputValueT>
	struct Solution<cub::ArgMin, InputValueT, OutputValueT>
	{
	typedef KeyValuePair<int, OutputValueT> OutputT;

	template <typename HostInputIteratorT, typename OffsetT, typename OffsetIteratorT>
	static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, OffsetIteratorT h_segment_offsets,
	cub::ArgMin reduction_op)
	{
	for (int i = 0; i < num_segments; ++i)
	{
	OutputT aggregate(1, Traits<InputValueT>::Max()); // replace with std::numeric_limits<OutputT>::max() when C++ support is more prevalent
	for (int j = h_segment_offsets[i]; j < h_segment_offsets[i + 1]; ++j)
	{
	OutputT item(j - h_segment_offsets[i], OutputValueT(h_in[j]));
	aggregate = reduction_op(aggregate, item);
	}
	h_reference[i] = aggregate;
	}
	}
	};


	/// Solve problem (argmax functor)
	template <typename InputValueT, typename OutputValueT>
	struct Solution<cub::ArgMax, InputValueT, OutputValueT>
	{
	typedef KeyValuePair<int, OutputValueT> OutputT;

	template <typename HostInputIteratorT, typename OffsetT, typename OffsetIteratorT>
	static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, OffsetIteratorT h_segment_offsets,
	cub::ArgMax reduction_op)
	{
	for (int i = 0; i < num_segments; ++i)
	{
	OutputT aggregate(1, Traits<InputValueT>::Lowest()); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent
	for (int j = h_segment_offsets[i]; j < h_segment_offsets[i + 1]; ++j)
	{
	OutputT item(j - h_segment_offsets[i], OutputValueT(h_in[j]));
	aggregate = reduction_op(aggregate, item);
	}
	h_reference[i] = aggregate;
	}
	}
	};


	//---------------------------------------------------------------------
	// Problem generation
	//---------------------------------------------------------------------

	/// Test DeviceReduce for a given problem input
	template <
	typename BackendT,
	typename DeviceInputIteratorT,
	typename DeviceOutputIteratorT,
	typename HostReferenceIteratorT,
	typename OffsetT,
	typename OffsetIteratorT,
	typename ReductionOpT>
	void Test(
	BackendT backend,
	DeviceInputIteratorT d_in,
	DeviceOutputIteratorT d_out,
	OffsetT num_items,
	OffsetT num_segments,
	OffsetIteratorT d_segment_offsets,
	ReductionOpT reduction_op,
	HostReferenceIteratorT h_reference)
	{
	// Input data types
	typedef typename std::iterator_traits<DeviceInputIteratorT>::value_type InputT;

	// Allocate CUB_CDP device arrays for temp storage size and error
	size_t *d_temp_storage_bytes = NULL;
	cudaError_t *d_cdp_error = NULL;
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_temp_storage_bytes, sizeof(size_t) 1));
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_cdp_error, sizeof(cudaError_t) 1));

	// Inquire temp device storage
	void *d_temp_storage = NULL;
	size_t temp_storage_bytes = 0;
	CubDebugExit(Dispatch(backend, 1,
	d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
	d_in, d_out, num_items, num_segments, d_segment_offsets,
	reduction_op, 0, true));

	// Allocate temp device storage
	CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));

	// Run warmup/correctness iteration
	CubDebugExit(Dispatch(backend, 1,
	d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
	d_in, d_out, num_items, num_segments, d_segment_offsets,
	reduction_op, 0, true));

	// Check for correctness (and display results, if specified)
	int compare = CompareDeviceResults(h_reference, d_out, num_segments, g_verbose, g_verbose);
	printf("\t%s", compare ? "FAIL" : "PASS");

	// Flush any stdout/stderr
	fflush(stdout);
	fflush(stderr);

	// Performance
	if (g_timing_iterations > 0)
	{
	GpuTimer gpu_timer;
	gpu_timer.Start();

	CubDebugExit(Dispatch(backend, g_timing_iterations,
	d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
	d_in, d_out, num_items, num_segments, d_segment_offsets,
	reduction_op, 0, false));

	gpu_timer.Stop();
	float elapsed_millis = gpu_timer.ElapsedMillis();

	// Display performance
	float avg_millis = elapsed_millis / g_timing_iterations;
	float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
	float giga_bandwidth = giga_rate * sizeof(InputT);
	printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak",
	avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0);

	}

	if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
	if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
	if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));

	// Correctness asserts
	AssertEquals(0, compare);
	}


	/// Test DeviceReduce
	template <
	Backend BACKEND,
	typename OutputValueT,
	typename HostInputIteratorT,
	typename DeviceInputIteratorT,
	typename OffsetT,
	typename OffsetIteratorT,
	typename ReductionOpT>
	void SolveAndTest(
	HostInputIteratorT h_in,
	DeviceInputIteratorT d_in,
	OffsetT num_items,
	OffsetT num_segments,
	OffsetIteratorT h_segment_offsets,
	OffsetIteratorT d_segment_offsets,
	ReductionOpT reduction_op)
	{
	typedef typename std::iterator_traits<DeviceInputIteratorT>::value_type InputValueT;
	typedef Solution<ReductionOpT, InputValueT, OutputValueT> SolutionT;
	typedef typename SolutionT::OutputT OutputT;

	printf("\n\n%s cub::DeviceReduce<%s> %d items (%s), %d segments\n",
	(BACKEND == CUB_CDP) ? "CUB_CDP" : (BACKEND == THRUST) ? "Thrust" : (BACKEND == CUB_SEGMENTED) ? "CUB_SEGMENTED" : "CUB",
	typeid(ReductionOpT).name(), num_items, typeid(HostInputIteratorT).name(), num_segments);
	fflush(stdout);

	// Allocate and solve solution
	OutputT *h_reference = new OutputT[num_segments];
	SolutionT::Solve(h_in, h_reference, num_segments, h_segment_offsets, reduction_op);

	// // Run with discard iterator
	// DiscardOutputIterator<OffsetT> discard_itr;
	// Test(Int2Type<BACKEND>(), d_in, discard_itr, num_items, num_segments, d_segment_offsets, reduction_op, h_reference);

	// Run with output data (cleared for sanity-check)
	OutputT *d_out = NULL;
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_out, sizeof(OutputT) num_segments));
	CubDebugExit(cudaMemset(d_out, 0, sizeof(OutputT) * num_segments));
	Test(Int2Type<BACKEND>(), d_in, d_out, num_items, num_segments, d_segment_offsets, reduction_op, h_reference);

	// Cleanup
	if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
	if (h_reference) delete[] h_reference;
	}


	/// Test specific problem type
	template <
	Backend BACKEND,
	typename InputT,
	typename OutputT,
	typename OffsetT,
	typename ReductionOpT>
	void TestProblem(
	OffsetT num_items,
	OffsetT num_segments,
	GenMode gen_mode,
	ReductionOpT reduction_op)
	{
	printf("\n\nInitializing %d %s->%s (gen mode %d)... ", num_items, typeid(InputT).name(), typeid(OutputT).name(), gen_mode); fflush(stdout);
	fflush(stdout);

	// Initialize value data
	InputT* h_in = new InputT[num_items];
	Initialize(gen_mode, h_in, num_items);

	// Initialize segment data
	OffsetT *h_segment_offsets = new OffsetT[num_segments + 1];
	InitializeSegments(num_items, num_segments, h_segment_offsets, g_verbose_input);

	// Initialize device data
	OffsetT *d_segment_offsets = NULL;
	InputT *d_in = NULL;
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_in, sizeof(InputT) num_items));
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_segment_offsets, sizeof(OffsetT) (num_segments + 1)));
	CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(InputT) * num_items, cudaMemcpyHostToDevice));
	CubDebugExit(cudaMemcpy(d_segment_offsets, h_segment_offsets, sizeof(OffsetT) * (num_segments + 1), cudaMemcpyHostToDevice));

	SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, reduction_op);

	if (h_segment_offsets) delete[] h_segment_offsets;
	if (d_segment_offsets) CubDebugExit(g_allocator.DeviceFree(d_segment_offsets));
	if (h_in) delete[] h_in;
	if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
	}


	/// Test different operators
	template <
	Backend BACKEND,
	typename OutputT,
	typename HostInputIteratorT,
	typename DeviceInputIteratorT,
	typename OffsetT,
	typename OffsetIteratorT>
	void TestByOp(
	HostInputIteratorT h_in,
	DeviceInputIteratorT d_in,
	OffsetT num_items,
	OffsetT num_segments,
	OffsetIteratorT h_segment_offsets,
	OffsetIteratorT d_segment_offsets)
	{
	SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, CustomMax());
	SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, Sum());
	SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, Min());
	SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, ArgMin());
	SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, Max());
	SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, ArgMax());
	}


	/// Test different backends
	template <
	typename InputT,
	typename OutputT,
	typename OffsetT>
	void TestByBackend(
	OffsetT num_items,
	OffsetT max_segments,
	GenMode gen_mode)
	{
	// Initialize host data
	printf("\n\nInitializing %d %s -> %s (gen mode %d)... ",
	num_items, typeid(InputT).name(), typeid(OutputT).name(), gen_mode); fflush(stdout);

	InputT *h_in = new InputT[num_items];
	OffsetT *h_segment_offsets = new OffsetT[max_segments + 1];
	Initialize(gen_mode, h_in, num_items);

	// Initialize device data
	InputT *d_in = NULL;
	OffsetT *d_segment_offsets = NULL;
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_in, sizeof(InputT) num_items));
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_segment_offsets, sizeof(OffsetT) (max_segments + 1)));
	CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(InputT) * num_items, cudaMemcpyHostToDevice));

	//
	// Test single-segment implementations
	//

	InitializeSegments(num_items, 1, h_segment_offsets, g_verbose_input);

	// Page-aligned-input tests
	TestByOp<CUB, OutputT>(h_in, d_in, num_items, 1, h_segment_offsets, (OffsetT*) NULL); // Host-dispatch
	#ifdef CUB_CDP
	TestByOp<CUB_CDP, OutputT>(h_in, d_in, num_items, 1, h_segment_offsets, (OffsetT*) NULL); // Device-dispatch
	#endif

	// Non-page-aligned-input tests
	if (num_items > 1)
	{
	InitializeSegments(num_items - 1, 1, h_segment_offsets, g_verbose_input);
	TestByOp<CUB, OutputT>(h_in + 1, d_in + 1, num_items - 1, 1, h_segment_offsets, (OffsetT*) NULL);
	}

	//
	// Test segmented implementation
	//

	// Right now we assign a single thread block to each segment, so lets keep it to under 128K items per segment
	int max_items_per_segment = 128000;

	for (int num_segments = (num_items + max_items_per_segment - 1) / max_items_per_segment;
	num_segments < max_segments;
	num_segments = (num_segments * 32) + 1)
	{
	// Test with segment pointer
	InitializeSegments(num_items, num_segments, h_segment_offsets, g_verbose_input);
	CubDebugExit(cudaMemcpy(d_segment_offsets, h_segment_offsets, sizeof(OffsetT) * (num_segments + 1), cudaMemcpyHostToDevice));
	TestByOp<CUB_SEGMENTED, OutputT>(
	h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets);

	// Test with segment iterator
	typedef CastOp<OffsetT> IdentityOpT;
	IdentityOpT identity_op;
	TransformInputIterator<OffsetT, IdentityOpT, OffsetT*, OffsetT> h_segment_offsets_itr(
	h_segment_offsets,
	identity_op);
	TransformInputIterator<OffsetT, IdentityOpT, OffsetT*, OffsetT> d_segment_offsets_itr(
	d_segment_offsets,
	identity_op);

	TestByOp<CUB_SEGMENTED, OutputT>(
	h_in, d_in, num_items, num_segments, h_segment_offsets_itr, d_segment_offsets_itr);
	}

	if (h_in) delete[] h_in;
	if (h_segment_offsets) delete[] h_segment_offsets;
	if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
	if (d_segment_offsets) CubDebugExit(g_allocator.DeviceFree(d_segment_offsets));
	}


	/// Test different input-generation modes
	template <
	typename InputT,
	typename OutputT,
	typename OffsetT>
	void TestByGenMode(
	OffsetT num_items,
	OffsetT max_segments)
	{
	//
	// Test pointer support using different input-generation modes
	//

	TestByBackend<InputT, OutputT>(num_items, max_segments, UNIFORM);
	TestByBackend<InputT, OutputT>(num_items, max_segments, INTEGER_SEED);
	TestByBackend<InputT, OutputT>(num_items, max_segments, RANDOM);

	//
	// Test iterator support using a constant-iterator and SUM
	//

	InputT val;
	InitValue(UNIFORM, val, 0);
	ConstantInputIterator<InputT, OffsetT> h_in(val);

	OffsetT *h_segment_offsets = new OffsetT[1 + 1];
	InitializeSegments(num_items, 1, h_segment_offsets, g_verbose_input);

	SolveAndTest<CUB, OutputT>(h_in, h_in, num_items, 1, h_segment_offsets, (OffsetT*) NULL, Sum());
	#ifdef CUB_CDP
	SolveAndTest<CUB_CDP, OutputT>(h_in, h_in, num_items, 1, h_segment_offsets, (OffsetT*) NULL, Sum());
	#endif

	if (h_segment_offsets) delete[] h_segment_offsets;
	}


	/// Test different problem sizes
	template <
	typename InputT,
	typename OutputT,
	typename OffsetT>
	struct TestBySize
	{
	OffsetT max_items;
	OffsetT max_segments;

	TestBySize(OffsetT max_items, OffsetT max_segments) :
	max_items(max_items),
	max_segments(max_segments)
	{}

	template <typename ActivePolicyT>
	cudaError_t Invoke()
	{
	//
	// Black-box testing on all backends
	//

	// Test 0, 1, many
	TestByGenMode<InputT, OutputT>(0, max_segments);
	TestByGenMode<InputT, OutputT>(1, max_segments);
	TestByGenMode<InputT, OutputT>(max_items, max_segments);

	// Test random problem sizes from a log-distribution [8, max_items-ish)
	int num_iterations = 8;
	double max_exp = log(double(max_items)) / log(double(2.0));
	for (int i = 0; i < num_iterations; ++i)
	{
	OffsetT num_items = (OffsetT) pow(2.0, RandomValue(max_exp - 3.0) + 3.0);
	TestByGenMode<InputT, OutputT>(num_items, max_segments);
	}

	//
	// White-box testing of single-segment problems around specific sizes
	//

	// Tile-boundaries: multiple blocks, one tile per block
	OffsetT tile_size = ActivePolicyT::ReducePolicy::BLOCK_THREADS * ActivePolicyT::ReducePolicy::ITEMS_PER_THREAD;
	TestProblem<CUB, InputT, OutputT>(tile_size * 4, 1, RANDOM, Sum());
	TestProblem<CUB, InputT, OutputT>(tile_size * 4 + 1, 1, RANDOM, Sum());
	TestProblem<CUB, InputT, OutputT>(tile_size * 4 - 1, 1, RANDOM, Sum());

	// Tile-boundaries: multiple blocks, multiple tiles per block
	OffsetT sm_occupancy = 32;
	OffsetT occupancy = tile_size * sm_occupancy * g_sm_count;
	TestProblem<CUB, InputT, OutputT>(occupancy, 1, RANDOM, Sum());
	TestProblem<CUB, InputT, OutputT>(occupancy + 1, 1, RANDOM, Sum());
	TestProblem<CUB, InputT, OutputT>(occupancy - 1, 1, RANDOM, Sum());

	return cudaSuccess;
	}
	};


	/// Test problem type
	template <
	typename InputT,
	typename OutputT,
	typename OffsetT>
	void TestType(
	OffsetT max_items,
	OffsetT max_segments)
	{
	typedef typename DeviceReducePolicy<InputT, OutputT, OffsetT, cub::Sum>::MaxPolicy MaxPolicyT;

	TestBySize<InputT, OutputT, OffsetT> dispatch(max_items, max_segments);

	MaxPolicyT::Invoke(g_ptx_version, dispatch);
	}


	//---------------------------------------------------------------------
	// Main
	//---------------------------------------------------------------------


	/**
	* Main
	*/
	int main(int argc, char** argv)
	{
	typedef int OffsetT;

	OffsetT max_items = 27000000;
	OffsetT max_segments = 34000;

	// Initialize command line
	CommandLineArgs args(argc, argv);
	g_verbose = args.CheckCmdLineFlag("v");
	g_verbose_input = args.CheckCmdLineFlag("v2");
	args.GetCmdLineArgument("n", max_items);
	args.GetCmdLineArgument("s", max_segments);
	args.GetCmdLineArgument("i", g_timing_iterations);
	args.GetCmdLineArgument("repeat", g_repeat);

	// Print usage
	if (args.CheckCmdLineFlag("help"))
	{
	printf("%s "
	"[--n=<input items> "
	"[--s=<num segments> "
	"[--i=<timing iterations> "
	"[--device=<device-id>] "
	"[--repeat=<repetitions of entire test suite>]"
	"[--v] "
	"[--cdp]"
	"\n", argv[0]);
	exit(0);
	}

	// Initialize device
	CubDebugExit(args.DeviceInit());
	g_device_giga_bandwidth = args.device_giga_bandwidth;

	// Get ptx version
	CubDebugExit(PtxVersion(g_ptx_version));

	// Get SM count
	g_sm_count = args.deviceProp.multiProcessorCount;

	#ifdef QUICKER_TEST

	// Compile/run basic test


	TestProblem<CUB, char, int>( max_items, 1, RANDOM_BIT, Sum());
	TestProblem<CUB, short, int>( max_items, 1, RANDOM_BIT, Sum());

	printf("\n-------------------------------\n");

	TestProblem<CUB, int, int>( max_items, 1, RANDOM_BIT, Sum());
	TestProblem<CUB, long long, long long>( max_items, 1, RANDOM_BIT, Sum());

	printf("\n-------------------------------\n");

	TestProblem<CUB, float, float>( max_items, 1, RANDOM_BIT, Sum());
	TestProblem<CUB, double, double>( max_items, 1, RANDOM_BIT, Sum());

	printf("\n-------------------------------\n");

	TestProblem<CUB_SEGMENTED, int, int>(max_items, max_segments, RANDOM_BIT, Sum());


	#elif defined(QUICK_TEST)

	// Compile/run quick comparison tests

	TestProblem<CUB, char, char>( max_items * 4, 1, UNIFORM, Sum());
	TestProblem<THRUST, char, char>( max_items * 4, 1, UNIFORM, Sum());

	printf("\n----------------------------\n");
	TestProblem<CUB, short, short>( max_items * 2, 1, UNIFORM, Sum());
	TestProblem<THRUST, short, short>( max_items * 2, 1, UNIFORM, Sum());

	printf("\n----------------------------\n");
	TestProblem<CUB, int, int>( max_items, 1, UNIFORM, Sum());
	TestProblem<THRUST, int, int>( max_items, 1, UNIFORM, Sum());

	printf("\n----------------------------\n");
	TestProblem<CUB, long long, long long>( max_items / 2, 1, UNIFORM, Sum());
	TestProblem<THRUST, long long, long long>( max_items / 2, 1, UNIFORM, Sum());

	printf("\n----------------------------\n");
	TestProblem<CUB, TestFoo, TestFoo>( max_items / 4, 1, UNIFORM, Max());
	TestProblem<THRUST, TestFoo, TestFoo>( max_items / 4, 1, UNIFORM, Max());

	#else

	// Compile/run thorough tests
	for (int i = 0; i <= g_repeat; ++i)
	{
	// Test different input types
	TestType<char, char>(max_items, max_segments);

	TestType<unsigned char, unsigned char>(max_items, max_segments);

	TestType<char, int>(max_items, max_segments);

	TestType<short, short>(max_items, max_segments);
	TestType<int, int>(max_items, max_segments);
	TestType<long, long>(max_items, max_segments);
	TestType<long long, long long>(max_items, max_segments);

	TestType<uchar2, uchar2>(max_items, max_segments);
	TestType<uint2, uint2>(max_items, max_segments);
	TestType<ulonglong2, ulonglong2>(max_items, max_segments);
	TestType<ulonglong4, ulonglong4>(max_items, max_segments);

	TestType<TestFoo, TestFoo>(max_items, max_segments);
	TestType<TestBar, TestBar>(max_items, max_segments);
	}

	#endif


	printf("\n");
	return 0;
	}