Spaces:
Runtime error
Runtime error
/****************************************************************************** | |
* Copyright (c) 2011, Duane Merrill. All rights reserved. | |
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. | |
* | |
* Redistribution and use in source and binary forms, with or without | |
* modification, are permitted provided that the following conditions are met: | |
* * Redistributions of source code must retain the above copyright | |
* notice, this list of conditions and the following disclaimer. | |
* * Redistributions in binary form must reproduce the above copyright | |
* notice, this list of conditions and the following disclaimer in the | |
* documentation and/or other materials provided with the distribution. | |
* * Neither the name of the NVIDIA CORPORATION nor the | |
* names of its contributors may be used to endorse or promote products | |
* derived from this software without specific prior written permission. | |
* | |
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | |
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY | |
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES | |
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND | |
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
* | |
******************************************************************************/ | |
/****************************************************************************** | |
* Test of DeviceReduce utilities | |
******************************************************************************/ | |
// Ensure printing of CUDA runtime errors to console | |
#define CUB_STDERR | |
#include <stdio.h> | |
#include <limits> | |
#include <typeinfo> | |
#include <thrust/device_ptr.h> | |
#include <thrust/reduce.h> | |
#include <cub/util_allocator.cuh> | |
#include <cub/device/device_reduce.cuh> | |
#include <cub/device/device_segmented_reduce.cuh> | |
#include <cub/iterator/constant_input_iterator.cuh> | |
#include <cub/iterator/discard_output_iterator.cuh> | |
#include <cub/iterator/transform_input_iterator.cuh> | |
#include "test_util.h" | |
using namespace cub; | |
//--------------------------------------------------------------------- | |
// Globals, constants and typedefs | |
//--------------------------------------------------------------------- | |
int g_ptx_version; | |
int g_sm_count; | |
double g_device_giga_bandwidth; | |
bool g_verbose = false; | |
bool g_verbose_input = false; | |
int g_timing_iterations = 0; | |
int g_repeat = 0; | |
CachingDeviceAllocator g_allocator(true); | |
// Dispatch types | |
enum Backend | |
{ | |
CUB, // CUB method | |
CUB_SEGMENTED, // CUB segmented method | |
CUB_CDP, // GPU-based (dynamic parallelism) dispatch to CUB method | |
THRUST, // Thrust method | |
}; | |
// Custom max functor | |
struct CustomMax | |
{ | |
/// Boolean max operator, returns <tt>(a > b) ? a : b</tt> | |
template <typename OutputT> | |
__host__ __device__ __forceinline__ OutputT operator()(const OutputT &a, const OutputT &b) | |
{ | |
return CUB_MAX(a, b); | |
} | |
}; | |
//--------------------------------------------------------------------- | |
// Dispatch to different CUB DeviceReduce entrypoints | |
//--------------------------------------------------------------------- | |
/** | |
* Dispatch to reduce entrypoint (custom-max) | |
*/ | |
template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT, typename ReductionOpT> | |
CUB_RUNTIME_FUNCTION __forceinline__ | |
cudaError_t Dispatch( | |
Int2Type<CUB> /*dispatch_to*/, | |
int timing_iterations, | |
size_t */*d_temp_storage_bytes*/, | |
cudaError_t */*d_cdp_error*/, | |
void* d_temp_storage, | |
size_t& temp_storage_bytes, | |
InputIteratorT d_in, | |
OutputIteratorT d_out, | |
int num_items, | |
int /*max_segments*/, | |
OffsetIteratorT /*d_segment_offsets*/, | |
ReductionOpT reduction_op, | |
cudaStream_t stream, | |
bool debug_synchronous) | |
{ | |
typedef typename std::iterator_traits<InputIteratorT>::value_type InputT; | |
// The output value type | |
typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? | |
typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type, | |
typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type | |
// Max-identity | |
OutputT identity = Traits<InputT>::Lowest(); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent | |
// Invoke kernel to device reduction directly | |
cudaError_t error = cudaSuccess; | |
for (int i = 0; i < timing_iterations; ++i) | |
{ | |
error = DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, | |
d_in, d_out, num_items, reduction_op, identity, | |
stream, debug_synchronous); | |
} | |
return error; | |
} | |
/** | |
* Dispatch to sum entrypoint | |
*/ | |
template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT> | |
CUB_RUNTIME_FUNCTION __forceinline__ | |
cudaError_t Dispatch( | |
Int2Type<CUB> /*dispatch_to*/, | |
int timing_iterations, | |
size_t */*d_temp_storage_bytes*/, | |
cudaError_t */*d_cdp_error*/, | |
void* d_temp_storage, | |
size_t& temp_storage_bytes, | |
InputIteratorT d_in, | |
OutputIteratorT d_out, | |
int num_items, | |
int /*max_segments*/, | |
OffsetIteratorT /*d_segment_offsets*/, | |
cub::Sum /*reduction_op*/, | |
cudaStream_t stream, | |
bool debug_synchronous) | |
{ | |
// Invoke kernel to device reduction directly | |
cudaError_t error = cudaSuccess; | |
for (int i = 0; i < timing_iterations; ++i) | |
{ | |
error = DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous); | |
} | |
return error; | |
} | |
/** | |
* Dispatch to min entrypoint | |
*/ | |
template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT> | |
CUB_RUNTIME_FUNCTION __forceinline__ | |
cudaError_t Dispatch( | |
Int2Type<CUB> /*dispatch_to*/, | |
int timing_iterations, | |
size_t */*d_temp_storage_bytes*/, | |
cudaError_t */*d_cdp_error*/, | |
void* d_temp_storage, | |
size_t& temp_storage_bytes, | |
InputIteratorT d_in, | |
OutputIteratorT d_out, | |
int num_items, | |
int /*max_segments*/, | |
OffsetIteratorT /*d_segment_offsets*/, | |
cub::Min /*reduction_op*/, | |
cudaStream_t stream, | |
bool debug_synchronous) | |
{ | |
// Invoke kernel to device reduction directly | |
cudaError_t error = cudaSuccess; | |
for (int i = 0; i < timing_iterations; ++i) | |
{ | |
error = DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous); | |
} | |
return error; | |
} | |
/** | |
* Dispatch to max entrypoint | |
*/ | |
template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT> | |
CUB_RUNTIME_FUNCTION __forceinline__ | |
cudaError_t Dispatch( | |
Int2Type<CUB> /*dispatch_to*/, | |
int timing_iterations, | |
size_t */*d_temp_storage_bytes*/, | |
cudaError_t */*d_cdp_error*/, | |
void* d_temp_storage, | |
size_t& temp_storage_bytes, | |
InputIteratorT d_in, | |
OutputIteratorT d_out, | |
int num_items, | |
int /*max_segments*/, | |
OffsetIteratorT /*d_segment_offsets*/, | |
cub::Max /*reduction_op*/, | |
cudaStream_t stream, | |
bool debug_synchronous) | |
{ | |
// Invoke kernel to device reduction directly | |
cudaError_t error = cudaSuccess; | |
for (int i = 0; i < timing_iterations; ++i) | |
{ | |
error = DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous); | |
} | |
return error; | |
} | |
/** | |
* Dispatch to argmin entrypoint | |
*/ | |
template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT> | |
CUB_RUNTIME_FUNCTION __forceinline__ | |
cudaError_t Dispatch( | |
Int2Type<CUB> /*dispatch_to*/, | |
int timing_iterations, | |
size_t */*d_temp_storage_bytes*/, | |
cudaError_t */*d_cdp_error*/, | |
void* d_temp_storage, | |
size_t& temp_storage_bytes, | |
InputIteratorT d_in, | |
OutputIteratorT d_out, | |
int num_items, | |
int /*max_segments*/, | |
OffsetIteratorT /*d_segment_offsets*/, | |
cub::ArgMin /*reduction_op*/, | |
cudaStream_t stream, | |
bool debug_synchronous) | |
{ | |
// Invoke kernel to device reduction directly | |
cudaError_t error = cudaSuccess; | |
for (int i = 0; i < timing_iterations; ++i) | |
{ | |
error = DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous); | |
} | |
return error; | |
} | |
/** | |
* Dispatch to argmax entrypoint | |
*/ | |
template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT> | |
CUB_RUNTIME_FUNCTION __forceinline__ | |
cudaError_t Dispatch( | |
Int2Type<CUB> /*dispatch_to*/, | |
int timing_iterations, | |
size_t */*d_temp_storage_bytes*/, | |
cudaError_t */*d_cdp_error*/, | |
void* d_temp_storage, | |
size_t& temp_storage_bytes, | |
InputIteratorT d_in, | |
OutputIteratorT d_out, | |
int num_items, | |
int /*max_segments*/, | |
OffsetIteratorT /*d_segment_offsets*/, | |
cub::ArgMax /*reduction_op*/, | |
cudaStream_t stream, | |
bool debug_synchronous) | |
{ | |
// Invoke kernel to device reduction directly | |
cudaError_t error = cudaSuccess; | |
for (int i = 0; i < timing_iterations; ++i) | |
{ | |
error = DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous); | |
} | |
return error; | |
} | |
//--------------------------------------------------------------------- | |
// Dispatch to different CUB DeviceSegmentedReduce entrypoints | |
//--------------------------------------------------------------------- | |
/** | |
* Dispatch to reduce entrypoint (custom-max) | |
*/ | |
template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT, typename ReductionOpT> | |
CUB_RUNTIME_FUNCTION __forceinline__ | |
cudaError_t Dispatch( | |
Int2Type<CUB_SEGMENTED> /*dispatch_to*/, | |
int timing_iterations, | |
size_t */*d_temp_storage_bytes*/, | |
cudaError_t */*d_cdp_error*/, | |
void* d_temp_storage, | |
size_t& temp_storage_bytes, | |
InputIteratorT d_in, | |
OutputIteratorT d_out, | |
int /*num_items*/, | |
int max_segments, | |
OffsetIteratorT d_segment_offsets, | |
ReductionOpT reduction_op, | |
cudaStream_t stream, | |
bool debug_synchronous) | |
{ | |
// The input value type | |
typedef typename std::iterator_traits<InputIteratorT>::value_type InputT; | |
// The output value type | |
typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? | |
typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type, | |
typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type | |
// Max-identity | |
OutputT identity = Traits<InputT>::Lowest(); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent | |
// Invoke kernel to device reduction directly | |
cudaError_t error = cudaSuccess; | |
for (int i = 0; i < timing_iterations; ++i) | |
{ | |
error = DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes, | |
d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1, reduction_op, identity, | |
stream, debug_synchronous); | |
} | |
return error; | |
} | |
/** | |
* Dispatch to sum entrypoint | |
*/ | |
template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT> | |
CUB_RUNTIME_FUNCTION __forceinline__ | |
cudaError_t Dispatch( | |
Int2Type<CUB_SEGMENTED> /*dispatch_to*/, | |
int timing_iterations, | |
size_t */*d_temp_storage_bytes*/, | |
cudaError_t */*d_cdp_error*/, | |
void* d_temp_storage, | |
size_t& temp_storage_bytes, | |
InputIteratorT d_in, | |
OutputIteratorT d_out, | |
int /*num_items*/, | |
int max_segments, | |
OffsetIteratorT d_segment_offsets, | |
cub::Sum /*reduction_op*/, | |
cudaStream_t stream, | |
bool debug_synchronous) | |
{ | |
// Invoke kernel to device reduction directly | |
cudaError_t error = cudaSuccess; | |
for (int i = 0; i < timing_iterations; ++i) | |
{ | |
error = DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes, | |
d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1, | |
stream, debug_synchronous); | |
} | |
return error; | |
} | |
/** | |
* Dispatch to min entrypoint | |
*/ | |
template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT> | |
CUB_RUNTIME_FUNCTION __forceinline__ | |
cudaError_t Dispatch( | |
Int2Type<CUB_SEGMENTED> /*dispatch_to*/, | |
int timing_iterations, | |
size_t */*d_temp_storage_bytes*/, | |
cudaError_t */*d_cdp_error*/, | |
void* d_temp_storage, | |
size_t& temp_storage_bytes, | |
InputIteratorT d_in, | |
OutputIteratorT d_out, | |
int /*num_items*/, | |
int max_segments, | |
OffsetIteratorT d_segment_offsets, | |
cub::Min /*reduction_op*/, | |
cudaStream_t stream, | |
bool debug_synchronous) | |
{ | |
// Invoke kernel to device reduction directly | |
cudaError_t error = cudaSuccess; | |
for (int i = 0; i < timing_iterations; ++i) | |
{ | |
error = DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes, | |
d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1, | |
stream, debug_synchronous); | |
} | |
return error; | |
} | |
/** | |
* Dispatch to max entrypoint | |
*/ | |
template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT> | |
CUB_RUNTIME_FUNCTION __forceinline__ | |
cudaError_t Dispatch( | |
Int2Type<CUB_SEGMENTED> /*dispatch_to*/, | |
int timing_iterations, | |
size_t */*d_temp_storage_bytes*/, | |
cudaError_t */*d_cdp_error*/, | |
void* d_temp_storage, | |
size_t& temp_storage_bytes, | |
InputIteratorT d_in, | |
OutputIteratorT d_out, | |
int /*num_items*/, | |
int max_segments, | |
OffsetIteratorT d_segment_offsets, | |
cub::Max /*reduction_op*/, | |
cudaStream_t stream, | |
bool debug_synchronous) | |
{ | |
// Invoke kernel to device reduction directly | |
cudaError_t error = cudaSuccess; | |
for (int i = 0; i < timing_iterations; ++i) | |
{ | |
error = DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes, | |
d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1, | |
stream, debug_synchronous); | |
} | |
return error; | |
} | |
/** | |
* Dispatch to argmin entrypoint | |
*/ | |
template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT> | |
CUB_RUNTIME_FUNCTION __forceinline__ | |
cudaError_t Dispatch( | |
Int2Type<CUB_SEGMENTED> /*dispatch_to*/, | |
int timing_iterations, | |
size_t */*d_temp_storage_bytes*/, | |
cudaError_t */*d_cdp_error*/, | |
void* d_temp_storage, | |
size_t& temp_storage_bytes, | |
InputIteratorT d_in, | |
OutputIteratorT d_out, | |
int /*num_items*/, | |
int max_segments, | |
OffsetIteratorT d_segment_offsets, | |
cub::ArgMin /*reduction_op*/, | |
cudaStream_t stream, | |
bool debug_synchronous) | |
{ | |
// Invoke kernel to device reduction directly | |
cudaError_t error = cudaSuccess; | |
for (int i = 0; i < timing_iterations; ++i) | |
{ | |
error = DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes, | |
d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1, | |
stream, debug_synchronous); | |
} | |
return error; | |
} | |
/** | |
* Dispatch to argmax entrypoint | |
*/ | |
template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT> | |
CUB_RUNTIME_FUNCTION __forceinline__ | |
cudaError_t Dispatch( | |
Int2Type<CUB_SEGMENTED> /*dispatch_to*/, | |
int timing_iterations, | |
size_t */*d_temp_storage_bytes*/, | |
cudaError_t */*d_cdp_error*/, | |
void* d_temp_storage, | |
size_t& temp_storage_bytes, | |
InputIteratorT d_in, | |
OutputIteratorT d_out, | |
int /*num_items*/, | |
int max_segments, | |
OffsetIteratorT d_segment_offsets, | |
cub::ArgMax /*reduction_op*/, | |
cudaStream_t stream, | |
bool debug_synchronous) | |
{ | |
// Invoke kernel to device reduction directly | |
cudaError_t error = cudaSuccess; | |
for (int i = 0; i < timing_iterations; ++i) | |
{ | |
error = DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes, | |
d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1, | |
stream, debug_synchronous); | |
} | |
return error; | |
} | |
//--------------------------------------------------------------------- | |
// Dispatch to different Thrust entrypoints | |
//--------------------------------------------------------------------- | |
/** | |
* Dispatch to reduction entrypoint (min or max specialization) | |
*/ | |
template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT, typename ReductionOpT> | |
cudaError_t Dispatch( | |
Int2Type<THRUST> /*dispatch_to*/, | |
int timing_iterations, | |
size_t */*d_temp_storage_bytes*/, | |
cudaError_t */*d_cdp_error*/, | |
void* d_temp_storage, | |
size_t& temp_storage_bytes, | |
InputIteratorT d_in, | |
OutputIteratorT d_out, | |
int num_items, | |
int /*max_segments*/, | |
OffsetIteratorT /*d_segment_offsets*/, | |
ReductionOpT reduction_op, | |
cudaStream_t /*stream*/, | |
bool /*debug_synchronous*/) | |
{ | |
// The output value type | |
typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? | |
typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type, | |
typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type | |
if (d_temp_storage == 0) | |
{ | |
temp_storage_bytes = 1; | |
} | |
else | |
{ | |
OutputT init; | |
CubDebugExit(cudaMemcpy(&init, d_in + 0, sizeof(OutputT), cudaMemcpyDeviceToHost)); | |
thrust::device_ptr<OutputT> d_in_wrapper(d_in); | |
OutputT retval; | |
for (int i = 0; i < timing_iterations; ++i) | |
{ | |
retval = thrust::reduce(d_in_wrapper, d_in_wrapper + num_items, init, reduction_op); | |
} | |
if (!Equals<OutputIteratorT, DiscardOutputIterator<int> >::VALUE) | |
CubDebugExit(cudaMemcpy(d_out, &retval, sizeof(OutputT), cudaMemcpyHostToDevice)); | |
} | |
return cudaSuccess; | |
} | |
/** | |
* Dispatch to reduction entrypoint (sum specialization) | |
*/ | |
template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT> | |
cudaError_t Dispatch( | |
Int2Type<THRUST> /*dispatch_to*/, | |
int timing_iterations, | |
size_t */*d_temp_storage_bytes*/, | |
cudaError_t */*d_cdp_error*/, | |
void* d_temp_storage, | |
size_t& temp_storage_bytes, | |
InputIteratorT d_in, | |
OutputIteratorT d_out, | |
int num_items, | |
int /*max_segments*/, | |
OffsetIteratorT /*d_segment_offsets*/, | |
Sum /*reduction_op*/, | |
cudaStream_t /*stream*/, | |
bool /*debug_synchronous*/) | |
{ | |
// The output value type | |
typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? | |
typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type, | |
typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type | |
if (d_temp_storage == 0) | |
{ | |
temp_storage_bytes = 1; | |
} | |
else | |
{ | |
thrust::device_ptr<OutputT> d_in_wrapper(d_in); | |
OutputT retval; | |
for (int i = 0; i < timing_iterations; ++i) | |
{ | |
retval = thrust::reduce(d_in_wrapper, d_in_wrapper + num_items); | |
} | |
if (!Equals<OutputIteratorT, DiscardOutputIterator<int> >::VALUE) | |
CubDebugExit(cudaMemcpy(d_out, &retval, sizeof(OutputT), cudaMemcpyHostToDevice)); | |
} | |
return cudaSuccess; | |
} | |
//--------------------------------------------------------------------- | |
// CUDA nested-parallelism test kernel | |
//--------------------------------------------------------------------- | |
/** | |
* Simple wrapper kernel to invoke DeviceReduce | |
*/ | |
template < | |
typename InputIteratorT, | |
typename OutputIteratorT, | |
typename OffsetIteratorT, | |
typename ReductionOpT> | |
__global__ void CnpDispatchKernel( | |
int timing_iterations, | |
size_t *d_temp_storage_bytes, | |
cudaError_t *d_cdp_error, | |
void* d_temp_storage, | |
size_t temp_storage_bytes, | |
InputIteratorT d_in, | |
OutputIteratorT d_out, | |
int num_items, | |
int max_segments, | |
OffsetIteratorT d_segment_offsets, | |
ReductionOpT reduction_op, | |
bool debug_synchronous) | |
{ | |
#ifndef CUB_CDP | |
(void)timing_iterations; | |
(void)d_temp_storage_bytes; | |
(void)d_cdp_error; | |
(void)d_temp_storage; | |
(void)temp_storage_bytes; | |
(void)d_in; | |
(void)d_out; | |
(void)num_items; | |
(void)max_segments; | |
(void)d_segment_offsets; | |
(void)reduction_op; | |
(void)debug_synchronous; | |
*d_cdp_error = cudaErrorNotSupported; | |
#else | |
*d_cdp_error = Dispatch(Int2Type<CUB>(), timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, | |
d_in, d_out, num_items, max_segments, d_segment_offsets, reduction_op, 0, debug_synchronous); | |
*d_temp_storage_bytes = temp_storage_bytes; | |
#endif | |
} | |
/** | |
* Dispatch to CUB_CDP kernel | |
*/ | |
template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT, typename ReductionOpT> | |
CUB_RUNTIME_FUNCTION __forceinline__ | |
cudaError_t Dispatch( | |
Int2Type<CUB_CDP> dispatch_to, | |
int timing_iterations, | |
size_t *d_temp_storage_bytes, | |
cudaError_t *d_cdp_error, | |
void* d_temp_storage, | |
size_t& temp_storage_bytes, | |
InputIteratorT d_in, | |
OutputIteratorT d_out, | |
int num_items, | |
int max_segments, | |
OffsetIteratorT d_segment_offsets, | |
ReductionOpT reduction_op, | |
cudaStream_t stream, | |
bool debug_synchronous) | |
{ | |
// Invoke kernel to invoke device-side dispatch | |
CnpDispatchKernel<<<1,1>>>(timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, | |
d_in, d_out, num_items, max_segments, d_segment_offsets, reduction_op, debug_synchronous); | |
// Copy out temp_storage_bytes | |
CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost)); | |
// Copy out error | |
cudaError_t retval; | |
CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost)); | |
return retval; | |
} | |
//--------------------------------------------------------------------- | |
// Problem generation | |
//--------------------------------------------------------------------- | |
/// Initialize problem | |
template <typename InputT> | |
void Initialize( | |
GenMode gen_mode, | |
InputT *h_in, | |
int num_items) | |
{ | |
for (int i = 0; i < num_items; ++i) | |
{ | |
InitValue(gen_mode, h_in[i], i); | |
} | |
if (g_verbose_input) | |
{ | |
printf("Input:\n"); | |
DisplayResults(h_in, num_items); | |
printf("\n\n"); | |
} | |
} | |
/// Solve problem (max/custom-max functor) | |
template <typename ReductionOpT, typename InputT, typename _OutputT> | |
struct Solution | |
{ | |
typedef _OutputT OutputT; | |
template <typename HostInputIteratorT, typename OffsetT, typename OffsetIteratorT> | |
static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, OffsetIteratorT h_segment_offsets, | |
ReductionOpT reduction_op) | |
{ | |
for (int i = 0; i < num_segments; ++i) | |
{ | |
OutputT aggregate = Traits<InputT>::Lowest(); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent | |
for (int j = h_segment_offsets[i]; j < h_segment_offsets[i + 1]; ++j) | |
aggregate = reduction_op(aggregate, OutputT(h_in[j])); | |
h_reference[i] = aggregate; | |
} | |
} | |
}; | |
/// Solve problem (min functor) | |
template <typename InputT, typename _OutputT> | |
struct Solution<cub::Min, InputT, _OutputT> | |
{ | |
typedef _OutputT OutputT; | |
template <typename HostInputIteratorT, typename OffsetT, typename OffsetIteratorT> | |
static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, OffsetIteratorT h_segment_offsets, | |
cub::Min reduction_op) | |
{ | |
for (int i = 0; i < num_segments; ++i) | |
{ | |
OutputT aggregate = Traits<InputT>::Max(); // replace with std::numeric_limits<OutputT>::max() when C++ support is more prevalent | |
for (int j = h_segment_offsets[i]; j < h_segment_offsets[i + 1]; ++j) | |
aggregate = reduction_op(aggregate, OutputT(h_in[j])); | |
h_reference[i] = aggregate; | |
} | |
} | |
}; | |
/// Solve problem (sum functor) | |
template <typename InputT, typename _OutputT> | |
struct Solution<cub::Sum, InputT, _OutputT> | |
{ | |
typedef _OutputT OutputT; | |
template <typename HostInputIteratorT, typename OffsetT, typename OffsetIteratorT> | |
static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, OffsetIteratorT h_segment_offsets, | |
cub::Sum reduction_op) | |
{ | |
for (int i = 0; i < num_segments; ++i) | |
{ | |
OutputT aggregate; | |
InitValue(INTEGER_SEED, aggregate, 0); | |
for (int j = h_segment_offsets[i]; j < h_segment_offsets[i + 1]; ++j) | |
aggregate = reduction_op(aggregate, OutputT(h_in[j])); | |
h_reference[i] = aggregate; | |
} | |
} | |
}; | |
/// Solve problem (argmin functor) | |
template <typename InputValueT, typename OutputValueT> | |
struct Solution<cub::ArgMin, InputValueT, OutputValueT> | |
{ | |
typedef KeyValuePair<int, OutputValueT> OutputT; | |
template <typename HostInputIteratorT, typename OffsetT, typename OffsetIteratorT> | |
static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, OffsetIteratorT h_segment_offsets, | |
cub::ArgMin reduction_op) | |
{ | |
for (int i = 0; i < num_segments; ++i) | |
{ | |
OutputT aggregate(1, Traits<InputValueT>::Max()); // replace with std::numeric_limits<OutputT>::max() when C++ support is more prevalent | |
for (int j = h_segment_offsets[i]; j < h_segment_offsets[i + 1]; ++j) | |
{ | |
OutputT item(j - h_segment_offsets[i], OutputValueT(h_in[j])); | |
aggregate = reduction_op(aggregate, item); | |
} | |
h_reference[i] = aggregate; | |
} | |
} | |
}; | |
/// Solve problem (argmax functor) | |
template <typename InputValueT, typename OutputValueT> | |
struct Solution<cub::ArgMax, InputValueT, OutputValueT> | |
{ | |
typedef KeyValuePair<int, OutputValueT> OutputT; | |
template <typename HostInputIteratorT, typename OffsetT, typename OffsetIteratorT> | |
static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, OffsetIteratorT h_segment_offsets, | |
cub::ArgMax reduction_op) | |
{ | |
for (int i = 0; i < num_segments; ++i) | |
{ | |
OutputT aggregate(1, Traits<InputValueT>::Lowest()); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent | |
for (int j = h_segment_offsets[i]; j < h_segment_offsets[i + 1]; ++j) | |
{ | |
OutputT item(j - h_segment_offsets[i], OutputValueT(h_in[j])); | |
aggregate = reduction_op(aggregate, item); | |
} | |
h_reference[i] = aggregate; | |
} | |
} | |
}; | |
//--------------------------------------------------------------------- | |
// Problem generation | |
//--------------------------------------------------------------------- | |
/// Test DeviceReduce for a given problem input | |
template < | |
typename BackendT, | |
typename DeviceInputIteratorT, | |
typename DeviceOutputIteratorT, | |
typename HostReferenceIteratorT, | |
typename OffsetT, | |
typename OffsetIteratorT, | |
typename ReductionOpT> | |
void Test( | |
BackendT backend, | |
DeviceInputIteratorT d_in, | |
DeviceOutputIteratorT d_out, | |
OffsetT num_items, | |
OffsetT num_segments, | |
OffsetIteratorT d_segment_offsets, | |
ReductionOpT reduction_op, | |
HostReferenceIteratorT h_reference) | |
{ | |
// Input data types | |
typedef typename std::iterator_traits<DeviceInputIteratorT>::value_type InputT; | |
// Allocate CUB_CDP device arrays for temp storage size and error | |
size_t *d_temp_storage_bytes = NULL; | |
cudaError_t *d_cdp_error = NULL; | |
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes, sizeof(size_t) * 1)); | |
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error, sizeof(cudaError_t) * 1)); | |
// Inquire temp device storage | |
void *d_temp_storage = NULL; | |
size_t temp_storage_bytes = 0; | |
CubDebugExit(Dispatch(backend, 1, | |
d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, | |
d_in, d_out, num_items, num_segments, d_segment_offsets, | |
reduction_op, 0, true)); | |
// Allocate temp device storage | |
CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); | |
// Run warmup/correctness iteration | |
CubDebugExit(Dispatch(backend, 1, | |
d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, | |
d_in, d_out, num_items, num_segments, d_segment_offsets, | |
reduction_op, 0, true)); | |
// Check for correctness (and display results, if specified) | |
int compare = CompareDeviceResults(h_reference, d_out, num_segments, g_verbose, g_verbose); | |
printf("\t%s", compare ? "FAIL" : "PASS"); | |
// Flush any stdout/stderr | |
fflush(stdout); | |
fflush(stderr); | |
// Performance | |
if (g_timing_iterations > 0) | |
{ | |
GpuTimer gpu_timer; | |
gpu_timer.Start(); | |
CubDebugExit(Dispatch(backend, g_timing_iterations, | |
d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, | |
d_in, d_out, num_items, num_segments, d_segment_offsets, | |
reduction_op, 0, false)); | |
gpu_timer.Stop(); | |
float elapsed_millis = gpu_timer.ElapsedMillis(); | |
// Display performance | |
float avg_millis = elapsed_millis / g_timing_iterations; | |
float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f; | |
float giga_bandwidth = giga_rate * sizeof(InputT); | |
printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak", | |
avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0); | |
} | |
if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes)); | |
if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error)); | |
if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage)); | |
// Correctness asserts | |
AssertEquals(0, compare); | |
} | |
/// Test DeviceReduce | |
template < | |
Backend BACKEND, | |
typename OutputValueT, | |
typename HostInputIteratorT, | |
typename DeviceInputIteratorT, | |
typename OffsetT, | |
typename OffsetIteratorT, | |
typename ReductionOpT> | |
void SolveAndTest( | |
HostInputIteratorT h_in, | |
DeviceInputIteratorT d_in, | |
OffsetT num_items, | |
OffsetT num_segments, | |
OffsetIteratorT h_segment_offsets, | |
OffsetIteratorT d_segment_offsets, | |
ReductionOpT reduction_op) | |
{ | |
typedef typename std::iterator_traits<DeviceInputIteratorT>::value_type InputValueT; | |
typedef Solution<ReductionOpT, InputValueT, OutputValueT> SolutionT; | |
typedef typename SolutionT::OutputT OutputT; | |
printf("\n\n%s cub::DeviceReduce<%s> %d items (%s), %d segments\n", | |
(BACKEND == CUB_CDP) ? "CUB_CDP" : (BACKEND == THRUST) ? "Thrust" : (BACKEND == CUB_SEGMENTED) ? "CUB_SEGMENTED" : "CUB", | |
typeid(ReductionOpT).name(), num_items, typeid(HostInputIteratorT).name(), num_segments); | |
fflush(stdout); | |
// Allocate and solve solution | |
OutputT *h_reference = new OutputT[num_segments]; | |
SolutionT::Solve(h_in, h_reference, num_segments, h_segment_offsets, reduction_op); | |
// // Run with discard iterator | |
// DiscardOutputIterator<OffsetT> discard_itr; | |
// Test(Int2Type<BACKEND>(), d_in, discard_itr, num_items, num_segments, d_segment_offsets, reduction_op, h_reference); | |
// Run with output data (cleared for sanity-check) | |
OutputT *d_out = NULL; | |
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(OutputT) * num_segments)); | |
CubDebugExit(cudaMemset(d_out, 0, sizeof(OutputT) * num_segments)); | |
Test(Int2Type<BACKEND>(), d_in, d_out, num_items, num_segments, d_segment_offsets, reduction_op, h_reference); | |
// Cleanup | |
if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out)); | |
if (h_reference) delete[] h_reference; | |
} | |
/// Test specific problem type | |
template < | |
Backend BACKEND, | |
typename InputT, | |
typename OutputT, | |
typename OffsetT, | |
typename ReductionOpT> | |
void TestProblem( | |
OffsetT num_items, | |
OffsetT num_segments, | |
GenMode gen_mode, | |
ReductionOpT reduction_op) | |
{ | |
printf("\n\nInitializing %d %s->%s (gen mode %d)... ", num_items, typeid(InputT).name(), typeid(OutputT).name(), gen_mode); fflush(stdout); | |
fflush(stdout); | |
// Initialize value data | |
InputT* h_in = new InputT[num_items]; | |
Initialize(gen_mode, h_in, num_items); | |
// Initialize segment data | |
OffsetT *h_segment_offsets = new OffsetT[num_segments + 1]; | |
InitializeSegments(num_items, num_segments, h_segment_offsets, g_verbose_input); | |
// Initialize device data | |
OffsetT *d_segment_offsets = NULL; | |
InputT *d_in = NULL; | |
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(InputT) * num_items)); | |
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_segment_offsets, sizeof(OffsetT) * (num_segments + 1))); | |
CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(InputT) * num_items, cudaMemcpyHostToDevice)); | |
CubDebugExit(cudaMemcpy(d_segment_offsets, h_segment_offsets, sizeof(OffsetT) * (num_segments + 1), cudaMemcpyHostToDevice)); | |
SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, reduction_op); | |
if (h_segment_offsets) delete[] h_segment_offsets; | |
if (d_segment_offsets) CubDebugExit(g_allocator.DeviceFree(d_segment_offsets)); | |
if (h_in) delete[] h_in; | |
if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in)); | |
} | |
/// Test different operators | |
template < | |
Backend BACKEND, | |
typename OutputT, | |
typename HostInputIteratorT, | |
typename DeviceInputIteratorT, | |
typename OffsetT, | |
typename OffsetIteratorT> | |
void TestByOp( | |
HostInputIteratorT h_in, | |
DeviceInputIteratorT d_in, | |
OffsetT num_items, | |
OffsetT num_segments, | |
OffsetIteratorT h_segment_offsets, | |
OffsetIteratorT d_segment_offsets) | |
{ | |
SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, CustomMax()); | |
SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, Sum()); | |
SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, Min()); | |
SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, ArgMin()); | |
SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, Max()); | |
SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, ArgMax()); | |
} | |
/// Test different backends | |
template < | |
typename InputT, | |
typename OutputT, | |
typename OffsetT> | |
void TestByBackend( | |
OffsetT num_items, | |
OffsetT max_segments, | |
GenMode gen_mode) | |
{ | |
// Initialize host data | |
printf("\n\nInitializing %d %s -> %s (gen mode %d)... ", | |
num_items, typeid(InputT).name(), typeid(OutputT).name(), gen_mode); fflush(stdout); | |
InputT *h_in = new InputT[num_items]; | |
OffsetT *h_segment_offsets = new OffsetT[max_segments + 1]; | |
Initialize(gen_mode, h_in, num_items); | |
// Initialize device data | |
InputT *d_in = NULL; | |
OffsetT *d_segment_offsets = NULL; | |
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(InputT) * num_items)); | |
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_segment_offsets, sizeof(OffsetT) * (max_segments + 1))); | |
CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(InputT) * num_items, cudaMemcpyHostToDevice)); | |
// | |
// Test single-segment implementations | |
// | |
InitializeSegments(num_items, 1, h_segment_offsets, g_verbose_input); | |
// Page-aligned-input tests | |
TestByOp<CUB, OutputT>(h_in, d_in, num_items, 1, h_segment_offsets, (OffsetT*) NULL); // Host-dispatch | |
#ifdef CUB_CDP | |
TestByOp<CUB_CDP, OutputT>(h_in, d_in, num_items, 1, h_segment_offsets, (OffsetT*) NULL); // Device-dispatch | |
#endif | |
// Non-page-aligned-input tests | |
if (num_items > 1) | |
{ | |
InitializeSegments(num_items - 1, 1, h_segment_offsets, g_verbose_input); | |
TestByOp<CUB, OutputT>(h_in + 1, d_in + 1, num_items - 1, 1, h_segment_offsets, (OffsetT*) NULL); | |
} | |
// | |
// Test segmented implementation | |
// | |
// Right now we assign a single thread block to each segment, so lets keep it to under 128K items per segment | |
int max_items_per_segment = 128000; | |
for (int num_segments = (num_items + max_items_per_segment - 1) / max_items_per_segment; | |
num_segments < max_segments; | |
num_segments = (num_segments * 32) + 1) | |
{ | |
// Test with segment pointer | |
InitializeSegments(num_items, num_segments, h_segment_offsets, g_verbose_input); | |
CubDebugExit(cudaMemcpy(d_segment_offsets, h_segment_offsets, sizeof(OffsetT) * (num_segments + 1), cudaMemcpyHostToDevice)); | |
TestByOp<CUB_SEGMENTED, OutputT>( | |
h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets); | |
// Test with segment iterator | |
typedef CastOp<OffsetT> IdentityOpT; | |
IdentityOpT identity_op; | |
TransformInputIterator<OffsetT, IdentityOpT, OffsetT*, OffsetT> h_segment_offsets_itr( | |
h_segment_offsets, | |
identity_op); | |
TransformInputIterator<OffsetT, IdentityOpT, OffsetT*, OffsetT> d_segment_offsets_itr( | |
d_segment_offsets, | |
identity_op); | |
TestByOp<CUB_SEGMENTED, OutputT>( | |
h_in, d_in, num_items, num_segments, h_segment_offsets_itr, d_segment_offsets_itr); | |
} | |
if (h_in) delete[] h_in; | |
if (h_segment_offsets) delete[] h_segment_offsets; | |
if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in)); | |
if (d_segment_offsets) CubDebugExit(g_allocator.DeviceFree(d_segment_offsets)); | |
} | |
/// Test different input-generation modes | |
template < | |
typename InputT, | |
typename OutputT, | |
typename OffsetT> | |
void TestByGenMode( | |
OffsetT num_items, | |
OffsetT max_segments) | |
{ | |
// | |
// Test pointer support using different input-generation modes | |
// | |
TestByBackend<InputT, OutputT>(num_items, max_segments, UNIFORM); | |
TestByBackend<InputT, OutputT>(num_items, max_segments, INTEGER_SEED); | |
TestByBackend<InputT, OutputT>(num_items, max_segments, RANDOM); | |
// | |
// Test iterator support using a constant-iterator and SUM | |
// | |
InputT val; | |
InitValue(UNIFORM, val, 0); | |
ConstantInputIterator<InputT, OffsetT> h_in(val); | |
OffsetT *h_segment_offsets = new OffsetT[1 + 1]; | |
InitializeSegments(num_items, 1, h_segment_offsets, g_verbose_input); | |
SolveAndTest<CUB, OutputT>(h_in, h_in, num_items, 1, h_segment_offsets, (OffsetT*) NULL, Sum()); | |
#ifdef CUB_CDP | |
SolveAndTest<CUB_CDP, OutputT>(h_in, h_in, num_items, 1, h_segment_offsets, (OffsetT*) NULL, Sum()); | |
#endif | |
if (h_segment_offsets) delete[] h_segment_offsets; | |
} | |
/// Test different problem sizes | |
template < | |
typename InputT, | |
typename OutputT, | |
typename OffsetT> | |
struct TestBySize | |
{ | |
OffsetT max_items; | |
OffsetT max_segments; | |
TestBySize(OffsetT max_items, OffsetT max_segments) : | |
max_items(max_items), | |
max_segments(max_segments) | |
{} | |
template <typename ActivePolicyT> | |
cudaError_t Invoke() | |
{ | |
// | |
// Black-box testing on all backends | |
// | |
// Test 0, 1, many | |
TestByGenMode<InputT, OutputT>(0, max_segments); | |
TestByGenMode<InputT, OutputT>(1, max_segments); | |
TestByGenMode<InputT, OutputT>(max_items, max_segments); | |
// Test random problem sizes from a log-distribution [8, max_items-ish) | |
int num_iterations = 8; | |
double max_exp = log(double(max_items)) / log(double(2.0)); | |
for (int i = 0; i < num_iterations; ++i) | |
{ | |
OffsetT num_items = (OffsetT) pow(2.0, RandomValue(max_exp - 3.0) + 3.0); | |
TestByGenMode<InputT, OutputT>(num_items, max_segments); | |
} | |
// | |
// White-box testing of single-segment problems around specific sizes | |
// | |
// Tile-boundaries: multiple blocks, one tile per block | |
OffsetT tile_size = ActivePolicyT::ReducePolicy::BLOCK_THREADS * ActivePolicyT::ReducePolicy::ITEMS_PER_THREAD; | |
TestProblem<CUB, InputT, OutputT>(tile_size * 4, 1, RANDOM, Sum()); | |
TestProblem<CUB, InputT, OutputT>(tile_size * 4 + 1, 1, RANDOM, Sum()); | |
TestProblem<CUB, InputT, OutputT>(tile_size * 4 - 1, 1, RANDOM, Sum()); | |
// Tile-boundaries: multiple blocks, multiple tiles per block | |
OffsetT sm_occupancy = 32; | |
OffsetT occupancy = tile_size * sm_occupancy * g_sm_count; | |
TestProblem<CUB, InputT, OutputT>(occupancy, 1, RANDOM, Sum()); | |
TestProblem<CUB, InputT, OutputT>(occupancy + 1, 1, RANDOM, Sum()); | |
TestProblem<CUB, InputT, OutputT>(occupancy - 1, 1, RANDOM, Sum()); | |
return cudaSuccess; | |
} | |
}; | |
/// Test problem type | |
template < | |
typename InputT, | |
typename OutputT, | |
typename OffsetT> | |
void TestType( | |
OffsetT max_items, | |
OffsetT max_segments) | |
{ | |
typedef typename DeviceReducePolicy<InputT, OutputT, OffsetT, cub::Sum>::MaxPolicy MaxPolicyT; | |
TestBySize<InputT, OutputT, OffsetT> dispatch(max_items, max_segments); | |
MaxPolicyT::Invoke(g_ptx_version, dispatch); | |
} | |
//--------------------------------------------------------------------- | |
// Main | |
//--------------------------------------------------------------------- | |
/** | |
* Main | |
*/ | |
int main(int argc, char** argv) | |
{ | |
typedef int OffsetT; | |
OffsetT max_items = 27000000; | |
OffsetT max_segments = 34000; | |
// Initialize command line | |
CommandLineArgs args(argc, argv); | |
g_verbose = args.CheckCmdLineFlag("v"); | |
g_verbose_input = args.CheckCmdLineFlag("v2"); | |
args.GetCmdLineArgument("n", max_items); | |
args.GetCmdLineArgument("s", max_segments); | |
args.GetCmdLineArgument("i", g_timing_iterations); | |
args.GetCmdLineArgument("repeat", g_repeat); | |
// Print usage | |
if (args.CheckCmdLineFlag("help")) | |
{ | |
printf("%s " | |
"[--n=<input items> " | |
"[--s=<num segments> " | |
"[--i=<timing iterations> " | |
"[--device=<device-id>] " | |
"[--repeat=<repetitions of entire test suite>]" | |
"[--v] " | |
"[--cdp]" | |
"\n", argv[0]); | |
exit(0); | |
} | |
// Initialize device | |
CubDebugExit(args.DeviceInit()); | |
g_device_giga_bandwidth = args.device_giga_bandwidth; | |
// Get ptx version | |
CubDebugExit(PtxVersion(g_ptx_version)); | |
// Get SM count | |
g_sm_count = args.deviceProp.multiProcessorCount; | |
#ifdef QUICKER_TEST | |
// Compile/run basic test | |
TestProblem<CUB, char, int>( max_items, 1, RANDOM_BIT, Sum()); | |
TestProblem<CUB, short, int>( max_items, 1, RANDOM_BIT, Sum()); | |
printf("\n-------------------------------\n"); | |
TestProblem<CUB, int, int>( max_items, 1, RANDOM_BIT, Sum()); | |
TestProblem<CUB, long long, long long>( max_items, 1, RANDOM_BIT, Sum()); | |
printf("\n-------------------------------\n"); | |
TestProblem<CUB, float, float>( max_items, 1, RANDOM_BIT, Sum()); | |
TestProblem<CUB, double, double>( max_items, 1, RANDOM_BIT, Sum()); | |
printf("\n-------------------------------\n"); | |
TestProblem<CUB_SEGMENTED, int, int>(max_items, max_segments, RANDOM_BIT, Sum()); | |
#elif defined(QUICK_TEST) | |
// Compile/run quick comparison tests | |
TestProblem<CUB, char, char>( max_items * 4, 1, UNIFORM, Sum()); | |
TestProblem<THRUST, char, char>( max_items * 4, 1, UNIFORM, Sum()); | |
printf("\n----------------------------\n"); | |
TestProblem<CUB, short, short>( max_items * 2, 1, UNIFORM, Sum()); | |
TestProblem<THRUST, short, short>( max_items * 2, 1, UNIFORM, Sum()); | |
printf("\n----------------------------\n"); | |
TestProblem<CUB, int, int>( max_items, 1, UNIFORM, Sum()); | |
TestProblem<THRUST, int, int>( max_items, 1, UNIFORM, Sum()); | |
printf("\n----------------------------\n"); | |
TestProblem<CUB, long long, long long>( max_items / 2, 1, UNIFORM, Sum()); | |
TestProblem<THRUST, long long, long long>( max_items / 2, 1, UNIFORM, Sum()); | |
printf("\n----------------------------\n"); | |
TestProblem<CUB, TestFoo, TestFoo>( max_items / 4, 1, UNIFORM, Max()); | |
TestProblem<THRUST, TestFoo, TestFoo>( max_items / 4, 1, UNIFORM, Max()); | |
#else | |
// Compile/run thorough tests | |
for (int i = 0; i <= g_repeat; ++i) | |
{ | |
// Test different input types | |
TestType<char, char>(max_items, max_segments); | |
TestType<unsigned char, unsigned char>(max_items, max_segments); | |
TestType<char, int>(max_items, max_segments); | |
TestType<short, short>(max_items, max_segments); | |
TestType<int, int>(max_items, max_segments); | |
TestType<long, long>(max_items, max_segments); | |
TestType<long long, long long>(max_items, max_segments); | |
TestType<uchar2, uchar2>(max_items, max_segments); | |
TestType<uint2, uint2>(max_items, max_segments); | |
TestType<ulonglong2, ulonglong2>(max_items, max_segments); | |
TestType<ulonglong4, ulonglong4>(max_items, max_segments); | |
TestType<TestFoo, TestFoo>(max_items, max_segments); | |
TestType<TestBar, TestBar>(max_items, max_segments); | |
} | |
#endif | |
printf("\n"); | |
return 0; | |
} | |