LIVE / thrust /dependencies /cub /test /test_warp_reduce.cu
Xu Ma
update
1c3c0d9
raw
history blame
27.2 kB
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
* Test of WarpReduce utilities
******************************************************************************/
// Ensure printing of CUDA runtime errors to console
#define CUB_STDERR
#include <stdio.h>
#include <typeinfo>
#include <cub/warp/warp_reduce.cuh>
#include <cub/util_allocator.cuh>
#include "test_util.h"
using namespace cub;
//---------------------------------------------------------------------
// Globals, constants and typedefs
//---------------------------------------------------------------------
bool g_verbose = false;
int g_repeat = 0;
CachingDeviceAllocator g_allocator(true);
/**
* \brief WrapperFunctor (for precluding test-specialized dispatch to *Sum variants)
*/
template<
typename OpT,
int LOGICAL_WARP_THREADS>
struct WrapperFunctor
{
OpT op;
int num_valid;
inline __host__ __device__ WrapperFunctor(OpT op, int num_valid) : op(op), num_valid(num_valid) {}
template <typename T>
inline __host__ __device__ T operator()(const T &a, const T &b) const
{
#if CUB_PTX_ARCH != 0
if ((cub::LaneId() % LOGICAL_WARP_THREADS) >= num_valid)
cub::ThreadTrap();
#endif
return op(a, b);
}
};
//---------------------------------------------------------------------
// Test kernels
//---------------------------------------------------------------------
/**
* Generic reduction
*/
template <
typename T,
typename ReductionOp,
typename WarpReduce,
bool PRIMITIVE = Traits<T>::PRIMITIVE>
struct DeviceTest
{
static __device__ __forceinline__ T Reduce(
typename WarpReduce::TempStorage &temp_storage,
T &data,
ReductionOp &reduction_op)
{
return WarpReduce(temp_storage).Reduce(data, reduction_op);
}
static __device__ __forceinline__ T Reduce(
typename WarpReduce::TempStorage &temp_storage,
T &data,
ReductionOp &reduction_op,
const int &valid_warp_threads)
{
return WarpReduce(temp_storage).Reduce(data, reduction_op, valid_warp_threads);
}
template <typename FlagT>
static __device__ __forceinline__ T HeadSegmentedReduce(
typename WarpReduce::TempStorage &temp_storage,
T &data,
FlagT &flag,
ReductionOp &reduction_op)
{
return WarpReduce(temp_storage).HeadSegmentedReduce(data, flag, reduction_op);
}
template <typename FlagT>
static __device__ __forceinline__ T TailSegmentedReduce(
typename WarpReduce::TempStorage &temp_storage,
T &data,
FlagT &flag,
ReductionOp &reduction_op)
{
return WarpReduce(temp_storage).TailSegmentedReduce(data, flag, reduction_op);
}
};
/**
* Summation
*/
template <
typename T,
typename WarpReduce>
struct DeviceTest<T, Sum, WarpReduce, true>
{
static __device__ __forceinline__ T Reduce(
typename WarpReduce::TempStorage &temp_storage,
T &data,
Sum &reduction_op)
{
return WarpReduce(temp_storage).Sum(data);
}
static __device__ __forceinline__ T Reduce(
typename WarpReduce::TempStorage &temp_storage,
T &data,
Sum &reduction_op,
const int &valid_warp_threads)
{
return WarpReduce(temp_storage).Sum(data, valid_warp_threads);
}
template <typename FlagT>
static __device__ __forceinline__ T HeadSegmentedReduce(
typename WarpReduce::TempStorage &temp_storage,
T &data,
FlagT &flag,
Sum &reduction_op)
{
return WarpReduce(temp_storage).HeadSegmentedSum(data, flag);
}
template <typename FlagT>
static __device__ __forceinline__ T TailSegmentedReduce(
typename WarpReduce::TempStorage &temp_storage,
T &data,
FlagT &flag,
Sum &reduction_op)
{
return WarpReduce(temp_storage).TailSegmentedSum(data, flag);
}
};
/**
* Full-tile warp reduction kernel
*/
template <
int WARPS,
int LOGICAL_WARP_THREADS,
typename T,
typename ReductionOp>
__global__ void FullWarpReduceKernel(
T *d_in,
T *d_out,
ReductionOp reduction_op,
clock_t *d_elapsed)
{
// Cooperative warp-reduce utility type (1 warp)
typedef WarpReduce<T, LOGICAL_WARP_THREADS> WarpReduce;
// Allocate temp storage in shared memory
__shared__ typename WarpReduce::TempStorage temp_storage[WARPS];
// Per-thread tile data
T input = d_in[threadIdx.x];
// Record elapsed clocks
__threadfence_block(); // workaround to prevent clock hoisting
clock_t start = clock();
__threadfence_block(); // workaround to prevent clock hoisting
// Test warp reduce
int warp_id = threadIdx.x / LOGICAL_WARP_THREADS;
T output = DeviceTest<T, ReductionOp, WarpReduce>::Reduce(
temp_storage[warp_id], input, reduction_op);
// Record elapsed clocks
__threadfence_block(); // workaround to prevent clock hoisting
clock_t stop = clock();
__threadfence_block(); // workaround to prevent clock hoisting
*d_elapsed = stop - start;
// Store aggregate
d_out[threadIdx.x] = (threadIdx.x % LOGICAL_WARP_THREADS == 0) ?
output :
input;
}
/**
* Partially-full warp reduction kernel
*/
template <
int WARPS,
int LOGICAL_WARP_THREADS,
typename T,
typename ReductionOp>
__global__ void PartialWarpReduceKernel(
T *d_in,
T *d_out,
ReductionOp reduction_op,
clock_t *d_elapsed,
int valid_warp_threads)
{
// Cooperative warp-reduce utility type
typedef WarpReduce<T, LOGICAL_WARP_THREADS> WarpReduce;
// Allocate temp storage in shared memory
__shared__ typename WarpReduce::TempStorage temp_storage[WARPS];
// Per-thread tile data
T input = d_in[threadIdx.x];
// Record elapsed clocks
__threadfence_block(); // workaround to prevent clock hoisting
clock_t start = clock();
__threadfence_block(); // workaround to prevent clock hoisting
// Test partial-warp reduce
int warp_id = threadIdx.x / LOGICAL_WARP_THREADS;
T output = DeviceTest<T, ReductionOp, WarpReduce>::Reduce(
temp_storage[warp_id], input, reduction_op, valid_warp_threads);
// Record elapsed clocks
__threadfence_block(); // workaround to prevent clock hoisting
clock_t stop = clock();
__threadfence_block(); // workaround to prevent clock hoisting
*d_elapsed = stop - start;
// Store aggregate
d_out[threadIdx.x] = (threadIdx.x % LOGICAL_WARP_THREADS == 0) ?
output :
input;
}
/**
* Head-based segmented warp reduction test kernel
*/
template <
int WARPS,
int LOGICAL_WARP_THREADS,
typename T,
typename FlagT,
typename ReductionOp>
__global__ void WarpHeadSegmentedReduceKernel(
T *d_in,
FlagT *d_head_flags,
T *d_out,
ReductionOp reduction_op,
clock_t *d_elapsed)
{
// Cooperative warp-reduce utility type
typedef WarpReduce<T, LOGICAL_WARP_THREADS> WarpReduce;
// Allocate temp storage in shared memory
__shared__ typename WarpReduce::TempStorage temp_storage[WARPS];
// Per-thread tile data
T input = d_in[threadIdx.x];
FlagT head_flag = d_head_flags[threadIdx.x];
// Record elapsed clocks
__threadfence_block(); // workaround to prevent clock hoisting
clock_t start = clock();
__threadfence_block(); // workaround to prevent clock hoisting
// Test segmented warp reduce
int warp_id = threadIdx.x / LOGICAL_WARP_THREADS;
T output = DeviceTest<T, ReductionOp, WarpReduce>::HeadSegmentedReduce(
temp_storage[warp_id], input, head_flag, reduction_op);
// Record elapsed clocks
__threadfence_block(); // workaround to prevent clock hoisting
clock_t stop = clock();
__threadfence_block(); // workaround to prevent clock hoisting
*d_elapsed = stop - start;
// Store aggregate
d_out[threadIdx.x] = ((threadIdx.x % LOGICAL_WARP_THREADS == 0) || head_flag) ?
output :
input;
}
/**
* Tail-based segmented warp reduction test kernel
*/
template <
int WARPS,
int LOGICAL_WARP_THREADS,
typename T,
typename FlagT,
typename ReductionOp>
__global__ void WarpTailSegmentedReduceKernel(
T *d_in,
FlagT *d_tail_flags,
T *d_out,
ReductionOp reduction_op,
clock_t *d_elapsed)
{
// Cooperative warp-reduce utility type
typedef WarpReduce<T, LOGICAL_WARP_THREADS> WarpReduce;
// Allocate temp storage in shared memory
__shared__ typename WarpReduce::TempStorage temp_storage[WARPS];
// Per-thread tile data
T input = d_in[threadIdx.x];
FlagT tail_flag = d_tail_flags[threadIdx.x];
FlagT head_flag = (threadIdx.x == 0) ?
0 :
d_tail_flags[threadIdx.x - 1];
// Record elapsed clocks
__threadfence_block(); // workaround to prevent clock hoisting
clock_t start = clock();
__threadfence_block(); // workaround to prevent clock hoisting
// Test segmented warp reduce
int warp_id = threadIdx.x / LOGICAL_WARP_THREADS;
T output = DeviceTest<T, ReductionOp, WarpReduce>::TailSegmentedReduce(
temp_storage[warp_id], input, tail_flag, reduction_op);
// Record elapsed clocks
__threadfence_block(); // workaround to prevent clock hoisting
clock_t stop = clock();
__threadfence_block(); // workaround to prevent clock hoisting
*d_elapsed = stop - start;
// Store aggregate
d_out[threadIdx.x] = ((threadIdx.x % LOGICAL_WARP_THREADS == 0) || head_flag) ?
output :
input;
}
//---------------------------------------------------------------------
// Host utility subroutines
//---------------------------------------------------------------------
/**
* Initialize reduction problem (and solution)
*/
template <
typename T,
typename ReductionOp>
void Initialize(
GenMode gen_mode,
int flag_entropy,
T *h_in,
int *h_flags,
int warps,
int warp_threads,
int valid_warp_threads,
ReductionOp reduction_op,
T *h_head_out,
T *h_tail_out)
{
for (int i = 0; i < warps * warp_threads; ++i)
{
// Sample a value for this item
InitValue(gen_mode, h_in[i], i);
h_head_out[i] = h_in[i];
h_tail_out[i] = h_in[i];
// Sample whether or not this item will be a segment head
char bits;
RandomBits(bits, flag_entropy);
h_flags[i] = bits & 0x1;
}
// Accumulate segments (lane 0 of each warp is implicitly a segment head)
for (int warp = 0; warp < warps; ++warp)
{
int warp_offset = warp * warp_threads;
int item_offset = warp_offset + valid_warp_threads - 1;
// Last item in warp
T head_aggregate = h_in[item_offset];
T tail_aggregate = h_in[item_offset];
if (h_flags[item_offset])
h_head_out[item_offset] = head_aggregate;
item_offset--;
// Work backwards
while (item_offset >= warp_offset)
{
if (h_flags[item_offset + 1])
{
head_aggregate = h_in[item_offset];
}
else
{
head_aggregate = reduction_op(head_aggregate, h_in[item_offset]);
}
if (h_flags[item_offset])
{
h_head_out[item_offset] = head_aggregate;
h_tail_out[item_offset + 1] = tail_aggregate;
tail_aggregate = h_in[item_offset];
}
else
{
tail_aggregate = reduction_op(tail_aggregate, h_in[item_offset]);
}
item_offset--;
}
// Record last segment head_aggregate to head offset
h_head_out[warp_offset] = head_aggregate;
h_tail_out[warp_offset] = tail_aggregate;
}
}
/**
* Test warp reduction
*/
template <
int WARPS,
int LOGICAL_WARP_THREADS,
typename T,
typename ReductionOp>
void TestReduce(
GenMode gen_mode,
ReductionOp reduction_op,
int valid_warp_threads = LOGICAL_WARP_THREADS)
{
const int BLOCK_THREADS = LOGICAL_WARP_THREADS * WARPS;
// Allocate host arrays
T *h_in = new T[BLOCK_THREADS];
int *h_flags = new int[BLOCK_THREADS];
T *h_out = new T[BLOCK_THREADS];
T *h_tail_out = new T[BLOCK_THREADS];
// Initialize problem
Initialize(gen_mode, -1, h_in, h_flags, WARPS, LOGICAL_WARP_THREADS, valid_warp_threads, reduction_op, h_out, h_tail_out);
// Initialize/clear device arrays
T *d_in = NULL;
T *d_out = NULL;
clock_t *d_elapsed = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * BLOCK_THREADS));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * BLOCK_THREADS));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(clock_t)));
CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * BLOCK_THREADS, cudaMemcpyHostToDevice));
CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * BLOCK_THREADS));
if (g_verbose)
{
printf("Data:\n");
for (int i = 0; i < WARPS; ++i)
DisplayResults(h_in + (i * LOGICAL_WARP_THREADS), valid_warp_threads);
}
// Run kernel
printf("\nGen-mode %d, %d warps, %d warp threads, %d valid lanes, %s (%d bytes) elements:\n",
gen_mode,
WARPS,
LOGICAL_WARP_THREADS,
valid_warp_threads,
typeid(T).name(),
(int) sizeof(T));
fflush(stdout);
if (valid_warp_threads == LOGICAL_WARP_THREADS)
{
// Run full-warp kernel
FullWarpReduceKernel<WARPS, LOGICAL_WARP_THREADS><<<1, BLOCK_THREADS>>>(
d_in,
d_out,
reduction_op,
d_elapsed);
}
else
{
// Run partial-warp kernel
PartialWarpReduceKernel<WARPS, LOGICAL_WARP_THREADS><<<1, BLOCK_THREADS>>>(
d_in,
d_out,
reduction_op,
d_elapsed,
valid_warp_threads);
}
CubDebugExit(cudaPeekAtLastError());
CubDebugExit(cudaDeviceSynchronize());
// Copy out and display results
printf("\tReduction results: ");
int compare = CompareDeviceResults(h_out, d_out, BLOCK_THREADS, g_verbose, g_verbose);
printf("%s\n", compare ? "FAIL" : "PASS");
AssertEquals(0, compare);
printf("\tElapsed clocks: ");
DisplayDeviceResults(d_elapsed, 1);
// Cleanup
if (h_in) delete[] h_in;
if (h_flags) delete[] h_flags;
if (h_out) delete[] h_out;
if (h_tail_out) delete[] h_tail_out;
if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
if (d_elapsed) CubDebugExit(g_allocator.DeviceFree(d_elapsed));
}
/**
* Test warp segmented reduction
*/
template <
int WARPS,
int LOGICAL_WARP_THREADS,
typename T,
typename ReductionOp>
void TestSegmentedReduce(
GenMode gen_mode,
int flag_entropy,
ReductionOp reduction_op)
{
const int BLOCK_THREADS = LOGICAL_WARP_THREADS * WARPS;
// Allocate host arrays
int compare;
T *h_in = new T[BLOCK_THREADS];
int *h_flags = new int[BLOCK_THREADS];
T *h_head_out = new T[BLOCK_THREADS];
T *h_tail_out = new T[BLOCK_THREADS];
// Initialize problem
Initialize(gen_mode, flag_entropy, h_in, h_flags, WARPS, LOGICAL_WARP_THREADS, LOGICAL_WARP_THREADS, reduction_op, h_head_out, h_tail_out);
// Initialize/clear device arrays
T *d_in = NULL;
int *d_flags = NULL;
T *d_head_out = NULL;
T *d_tail_out = NULL;
clock_t *d_elapsed = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * BLOCK_THREADS));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_flags, sizeof(int) * BLOCK_THREADS));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_head_out, sizeof(T) * BLOCK_THREADS));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_tail_out, sizeof(T) * BLOCK_THREADS));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(clock_t)));
CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * BLOCK_THREADS, cudaMemcpyHostToDevice));
CubDebugExit(cudaMemcpy(d_flags, h_flags, sizeof(int) * BLOCK_THREADS, cudaMemcpyHostToDevice));
CubDebugExit(cudaMemset(d_head_out, 0, sizeof(T) * BLOCK_THREADS));
CubDebugExit(cudaMemset(d_tail_out, 0, sizeof(T) * BLOCK_THREADS));
if (g_verbose)
{
printf("Data:\n");
for (int i = 0; i < WARPS; ++i)
DisplayResults(h_in + (i * LOGICAL_WARP_THREADS), LOGICAL_WARP_THREADS);
printf("\nFlags:\n");
for (int i = 0; i < WARPS; ++i)
DisplayResults(h_flags + (i * LOGICAL_WARP_THREADS), LOGICAL_WARP_THREADS);
}
printf("\nGen-mode %d, head flag entropy reduction %d, %d warps, %d warp threads, %s (%d bytes) elements:\n",
gen_mode,
flag_entropy,
WARPS,
LOGICAL_WARP_THREADS,
typeid(T).name(),
(int) sizeof(T));
fflush(stdout);
// Run head-based kernel
WarpHeadSegmentedReduceKernel<WARPS, LOGICAL_WARP_THREADS><<<1, BLOCK_THREADS>>>(
d_in,
d_flags,
d_head_out,
reduction_op,
d_elapsed);
CubDebugExit(cudaPeekAtLastError());
CubDebugExit(cudaDeviceSynchronize());
// Copy out and display results
printf("\tHead-based segmented reduction results: ");
compare = CompareDeviceResults(h_head_out, d_head_out, BLOCK_THREADS, g_verbose, g_verbose);
printf("%s\n", compare ? "FAIL" : "PASS");
AssertEquals(0, compare);
printf("\tElapsed clocks: ");
DisplayDeviceResults(d_elapsed, 1);
// Run tail-based kernel
WarpTailSegmentedReduceKernel<WARPS, LOGICAL_WARP_THREADS><<<1, BLOCK_THREADS>>>(
d_in,
d_flags,
d_tail_out,
reduction_op,
d_elapsed);
CubDebugExit(cudaPeekAtLastError());
CubDebugExit(cudaDeviceSynchronize());
// Copy out and display results
printf("\tTail-based segmented reduction results: ");
compare = CompareDeviceResults(h_tail_out, d_tail_out, BLOCK_THREADS, g_verbose, g_verbose);
printf("%s\n", compare ? "FAIL" : "PASS");
AssertEquals(0, compare);
printf("\tElapsed clocks: ");
DisplayDeviceResults(d_elapsed, 1);
// Cleanup
if (h_in) delete[] h_in;
if (h_flags) delete[] h_flags;
if (h_head_out) delete[] h_head_out;
if (h_tail_out) delete[] h_tail_out;
if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
if (d_flags) CubDebugExit(g_allocator.DeviceFree(d_flags));
if (d_head_out) CubDebugExit(g_allocator.DeviceFree(d_head_out));
if (d_tail_out) CubDebugExit(g_allocator.DeviceFree(d_tail_out));
if (d_elapsed) CubDebugExit(g_allocator.DeviceFree(d_elapsed));
}
/**
* Run battery of tests for different full and partial tile sizes
*/
template <
int WARPS,
int LOGICAL_WARP_THREADS,
typename T,
typename ReductionOp>
void Test(
GenMode gen_mode,
ReductionOp reduction_op)
{
// Partial tiles
for (
int valid_warp_threads = 1;
valid_warp_threads < LOGICAL_WARP_THREADS;
valid_warp_threads += CUB_MAX(1, LOGICAL_WARP_THREADS / 5))
{
// Without wrapper (to test non-excepting PTX POD-op specializations)
TestReduce<WARPS, LOGICAL_WARP_THREADS, T>(gen_mode, reduction_op, valid_warp_threads);
// With wrapper to ensure no ops called on OOB lanes
WrapperFunctor<ReductionOp, LOGICAL_WARP_THREADS> wrapped_op(reduction_op, valid_warp_threads);
TestReduce<WARPS, LOGICAL_WARP_THREADS, T>(gen_mode, wrapped_op, valid_warp_threads);
}
// Full tile
TestReduce<WARPS, LOGICAL_WARP_THREADS, T>(gen_mode, reduction_op, LOGICAL_WARP_THREADS);
// Segmented reduction with different head flags
for (int flag_entropy = 0; flag_entropy < 10; ++flag_entropy)
{
TestSegmentedReduce<WARPS, LOGICAL_WARP_THREADS, T>(gen_mode, flag_entropy, reduction_op);
}
}
/**
* Run battery of tests for different data types and reduce ops
*/
template <
int WARPS,
int LOGICAL_WARP_THREADS>
void Test(GenMode gen_mode)
{
// primitive
Test<WARPS, LOGICAL_WARP_THREADS, char>( gen_mode, Sum());
Test<WARPS, LOGICAL_WARP_THREADS, short>( gen_mode, Sum());
Test<WARPS, LOGICAL_WARP_THREADS, int>( gen_mode, Sum());
Test<WARPS, LOGICAL_WARP_THREADS, long long>( gen_mode, Sum());
Test<WARPS, LOGICAL_WARP_THREADS, unsigned char>( gen_mode, Sum());
Test<WARPS, LOGICAL_WARP_THREADS, unsigned short>( gen_mode, Sum());
Test<WARPS, LOGICAL_WARP_THREADS, unsigned int>( gen_mode, Sum());
Test<WARPS, LOGICAL_WARP_THREADS, unsigned long long>( gen_mode, Sum());
if (gen_mode != RANDOM)
{
Test<WARPS, LOGICAL_WARP_THREADS, float>( gen_mode, Sum());
Test<WARPS, LOGICAL_WARP_THREADS, double>( gen_mode, Sum());
}
// primitive (alternative reduce op)
Test<WARPS, LOGICAL_WARP_THREADS, unsigned char>( gen_mode, Max());
Test<WARPS, LOGICAL_WARP_THREADS, unsigned short>( gen_mode, Max());
Test<WARPS, LOGICAL_WARP_THREADS, unsigned int>( gen_mode, Max());
Test<WARPS, LOGICAL_WARP_THREADS, unsigned long long>( gen_mode, Max());
// vec-1
Test<WARPS, LOGICAL_WARP_THREADS, uchar1>( gen_mode, Sum());
// vec-2
Test<WARPS, LOGICAL_WARP_THREADS, uchar2>( gen_mode, Sum());
Test<WARPS, LOGICAL_WARP_THREADS, ushort2>( gen_mode, Sum());
Test<WARPS, LOGICAL_WARP_THREADS, uint2>( gen_mode, Sum());
Test<WARPS, LOGICAL_WARP_THREADS, ulonglong2>( gen_mode, Sum());
// vec-4
Test<WARPS, LOGICAL_WARP_THREADS, uchar4>( gen_mode, Sum());
Test<WARPS, LOGICAL_WARP_THREADS, ushort4>( gen_mode, Sum());
Test<WARPS, LOGICAL_WARP_THREADS, uint4>( gen_mode, Sum());
Test<WARPS, LOGICAL_WARP_THREADS, ulonglong4>( gen_mode, Sum());
// complex
Test<WARPS, LOGICAL_WARP_THREADS, TestFoo>( gen_mode, Sum());
Test<WARPS, LOGICAL_WARP_THREADS, TestBar>( gen_mode, Sum());
}
/**
* Run battery of tests for different problem generation options
*/
template <
int WARPS,
int LOGICAL_WARP_THREADS>
void Test()
{
Test<WARPS, LOGICAL_WARP_THREADS>(UNIFORM);
Test<WARPS, LOGICAL_WARP_THREADS>(INTEGER_SEED);
Test<WARPS, LOGICAL_WARP_THREADS>(RANDOM);
}
/**
* Run battery of tests for different number of active warps
*/
template <int LOGICAL_WARP_THREADS>
void Test()
{
Test<1, LOGICAL_WARP_THREADS>();
// Only power-of-two subwarps can be tiled
if ((LOGICAL_WARP_THREADS == 32) || PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE)
Test<2, LOGICAL_WARP_THREADS>();
}
/**
* Main
*/
int main(int argc, char** argv)
{
// Initialize command line
CommandLineArgs args(argc, argv);
g_verbose = args.CheckCmdLineFlag("v");
args.GetCmdLineArgument("repeat", g_repeat);
// Print usage
if (args.CheckCmdLineFlag("help"))
{
printf("%s "
"[--device=<device-id>] "
"[--repeat=<repetitions of entire test suite>]"
"[--v] "
"\n", argv[0]);
exit(0);
}
// Initialize device
CubDebugExit(args.DeviceInit());
#ifdef QUICK_TEST
// Compile/run quick tests
TestReduce<1, 32, int>(UNIFORM, Sum());
TestReduce<1, 32, double>(UNIFORM, Sum());
TestReduce<2, 16, TestBar>(UNIFORM, Sum());
TestSegmentedReduce<1, 32, int>(UNIFORM, 1, Sum());
#else
// Compile/run thorough tests
for (int i = 0; i <= g_repeat; ++i)
{
// Test logical warp sizes
Test<32>();
Test<16>();
Test<9>();
Test<7>();
}
#endif
return 0;
}