LIVE / thrust /dependencies /cub /tune /tune_device_reduce.cu
Xu Ma
update
1c3c0d9
raw
history blame
27.7 kB
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
* Evaluates different tuning configurations of DeviceReduce.
*
* The best way to use this program:
* (1) Find the best all-around single-block tune for a given arch.
* For example, 100 samples [1 ..512], 100 timing iterations per config per sample:
* ./bin/tune_device_reduce_sm200_nvvm_5.0_abi_i386 --i=100 --s=100 --n=512 --single --device=0
* (2) Update the single tune in device_reduce.cuh
* (3) Find the best all-around multi-block tune for a given arch.
* For example, 100 samples [single-block tile-size .. 50,331,648], 100 timing iterations per config per sample:
* ./bin/tune_device_reduce_sm200_nvvm_5.0_abi_i386 --i=100 --s=100 --device=0
* (4) Update the multi-block tune in device_reduce.cuh
*
******************************************************************************/
// Ensure printing of CUDA runtime errors to console
#define CUB_STDERR
#include <vector>
#include <algorithm>
#include <stdio.h>
#include <cub/cub.cuh>
#include "../test/test_util.h"
using namespace cub;
using namespace std;
//---------------------------------------------------------------------
// Globals, constants and typedefs
//---------------------------------------------------------------------
#ifndef TUNE_ARCH
#define TUNE_ARCH 100
#endif
int g_max_items = 48 * 1024 * 1024;
int g_samples = 100;
int g_timing_iterations = 2;
bool g_verbose = false;
bool g_single = false;
bool g_verify = true;
CachingDeviceAllocator g_allocator;
//---------------------------------------------------------------------
// Host utility subroutines
//---------------------------------------------------------------------
/**
* Initialize problem
*/
template <typename T>
void Initialize(
GenMode gen_mode,
T *h_in,
int num_items)
{
for (int i = 0; i < num_items; ++i)
{
InitValue(gen_mode, h_in[i], i);
}
}
/**
* Sequential reduction
*/
template <typename T, typename ReductionOp>
T Reduce(
T *h_in,
ReductionOp reduction_op,
int num_items)
{
T retval = h_in[0];
for (int i = 1; i < num_items; ++i)
retval = reduction_op(retval, h_in[i]);
return retval;
}
//---------------------------------------------------------------------
// Full tile test generation
//---------------------------------------------------------------------
/**
* Wrapper structure for generating and running different tuning configurations
*/
template <
typename T,
typename OffsetT,
typename ReductionOp>
struct Schmoo
{
//---------------------------------------------------------------------
// Types
//---------------------------------------------------------------------
/// Pairing of kernel function pointer and corresponding dispatch params
template <typename KernelPtr>
struct DispatchTuple
{
KernelPtr kernel_ptr;
DeviceReduce::KernelDispachParams params;
float avg_throughput;
float best_avg_throughput;
OffsetT best_size;
float hmean_speedup;
DispatchTuple() :
kernel_ptr(0),
params(DeviceReduce::KernelDispachParams()),
avg_throughput(0.0),
best_avg_throughput(0.0),
hmean_speedup(0.0),
best_size(0)
{}
};
/**
* Comparison operator for DispatchTuple.avg_throughput
*/
template <typename Tuple>
static bool MinSpeedup(const Tuple &a, const Tuple &b)
{
float delta = a.hmean_speedup - b.hmean_speedup;
return ((delta < 0.02) && (delta > -0.02)) ?
(a.best_avg_throughput < b.best_avg_throughput) : // Negligible average performance differences: defer to best performance
(a.hmean_speedup < b.hmean_speedup);
}
/// Multi-block reduction kernel type and dispatch tuple type
typedef void (*MultiBlockDeviceReduceKernelPtr)(T*, T*, OffsetT, GridEvenShare<OffsetT>, GridQueue<OffsetT>, ReductionOp);
typedef DispatchTuple<MultiBlockDeviceReduceKernelPtr> MultiDispatchTuple;
/// Single-block reduction kernel type and dispatch tuple type
typedef void (*SingleBlockDeviceReduceKernelPtr)(T*, T*, OffsetT, ReductionOp);
typedef DispatchTuple<SingleBlockDeviceReduceKernelPtr> SingleDispatchTuple;
//---------------------------------------------------------------------
// Fields
//---------------------------------------------------------------------
vector<MultiDispatchTuple> multi_kernels; // List of generated multi-block kernels
vector<SingleDispatchTuple> single_kernels; // List of generated single-block kernels
//---------------------------------------------------------------------
// Kernel enumeration methods
//---------------------------------------------------------------------
/**
* Must have smem that fits in the SM
* Must have vector load length that divides items per thread
*/
template <typename TilesReducePolicy, typename ReductionOp>
struct SmemSize
{
enum
{
BYTES = sizeof(typename BlockReduceTiles<TilesReducePolicy, T*, OffsetT, ReductionOp>::TempStorage),
IS_OK = ((BYTES < ArchProps<TUNE_ARCH>::SMEM_BYTES) &&
(TilesReducePolicy::ITEMS_PER_THREAD % TilesReducePolicy::VECTOR_LOAD_LENGTH == 0))
};
};
/**
* Specialization that allows kernel generation with the specified TilesReducePolicy
*/
template <
typename TilesReducePolicy,
bool IsOk = SmemSize<TilesReducePolicy, ReductionOp>::IS_OK>
struct Ok
{
/// Enumerate multi-block kernel and add to the list
template <typename KernelsVector>
static void GenerateMulti(
KernelsVector &multi_kernels,
int subscription_factor)
{
MultiDispatchTuple tuple;
tuple.params.template Init<TilesReducePolicy>(subscription_factor);
tuple.kernel_ptr = ReducePrivatizedKernel<TilesReducePolicy, T*, T*, OffsetT, ReductionOp>;
multi_kernels.push_back(tuple);
}
/// Enumerate single-block kernel and add to the list
template <typename KernelsVector>
static void GenerateSingle(KernelsVector &single_kernels)
{
SingleDispatchTuple tuple;
tuple.params.template Init<TilesReducePolicy>();
tuple.kernel_ptr = ReduceSingleKernel<TilesReducePolicy, T*, T*, OffsetT, ReductionOp>;
single_kernels.push_back(tuple);
}
};
/**
* Specialization that rejects kernel generation with the specified TilesReducePolicy
*/
template <typename TilesReducePolicy>
struct Ok<TilesReducePolicy, false>
{
template <typename KernelsVector>
static void GenerateMulti(KernelsVector &multi_kernels, int subscription_factor) {}
template <typename KernelsVector>
static void GenerateSingle(KernelsVector &single_kernels) {}
};
/// Enumerate block-scheduling variations
template <
int BLOCK_THREADS,
int ITEMS_PER_THREAD,
int VECTOR_LOAD_LENGTH,
BlockReduceAlgorithm BLOCK_ALGORITHM,
CacheLoadModifier LOAD_MODIFIER>
void Enumerate()
{
// Multi-block kernels
Ok<BlockReduceTilesPolicy<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_MODIFIER, GRID_MAPPING_RAKE> >::GenerateMulti(multi_kernels, 1);
Ok<BlockReduceTilesPolicy<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_MODIFIER, GRID_MAPPING_RAKE> >::GenerateMulti(multi_kernels, 2);
Ok<BlockReduceTilesPolicy<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_MODIFIER, GRID_MAPPING_RAKE> >::GenerateMulti(multi_kernels, 4);
Ok<BlockReduceTilesPolicy<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_MODIFIER, GRID_MAPPING_RAKE> >::GenerateMulti(multi_kernels, 8);
#if TUNE_ARCH >= 200
Ok<BlockReduceTilesPolicy<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_MODIFIER, GRID_MAPPING_DYNAMIC> >::GenerateMulti(multi_kernels, 1);
#endif
// Single-block kernels
Ok<BlockReduceTilesPolicy<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_MODIFIER, GRID_MAPPING_RAKE> >::GenerateSingle(single_kernels);
}
/// Enumerate load modifier variations
template <
int BLOCK_THREADS,
int ITEMS_PER_THREAD,
int VECTOR_LOAD_LENGTH,
BlockReduceAlgorithm BLOCK_ALGORITHM>
void Enumerate()
{
Enumerate<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_DEFAULT>();
#if TUNE_ARCH >= 350
Enumerate<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_LDG>();
#endif
}
/// Enumerate block algorithms
template <
int BLOCK_THREADS,
int ITEMS_PER_THREAD,
int VECTOR_LOAD_LENGTH>
void Enumerate()
{
Enumerate<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_REDUCE_RAKING>();
Enumerate<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_REDUCE_WARP_REDUCTIONS>();
}
/// Enumerate vectorization variations
template <
int BLOCK_THREADS,
int ITEMS_PER_THREAD>
void Enumerate()
{
Enumerate<BLOCK_THREADS, ITEMS_PER_THREAD, 1>();
Enumerate<BLOCK_THREADS, ITEMS_PER_THREAD, 2>();
Enumerate<BLOCK_THREADS, ITEMS_PER_THREAD, 4>();
}
/// Enumerate thread-granularity variations
template <int BLOCK_THREADS>
void Enumerate()
{
Enumerate<BLOCK_THREADS, 7>();
Enumerate<BLOCK_THREADS, 8>();
Enumerate<BLOCK_THREADS, 9>();
Enumerate<BLOCK_THREADS, 11>();
Enumerate<BLOCK_THREADS, 12>();
Enumerate<BLOCK_THREADS, 13>();
Enumerate<BLOCK_THREADS, 15>();
Enumerate<BLOCK_THREADS, 16>();
Enumerate<BLOCK_THREADS, 17>();
Enumerate<BLOCK_THREADS, 19>();
Enumerate<BLOCK_THREADS, 20>();
Enumerate<BLOCK_THREADS, 21>();
Enumerate<BLOCK_THREADS, 23>();
Enumerate<BLOCK_THREADS, 24>();
Enumerate<BLOCK_THREADS, 25>();
}
/// Enumerate block size variations
void Enumerate()
{
printf("\nEnumerating kernels\n"); fflush(stdout);
Enumerate<32>();
Enumerate<64>();
Enumerate<96>();
Enumerate<128>();
Enumerate<160>();
Enumerate<192>();
Enumerate<256>();
Enumerate<512>();
}
//---------------------------------------------------------------------
// Test methods
//---------------------------------------------------------------------
/**
* Test a configuration
*/
void TestConfiguration(
MultiDispatchTuple &multi_dispatch,
SingleDispatchTuple &single_dispatch,
T* d_in,
T* d_out,
T* h_reference,
OffsetT num_items,
ReductionOp reduction_op)
{
// Clear output
if (g_verify) CubDebugExit(cudaMemset(d_out, 0, sizeof(T)));
// Allocate temporary storage
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
CubDebugExit(DeviceReduce::Dispatch(
d_temp_storage,
temp_storage_bytes,
multi_dispatch.kernel_ptr,
single_dispatch.kernel_ptr,
FillAndResetDrainKernel<OffsetT>,
multi_dispatch.params,
single_dispatch.params,
d_in,
d_out,
num_items,
reduction_op));
CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
// Warmup/correctness iteration
CubDebugExit(DeviceReduce::Dispatch(
d_temp_storage,
temp_storage_bytes,
multi_dispatch.kernel_ptr,
single_dispatch.kernel_ptr,
FillAndResetDrainKernel<OffsetT>,
multi_dispatch.params,
single_dispatch.params,
d_in,
d_out,
num_items,
reduction_op));
if (g_verify) CubDebugExit(cudaDeviceSynchronize());
// Copy out and display results
int compare = (g_verify) ?
CompareDeviceResults(h_reference, d_out, 1, true, false) :
0;
// Performance
GpuTimer gpu_timer;
float elapsed_millis = 0.0;
for (int i = 0; i < g_timing_iterations; i++)
{
gpu_timer.Start();
CubDebugExit(DeviceReduce::Dispatch(
d_temp_storage,
temp_storage_bytes,
multi_dispatch.kernel_ptr,
single_dispatch.kernel_ptr,
FillAndResetDrainKernel<OffsetT>,
multi_dispatch.params,
single_dispatch.params,
d_in,
d_out,
num_items,
reduction_op));
gpu_timer.Stop();
elapsed_millis += gpu_timer.ElapsedMillis();
}
// Mooch
CubDebugExit(cudaDeviceSynchronize());
float avg_elapsed = elapsed_millis / g_timing_iterations;
float avg_throughput = float(num_items) / avg_elapsed / 1000.0 / 1000.0;
float avg_bandwidth = avg_throughput * sizeof(T);
multi_dispatch.avg_throughput = CUB_MAX(avg_throughput, multi_dispatch.avg_throughput);
if (avg_throughput > multi_dispatch.best_avg_throughput)
{
multi_dispatch.best_avg_throughput = avg_throughput;
multi_dispatch.best_size = num_items;
}
single_dispatch.avg_throughput = CUB_MAX(avg_throughput, single_dispatch.avg_throughput);
if (avg_throughput > single_dispatch.best_avg_throughput)
{
single_dispatch.best_avg_throughput = avg_throughput;
single_dispatch.best_size = num_items;
}
if (g_verbose)
{
printf("\t%.2f GB/s, multi_dispatch( ", avg_bandwidth);
multi_dispatch.params.Print();
printf(" ), single_dispatch( ");
single_dispatch.params.Print();
printf(" )\n");
fflush(stdout);
}
AssertEquals(0, compare);
// Cleanup temporaries
if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
}
/**
* Evaluate multi-block configurations
*/
void TestMulti(
T* h_in,
T* d_in,
T* d_out,
ReductionOp reduction_op)
{
// Simple single kernel tuple for use with multi kernel sweep
typedef typename DeviceReduce::TunedPolicies<T, OffsetT, TUNE_ARCH>::SinglePolicy SimpleSinglePolicy;
SingleDispatchTuple simple_single_tuple;
simple_single_tuple.params.template Init<SimpleSinglePolicy>();
simple_single_tuple.kernel_ptr = ReduceSingleKernel<SimpleSinglePolicy, T*, T*, OffsetT, ReductionOp>;
double max_exponent = log2(double(g_max_items));
double min_exponent = log2(double(simple_single_tuple.params.tile_size));
unsigned int max_int = (unsigned int) -1;
for (int sample = 0; sample < g_samples; ++sample)
{
printf("\nMulti-block sample %d, ", sample);
int num_items;
if (sample == 0)
{
// First sample: use max items
num_items = g_max_items;
printf("num_items: %d", num_items); fflush(stdout);
}
else
{
// Sample a problem size from [2^g_min_exponent, g_max_items]. First 2/3 of the samples are log-distributed, the other 1/3 are uniformly-distributed.
unsigned int bits;
RandomBits(bits);
double scale = double(bits) / max_int;
if (sample < g_samples / 2)
{
// log bias
double exponent = ((max_exponent - min_exponent) * scale) + min_exponent;
num_items = pow(2.0, exponent);
num_items = CUB_MIN(num_items, g_max_items);
printf("num_items: %d (2^%.2f)", num_items, exponent); fflush(stdout);
}
else
{
// uniform bias
num_items = CUB_MAX(pow(2.0, min_exponent), scale * g_max_items);
num_items = CUB_MIN(num_items, g_max_items);
printf("num_items: %d (%.2f * %d)", num_items, scale, g_max_items); fflush(stdout);
}
}
if (g_verbose)
printf("\n");
else
printf(", ");
// Compute reference
T h_reference = Reduce(h_in, reduction_op, num_items);
// Run test on each multi-kernel configuration
float best_avg_throughput = 0.0;
for (int j = 0; j < multi_kernels.size(); ++j)
{
multi_kernels[j].avg_throughput = 0.0;
TestConfiguration(multi_kernels[j], simple_single_tuple, d_in, d_out, &h_reference, num_items, reduction_op);
best_avg_throughput = CUB_MAX(best_avg_throughput, multi_kernels[j].avg_throughput);
}
// Print best throughput for this problem size
printf("Best: %.2fe9 items/s (%.2f GB/s)\n", best_avg_throughput, best_avg_throughput * sizeof(T));
// Accumulate speedup (inverse for harmonic mean)
for (int j = 0; j < multi_kernels.size(); ++j)
multi_kernels[j].hmean_speedup += best_avg_throughput / multi_kernels[j].avg_throughput;
}
// Find max overall throughput and compute hmean speedups
float overall_max_throughput = 0.0;
for (int j = 0; j < multi_kernels.size(); ++j)
{
overall_max_throughput = CUB_MAX(overall_max_throughput, multi_kernels[j].best_avg_throughput);
multi_kernels[j].hmean_speedup = float(g_samples) / multi_kernels[j].hmean_speedup;
}
// Sort by cumulative speedup
sort(multi_kernels.begin(), multi_kernels.end(), MinSpeedup<MultiDispatchTuple>);
// Print ranked multi configurations
printf("\nRanked multi_kernels:\n");
for (int j = 0; j < multi_kernels.size(); ++j)
{
printf("\t (%d) params( ", multi_kernels.size() - j);
multi_kernels[j].params.Print();
printf(" ) hmean speedup: %.3f, best throughput %.2f @ %d elements (%.2f GB/s, %.2f%%)\n",
multi_kernels[j].hmean_speedup,
multi_kernels[j].best_avg_throughput,
(int) multi_kernels[j].best_size,
multi_kernels[j].best_avg_throughput * sizeof(T),
multi_kernels[j].best_avg_throughput / overall_max_throughput);
}
printf("\nMax multi-block throughput %.2f (%.2f GB/s)\n", overall_max_throughput, overall_max_throughput * sizeof(T));
}
/**
* Evaluate single-block configurations
*/
void TestSingle(
T* h_in,
T* d_in,
T* d_out,
ReductionOp reduction_op)
{
// Construct a NULL-ptr multi-kernel tuple that forces a single-kernel pass
MultiDispatchTuple multi_tuple;
double max_exponent = log2(double(g_max_items));
unsigned int max_int = (unsigned int) -1;
for (int sample = 0; sample < g_samples; ++sample)
{
printf("\nSingle-block sample %d, ", sample);
int num_items;
if (sample == 0)
{
// First sample: use max items
num_items = g_max_items;
printf("num_items: %d", num_items); fflush(stdout);
}
else
{
// Sample a problem size from [2, g_max_items], log-distributed
unsigned int bits;
RandomBits(bits);
double scale = double(bits) / max_int;
double exponent = ((max_exponent - 1) * scale) + 1;
num_items = pow(2.0, exponent);
printf("num_items: %d (2^%.2f)", num_items, exponent); fflush(stdout);
}
if (g_verbose)
printf("\n");
else
printf(", ");
// Compute reference
T h_reference = Reduce(h_in, reduction_op, num_items);
// Run test on each single-kernel configuration (pick first multi-config to use, which shouldn't be
float best_avg_throughput = 0.0;
for (int j = 0; j < single_kernels.size(); ++j)
{
single_kernels[j].avg_throughput = 0.0;
TestConfiguration(multi_tuple, single_kernels[j], d_in, d_out, &h_reference, num_items, reduction_op);
best_avg_throughput = CUB_MAX(best_avg_throughput, single_kernels[j].avg_throughput);
}
// Print best throughput for this problem size
printf("Best: %.2fe9 items/s (%.2f GB/s)\n", best_avg_throughput, best_avg_throughput * sizeof(T));
// Accumulate speedup (inverse for harmonic mean)
for (int j = 0; j < single_kernels.size(); ++j)
single_kernels[j].hmean_speedup += best_avg_throughput / single_kernels[j].avg_throughput;
}
// Find max overall throughput and compute hmean speedups
float overall_max_throughput = 0.0;
for (int j = 0; j < single_kernels.size(); ++j)
{
overall_max_throughput = CUB_MAX(overall_max_throughput, single_kernels[j].best_avg_throughput);
single_kernels[j].hmean_speedup = float(g_samples) / single_kernels[j].hmean_speedup;
}
// Sort by cumulative speedup
sort(single_kernels.begin(), single_kernels.end(), MinSpeedup<SingleDispatchTuple>);
// Print ranked single configurations
printf("\nRanked single_kernels:\n");
for (int j = 0; j < single_kernels.size(); ++j)
{
printf("\t (%d) params( ", single_kernels.size() - j);
single_kernels[j].params.Print();
printf(" ) hmean speedup: %.3f, best throughput %.2f @ %d elements (%.2f GB/s, %.2f%%)\n",
single_kernels[j].hmean_speedup,
single_kernels[j].best_avg_throughput,
(int) single_kernels[j].best_size,
single_kernels[j].best_avg_throughput * sizeof(T),
single_kernels[j].best_avg_throughput / overall_max_throughput);
}
printf("\nMax single-block throughput %.2f (%.2f GB/s)\n", overall_max_throughput, overall_max_throughput * sizeof(T));
}
};
//---------------------------------------------------------------------
// Main
//---------------------------------------------------------------------
/**
* Main
*/
int main(int argc, char** argv)
{
// Initialize command line
CommandLineArgs args(argc, argv);
args.GetCmdLineArgument("n", g_max_items);
args.GetCmdLineArgument("s", g_samples);
args.GetCmdLineArgument("i", g_timing_iterations);
g_verbose = args.CheckCmdLineFlag("v");
g_single = args.CheckCmdLineFlag("single");
g_verify = !args.CheckCmdLineFlag("noverify");
// Print usage
if (args.CheckCmdLineFlag("help"))
{
printf("%s "
"[--device=<device-id>] "
"[--n=<max items>]"
"[--s=<samples>]"
"[--i=<timing iterations>]"
"[--single]"
"[--v]"
"[--noverify]"
"\n", argv[0]);
exit(0);
}
// Initialize device
CubDebugExit(args.DeviceInit());
#if (TUNE_SIZE == 1)
typedef unsigned char T;
#elif (TUNE_SIZE == 2)
typedef unsigned short T;
#elif (TUNE_SIZE == 4)
typedef unsigned int T;
#elif (TUNE_SIZE == 8)
typedef unsigned long long T;
#else
// Default
typedef unsigned int T;
#endif
typedef unsigned int OffsetT;
Sum reduction_op;
// Enumerate kernels
Schmoo<T, OffsetT, Sum > schmoo;
schmoo.Enumerate();
// Allocate host arrays
T *h_in = new T[g_max_items];
// Initialize problem
Initialize(UNIFORM, h_in, g_max_items);
// Initialize device arrays
T *d_in = NULL;
T *d_out = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * g_max_items));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * 1));
CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * g_max_items, cudaMemcpyHostToDevice));
// Test kernels
if (g_single)
schmoo.TestSingle(h_in, d_in, d_out, reduction_op);
else
schmoo.TestMulti(h_in, d_in, d_out, reduction_op);
// Cleanup
if (h_in) delete[] h_in;
if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
return 0;
}