Spaces:
Runtime error
Runtime error
/****************************************************************************** | |
* Copyright (c) 2011, Duane Merrill. All rights reserved. | |
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. | |
* | |
* Redistribution and use in source and binary forms, with or without | |
* modification, are permitted provided that the following conditions are met: | |
* * Redistributions of source code must retain the above copyright | |
* notice, this list of conditions and the following disclaimer. | |
* * Redistributions in binary form must reproduce the above copyright | |
* notice, this list of conditions and the following disclaimer in the | |
* documentation and/or other materials provided with the distribution. | |
* * Neither the name of the NVIDIA CORPORATION nor the | |
* names of its contributors may be used to endorse or promote products | |
* derived from this software without specific prior written permission. | |
* | |
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | |
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY | |
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES | |
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND | |
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
* | |
******************************************************************************/ | |
/****************************************************************************** | |
* Type conversion macros | |
******************************************************************************/ | |
/** | |
* Return a value of type `T` with the same bitwise representation of `in`. | |
* Types `T` and `U` must be the same size. | |
*/ | |
template <typename T, typename U> | |
T SafeBitCast(const U& in) | |
{ | |
static_assert(sizeof(T) == sizeof(U), "Types must be same size."); | |
T out; | |
memcpy(&out, &in, sizeof(T)); | |
return out; | |
} | |
/****************************************************************************** | |
* Assertion macros | |
******************************************************************************/ | |
/** | |
* Assert equals | |
*/ | |
/****************************************************************************** | |
* Command-line parsing functionality | |
******************************************************************************/ | |
/** | |
* Utility for parsing command line arguments | |
*/ | |
struct CommandLineArgs | |
{ | |
std::vector<std::string> keys; | |
std::vector<std::string> values; | |
std::vector<std::string> args; | |
cudaDeviceProp deviceProp; | |
float device_giga_bandwidth; | |
size_t device_free_physmem; | |
size_t device_total_physmem; | |
/** | |
* Constructor | |
*/ | |
CommandLineArgs(int argc, char **argv) : | |
keys(10), | |
values(10) | |
{ | |
using namespace std; | |
// Initialize mersenne generator | |
unsigned int mersenne_init[4]= {0x123, 0x234, 0x345, 0x456}; | |
mersenne::init_by_array(mersenne_init, 4); | |
for (int i = 1; i < argc; i++) | |
{ | |
string arg = argv[i]; | |
if ((arg[0] != '-') || (arg[1] != '-')) | |
{ | |
args.push_back(arg); | |
continue; | |
} | |
string::size_type pos; | |
string key, val; | |
if ((pos = arg.find('=')) == string::npos) { | |
key = string(arg, 2, arg.length() - 2); | |
val = ""; | |
} else { | |
key = string(arg, 2, pos - 2); | |
val = string(arg, pos + 1, arg.length() - 1); | |
} | |
keys.push_back(key); | |
values.push_back(val); | |
} | |
} | |
/** | |
* Checks whether a flag "--<flag>" is present in the commandline | |
*/ | |
bool CheckCmdLineFlag(const char* arg_name) | |
{ | |
using namespace std; | |
for (int i = 0; i < int(keys.size()); ++i) | |
{ | |
if (keys[i] == string(arg_name)) | |
return true; | |
} | |
return false; | |
} | |
/** | |
* Returns number of naked (non-flag and non-key-value) commandline parameters | |
*/ | |
template <typename T> | |
int NumNakedArgs() | |
{ | |
return args.size(); | |
} | |
/** | |
* Returns the commandline parameter for a given index (not including flags) | |
*/ | |
template <typename T> | |
void GetCmdLineArgument(int index, T &val) | |
{ | |
using namespace std; | |
if (index < args.size()) { | |
istringstream str_stream(args[index]); | |
str_stream >> val; | |
} | |
} | |
/** | |
* Returns the value specified for a given commandline parameter --<flag>=<value> | |
*/ | |
template <typename T> | |
void GetCmdLineArgument(const char *arg_name, T &val) | |
{ | |
using namespace std; | |
for (int i = 0; i < int(keys.size()); ++i) | |
{ | |
if (keys[i] == string(arg_name)) | |
{ | |
istringstream str_stream(values[i]); | |
str_stream >> val; | |
} | |
} | |
} | |
/** | |
* Returns the values specified for a given commandline parameter --<flag>=<value>,<value>* | |
*/ | |
template <typename T> | |
void GetCmdLineArguments(const char *arg_name, std::vector<T> &vals) | |
{ | |
using namespace std; | |
if (CheckCmdLineFlag(arg_name)) | |
{ | |
// Clear any default values | |
vals.clear(); | |
// Recover from multi-value string | |
for (int i = 0; i < keys.size(); ++i) | |
{ | |
if (keys[i] == string(arg_name)) | |
{ | |
string val_string(values[i]); | |
istringstream str_stream(val_string); | |
string::size_type old_pos = 0; | |
string::size_type new_pos = 0; | |
// Iterate comma-separated values | |
T val; | |
while ((new_pos = val_string.find(',', old_pos)) != string::npos) | |
{ | |
if (new_pos != old_pos) | |
{ | |
str_stream.width(new_pos - old_pos); | |
str_stream >> val; | |
vals.push_back(val); | |
} | |
// skip over comma | |
str_stream.ignore(1); | |
old_pos = new_pos + 1; | |
} | |
// Read last value | |
str_stream >> val; | |
vals.push_back(val); | |
} | |
} | |
} | |
} | |
/** | |
* The number of pairs parsed | |
*/ | |
int ParsedArgc() | |
{ | |
return (int) keys.size(); | |
} | |
/** | |
* Initialize device | |
*/ | |
cudaError_t DeviceInit(int dev = -1) | |
{ | |
cudaError_t error = cudaSuccess; | |
do | |
{ | |
int deviceCount; | |
error = CubDebug(cudaGetDeviceCount(&deviceCount)); | |
if (error) break; | |
if (deviceCount == 0) { | |
fprintf(stderr, "No devices supporting CUDA.\n"); | |
exit(1); | |
} | |
if (dev < 0) | |
{ | |
GetCmdLineArgument("device", dev); | |
} | |
if ((dev > deviceCount - 1) || (dev < 0)) | |
{ | |
dev = 0; | |
} | |
error = CubDebug(cudaSetDevice(dev)); | |
if (error) break; | |
CubDebugExit(cudaMemGetInfo(&device_free_physmem, &device_total_physmem)); | |
int ptx_version = 0; | |
error = CubDebug(cub::PtxVersion(ptx_version)); | |
if (error) break; | |
error = CubDebug(cudaGetDeviceProperties(&deviceProp, dev)); | |
if (error) break; | |
if (deviceProp.major < 1) { | |
fprintf(stderr, "Device does not support CUDA.\n"); | |
exit(1); | |
} | |
device_giga_bandwidth = float(deviceProp.memoryBusWidth) * deviceProp.memoryClockRate * 2 / 8 / 1000 / 1000; | |
if (!CheckCmdLineFlag("quiet")) | |
{ | |
printf( | |
"Using device %d: %s (PTX version %d, SM%d, %d SMs, " | |
"%lld free / %lld total MB physmem, " | |
"%.3f GB/s @ %d kHz mem clock, ECC %s)\n", | |
dev, | |
deviceProp.name, | |
ptx_version, | |
deviceProp.major * 100 + deviceProp.minor * 10, | |
deviceProp.multiProcessorCount, | |
(unsigned long long) device_free_physmem / 1024 / 1024, | |
(unsigned long long) device_total_physmem / 1024 / 1024, | |
device_giga_bandwidth, | |
deviceProp.memoryClockRate, | |
(deviceProp.ECCEnabled) ? "on" : "off"); | |
fflush(stdout); | |
} | |
} while (0); | |
return error; | |
} | |
}; | |
/****************************************************************************** | |
* Random bits generator | |
******************************************************************************/ | |
int g_num_rand_samples = 0; | |
template <typename T> | |
bool IsNaN(T /* val */) { return false; } | |
template<> | |
__noinline__ bool IsNaN<float>(float val) | |
{ | |
return std::isnan(val); | |
} | |
template<> | |
__noinline__ bool IsNaN<float1>(float1 val) | |
{ | |
return (IsNaN(val.x)); | |
} | |
template<> | |
__noinline__ bool IsNaN<float2>(float2 val) | |
{ | |
return (IsNaN(val.y) || IsNaN(val.x)); | |
} | |
template<> | |
__noinline__ bool IsNaN<float3>(float3 val) | |
{ | |
return (IsNaN(val.z) || IsNaN(val.y) || IsNaN(val.x)); | |
} | |
template<> | |
__noinline__ bool IsNaN<float4>(float4 val) | |
{ | |
return (IsNaN(val.y) || IsNaN(val.x) || IsNaN(val.w) || IsNaN(val.z)); | |
} | |
template<> | |
__noinline__ bool IsNaN<double>(double val) | |
{ | |
return std::isnan(val); | |
} | |
template<> | |
__noinline__ bool IsNaN<double1>(double1 val) | |
{ | |
return (IsNaN(val.x)); | |
} | |
template<> | |
__noinline__ bool IsNaN<double2>(double2 val) | |
{ | |
return (IsNaN(val.y) || IsNaN(val.x)); | |
} | |
template<> | |
__noinline__ bool IsNaN<double3>(double3 val) | |
{ | |
return (IsNaN(val.z) || IsNaN(val.y) || IsNaN(val.x)); | |
} | |
template<> | |
__noinline__ bool IsNaN<double4>(double4 val) | |
{ | |
return (IsNaN(val.y) || IsNaN(val.x) || IsNaN(val.w) || IsNaN(val.z)); | |
} | |
template<> | |
__noinline__ bool IsNaN<half_t>(half_t val) | |
{ | |
const auto bits = SafeBitCast<unsigned short>(val); | |
// commented bit is always true, leaving for documentation: | |
return (((bits >= 0x7C01) && (bits <= 0x7FFF)) || | |
((bits >= 0xFC01) /*&& (bits <= 0xFFFFFFFF)*/)); | |
} | |
/** | |
* Generates random keys. | |
* | |
* We always take the second-order byte from rand() because the higher-order | |
* bits returned by rand() are commonly considered more uniformly distributed | |
* than the lower-order bits. | |
* | |
* We can decrease the entropy level of keys by adopting the technique | |
* of Thearling and Smith in which keys are computed from the bitwise AND of | |
* multiple random samples: | |
* | |
* entropy_reduction | Effectively-unique bits per key | |
* ----------------------------------------------------- | |
* -1 | 0 | |
* 0 | 32 | |
* 1 | 25.95 (81%) | |
* 2 | 17.41 (54%) | |
* 3 | 10.78 (34%) | |
* 4 | 6.42 (20%) | |
* ... | ... | |
* | |
*/ | |
template <typename K> | |
void RandomBits( | |
K &key, | |
int entropy_reduction = 0, | |
int begin_bit = 0, | |
int end_bit = sizeof(K) * 8) | |
{ | |
const int NUM_BYTES = sizeof(K); | |
const int WORD_BYTES = sizeof(unsigned int); | |
const int NUM_WORDS = (NUM_BYTES + WORD_BYTES - 1) / WORD_BYTES; | |
unsigned int word_buff[NUM_WORDS]; | |
if (entropy_reduction == -1) | |
{ | |
memset((void *) &key, 0, sizeof(key)); | |
return; | |
} | |
if (end_bit < 0) | |
end_bit = sizeof(K) * 8; | |
while (true) | |
{ | |
// Generate random word_buff | |
for (int j = 0; j < NUM_WORDS; j++) | |
{ | |
int current_bit = j * WORD_BYTES * 8; | |
unsigned int word = 0xffffffff; | |
word &= 0xffffffff << CUB_MAX(0, begin_bit - current_bit); | |
word &= 0xffffffff >> CUB_MAX(0, (current_bit + (WORD_BYTES * 8)) - end_bit); | |
for (int i = 0; i <= entropy_reduction; i++) | |
{ | |
// Grab some of the higher bits from rand (better entropy, supposedly) | |
word &= mersenne::genrand_int32(); | |
g_num_rand_samples++; | |
} | |
word_buff[j] = word; | |
} | |
memcpy(&key, word_buff, sizeof(K)); | |
K copy = key; | |
if (!IsNaN(copy)) | |
break; // avoids NaNs when generating random floating point numbers | |
} | |
} | |
/// Randomly select number between [0:max) | |
template <typename T> | |
T RandomValue(T max) | |
{ | |
unsigned int bits; | |
unsigned int max_int = (unsigned int) -1; | |
do { | |
RandomBits(bits); | |
} while (bits == max_int); | |
return (T) ((double(bits) / double(max_int)) * double(max)); | |
} | |
/****************************************************************************** | |
* Console printing utilities | |
******************************************************************************/ | |
/** | |
* Helper for casting character types to integers for cout printing | |
*/ | |
template <typename T> | |
T CoutCast(T val) { return val; } | |
int CoutCast(char val) { return val; } | |
int CoutCast(unsigned char val) { return val; } | |
int CoutCast(signed char val) { return val; } | |
/****************************************************************************** | |
* Test value initialization utilities | |
******************************************************************************/ | |
/** | |
* Test problem generation options | |
*/ | |
enum GenMode | |
{ | |
UNIFORM, // Assign to '2', regardless of integer seed | |
INTEGER_SEED, // Assign to integer seed | |
RANDOM, // Assign to random, regardless of integer seed | |
RANDOM_BIT, // Assign to randomly chosen 0 or 1, regardless of integer seed | |
}; | |
/** | |
* Initialize value | |
*/ | |
template <typename T> | |
__host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, int index = 0) | |
{ | |
switch (gen_mode) | |
{ | |
case RANDOM: | |
RandomBits(value); | |
break; | |
case RANDOM_BIT: | |
char c; | |
RandomBits(c, 0, 0, 1); | |
value = (c > 0) ? (T) 1 : (T) -1; | |
break; | |
case UNIFORM: | |
value = 2; | |
break; | |
case INTEGER_SEED: | |
default: | |
value = (T) index; | |
break; | |
} | |
} | |
/** | |
* Initialize value (bool) | |
*/ | |
__host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, bool &value, int index = 0) | |
{ | |
switch (gen_mode) | |
{ | |
case RANDOM: | |
case RANDOM_BIT: | |
char c; | |
RandomBits(c, 0, 0, 1); | |
value = (c > 0); | |
break; | |
case UNIFORM: | |
value = true; | |
break; | |
case INTEGER_SEED: | |
default: | |
value = (index > 0); | |
break; | |
} | |
} | |
/** | |
* cub::NullType test initialization | |
*/ | |
__host__ __device__ __forceinline__ void InitValue(GenMode /* gen_mode */, | |
cub::NullType &/* value */, | |
int /* index */ = 0) | |
{} | |
/** | |
* cub::KeyValuePair<OffsetT, ValueT>test initialization | |
*/ | |
template <typename KeyT, typename ValueT> | |
__host__ __device__ __forceinline__ void InitValue( | |
GenMode gen_mode, | |
cub::KeyValuePair<KeyT, ValueT>& value, | |
int index = 0) | |
{ | |
InitValue(gen_mode, value.value, index); | |
// Assign corresponding flag with a likelihood of the last bit being set with entropy-reduction level 3 | |
RandomBits(value.key, 3); | |
value.key = (value.key & 0x1); | |
} | |
/****************************************************************************** | |
* Comparison and ostream operators | |
******************************************************************************/ | |
/** | |
* KeyValuePair ostream operator | |
*/ | |
template <typename Key, typename Value> | |
std::ostream& operator<<(std::ostream& os, const cub::KeyValuePair<Key, Value> &val) | |
{ | |
os << '(' << CoutCast(val.key) << ',' << CoutCast(val.value) << ')'; | |
return os; | |
} | |
/****************************************************************************** | |
* Comparison and ostream operators for CUDA vector types | |
******************************************************************************/ | |
/** | |
* Vector1 overloads | |
*/ | |
/** | |
* Vector2 overloads | |
*/ | |
/** | |
* Vector3 overloads | |
*/ | |
/** | |
* Vector4 overloads | |
*/ | |
/** | |
* All vector overloads | |
*/ | |
/** | |
* Define for types | |
*/ | |
CUB_VEC_OVERLOAD(char, char) | |
CUB_VEC_OVERLOAD(short, short) | |
CUB_VEC_OVERLOAD(int, int) | |
CUB_VEC_OVERLOAD(long, long) | |
CUB_VEC_OVERLOAD(longlong, long long) | |
CUB_VEC_OVERLOAD(uchar, unsigned char) | |
CUB_VEC_OVERLOAD(ushort, unsigned short) | |
CUB_VEC_OVERLOAD(uint, unsigned int) | |
CUB_VEC_OVERLOAD(ulong, unsigned long) | |
CUB_VEC_OVERLOAD(ulonglong, unsigned long long) | |
CUB_VEC_OVERLOAD(float, float) | |
CUB_VEC_OVERLOAD(double, double) | |
//--------------------------------------------------------------------- | |
// Complex data type TestFoo | |
//--------------------------------------------------------------------- | |
/** | |
* TestFoo complex data type | |
*/ | |
struct TestFoo | |
{ | |
long long x; | |
int y; | |
short z; | |
char w; | |
// Factory | |
static __host__ __device__ __forceinline__ TestFoo MakeTestFoo(long long x, int y, short z, char w) | |
{ | |
TestFoo retval = {x, y, z, w}; | |
return retval; | |
} | |
// Assignment from int operator | |
__host__ __device__ __forceinline__ TestFoo& operator =(int b) | |
{ | |
x = b; | |
y = b; | |
z = b; | |
w = b; | |
return *this; | |
} | |
// Summation operator | |
__host__ __device__ __forceinline__ TestFoo operator+(const TestFoo &b) const | |
{ | |
return MakeTestFoo(x + b.x, y + b.y, z + b.z, w + b.w); | |
} | |
// Inequality operator | |
__host__ __device__ __forceinline__ bool operator !=(const TestFoo &b) const | |
{ | |
return (x != b.x) || (y != b.y) || (z != b.z) || (w != b.w); | |
} | |
// Equality operator | |
__host__ __device__ __forceinline__ bool operator ==(const TestFoo &b) const | |
{ | |
return (x == b.x) && (y == b.y) && (z == b.z) && (w == b.w); | |
} | |
// Less than operator | |
__host__ __device__ __forceinline__ bool operator <(const TestFoo &b) const | |
{ | |
if (x < b.x) return true; else if (b.x < x) return false; | |
if (y < b.y) return true; else if (b.y < y) return false; | |
if (z < b.z) return true; else if (b.z < z) return false; | |
return w < b.w; | |
} | |
// Greater than operator | |
__host__ __device__ __forceinline__ bool operator >(const TestFoo &b) const | |
{ | |
if (x > b.x) return true; else if (b.x > x) return false; | |
if (y > b.y) return true; else if (b.y > y) return false; | |
if (z > b.z) return true; else if (b.z > z) return false; | |
return w > b.w; | |
} | |
}; | |
/** | |
* TestFoo ostream operator | |
*/ | |
std::ostream& operator<<(std::ostream& os, const TestFoo& val) | |
{ | |
os << '(' << val.x << ',' << val.y << ',' << val.z << ',' << CoutCast(val.w) << ')'; | |
return os; | |
} | |
/** | |
* TestFoo test initialization | |
*/ | |
__host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, TestFoo &value, int index = 0) | |
{ | |
InitValue(gen_mode, value.x, index); | |
InitValue(gen_mode, value.y, index); | |
InitValue(gen_mode, value.z, index); | |
InitValue(gen_mode, value.w, index); | |
} | |
/// numeric_limits<TestFoo> specialization | |
namespace cub { | |
template<> | |
struct NumericTraits<TestFoo> | |
{ | |
static const Category CATEGORY = NOT_A_NUMBER; | |
enum { | |
PRIMITIVE = false, | |
NULL_TYPE = false, | |
}; | |
static TestFoo Max() | |
{ | |
return TestFoo::MakeTestFoo( | |
NumericTraits<long long>::Max(), | |
NumericTraits<int>::Max(), | |
NumericTraits<short>::Max(), | |
NumericTraits<char>::Max()); | |
} | |
static TestFoo Lowest() | |
{ | |
return TestFoo::MakeTestFoo( | |
NumericTraits<long long>::Lowest(), | |
NumericTraits<int>::Lowest(), | |
NumericTraits<short>::Lowest(), | |
NumericTraits<char>::Lowest()); | |
} | |
}; | |
} // namespace cub | |
//--------------------------------------------------------------------- | |
// Complex data type TestBar (with optimizations for fence-free warp-synchrony) | |
//--------------------------------------------------------------------- | |
/** | |
* TestBar complex data type | |
*/ | |
struct TestBar | |
{ | |
long long x; | |
int y; | |
// Constructor | |
__host__ __device__ __forceinline__ TestBar() : x(0), y(0) | |
{} | |
// Constructor | |
__host__ __device__ __forceinline__ TestBar(int b) : x(b), y(b) | |
{} | |
// Constructor | |
__host__ __device__ __forceinline__ TestBar(long long x, int y) : x(x), y(y) | |
{} | |
// Assignment from int operator | |
__host__ __device__ __forceinline__ TestBar& operator =(int b) | |
{ | |
x = b; | |
y = b; | |
return *this; | |
} | |
// Summation operator | |
__host__ __device__ __forceinline__ TestBar operator+(const TestBar &b) const | |
{ | |
return TestBar(x + b.x, y + b.y); | |
} | |
// Inequality operator | |
__host__ __device__ __forceinline__ bool operator !=(const TestBar &b) const | |
{ | |
return (x != b.x) || (y != b.y); | |
} | |
// Equality operator | |
__host__ __device__ __forceinline__ bool operator ==(const TestBar &b) const | |
{ | |
return (x == b.x) && (y == b.y); | |
} | |
// Less than operator | |
__host__ __device__ __forceinline__ bool operator <(const TestBar &b) const | |
{ | |
if (x < b.x) return true; else if (b.x < x) return false; | |
return y < b.y; | |
} | |
// Greater than operator | |
__host__ __device__ __forceinline__ bool operator >(const TestBar &b) const | |
{ | |
if (x > b.x) return true; else if (b.x > x) return false; | |
return y > b.y; | |
} | |
}; | |
/** | |
* TestBar ostream operator | |
*/ | |
std::ostream& operator<<(std::ostream& os, const TestBar& val) | |
{ | |
os << '(' << val.x << ',' << val.y << ')'; | |
return os; | |
} | |
/** | |
* TestBar test initialization | |
*/ | |
__host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, TestBar &value, int index = 0) | |
{ | |
InitValue(gen_mode, value.x, index); | |
InitValue(gen_mode, value.y, index); | |
} | |
/// numeric_limits<TestBar> specialization | |
namespace cub { | |
template<> | |
struct NumericTraits<TestBar> | |
{ | |
static const Category CATEGORY = NOT_A_NUMBER; | |
enum { | |
PRIMITIVE = false, | |
NULL_TYPE = false, | |
}; | |
static TestBar Max() | |
{ | |
return TestBar( | |
NumericTraits<long long>::Max(), | |
NumericTraits<int>::Max()); | |
} | |
static TestBar Lowest() | |
{ | |
return TestBar( | |
NumericTraits<long long>::Lowest(), | |
NumericTraits<int>::Lowest()); | |
} | |
}; | |
} // namespace cub | |
/****************************************************************************** | |
* Helper routines for list comparison and display | |
******************************************************************************/ | |
/** | |
* Compares the equivalence of two arrays | |
*/ | |
template <typename S, typename T, typename OffsetT> | |
int CompareResults(T* computed, S* reference, OffsetT len, bool verbose = true) | |
{ | |
for (OffsetT i = 0; i < len; i++) | |
{ | |
if (computed[i] != reference[i]) | |
{ | |
if (verbose) std::cout << "INCORRECT: [" << i << "]: " | |
<< CoutCast(computed[i]) << " != " | |
<< CoutCast(reference[i]); | |
return 1; | |
} | |
} | |
return 0; | |
} | |
/** | |
* Compares the equivalence of two arrays | |
*/ | |
template <typename OffsetT> | |
int CompareResults(float* computed, float* reference, OffsetT len, bool verbose = true) | |
{ | |
for (OffsetT i = 0; i < len; i++) | |
{ | |
if (computed[i] != reference[i]) | |
{ | |
float difference = std::abs(computed[i]-reference[i]); | |
float fraction = difference / std::abs(reference[i]); | |
if (fraction > 0.0001) | |
{ | |
if (verbose) std::cout << "INCORRECT: [" << i << "]: " | |
<< "(computed) " << CoutCast(computed[i]) << " != " | |
<< CoutCast(reference[i]) << " (difference:" << difference << ", fraction: " << fraction << ")"; | |
return 1; | |
} | |
} | |
} | |
return 0; | |
} | |
/** | |
* Compares the equivalence of two arrays | |
*/ | |
template <typename OffsetT> | |
int CompareResults(cub::NullType* computed, cub::NullType* reference, OffsetT len, bool verbose = true) | |
{ | |
return 0; | |
} | |
/** | |
* Compares the equivalence of two arrays | |
*/ | |
template <typename OffsetT> | |
int CompareResults(double* computed, double* reference, OffsetT len, bool verbose = true) | |
{ | |
for (OffsetT i = 0; i < len; i++) | |
{ | |
if (computed[i] != reference[i]) | |
{ | |
double difference = std::abs(computed[i]-reference[i]); | |
double fraction = difference / std::abs(reference[i]); | |
if (fraction > 0.0001) | |
{ | |
if (verbose) std::cout << "INCORRECT: [" << i << "]: " | |
<< CoutCast(computed[i]) << " != " | |
<< CoutCast(reference[i]) << " (difference:" << difference << ", fraction: " << fraction << ")"; | |
return 1; | |
} | |
} | |
} | |
return 0; | |
} | |
/** | |
* Verify the contents of a device array match those | |
* of a host array | |
*/ | |
int CompareDeviceResults( | |
cub::NullType */* h_reference */, | |
cub::NullType */* d_data */, | |
size_t /* num_items */, | |
bool /* verbose */ = true, | |
bool /* display_data */ = false) | |
{ | |
return 0; | |
} | |
/** | |
* Verify the contents of a device array match those | |
* of a host array | |
*/ | |
template <typename S, typename OffsetT> | |
int CompareDeviceResults( | |
S *h_reference, | |
cub::DiscardOutputIterator<OffsetT> d_data, | |
size_t num_items, | |
bool verbose = true, | |
bool display_data = false) | |
{ | |
return 0; | |
} | |
/** | |
* Verify the contents of a device array match those | |
* of a host array | |
*/ | |
template <typename S, typename T> | |
int CompareDeviceResults( | |
S *h_reference, | |
T *d_data, | |
size_t num_items, | |
bool verbose = true, | |
bool display_data = false) | |
{ | |
// Allocate array on host | |
T *h_data = (T*) malloc(num_items * sizeof(T)); | |
// Copy data back | |
cudaMemcpy(h_data, d_data, sizeof(T) * num_items, cudaMemcpyDeviceToHost); | |
// Display data | |
if (display_data) | |
{ | |
printf("Reference:\n"); | |
for (int i = 0; i < int(num_items); i++) | |
{ | |
std::cout << CoutCast(h_reference[i]) << ", "; | |
} | |
printf("\n\nComputed:\n"); | |
for (int i = 0; i < int(num_items); i++) | |
{ | |
std::cout << CoutCast(h_data[i]) << ", "; | |
} | |
printf("\n\n"); | |
} | |
// Check | |
int retval = CompareResults(h_data, h_reference, num_items, verbose); | |
// Cleanup | |
if (h_data) free(h_data); | |
return retval; | |
} | |
/** | |
* Verify the contents of a device array match those | |
* of a device array | |
*/ | |
template <typename T> | |
int CompareDeviceDeviceResults( | |
T *d_reference, | |
T *d_data, | |
size_t num_items, | |
bool verbose = true, | |
bool display_data = false) | |
{ | |
// Allocate array on host | |
T *h_reference = (T*) malloc(num_items * sizeof(T)); | |
T *h_data = (T*) malloc(num_items * sizeof(T)); | |
// Copy data back | |
cudaMemcpy(h_reference, d_reference, sizeof(T) * num_items, cudaMemcpyDeviceToHost); | |
cudaMemcpy(h_data, d_data, sizeof(T) * num_items, cudaMemcpyDeviceToHost); | |
// Display data | |
if (display_data) { | |
printf("Reference:\n"); | |
for (int i = 0; i < num_items; i++) | |
{ | |
std::cout << CoutCast(h_reference[i]) << ", "; | |
} | |
printf("\n\nComputed:\n"); | |
for (int i = 0; i < num_items; i++) | |
{ | |
std::cout << CoutCast(h_data[i]) << ", "; | |
} | |
printf("\n\n"); | |
} | |
// Check | |
int retval = CompareResults(h_data, h_reference, num_items, verbose); | |
// Cleanup | |
if (h_reference) free(h_reference); | |
if (h_data) free(h_data); | |
return retval; | |
} | |
/** | |
* Print the contents of a host array | |
*/ | |
void DisplayResults( | |
cub::NullType */* h_data */, | |
size_t /* num_items */) | |
{} | |
/** | |
* Print the contents of a host array | |
*/ | |
template <typename InputIteratorT> | |
void DisplayResults( | |
InputIteratorT h_data, | |
size_t num_items) | |
{ | |
// Display data | |
for (int i = 0; i < int(num_items); i++) | |
{ | |
std::cout << CoutCast(h_data[i]) << ", "; | |
} | |
printf("\n"); | |
} | |
/** | |
* Print the contents of a device array | |
*/ | |
template <typename T> | |
void DisplayDeviceResults( | |
T *d_data, | |
size_t num_items) | |
{ | |
// Allocate array on host | |
T *h_data = (T*) malloc(num_items * sizeof(T)); | |
// Copy data back | |
cudaMemcpy(h_data, d_data, sizeof(T) * num_items, cudaMemcpyDeviceToHost); | |
DisplayResults(h_data, num_items); | |
// Cleanup | |
if (h_data) free(h_data); | |
} | |
/****************************************************************************** | |
* Segment descriptor generation | |
******************************************************************************/ | |
/** | |
* Initialize segments | |
*/ | |
void InitializeSegments( | |
int num_items, | |
int num_segments, | |
int *h_segment_offsets, | |
bool verbose = false) | |
{ | |
if (num_segments <= 0) | |
return; | |
unsigned int expected_segment_length = (num_items + num_segments - 1) / num_segments; | |
int offset = 0; | |
for (int i = 0; i < num_segments; ++i) | |
{ | |
h_segment_offsets[i] = offset; | |
unsigned int segment_length = RandomValue((expected_segment_length * 2) + 1); | |
offset += segment_length; | |
offset = CUB_MIN(offset, num_items); | |
} | |
h_segment_offsets[num_segments] = num_items; | |
if (verbose) | |
{ | |
printf("Segment offsets: "); | |
DisplayResults(h_segment_offsets, num_segments + 1); | |
} | |
} | |
/****************************************************************************** | |
* Timing | |
******************************************************************************/ | |
struct CpuTimer | |
{ | |
LARGE_INTEGER ll_freq; | |
LARGE_INTEGER ll_start; | |
LARGE_INTEGER ll_stop; | |
CpuTimer() | |
{ | |
QueryPerformanceFrequency(&ll_freq); | |
} | |
void Start() | |
{ | |
QueryPerformanceCounter(&ll_start); | |
} | |
void Stop() | |
{ | |
QueryPerformanceCounter(&ll_stop); | |
} | |
float ElapsedMillis() | |
{ | |
double start = double(ll_start.QuadPart) / double(ll_freq.QuadPart); | |
double stop = double(ll_stop.QuadPart) / double(ll_freq.QuadPart); | |
return float((stop - start) * 1000); | |
} | |
rusage start; | |
rusage stop; | |
void Start() | |
{ | |
getrusage(RUSAGE_SELF, &start); | |
} | |
void Stop() | |
{ | |
getrusage(RUSAGE_SELF, &stop); | |
} | |
float ElapsedMillis() | |
{ | |
float sec = stop.ru_utime.tv_sec - start.ru_utime.tv_sec; | |
float usec = stop.ru_utime.tv_usec - start.ru_utime.tv_usec; | |
return (sec * 1000) + (usec / 1000); | |
} | |
}; | |
struct GpuTimer | |
{ | |
cudaEvent_t start; | |
cudaEvent_t stop; | |
GpuTimer() | |
{ | |
cudaEventCreate(&start); | |
cudaEventCreate(&stop); | |
} | |
~GpuTimer() | |
{ | |
cudaEventDestroy(start); | |
cudaEventDestroy(stop); | |
} | |
void Start() | |
{ | |
cudaEventRecord(start, 0); | |
} | |
void Stop() | |
{ | |
cudaEventRecord(stop, 0); | |
} | |
float ElapsedMillis() | |
{ | |
float elapsed; | |
cudaEventSynchronize(stop); | |
cudaEventElapsedTime(&elapsed, start, stop); | |
return elapsed; | |
} | |
}; | |