/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Test of iterator utilities ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include #include #include #include #include #include #include #include #include #include #include #include "test_util.h" #include #include using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- bool g_verbose = false; CachingDeviceAllocator g_allocator(true); // Dispatch types enum Backend { CUB, // CUB method THRUST, // Thrust method CDP, // GPU-based (dynamic parallelism) dispatch to CUB method }; template struct TransformOp { // Increment transform __host__ __device__ __forceinline__ T operator()(T input) const { T addend; InitValue(INTEGER_SEED, addend, 1); return input + addend; } }; struct SelectOp { template __host__ __device__ __forceinline__ bool operator()(T input) { return true; } }; //--------------------------------------------------------------------- // Test kernels //--------------------------------------------------------------------- /** * Test random access input iterator */ template < typename InputIteratorT, typename T> __global__ void Kernel( InputIteratorT d_in, T *d_out, InputIteratorT *d_itrs) { d_out[0] = *d_in; // Value at offset 0 d_out[1] = d_in[100]; // Value at offset 100 d_out[2] = *(d_in + 1000); // Value at offset 1000 d_out[3] = *(d_in + 10000); // Value at offset 10000 d_in++; d_out[4] = d_in[0]; // Value at offset 1 d_in += 20; d_out[5] = d_in[0]; // Value at offset 21 d_itrs[0] = d_in; // Iterator at offset 21 d_in -= 10; d_out[6] = d_in[0]; // Value at offset 11; d_in -= 11; d_out[7] = d_in[0]; // Value at offset 0 d_itrs[1] = d_in; // Iterator at offset 0 } //--------------------------------------------------------------------- // Host testing subroutines //--------------------------------------------------------------------- /** * Run iterator test on device */ template < typename InputIteratorT, typename T, int TEST_VALUES> void Test( InputIteratorT d_in, T (&h_reference)[TEST_VALUES]) { // Allocate device arrays T *d_out = NULL; InputIteratorT *d_itrs = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * TEST_VALUES)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_itrs, sizeof(InputIteratorT) * 2)); int compare; // Run unguarded kernel Kernel<<<1, 1>>>(d_in, d_out, d_itrs); CubDebugExit(cudaPeekAtLastError()); CubDebugExit(cudaDeviceSynchronize()); // Check results compare = CompareDeviceResults(h_reference, d_out, TEST_VALUES, g_verbose, g_verbose); printf("\tValues: %s\n", (compare) ? "FAIL" : "PASS"); AssertEquals(0, compare); // Check iterator at offset 21 InputIteratorT h_itr = d_in + 21; compare = CompareDeviceResults(&h_itr, d_itrs, 1, g_verbose, g_verbose); printf("\tIterators: %s\n", (compare) ? "FAIL" : "PASS"); AssertEquals(0, compare); // Check iterator at offset 0 compare = CompareDeviceResults(&d_in, d_itrs + 1, 1, g_verbose, g_verbose); printf("\tIterators: %s\n", (compare) ? "FAIL" : "PASS"); AssertEquals(0, compare); // Cleanup if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out)); if (d_itrs) CubDebugExit(g_allocator.DeviceFree(d_itrs)); } /** * Test constant iterator */ template void TestConstant(T base) { printf("\nTesting constant iterator on type %s (base: %lld)\n", typeid(T).name(), (unsigned long long) (base)); fflush(stdout); // // Test iterator manipulation in kernel // T h_reference[8] = {base, base, base, base, base, base, base, base}; ConstantInputIterator d_itr(base); Test(d_itr, h_reference); #if (THRUST_VERSION >= 100700) // Thrust 1.7 or newer // // Test with thrust::copy_if() // int copy_items = 100; T *h_copy = new T[copy_items]; T *d_copy = NULL; for (int i = 0; i < copy_items; ++i) h_copy[i] = d_itr[i]; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_copy, sizeof(T) * copy_items)); thrust::device_ptr d_copy_wrapper(d_copy); thrust::copy_if(d_itr, d_itr + copy_items, d_copy_wrapper, SelectOp()); int compare = CompareDeviceResults(h_copy, d_copy, copy_items, g_verbose, g_verbose); printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS"); AssertEquals(0, compare); if (h_copy) delete[] h_copy; if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy)); #endif // THRUST_VERSION } /** * Test counting iterator */ template void TestCounting(T base) { printf("\nTesting counting iterator on type %s (base: %d) \n", typeid(T).name(), int(base)); fflush(stdout); // // Test iterator manipulation in kernel // // Initialize reference data T h_reference[8]; h_reference[0] = base + 0; // Value at offset 0 h_reference[1] = base + 100; // Value at offset 100 h_reference[2] = base + 1000; // Value at offset 1000 h_reference[3] = base + 10000; // Value at offset 10000 h_reference[4] = base + 1; // Value at offset 1 h_reference[5] = base + 21; // Value at offset 21 h_reference[6] = base + 11; // Value at offset 11 h_reference[7] = base + 0; // Value at offset 0; CountingInputIterator d_itr(base); Test(d_itr, h_reference); #if (THRUST_VERSION >= 100700) // Thrust 1.7 or newer // // Test with thrust::copy_if() // unsigned long long max_items = ((1ull << ((sizeof(T) * 8) - 1)) - 1); size_t copy_items = (size_t) CUB_MIN(max_items - base, 100); // potential issue with differencing overflows when T is a smaller type than can handle the offset T *h_copy = new T[copy_items]; T *d_copy = NULL; for (unsigned long long i = 0; i < copy_items; ++i) h_copy[i] = d_itr[i]; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_copy, sizeof(T) * copy_items)); thrust::device_ptr d_copy_wrapper(d_copy); thrust::copy_if(d_itr, d_itr + copy_items, d_copy_wrapper, SelectOp()); int compare = CompareDeviceResults(h_copy, d_copy, copy_items, g_verbose, g_verbose); printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS"); AssertEquals(0, compare); if (h_copy) delete[] h_copy; if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy)); #endif // THRUST_VERSION } /** * Test modified iterator */ template void TestModified() { printf("\nTesting cache-modified iterator on type %s\n", typeid(T).name()); fflush(stdout); // // Test iterator manipulation in kernel // constexpr int TEST_VALUES = 11000; T *h_data = new T[TEST_VALUES]; for (int i = 0; i < TEST_VALUES; ++i) { RandomBits(h_data[i]); } // Allocate device arrays T *d_data = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_data, sizeof(T) * TEST_VALUES)); CubDebugExit(cudaMemcpy(d_data, h_data, sizeof(T) * TEST_VALUES, cudaMemcpyHostToDevice)); // Initialize reference data T h_reference[8]; h_reference[0] = h_data[0]; // Value at offset 0 h_reference[1] = h_data[100]; // Value at offset 100 h_reference[2] = h_data[1000]; // Value at offset 1000 h_reference[3] = h_data[10000]; // Value at offset 10000 h_reference[4] = h_data[1]; // Value at offset 1 h_reference[5] = h_data[21]; // Value at offset 21 h_reference[6] = h_data[11]; // Value at offset 11 h_reference[7] = h_data[0]; // Value at offset 0; Test(CacheModifiedInputIterator((CastT*) d_data), h_reference); Test(CacheModifiedInputIterator((CastT*) d_data), h_reference); Test(CacheModifiedInputIterator((CastT*) d_data), h_reference); Test(CacheModifiedInputIterator((CastT*) d_data), h_reference); Test(CacheModifiedInputIterator((CastT*) d_data), h_reference); Test(CacheModifiedInputIterator((CastT*) d_data), h_reference); Test(CacheModifiedInputIterator((CastT*) d_data), h_reference); #if (THRUST_VERSION >= 100700) // Thrust 1.7 or newer // // Test with thrust::copy_if() // T *d_copy = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_copy, sizeof(T) * TEST_VALUES)); CacheModifiedInputIterator d_in_itr((CastT*) d_data); CacheModifiedOutputIterator d_out_itr((CastT*) d_copy); thrust::copy_if(d_in_itr, d_in_itr + TEST_VALUES, d_out_itr, SelectOp()); int compare = CompareDeviceResults(h_data, d_copy, TEST_VALUES, g_verbose, g_verbose); printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS"); AssertEquals(0, compare); // Cleanup if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy)); #endif // THRUST_VERSION if (h_data) delete[] h_data; if (d_data) CubDebugExit(g_allocator.DeviceFree(d_data)); } /** * Test transform iterator */ template void TestTransform() { printf("\nTesting transform iterator on type %s\n", typeid(T).name()); fflush(stdout); // // Test iterator manipulation in kernel // constexpr int TEST_VALUES = 11000; T *h_data = new T[TEST_VALUES]; for (int i = 0; i < TEST_VALUES; ++i) { InitValue(INTEGER_SEED, h_data[i], i); } // Allocate device arrays T *d_data = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_data, sizeof(T) * TEST_VALUES)); CubDebugExit(cudaMemcpy(d_data, h_data, sizeof(T) * TEST_VALUES, cudaMemcpyHostToDevice)); TransformOp op; // Initialize reference data T h_reference[8]; h_reference[0] = op(h_data[0]); // Value at offset 0 h_reference[1] = op(h_data[100]); // Value at offset 100 h_reference[2] = op(h_data[1000]); // Value at offset 1000 h_reference[3] = op(h_data[10000]); // Value at offset 10000 h_reference[4] = op(h_data[1]); // Value at offset 1 h_reference[5] = op(h_data[21]); // Value at offset 21 h_reference[6] = op(h_data[11]); // Value at offset 11 h_reference[7] = op(h_data[0]); // Value at offset 0; TransformInputIterator, CastT*> d_itr((CastT*) d_data, op); Test(d_itr, h_reference); #if (THRUST_VERSION >= 100700) // Thrust 1.7 or newer // // Test with thrust::copy_if() // T *h_copy = new T[TEST_VALUES]; for (int i = 0; i < TEST_VALUES; ++i) h_copy[i] = op(h_data[i]); T *d_copy = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_copy, sizeof(T) * TEST_VALUES)); thrust::device_ptr d_copy_wrapper(d_copy); thrust::copy_if(d_itr, d_itr + TEST_VALUES, d_copy_wrapper, SelectOp()); int compare = CompareDeviceResults(h_copy, d_copy, TEST_VALUES, g_verbose, g_verbose); printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS"); AssertEquals(0, compare); // Cleanup if (h_copy) delete[] h_copy; if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy)); #endif // THRUST_VERSION if (h_data) delete[] h_data; if (d_data) CubDebugExit(g_allocator.DeviceFree(d_data)); } /** * Test tex-obj texture iterator */ template void TestTexObj() { printf("\nTesting tex-obj iterator on type %s\n", typeid(T).name()); fflush(stdout); // // Test iterator manipulation in kernel // const unsigned int TEST_VALUES = 11000; const unsigned int DUMMY_OFFSET = 500; const unsigned int DUMMY_TEST_VALUES = TEST_VALUES - DUMMY_OFFSET; T *h_data = new T[TEST_VALUES]; for (int i = 0; i < TEST_VALUES; ++i) { RandomBits(h_data[i]); } // Allocate device arrays T *d_data = NULL; T *d_dummy = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_data, sizeof(T) * TEST_VALUES)); CubDebugExit(cudaMemcpy(d_data, h_data, sizeof(T) * TEST_VALUES, cudaMemcpyHostToDevice)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_dummy, sizeof(T) * DUMMY_TEST_VALUES)); CubDebugExit(cudaMemcpy(d_dummy, h_data + DUMMY_OFFSET, sizeof(T) * DUMMY_TEST_VALUES, cudaMemcpyHostToDevice)); // Initialize reference data T h_reference[8]; h_reference[0] = h_data[0]; // Value at offset 0 h_reference[1] = h_data[100]; // Value at offset 100 h_reference[2] = h_data[1000]; // Value at offset 1000 h_reference[3] = h_data[10000]; // Value at offset 10000 h_reference[4] = h_data[1]; // Value at offset 1 h_reference[5] = h_data[21]; // Value at offset 21 h_reference[6] = h_data[11]; // Value at offset 11 h_reference[7] = h_data[0]; // Value at offset 0; // Create and bind obj-based test iterator TexObjInputIterator d_obj_itr; CubDebugExit(d_obj_itr.BindTexture((CastT*) d_data, sizeof(T) * TEST_VALUES)); Test(d_obj_itr, h_reference); #if (THRUST_VERSION >= 100700) // Thrust 1.7 or newer // // Test with thrust::copy_if() // T *d_copy = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_copy, sizeof(T) * TEST_VALUES)); thrust::device_ptr d_copy_wrapper(d_copy); CubDebugExit(cudaMemset(d_copy, 0, sizeof(T) * TEST_VALUES)); thrust::copy_if(d_obj_itr, d_obj_itr + TEST_VALUES, d_copy_wrapper, SelectOp()); int compare = CompareDeviceResults(h_data, d_copy, TEST_VALUES, g_verbose, g_verbose); printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS"); AssertEquals(0, compare); // Cleanup CubDebugExit(d_obj_itr.UnbindTexture()); if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy)); #endif // THRUST_VERSION if (h_data) delete[] h_data; if (d_data) CubDebugExit(g_allocator.DeviceFree(d_data)); if (d_dummy) CubDebugExit(g_allocator.DeviceFree(d_dummy)); } #if CUDART_VERSION >= 5050 /** * Test tex-ref texture iterator */ template void TestTexRef() { printf("\nTesting tex-ref iterator on type %s\n", typeid(T).name()); fflush(stdout); // // Test iterator manipulation in kernel // constexpr int TEST_VALUES = 11000; constexpr unsigned int DUMMY_OFFSET = 500; constexpr unsigned int DUMMY_TEST_VALUES = TEST_VALUES - DUMMY_OFFSET; T *h_data = new T[TEST_VALUES]; for (int i = 0; i < TEST_VALUES; ++i) { RandomBits(h_data[i]); } // Allocate device arrays T *d_data = NULL; T *d_dummy = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_data, sizeof(T) * TEST_VALUES)); CubDebugExit(cudaMemcpy(d_data, h_data, sizeof(T) * TEST_VALUES, cudaMemcpyHostToDevice)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_dummy, sizeof(T) * DUMMY_TEST_VALUES)); CubDebugExit(cudaMemcpy(d_dummy, h_data + DUMMY_OFFSET, sizeof(T) * DUMMY_TEST_VALUES, cudaMemcpyHostToDevice)); // Initialize reference data T h_reference[8]; h_reference[0] = h_data[0]; // Value at offset 0 h_reference[1] = h_data[100]; // Value at offset 100 h_reference[2] = h_data[1000]; // Value at offset 1000 h_reference[3] = h_data[10000]; // Value at offset 10000 h_reference[4] = h_data[1]; // Value at offset 1 h_reference[5] = h_data[21]; // Value at offset 21 h_reference[6] = h_data[11]; // Value at offset 11 h_reference[7] = h_data[0]; // Value at offset 0; // Create and bind ref-based test iterator TexRefInputIterator d_ref_itr; CubDebugExit(d_ref_itr.BindTexture((CastT*) d_data, sizeof(T) * TEST_VALUES)); // Create and bind dummy iterator of same type to check with interferance TexRefInputIterator d_ref_itr2; CubDebugExit(d_ref_itr2.BindTexture((CastT*) d_dummy, sizeof(T) * DUMMY_TEST_VALUES)); Test(d_ref_itr, h_reference); #if (THRUST_VERSION >= 100700) // Thrust 1.7 or newer // // Test with thrust::copy_if() // T *d_copy = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_copy, sizeof(T) * TEST_VALUES)); thrust::device_ptr d_copy_wrapper(d_copy); CubDebugExit(cudaMemset(d_copy, 0, sizeof(T) * TEST_VALUES)); thrust::copy_if(d_ref_itr, d_ref_itr + TEST_VALUES, d_copy_wrapper, SelectOp()); int compare = CompareDeviceResults(h_data, d_copy, TEST_VALUES, g_verbose, g_verbose); printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS"); AssertEquals(0, compare); if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy)); #endif // THRUST_VERSION CubDebugExit(d_ref_itr.UnbindTexture()); CubDebugExit(d_ref_itr2.UnbindTexture()); if (h_data) delete[] h_data; if (d_data) CubDebugExit(g_allocator.DeviceFree(d_data)); if (d_dummy) CubDebugExit(g_allocator.DeviceFree(d_dummy)); } /** * Test texture transform iterator */ template void TestTexTransform() { printf("\nTesting tex-transform iterator on type %s\n", typeid(T).name()); fflush(stdout); // // Test iterator manipulation in kernel // constexpr int TEST_VALUES = 11000; T *h_data = new T[TEST_VALUES]; for (int i = 0; i < TEST_VALUES; ++i) { InitValue(INTEGER_SEED, h_data[i], i); } // Allocate device arrays T *d_data = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_data, sizeof(T) * TEST_VALUES)); CubDebugExit(cudaMemcpy(d_data, h_data, sizeof(T) * TEST_VALUES, cudaMemcpyHostToDevice)); TransformOp op; // Initialize reference data T h_reference[8]; h_reference[0] = op(h_data[0]); // Value at offset 0 h_reference[1] = op(h_data[100]); // Value at offset 100 h_reference[2] = op(h_data[1000]); // Value at offset 1000 h_reference[3] = op(h_data[10000]); // Value at offset 10000 h_reference[4] = op(h_data[1]); // Value at offset 1 h_reference[5] = op(h_data[21]); // Value at offset 21 h_reference[6] = op(h_data[11]); // Value at offset 11 h_reference[7] = op(h_data[0]); // Value at offset 0; // Create and bind texture iterator typedef TexRefInputIterator TextureIterator; TextureIterator d_tex_itr; CubDebugExit(d_tex_itr.BindTexture((CastT*) d_data, sizeof(T) * TEST_VALUES)); // Create transform iterator TransformInputIterator, TextureIterator> xform_itr(d_tex_itr, op); Test(xform_itr, h_reference); #if (THRUST_VERSION >= 100700) // Thrust 1.7 or newer // // Test with thrust::copy_if() // T *h_copy = new T[TEST_VALUES]; for (int i = 0; i < TEST_VALUES; ++i) h_copy[i] = op(h_data[i]); T *d_copy = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_copy, sizeof(T) * TEST_VALUES)); thrust::device_ptr d_copy_wrapper(d_copy); thrust::copy_if(xform_itr, xform_itr + TEST_VALUES, d_copy_wrapper, SelectOp()); int compare = CompareDeviceResults(h_copy, d_copy, TEST_VALUES, g_verbose, g_verbose); printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS"); AssertEquals(0, compare); // Cleanup if (h_copy) delete[] h_copy; if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy)); #endif // THRUST_VERSION CubDebugExit(d_tex_itr.UnbindTexture()); if (h_data) delete[] h_data; if (d_data) CubDebugExit(g_allocator.DeviceFree(d_data)); } #endif // CUDART_VERSION /** * Run non-integer tests */ template void Test(Int2Type /* is_integer */) { TestModified(); TestTransform(); #if CUB_CDP // Test tex-obj iterators if CUDA dynamic parallelism enabled TestTexObj(type_string); #endif // CUB_CDP #if CUDART_VERSION >= 5050 // Test tex-ref iterators for CUDA 5.5 TestTexRef(); TestTexTransform(); #endif // CUDART_VERSION } /** * Run integer tests */ template void Test(Int2Type /* is_integer */) { TestConstant(0); TestConstant(99); TestCounting(0); TestCounting(99); // Run non-integer tests Test(Int2Type()); } /** * Run tests */ template void Test() { enum { IS_INTEGER = (Traits::CATEGORY == SIGNED_INTEGER) || (Traits::CATEGORY == UNSIGNED_INTEGER) }; // Test non-const type Test(Int2Type()); // Test non-const type Test(Int2Type()); } /** * Main */ int main(int argc, char** argv) { // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--device=] " "[--v] " "\n", argv[0]); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); // Get ptx version int ptx_version = 0; CubDebugExit(PtxVersion(ptx_version)); // Evaluate different data types Test(); Test(); Test(); Test(); Test(); Test(); if (ptx_version > 120) // Don't check doubles on PTX120 or below because they're down-converted Test(); Test(); Test(); Test(); Test(); Test(); Test(); if (ptx_version > 120) // Don't check doubles on PTX120 or below because they're down-converted Test(); Test(); Test(); Test(); Test(); Test(); Test(); if (ptx_version > 120) // Don't check doubles on PTX120 or below because they're down-converted Test(); Test(); Test(); Test(); Test(); Test(); Test(); if (ptx_version > 120) // Don't check doubles on PTX120 or below because they're down-converted Test(); Test(); Test(); printf("\nTest complete\n"); fflush(stdout); return 0; }