#include #include #include #include #include #if THRUST_CPP_DIALECT >= 2011 #include #endif // This example demonstrates two ways to achieve algorithm invocations that are asynchronous with // the calling thread. // // The first method wraps a call to thrust::reduce inside a __global__ function. Since __global__ function // launches are asynchronous with the launching thread, this achieves asynchrony. The result of the reduction // is stored to a pointer to CUDA global memory. The calling thread waits for the result of the reduction to // be ready by synchronizing with the CUDA stream on which the __global__ function is launched. // // The second method uses the C++11 library function, std::async, to create concurrency. The lambda function // given to std::async returns the result of thrust::reduce to a std::future. The calling thread can use the // std::future to wait for the result of the reduction. This method requires a compiler which supports // C++11-capable language and library constructs. template __global__ void reduce_kernel(Iterator first, Iterator last, T init, BinaryOperation binary_op, Pointer result) { *result = thrust::reduce(thrust::cuda::par, first, last, init, binary_op); } int main() { size_t n = 1 << 20; thrust::device_vector data(n, 1); thrust::device_vector result(1, 0); // method 1: call thrust::reduce from an asynchronous CUDA kernel launch // create a CUDA stream cudaStream_t s; cudaStreamCreate(&s); // launch a CUDA kernel with only 1 thread on our stream reduce_kernel<<<1,1,0,s>>>(data.begin(), data.end(), 0, thrust::plus(), result.data()); // wait for the stream to finish cudaStreamSynchronize(s); // our result should be ready assert(result[0] == n); cudaStreamDestroy(s); // reset the result result[0] = 0; #if THRUST_CPP_DIALECT >= 2011 // method 2: use std::async to create asynchrony // copy all the algorithm parameters auto begin = data.begin(); auto end = data.end(); unsigned int init = 0; auto binary_op = thrust::plus(); // std::async captures the algorithm parameters by value // use std::launch::async to ensure the creation of a new thread std::future future_result = std::async(std::launch::async, [=] { return thrust::reduce(begin, end, init, binary_op); }); // wait on the result and check that it is correct assert(future_result.get() == n); #endif return 0; }