Spaces:
Runtime error
Runtime error
/****************************************************************************** | |
* Copyright (c) 2011, Duane Merrill. All rights reserved. | |
* Copyright (c) 2011-2020, NVIDIA CORPORATION. All rights reserved. | |
* | |
* Redistribution and use in source and binary forms, with or without | |
* modification, are permitted provided that the following conditions are met: | |
* * Redistributions of source code must retain the above copyright | |
* notice, this list of conditions and the following disclaimer. | |
* * Redistributions in binary form must reproduce the above copyright | |
* notice, this list of conditions and the following disclaimer in the | |
* documentation and/or other materials provided with the distribution. | |
* * Neither the name of the NVIDIA CORPORATION nor the | |
* names of its contributors may be used to endorse or promote products | |
* derived from this software without specific prior written permission. | |
* | |
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | |
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY | |
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES | |
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND | |
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
* | |
******************************************************************************/ | |
/** | |
* \file | |
* Properties of a given CUDA device and the corresponding PTX bundle | |
*/ | |
#pragma once | |
#include "util_type.cuh" | |
#include "util_arch.cuh" | |
#include "util_debug.cuh" | |
#include "util_cpp_dialect.cuh" | |
#include "util_namespace.cuh" | |
#include "util_macro.cuh" | |
#if CUB_CPP_DIALECT >= 2011 // C++11 and later. | |
#include <atomic> | |
#include <array> | |
#include <cassert> | |
#endif | |
/// Optional outer namespace(s) | |
CUB_NS_PREFIX | |
/// CUB namespace | |
namespace cub { | |
/** | |
* \addtogroup UtilMgmt | |
* @{ | |
*/ | |
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document | |
/** | |
* \brief Alias temporaries to externally-allocated device storage (or simply return the amount of storage needed). | |
*/ | |
template <int ALLOCATIONS> | |
__host__ __device__ __forceinline__ | |
cudaError_t AliasTemporaries( | |
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. | |
size_t& temp_storage_bytes, ///< [in,out] Size in bytes of \t d_temp_storage allocation | |
void* (&allocations)[ALLOCATIONS], ///< [in,out] Pointers to device allocations needed | |
size_t (&allocation_sizes)[ALLOCATIONS]) ///< [in] Sizes in bytes of device allocations needed | |
{ | |
const int ALIGN_BYTES = 256; | |
const int ALIGN_MASK = ~(ALIGN_BYTES - 1); | |
// Compute exclusive prefix sum over allocation requests | |
size_t allocation_offsets[ALLOCATIONS]; | |
size_t bytes_needed = 0; | |
for (int i = 0; i < ALLOCATIONS; ++i) | |
{ | |
size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK; | |
allocation_offsets[i] = bytes_needed; | |
bytes_needed += allocation_bytes; | |
} | |
bytes_needed += ALIGN_BYTES - 1; | |
// Check if the caller is simply requesting the size of the storage allocation | |
if (!d_temp_storage) | |
{ | |
temp_storage_bytes = bytes_needed; | |
return cudaSuccess; | |
} | |
// Check if enough storage provided | |
if (temp_storage_bytes < bytes_needed) | |
{ | |
return CubDebug(cudaErrorInvalidValue); | |
} | |
// Alias | |
d_temp_storage = (void *) ((size_t(d_temp_storage) + ALIGN_BYTES - 1) & ALIGN_MASK); | |
for (int i = 0; i < ALLOCATIONS; ++i) | |
{ | |
allocations[i] = static_cast<char*>(d_temp_storage) + allocation_offsets[i]; | |
} | |
return cudaSuccess; | |
} | |
/** | |
* \brief Empty kernel for querying PTX manifest metadata (e.g., version) for the current device | |
*/ | |
template <typename T> | |
__global__ void EmptyKernel(void) { } | |
#endif // DOXYGEN_SHOULD_SKIP_THIS | |
/** | |
* \brief Returns the current device or -1 if an error occurred. | |
*/ | |
CUB_RUNTIME_FUNCTION __forceinline__ int CurrentDevice() | |
{ | |
#if defined(CUB_RUNTIME_ENABLED) // Host code or device code with the CUDA runtime. | |
int device = -1; | |
if (CubDebug(cudaGetDevice(&device))) return -1; | |
return device; | |
#else // Device code without the CUDA runtime. | |
return -1; | |
#endif | |
} | |
/** | |
* \brief RAII helper which saves the current device and switches to the | |
* specified device on construction and switches to the saved device on | |
* destruction. | |
*/ | |
struct SwitchDevice | |
{ | |
private: | |
int const old_device; | |
bool const needs_reset; | |
public: | |
__host__ __forceinline__ SwitchDevice(int new_device) | |
: old_device(CurrentDevice()), needs_reset(old_device != new_device) | |
{ | |
if (needs_reset) | |
CubDebug(cudaSetDevice(new_device)); | |
} | |
__host__ __forceinline__ ~SwitchDevice() | |
{ | |
if (needs_reset) | |
CubDebug(cudaSetDevice(old_device)); | |
} | |
}; | |
/** | |
* \brief Returns the number of CUDA devices available or -1 if an error | |
* occurred. | |
*/ | |
CUB_RUNTIME_FUNCTION __forceinline__ int DeviceCountUncached() | |
{ | |
#if defined(CUB_RUNTIME_ENABLED) // Host code or device code with the CUDA runtime. | |
int count = -1; | |
if (CubDebug(cudaGetDeviceCount(&count))) | |
// CUDA makes no guarantees about the state of the output parameter if | |
// `cudaGetDeviceCount` fails; in practice, they don't, but out of | |
// paranoia we'll reset `count` to `-1`. | |
count = -1; | |
return count; | |
#else // Device code without the CUDA runtime. | |
return -1; | |
#endif | |
} | |
#if CUB_CPP_DIALECT >= 2011 // C++11 and later. | |
/** | |
* \brief Cache for an arbitrary value produced by a nullary function. | |
*/ | |
template <typename T, T(*Function)()> | |
struct ValueCache | |
{ | |
T const value; | |
/** | |
* \brief Call the nullary function to produce the value and construct the | |
* cache. | |
*/ | |
__host__ __forceinline__ ValueCache() : value(Function()) {} | |
}; | |
#endif | |
#if CUB_CPP_DIALECT >= 2011 | |
// Host code, only safely usable in C++11 or newer, where thread-safe | |
// initialization of static locals is guaranteed. This is a separate function | |
// to avoid defining a local static in a host/device function. | |
__host__ __forceinline__ int DeviceCountCachedValue() | |
{ | |
static ValueCache<int, DeviceCountUncached> cache; | |
return cache.value; | |
} | |
#endif | |
/** | |
* \brief Returns the number of CUDA devices available. | |
* | |
* \note This function may cache the result internally. | |
* | |
* \note This function is thread safe. | |
*/ | |
CUB_RUNTIME_FUNCTION __forceinline__ int DeviceCount() | |
{ | |
int result = -1; | |
if (CUB_IS_HOST_CODE) { | |
#if CUB_INCLUDE_HOST_CODE | |
#if CUB_CPP_DIALECT >= 2011 | |
// Host code and C++11. | |
result = DeviceCountCachedValue(); | |
#else | |
// Host code and C++98. | |
result = DeviceCountUncached(); | |
#endif | |
#endif | |
} else { | |
#if CUB_INCLUDE_DEVICE_CODE | |
// Device code. | |
result = DeviceCountUncached(); | |
#endif | |
} | |
return result; | |
} | |
#if CUB_CPP_DIALECT >= 2011 // C++11 and later. | |
/** | |
* \brief Per-device cache for a CUDA attribute value; the attribute is queried | |
* and stored for each device upon construction. | |
*/ | |
struct PerDeviceAttributeCache | |
{ | |
struct DevicePayload | |
{ | |
int attribute; | |
cudaError_t error; | |
}; | |
// Each entry starts in the `DeviceEntryEmpty` state, then proceeds to the | |
// `DeviceEntryInitializing` state, and then proceeds to the | |
// `DeviceEntryReady` state. These are the only state transitions allowed; | |
// e.g. a linear sequence of transitions. | |
enum DeviceEntryStatus | |
{ | |
DeviceEntryEmpty = 0, | |
DeviceEntryInitializing, | |
DeviceEntryReady | |
}; | |
struct DeviceEntry | |
{ | |
std::atomic<DeviceEntryStatus> flag; | |
DevicePayload payload; | |
}; | |
private: | |
std::array<DeviceEntry, CUB_MAX_DEVICES> entries_; | |
public: | |
/** | |
* \brief Construct the cache. | |
*/ | |
__host__ __forceinline__ PerDeviceAttributeCache() : entries_() | |
{ | |
assert(DeviceCount() <= CUB_MAX_DEVICES); | |
} | |
/** | |
* \brief Retrieves the payload of the cached function \p f for \p device. | |
* | |
* \note You must pass a morally equivalent function in to every call or | |
* this function has undefined behavior. | |
*/ | |
template <typename Invocable> | |
__host__ DevicePayload operator()(Invocable&& f, int device) | |
{ | |
if (device >= DeviceCount()) | |
return DevicePayload{0, cudaErrorInvalidDevice}; | |
auto& entry = entries_[device]; | |
auto& flag = entry.flag; | |
auto& payload = entry.payload; | |
DeviceEntryStatus old_status = DeviceEntryEmpty; | |
// First, check for the common case of the entry being ready. | |
if (flag.load(std::memory_order_acquire) != DeviceEntryReady) | |
{ | |
// Assume the entry is empty and attempt to lock it so we can fill | |
// it by trying to set the state from `DeviceEntryReady` to | |
// `DeviceEntryInitializing`. | |
if (flag.compare_exchange_strong(old_status, DeviceEntryInitializing, | |
std::memory_order_acq_rel, | |
std::memory_order_acquire)) | |
{ | |
// We successfully set the state to `DeviceEntryInitializing`; | |
// we have the lock and it's our job to initialize this entry | |
// and then release it. | |
// We don't use `CubDebug` here because we let the user code | |
// decide whether or not errors are hard errors. | |
if (payload.error = std::forward<Invocable>(f)(payload.attribute)) | |
// Clear the global CUDA error state which may have been | |
// set by the last call. Otherwise, errors may "leak" to | |
// unrelated kernel launches. | |
cudaGetLastError(); | |
// Release the lock by setting the state to `DeviceEntryReady`. | |
flag.store(DeviceEntryReady, std::memory_order_release); | |
} | |
// If the `compare_exchange_weak` failed, then `old_status` has | |
// been updated with the value of `flag` that it observed. | |
else if (old_status == DeviceEntryInitializing) | |
{ | |
// Another execution agent is initializing this entry; we need | |
// to wait for them to finish; we'll know they're done when we | |
// observe the entry status as `DeviceEntryReady`. | |
do { old_status = flag.load(std::memory_order_acquire); } | |
while (old_status != DeviceEntryReady); | |
// FIXME: Use `atomic::wait` instead when we have access to | |
// host-side C++20 atomics. We could use libcu++, but it only | |
// supports atomics for SM60 and up, even if you're only using | |
// them in host code. | |
} | |
} | |
// We now know that the state of our entry is `DeviceEntryReady`, so | |
// just return the entry's payload. | |
return entry.payload; | |
} | |
}; | |
#endif | |
/** | |
* \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10). | |
*/ | |
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersionUncached(int& ptx_version) | |
{ | |
// Instantiate `EmptyKernel<void>` in both host and device code to ensure | |
// it can be called. | |
typedef void (*EmptyKernelPtr)(); | |
EmptyKernelPtr empty_kernel = EmptyKernel<void>; | |
// This is necessary for unused variable warnings in host compilers. The | |
// usual syntax of (void)empty_kernel; was not sufficient on MSVC2015. | |
(void)reinterpret_cast<void*>(empty_kernel); | |
cudaError_t result = cudaSuccess; | |
if (CUB_IS_HOST_CODE) { | |
#if CUB_INCLUDE_HOST_CODE | |
cudaFuncAttributes empty_kernel_attrs; | |
do { | |
if (CubDebug(result = cudaFuncGetAttributes(&empty_kernel_attrs, empty_kernel))) | |
break; | |
} | |
while(0); | |
ptx_version = empty_kernel_attrs.ptxVersion * 10; | |
#endif | |
} else { | |
#if CUB_INCLUDE_DEVICE_CODE | |
// This is necessary to ensure instantiation of EmptyKernel in device code. | |
// The `reinterpret_cast` is necessary to suppress a set-but-unused warnings. | |
// This is a meme now: https://twitter.com/blelbach/status/1222391615576100864 | |
(void)reinterpret_cast<EmptyKernelPtr>(empty_kernel); | |
ptx_version = CUB_PTX_ARCH; | |
#endif | |
} | |
return result; | |
} | |
/** | |
* \brief Retrieves the PTX version that will be used on \p device (major * 100 + minor * 10). | |
*/ | |
__host__ __forceinline__ cudaError_t PtxVersionUncached(int& ptx_version, int device) | |
{ | |
SwitchDevice sd(device); | |
return PtxVersionUncached(ptx_version); | |
} | |
#if CUB_CPP_DIALECT >= 2011 // C++11 and later. | |
template <typename Tag> | |
__host__ __forceinline__ PerDeviceAttributeCache& GetPerDeviceAttributeCache() | |
{ | |
// C++11 guarantees that initialization of static locals is thread safe. | |
static PerDeviceAttributeCache cache; | |
return cache; | |
} | |
struct PtxVersionCacheTag {}; | |
struct SmVersionCacheTag {}; | |
#endif | |
/** | |
* \brief Retrieves the PTX version that will be used on \p device (major * 100 + minor * 10). | |
* | |
* \note This function may cache the result internally. | |
* | |
* \note This function is thread safe. | |
*/ | |
__host__ __forceinline__ cudaError_t PtxVersion(int& ptx_version, int device) | |
{ | |
#if CUB_CPP_DIALECT >= 2011 // C++11 and later. | |
auto const payload = GetPerDeviceAttributeCache<PtxVersionCacheTag>()( | |
// If this call fails, then we get the error code back in the payload, | |
// which we check with `CubDebug` below. | |
[=] (int& pv) { return PtxVersionUncached(pv, device); }, | |
device); | |
if (!CubDebug(payload.error)) | |
ptx_version = payload.attribute; | |
return payload.error; | |
#else // Pre C++11. | |
return PtxVersionUncached(ptx_version, device); | |
#endif | |
} | |
/** | |
* \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10). | |
* | |
* \note This function may cache the result internally. | |
* | |
* \note This function is thread safe. | |
*/ | |
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int& ptx_version) | |
{ | |
cudaError_t result = cudaErrorUnknown; | |
if (CUB_IS_HOST_CODE) { | |
#if CUB_INCLUDE_HOST_CODE | |
#if CUB_CPP_DIALECT >= 2011 | |
// Host code and C++11. | |
auto const device = CurrentDevice(); | |
auto const payload = GetPerDeviceAttributeCache<PtxVersionCacheTag>()( | |
// If this call fails, then we get the error code back in the payload, | |
// which we check with `CubDebug` below. | |
[=] (int& pv) { return PtxVersionUncached(pv, device); }, | |
device); | |
if (!CubDebug(payload.error)) | |
ptx_version = payload.attribute; | |
result = payload.error; | |
#else | |
// Host code and C++98. | |
result = PtxVersionUncached(ptx_version); | |
#endif | |
#endif | |
} else { | |
#if CUB_INCLUDE_DEVICE_CODE | |
// Device code. | |
result = PtxVersionUncached(ptx_version); | |
#endif | |
} | |
return result; | |
} | |
/** | |
* \brief Retrieves the SM version of \p device (major * 100 + minor * 10) | |
*/ | |
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersionUncached(int& sm_version, int device = CurrentDevice()) | |
{ | |
#if defined(CUB_RUNTIME_ENABLED) // Host code or device code with the CUDA runtime. | |
cudaError_t error = cudaSuccess; | |
do | |
{ | |
int major = 0, minor = 0; | |
if (CubDebug(error = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device))) break; | |
if (CubDebug(error = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device))) break; | |
sm_version = major * 100 + minor * 10; | |
} | |
while (0); | |
return error; | |
#else // Device code without the CUDA runtime. | |
(void)sm_version; | |
(void)device; | |
// CUDA API calls are not supported from this device. | |
return CubDebug(cudaErrorInvalidConfiguration); | |
#endif | |
} | |
/** | |
* \brief Retrieves the SM version of \p device (major * 100 + minor * 10) | |
* | |
* \note This function may cache the result internally. | |
* | |
* \note This function is thread safe. | |
*/ | |
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersion(int& sm_version, int device = CurrentDevice()) | |
{ | |
cudaError_t result = cudaErrorUnknown; | |
if (CUB_IS_HOST_CODE) { | |
#if CUB_INCLUDE_HOST_CODE | |
#if CUB_CPP_DIALECT >= 2011 | |
// Host code and C++11 | |
auto const payload = GetPerDeviceAttributeCache<SmVersionCacheTag>()( | |
// If this call fails, then we get the error code back in the payload, | |
// which we check with `CubDebug` below. | |
[=] (int& pv) { return SmVersionUncached(pv, device); }, | |
device); | |
if (!CubDebug(payload.error)) | |
sm_version = payload.attribute; | |
result = payload.error; | |
#else | |
// Host code and C++98 | |
result = SmVersionUncached(sm_version, device); | |
#endif | |
#endif | |
} else { | |
#if CUB_INCLUDE_DEVICE_CODE | |
result = SmVersionUncached(sm_version, device); | |
#endif | |
} | |
return result; | |
} | |
/** | |
* Synchronize the specified \p stream. | |
*/ | |
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SyncStream(cudaStream_t stream) | |
{ | |
cudaError_t result = cudaErrorUnknown; | |
if (CUB_IS_HOST_CODE) { | |
#if CUB_INCLUDE_HOST_CODE | |
result = CubDebug(cudaStreamSynchronize(stream)); | |
#endif | |
} else { | |
#if CUB_INCLUDE_DEVICE_CODE | |
#if defined(CUB_RUNTIME_ENABLED) // Device code with the CUDA runtime. | |
(void)stream; | |
// Device can't yet sync on a specific stream | |
result = CubDebug(cudaDeviceSynchronize()); | |
#else // Device code without the CUDA runtime. | |
(void)stream; | |
// CUDA API calls are not supported from this device. | |
result = CubDebug(cudaErrorInvalidConfiguration); | |
#endif | |
#endif | |
} | |
return result; | |
} | |
/** | |
* \brief Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer \p kernel_ptr on the current device with \p block_threads per thread block. | |
* | |
* \par Snippet | |
* The code snippet below illustrates the use of the MaxSmOccupancy function. | |
* \par | |
* \code | |
* #include <cub/cub.cuh> // or equivalently <cub/util_device.cuh> | |
* | |
* template <typename T> | |
* __global__ void ExampleKernel() | |
* { | |
* // Allocate shared memory for BlockScan | |
* __shared__ volatile T buffer[4096]; | |
* | |
* ... | |
* } | |
* | |
* ... | |
* | |
* // Determine SM occupancy for ExampleKernel specialized for unsigned char | |
* int max_sm_occupancy; | |
* MaxSmOccupancy(max_sm_occupancy, ExampleKernel<unsigned char>, 64); | |
* | |
* // max_sm_occupancy <-- 4 on SM10 | |
* // max_sm_occupancy <-- 8 on SM20 | |
* // max_sm_occupancy <-- 12 on SM35 | |
* | |
* \endcode | |
* | |
*/ | |
template <typename KernelPtr> | |
CUB_RUNTIME_FUNCTION __forceinline__ | |
cudaError_t MaxSmOccupancy( | |
int& max_sm_occupancy, ///< [out] maximum number of thread blocks that can reside on a single SM | |
KernelPtr kernel_ptr, ///< [in] Kernel pointer for which to compute SM occupancy | |
int block_threads, ///< [in] Number of threads per thread block | |
int dynamic_smem_bytes = 0) | |
{ | |
#ifndef CUB_RUNTIME_ENABLED | |
(void)dynamic_smem_bytes; | |
(void)block_threads; | |
(void)kernel_ptr; | |
(void)max_sm_occupancy; | |
// CUDA API calls not supported from this device | |
return CubDebug(cudaErrorInvalidConfiguration); | |
#else | |
return CubDebug(cudaOccupancyMaxActiveBlocksPerMultiprocessor( | |
&max_sm_occupancy, | |
kernel_ptr, | |
block_threads, | |
dynamic_smem_bytes)); | |
#endif // CUB_RUNTIME_ENABLED | |
} | |
/****************************************************************************** | |
* Policy management | |
******************************************************************************/ | |
/** | |
* Kernel dispatch configuration | |
*/ | |
struct KernelConfig | |
{ | |
int block_threads; | |
int items_per_thread; | |
int tile_size; | |
int sm_occupancy; | |
CUB_RUNTIME_FUNCTION __forceinline__ | |
KernelConfig() : block_threads(0), items_per_thread(0), tile_size(0), sm_occupancy(0) {} | |
template <typename AgentPolicyT, typename KernelPtrT> | |
CUB_RUNTIME_FUNCTION __forceinline__ | |
cudaError_t Init(KernelPtrT kernel_ptr) | |
{ | |
block_threads = AgentPolicyT::BLOCK_THREADS; | |
items_per_thread = AgentPolicyT::ITEMS_PER_THREAD; | |
tile_size = block_threads * items_per_thread; | |
cudaError_t retval = MaxSmOccupancy(sm_occupancy, kernel_ptr, block_threads); | |
return retval; | |
} | |
}; | |
/// Helper for dispatching into a policy chain | |
template <int PTX_VERSION, typename PolicyT, typename PrevPolicyT> | |
struct ChainedPolicy | |
{ | |
/// The policy for the active compiler pass | |
typedef typename If<(CUB_PTX_ARCH < PTX_VERSION), typename PrevPolicyT::ActivePolicy, PolicyT>::Type ActivePolicy; | |
/// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version | |
template <typename FunctorT> | |
CUB_RUNTIME_FUNCTION __forceinline__ | |
static cudaError_t Invoke(int ptx_version, FunctorT& op) | |
{ | |
if (ptx_version < PTX_VERSION) { | |
return PrevPolicyT::Invoke(ptx_version, op); | |
} | |
return op.template Invoke<PolicyT>(); | |
} | |
}; | |
/// Helper for dispatching into a policy chain (end-of-chain specialization) | |
template <int PTX_VERSION, typename PolicyT> | |
struct ChainedPolicy<PTX_VERSION, PolicyT, PolicyT> | |
{ | |
/// The policy for the active compiler pass | |
typedef PolicyT ActivePolicy; | |
/// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version | |
template <typename FunctorT> | |
CUB_RUNTIME_FUNCTION __forceinline__ | |
static cudaError_t Invoke(int /*ptx_version*/, FunctorT& op) { | |
return op.template Invoke<PolicyT>(); | |
} | |
}; | |
/** @} */ // end group UtilMgmt | |
} // CUB namespace | |
CUB_NS_POSTFIX // Optional outer namespace(s) | |