Spaces:
Runtime error
Runtime error
/****************************************************************************** | |
* Copyright (c) 2011, Duane Merrill. All rights reserved. | |
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. | |
* | |
* Redistribution and use in source and binary forms, with or without | |
* modification, are permitted provided that the following conditions are met: | |
* * Redistributions of source code must retain the above copyright | |
* notice, this list of conditions and the following disclaimer. | |
* * Redistributions in binary form must reproduce the above copyright | |
* notice, this list of conditions and the following disclaimer in the | |
* documentation and/or other materials provided with the distribution. | |
* * Neither the name of the NVIDIA CORPORATION nor the | |
* names of its contributors may be used to endorse or promote products | |
* derived from this software without specific prior written permission. | |
* | |
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | |
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY | |
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES | |
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND | |
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
* | |
******************************************************************************/ | |
/** | |
* \file | |
* PTX intrinsics | |
*/ | |
#pragma once | |
#include "util_type.cuh" | |
#include "util_arch.cuh" | |
#include "util_namespace.cuh" | |
#include "util_debug.cuh" | |
/// Optional outer namespace(s) | |
CUB_NS_PREFIX | |
/// CUB namespace | |
namespace cub { | |
/** | |
* \addtogroup UtilPtx | |
* @{ | |
*/ | |
/****************************************************************************** | |
* PTX helper macros | |
******************************************************************************/ | |
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document | |
/** | |
* Register modifier for pointer-types (for inlining PTX assembly) | |
*/ | |
#if defined(_WIN64) || defined(__LP64__) | |
#define __CUB_LP64__ 1 | |
// 64-bit register modifier for inlined asm | |
#define _CUB_ASM_PTR_ "l" | |
#define _CUB_ASM_PTR_SIZE_ "u64" | |
#else | |
#define __CUB_LP64__ 0 | |
// 32-bit register modifier for inlined asm | |
#define _CUB_ASM_PTR_ "r" | |
#define _CUB_ASM_PTR_SIZE_ "u32" | |
#endif | |
#endif // DOXYGEN_SHOULD_SKIP_THIS | |
/****************************************************************************** | |
* Inlined PTX intrinsics | |
******************************************************************************/ | |
/** | |
* \brief Shift-right then add. Returns (\p x >> \p shift) + \p addend. | |
*/ | |
__device__ __forceinline__ unsigned int SHR_ADD( | |
unsigned int x, | |
unsigned int shift, | |
unsigned int addend) | |
{ | |
unsigned int ret; | |
asm ("vshr.u32.u32.u32.clamp.add %0, %1, %2, %3;" : | |
"=r"(ret) : "r"(x), "r"(shift), "r"(addend)); | |
return ret; | |
} | |
/** | |
* \brief Shift-left then add. Returns (\p x << \p shift) + \p addend. | |
*/ | |
__device__ __forceinline__ unsigned int SHL_ADD( | |
unsigned int x, | |
unsigned int shift, | |
unsigned int addend) | |
{ | |
unsigned int ret; | |
asm ("vshl.u32.u32.u32.clamp.add %0, %1, %2, %3;" : | |
"=r"(ret) : "r"(x), "r"(shift), "r"(addend)); | |
return ret; | |
} | |
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document | |
/** | |
* Bitfield-extract. | |
*/ | |
template <typename UnsignedBits, int BYTE_LEN> | |
__device__ __forceinline__ unsigned int BFE( | |
UnsignedBits source, | |
unsigned int bit_start, | |
unsigned int num_bits, | |
Int2Type<BYTE_LEN> /*byte_len*/) | |
{ | |
unsigned int bits; | |
asm ("bfe.u32 %0, %1, %2, %3;" : "=r"(bits) : "r"((unsigned int) source), "r"(bit_start), "r"(num_bits)); | |
return bits; | |
} | |
/** | |
* Bitfield-extract for 64-bit types. | |
*/ | |
template <typename UnsignedBits> | |
__device__ __forceinline__ unsigned int BFE( | |
UnsignedBits source, | |
unsigned int bit_start, | |
unsigned int num_bits, | |
Int2Type<8> /*byte_len*/) | |
{ | |
const unsigned long long MASK = (1ull << num_bits) - 1; | |
return (source >> bit_start) & MASK; | |
} | |
#endif // DOXYGEN_SHOULD_SKIP_THIS | |
/** | |
* \brief Bitfield-extract. Extracts \p num_bits from \p source starting at bit-offset \p bit_start. The input \p source may be an 8b, 16b, 32b, or 64b unsigned integer type. | |
*/ | |
template <typename UnsignedBits> | |
__device__ __forceinline__ unsigned int BFE( | |
UnsignedBits source, | |
unsigned int bit_start, | |
unsigned int num_bits) | |
{ | |
return BFE(source, bit_start, num_bits, Int2Type<sizeof(UnsignedBits)>()); | |
} | |
/** | |
* \brief Bitfield insert. Inserts the \p num_bits least significant bits of \p y into \p x at bit-offset \p bit_start. | |
*/ | |
__device__ __forceinline__ void BFI( | |
unsigned int &ret, | |
unsigned int x, | |
unsigned int y, | |
unsigned int bit_start, | |
unsigned int num_bits) | |
{ | |
asm ("bfi.b32 %0, %1, %2, %3, %4;" : | |
"=r"(ret) : "r"(y), "r"(x), "r"(bit_start), "r"(num_bits)); | |
} | |
/** | |
* \brief Three-operand add. Returns \p x + \p y + \p z. | |
*/ | |
__device__ __forceinline__ unsigned int IADD3(unsigned int x, unsigned int y, unsigned int z) | |
{ | |
asm ("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(x) : "r"(x), "r"(y), "r"(z)); | |
return x; | |
} | |
/** | |
* \brief Byte-permute. Pick four arbitrary bytes from two 32-bit registers, and reassemble them into a 32-bit destination register. For SM2.0 or later. | |
* | |
* \par | |
* The bytes in the two source registers \p a and \p b are numbered from 0 to 7: | |
* {\p b, \p a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}. For each of the four bytes | |
* {b3, b2, b1, b0} selected in the return value, a 4-bit selector is defined within | |
* the four lower "nibbles" of \p index: {\p index } = {n7, n6, n5, n4, n3, n2, n1, n0} | |
* | |
* \par Snippet | |
* The code snippet below illustrates byte-permute. | |
* \par | |
* \code | |
* #include <cub/cub.cuh> | |
* | |
* __global__ void ExampleKernel(...) | |
* { | |
* int a = 0x03020100; | |
* int b = 0x07060504; | |
* int index = 0x00007531; | |
* | |
* int selected = PRMT(a, b, index); // 0x07050301 | |
* | |
* \endcode | |
* | |
*/ | |
__device__ __forceinline__ int PRMT(unsigned int a, unsigned int b, unsigned int index) | |
{ | |
int ret; | |
asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index)); | |
return ret; | |
} | |
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document | |
/** | |
* Sync-threads barrier. | |
*/ | |
__device__ __forceinline__ void BAR(int count) | |
{ | |
asm volatile("bar.sync 1, %0;" : : "r"(count)); | |
} | |
/** | |
* CTA barrier | |
*/ | |
__device__ __forceinline__ void CTA_SYNC() | |
{ | |
__syncthreads(); | |
} | |
/** | |
* CTA barrier with predicate | |
*/ | |
__device__ __forceinline__ int CTA_SYNC_AND(int p) | |
{ | |
return __syncthreads_and(p); | |
} | |
/** | |
* Warp barrier | |
*/ | |
__device__ __forceinline__ void WARP_SYNC(unsigned int member_mask) | |
{ | |
#ifdef CUB_USE_COOPERATIVE_GROUPS | |
__syncwarp(member_mask); | |
#endif | |
} | |
/** | |
* Warp any | |
*/ | |
__device__ __forceinline__ int WARP_ANY(int predicate, unsigned int member_mask) | |
{ | |
#ifdef CUB_USE_COOPERATIVE_GROUPS | |
return __any_sync(member_mask, predicate); | |
#else | |
return ::__any(predicate); | |
#endif | |
} | |
/** | |
* Warp any | |
*/ | |
__device__ __forceinline__ int WARP_ALL(int predicate, unsigned int member_mask) | |
{ | |
#ifdef CUB_USE_COOPERATIVE_GROUPS | |
return __all_sync(member_mask, predicate); | |
#else | |
return ::__all(predicate); | |
#endif | |
} | |
/** | |
* Warp ballot | |
*/ | |
__device__ __forceinline__ int WARP_BALLOT(int predicate, unsigned int member_mask) | |
{ | |
#ifdef CUB_USE_COOPERATIVE_GROUPS | |
return __ballot_sync(member_mask, predicate); | |
#else | |
return __ballot(predicate); | |
#endif | |
} | |
/** | |
* Warp synchronous shfl_up | |
*/ | |
__device__ __forceinline__ | |
unsigned int SHFL_UP_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask) | |
{ | |
#ifdef CUB_USE_COOPERATIVE_GROUPS | |
asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;" | |
: "=r"(word) : "r"(word), "r"(src_offset), "r"(flags), "r"(member_mask)); | |
#else | |
asm volatile("shfl.up.b32 %0, %1, %2, %3;" | |
: "=r"(word) : "r"(word), "r"(src_offset), "r"(flags)); | |
#endif | |
return word; | |
} | |
/** | |
* Warp synchronous shfl_down | |
*/ | |
__device__ __forceinline__ | |
unsigned int SHFL_DOWN_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask) | |
{ | |
#ifdef CUB_USE_COOPERATIVE_GROUPS | |
asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;" | |
: "=r"(word) : "r"(word), "r"(src_offset), "r"(flags), "r"(member_mask)); | |
#else | |
asm volatile("shfl.down.b32 %0, %1, %2, %3;" | |
: "=r"(word) : "r"(word), "r"(src_offset), "r"(flags)); | |
#endif | |
return word; | |
} | |
/** | |
* Warp synchronous shfl_idx | |
*/ | |
__device__ __forceinline__ | |
unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, int flags, unsigned int member_mask) | |
{ | |
#ifdef CUB_USE_COOPERATIVE_GROUPS | |
asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;" | |
: "=r"(word) : "r"(word), "r"(src_lane), "r"(flags), "r"(member_mask)); | |
#else | |
asm volatile("shfl.idx.b32 %0, %1, %2, %3;" | |
: "=r"(word) : "r"(word), "r"(src_lane), "r"(flags)); | |
#endif | |
return word; | |
} | |
/** | |
* Floating point multiply. (Mantissa LSB rounds towards zero.) | |
*/ | |
__device__ __forceinline__ float FMUL_RZ(float a, float b) | |
{ | |
float d; | |
asm ("mul.rz.f32 %0, %1, %2;" : "=f"(d) : "f"(a), "f"(b)); | |
return d; | |
} | |
/** | |
* Floating point multiply-add. (Mantissa LSB rounds towards zero.) | |
*/ | |
__device__ __forceinline__ float FFMA_RZ(float a, float b, float c) | |
{ | |
float d; | |
asm ("fma.rz.f32 %0, %1, %2, %3;" : "=f"(d) : "f"(a), "f"(b), "f"(c)); | |
return d; | |
} | |
#endif // DOXYGEN_SHOULD_SKIP_THIS | |
/** | |
* \brief Terminates the calling thread | |
*/ | |
__device__ __forceinline__ void ThreadExit() { | |
asm volatile("exit;"); | |
} | |
/** | |
* \brief Abort execution and generate an interrupt to the host CPU | |
*/ | |
__device__ __forceinline__ void ThreadTrap() { | |
asm volatile("trap;"); | |
} | |
/** | |
* \brief Returns the row-major linear thread identifier for a multidimensional thread block | |
*/ | |
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z) | |
{ | |
return ((block_dim_z == 1) ? 0 : (threadIdx.z * block_dim_x * block_dim_y)) + | |
((block_dim_y == 1) ? 0 : (threadIdx.y * block_dim_x)) + | |
threadIdx.x; | |
} | |
/** | |
* \brief Returns the warp lane ID of the calling thread | |
*/ | |
__device__ __forceinline__ unsigned int LaneId() | |
{ | |
unsigned int ret; | |
asm ("mov.u32 %0, %%laneid;" : "=r"(ret) ); | |
return ret; | |
} | |
/** | |
* \brief Returns the warp ID of the calling thread. Warp ID is guaranteed to be unique among warps, but may not correspond to a zero-based ranking within the thread block. | |
*/ | |
__device__ __forceinline__ unsigned int WarpId() | |
{ | |
unsigned int ret; | |
asm ("mov.u32 %0, %%warpid;" : "=r"(ret) ); | |
return ret; | |
} | |
/** | |
* \brief Returns the warp lane mask of all lanes less than the calling thread | |
*/ | |
__device__ __forceinline__ unsigned int LaneMaskLt() | |
{ | |
unsigned int ret; | |
asm ("mov.u32 %0, %%lanemask_lt;" : "=r"(ret) ); | |
return ret; | |
} | |
/** | |
* \brief Returns the warp lane mask of all lanes less than or equal to the calling thread | |
*/ | |
__device__ __forceinline__ unsigned int LaneMaskLe() | |
{ | |
unsigned int ret; | |
asm ("mov.u32 %0, %%lanemask_le;" : "=r"(ret) ); | |
return ret; | |
} | |
/** | |
* \brief Returns the warp lane mask of all lanes greater than the calling thread | |
*/ | |
__device__ __forceinline__ unsigned int LaneMaskGt() | |
{ | |
unsigned int ret; | |
asm ("mov.u32 %0, %%lanemask_gt;" : "=r"(ret) ); | |
return ret; | |
} | |
/** | |
* \brief Returns the warp lane mask of all lanes greater than or equal to the calling thread | |
*/ | |
__device__ __forceinline__ unsigned int LaneMaskGe() | |
{ | |
unsigned int ret; | |
asm ("mov.u32 %0, %%lanemask_ge;" : "=r"(ret) ); | |
return ret; | |
} | |
/** @} */ // end group UtilPtx | |
/** | |
* \brief Shuffle-up for any data type. Each <em>warp-lane<sub>i</sub></em> obtains the value \p input contributed by <em>warp-lane</em><sub><em>i</em>-<tt>src_offset</tt></sub>. For thread lanes \e i < src_offset, the thread's own \p input is returned to the thread. ![](shfl_up_logo.png) | |
* \ingroup WarpModule | |
* | |
* \tparam LOGICAL_WARP_THREADS The number of threads per "logical" warp. Must be a power-of-two <= 32. | |
* \tparam T <b>[inferred]</b> The input/output element type | |
* | |
* \par | |
* - Available only for SM3.0 or newer | |
* | |
* \par Snippet | |
* The code snippet below illustrates each thread obtaining a \p double value from the | |
* predecessor of its predecessor. | |
* \par | |
* \code | |
* #include <cub/cub.cuh> // or equivalently <cub/util_ptx.cuh> | |
* | |
* __global__ void ExampleKernel(...) | |
* { | |
* // Obtain one input item per thread | |
* double thread_data = ... | |
* | |
* // Obtain item from two ranks below | |
* double peer_data = ShuffleUp<32>(thread_data, 2, 0, 0xffffffff); | |
* | |
* \endcode | |
* \par | |
* Suppose the set of input \p thread_data across the first warp of threads is <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>. | |
* The corresponding output \p peer_data will be <tt>{1.0, 2.0, 1.0, 2.0, 3.0, ..., 30.0}</tt>. | |
* | |
*/ | |
template < | |
int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp | |
typename T> | |
__device__ __forceinline__ T ShuffleUp( | |
T input, ///< [in] The value to broadcast | |
int src_offset, ///< [in] The relative down-offset of the peer to read from | |
int first_thread, ///< [in] Index of first lane in logical warp (typically 0) | |
unsigned int member_mask) ///< [in] 32-bit mask of participating warp lanes | |
{ | |
/// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up | |
enum { | |
SHFL_C = (32 - LOGICAL_WARP_THREADS) << 8 | |
}; | |
typedef typename UnitWord<T>::ShuffleWord ShuffleWord; | |
const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord); | |
T output; | |
ShuffleWord *output_alias = reinterpret_cast<ShuffleWord *>(&output); | |
ShuffleWord *input_alias = reinterpret_cast<ShuffleWord *>(&input); | |
unsigned int shuffle_word; | |
shuffle_word = SHFL_UP_SYNC((unsigned int)input_alias[0], src_offset, first_thread | SHFL_C, member_mask); | |
output_alias[0] = shuffle_word; | |
#pragma unroll | |
for (int WORD = 1; WORD < WORDS; ++WORD) | |
{ | |
shuffle_word = SHFL_UP_SYNC((unsigned int)input_alias[WORD], src_offset, first_thread | SHFL_C, member_mask); | |
output_alias[WORD] = shuffle_word; | |
} | |
return output; | |
} | |
/** | |
* \brief Shuffle-down for any data type. Each <em>warp-lane<sub>i</sub></em> obtains the value \p input contributed by <em>warp-lane</em><sub><em>i</em>+<tt>src_offset</tt></sub>. For thread lanes \e i >= WARP_THREADS, the thread's own \p input is returned to the thread. ![](shfl_down_logo.png) | |
* \ingroup WarpModule | |
* | |
* \tparam LOGICAL_WARP_THREADS The number of threads per "logical" warp. Must be a power-of-two <= 32. | |
* \tparam T <b>[inferred]</b> The input/output element type | |
* | |
* \par | |
* - Available only for SM3.0 or newer | |
* | |
* \par Snippet | |
* The code snippet below illustrates each thread obtaining a \p double value from the | |
* successor of its successor. | |
* \par | |
* \code | |
* #include <cub/cub.cuh> // or equivalently <cub/util_ptx.cuh> | |
* | |
* __global__ void ExampleKernel(...) | |
* { | |
* // Obtain one input item per thread | |
* double thread_data = ... | |
* | |
* // Obtain item from two ranks below | |
* double peer_data = ShuffleDown<32>(thread_data, 2, 31, 0xffffffff); | |
* | |
* \endcode | |
* \par | |
* Suppose the set of input \p thread_data across the first warp of threads is <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>. | |
* The corresponding output \p peer_data will be <tt>{3.0, 4.0, 5.0, 6.0, 7.0, ..., 32.0}</tt>. | |
* | |
*/ | |
template < | |
int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp | |
typename T> | |
__device__ __forceinline__ T ShuffleDown( | |
T input, ///< [in] The value to broadcast | |
int src_offset, ///< [in] The relative up-offset of the peer to read from | |
int last_thread, ///< [in] Index of last thread in logical warp (typically 31 for a 32-thread warp) | |
unsigned int member_mask) ///< [in] 32-bit mask of participating warp lanes | |
{ | |
/// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up | |
enum { | |
SHFL_C = (32 - LOGICAL_WARP_THREADS) << 8 | |
}; | |
typedef typename UnitWord<T>::ShuffleWord ShuffleWord; | |
const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord); | |
T output; | |
ShuffleWord *output_alias = reinterpret_cast<ShuffleWord *>(&output); | |
ShuffleWord *input_alias = reinterpret_cast<ShuffleWord *>(&input); | |
unsigned int shuffle_word; | |
shuffle_word = SHFL_DOWN_SYNC((unsigned int)input_alias[0], src_offset, last_thread | SHFL_C, member_mask); | |
output_alias[0] = shuffle_word; | |
#pragma unroll | |
for (int WORD = 1; WORD < WORDS; ++WORD) | |
{ | |
shuffle_word = SHFL_DOWN_SYNC((unsigned int)input_alias[WORD], src_offset, last_thread | SHFL_C, member_mask); | |
output_alias[WORD] = shuffle_word; | |
} | |
return output; | |
} | |
/** | |
* \brief Shuffle-broadcast for any data type. Each <em>warp-lane<sub>i</sub></em> obtains the value \p input | |
* contributed by <em>warp-lane</em><sub><tt>src_lane</tt></sub>. For \p src_lane < 0 or \p src_lane >= WARP_THREADS, | |
* then the thread's own \p input is returned to the thread. ![](shfl_broadcast_logo.png) | |
* | |
* \tparam LOGICAL_WARP_THREADS The number of threads per "logical" warp. Must be a power-of-two <= 32. | |
* \tparam T <b>[inferred]</b> The input/output element type | |
* | |
* \ingroup WarpModule | |
* | |
* \par | |
* - Available only for SM3.0 or newer | |
* | |
* \par Snippet | |
* The code snippet below illustrates each thread obtaining a \p double value from <em>warp-lane</em><sub>0</sub>. | |
* | |
* \par | |
* \code | |
* #include <cub/cub.cuh> // or equivalently <cub/util_ptx.cuh> | |
* | |
* __global__ void ExampleKernel(...) | |
* { | |
* // Obtain one input item per thread | |
* double thread_data = ... | |
* | |
* // Obtain item from thread 0 | |
* double peer_data = ShuffleIndex<32>(thread_data, 0, 0xffffffff); | |
* | |
* \endcode | |
* \par | |
* Suppose the set of input \p thread_data across the first warp of threads is <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>. | |
* The corresponding output \p peer_data will be <tt>{1.0, 1.0, 1.0, 1.0, 1.0, ..., 1.0}</tt>. | |
* | |
*/ | |
template < | |
int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp | |
typename T> | |
__device__ __forceinline__ T ShuffleIndex( | |
T input, ///< [in] The value to broadcast | |
int src_lane, ///< [in] Which warp lane is to do the broadcasting | |
unsigned int member_mask) ///< [in] 32-bit mask of participating warp lanes | |
{ | |
/// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up | |
enum { | |
SHFL_C = ((32 - LOGICAL_WARP_THREADS) << 8) | (LOGICAL_WARP_THREADS - 1) | |
}; | |
typedef typename UnitWord<T>::ShuffleWord ShuffleWord; | |
const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord); | |
T output; | |
ShuffleWord *output_alias = reinterpret_cast<ShuffleWord *>(&output); | |
ShuffleWord *input_alias = reinterpret_cast<ShuffleWord *>(&input); | |
unsigned int shuffle_word; | |
shuffle_word = SHFL_IDX_SYNC((unsigned int)input_alias[0], | |
src_lane, | |
SHFL_C, | |
member_mask); | |
output_alias[0] = shuffle_word; | |
#pragma unroll | |
for (int WORD = 1; WORD < WORDS; ++WORD) | |
{ | |
shuffle_word = SHFL_IDX_SYNC((unsigned int)input_alias[WORD], | |
src_lane, | |
SHFL_C, | |
member_mask); | |
output_alias[WORD] = shuffle_word; | |
} | |
return output; | |
} | |
/** | |
* Compute a 32b mask of threads having the same least-significant | |
* LABEL_BITS of \p label as the calling thread. | |
*/ | |
template <int LABEL_BITS> | |
inline __device__ unsigned int MatchAny(unsigned int label) | |
{ | |
unsigned int retval; | |
// Extract masks of common threads for each bit | |
#pragma unroll | |
for (int BIT = 0; BIT < LABEL_BITS; ++BIT) | |
{ | |
unsigned int mask; | |
unsigned int current_bit = 1 << BIT; | |
asm ("{\n" | |
" .reg .pred p;\n" | |
" and.b32 %0, %1, %2;" | |
" setp.eq.u32 p, %0, %2;\n" | |
#ifdef CUB_USE_COOPERATIVE_GROUPS | |
" vote.ballot.sync.b32 %0, p, 0xffffffff;\n" | |
#else | |
" vote.ballot.b32 %0, p;\n" | |
#endif | |
" @!p not.b32 %0, %0;\n" | |
"}\n" : "=r"(mask) : "r"(label), "r"(current_bit)); | |
// Remove peers who differ | |
retval = (BIT == 0) ? mask : retval & mask; | |
} | |
return retval; | |
// // VOLTA match | |
// unsigned int retval; | |
// asm ("{\n" | |
// " match.any.sync.b32 %0, %1, 0xffffffff;\n" | |
// "}\n" : "=r"(retval) : "r"(label)); | |
// return retval; | |
} | |
} // CUB namespace | |
CUB_NS_POSTFIX // Optional outer namespace(s) | |