LIVE / thrust /cub /util_ptx.cuh
Xu Ma
update
1c3c0d9
raw
history blame
21.8 kB
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* PTX intrinsics
*/
#pragma once
#include "util_type.cuh"
#include "util_arch.cuh"
#include "util_namespace.cuh"
#include "util_debug.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* \addtogroup UtilPtx
* @{
*/
/******************************************************************************
* PTX helper macros
******************************************************************************/
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
/**
* Register modifier for pointer-types (for inlining PTX assembly)
*/
#if defined(_WIN64) || defined(__LP64__)
#define __CUB_LP64__ 1
// 64-bit register modifier for inlined asm
#define _CUB_ASM_PTR_ "l"
#define _CUB_ASM_PTR_SIZE_ "u64"
#else
#define __CUB_LP64__ 0
// 32-bit register modifier for inlined asm
#define _CUB_ASM_PTR_ "r"
#define _CUB_ASM_PTR_SIZE_ "u32"
#endif
#endif // DOXYGEN_SHOULD_SKIP_THIS
/******************************************************************************
* Inlined PTX intrinsics
******************************************************************************/
/**
* \brief Shift-right then add. Returns (\p x >> \p shift) + \p addend.
*/
__device__ __forceinline__ unsigned int SHR_ADD(
unsigned int x,
unsigned int shift,
unsigned int addend)
{
unsigned int ret;
asm ("vshr.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
"=r"(ret) : "r"(x), "r"(shift), "r"(addend));
return ret;
}
/**
* \brief Shift-left then add. Returns (\p x << \p shift) + \p addend.
*/
__device__ __forceinline__ unsigned int SHL_ADD(
unsigned int x,
unsigned int shift,
unsigned int addend)
{
unsigned int ret;
asm ("vshl.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
"=r"(ret) : "r"(x), "r"(shift), "r"(addend));
return ret;
}
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
/**
* Bitfield-extract.
*/
template <typename UnsignedBits, int BYTE_LEN>
__device__ __forceinline__ unsigned int BFE(
UnsignedBits source,
unsigned int bit_start,
unsigned int num_bits,
Int2Type<BYTE_LEN> /*byte_len*/)
{
unsigned int bits;
asm ("bfe.u32 %0, %1, %2, %3;" : "=r"(bits) : "r"((unsigned int) source), "r"(bit_start), "r"(num_bits));
return bits;
}
/**
* Bitfield-extract for 64-bit types.
*/
template <typename UnsignedBits>
__device__ __forceinline__ unsigned int BFE(
UnsignedBits source,
unsigned int bit_start,
unsigned int num_bits,
Int2Type<8> /*byte_len*/)
{
const unsigned long long MASK = (1ull << num_bits) - 1;
return (source >> bit_start) & MASK;
}
#endif // DOXYGEN_SHOULD_SKIP_THIS
/**
* \brief Bitfield-extract. Extracts \p num_bits from \p source starting at bit-offset \p bit_start. The input \p source may be an 8b, 16b, 32b, or 64b unsigned integer type.
*/
template <typename UnsignedBits>
__device__ __forceinline__ unsigned int BFE(
UnsignedBits source,
unsigned int bit_start,
unsigned int num_bits)
{
return BFE(source, bit_start, num_bits, Int2Type<sizeof(UnsignedBits)>());
}
/**
* \brief Bitfield insert. Inserts the \p num_bits least significant bits of \p y into \p x at bit-offset \p bit_start.
*/
__device__ __forceinline__ void BFI(
unsigned int &ret,
unsigned int x,
unsigned int y,
unsigned int bit_start,
unsigned int num_bits)
{
asm ("bfi.b32 %0, %1, %2, %3, %4;" :
"=r"(ret) : "r"(y), "r"(x), "r"(bit_start), "r"(num_bits));
}
/**
* \brief Three-operand add. Returns \p x + \p y + \p z.
*/
__device__ __forceinline__ unsigned int IADD3(unsigned int x, unsigned int y, unsigned int z)
{
asm ("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(x) : "r"(x), "r"(y), "r"(z));
return x;
}
/**
* \brief Byte-permute. Pick four arbitrary bytes from two 32-bit registers, and reassemble them into a 32-bit destination register. For SM2.0 or later.
*
* \par
* The bytes in the two source registers \p a and \p b are numbered from 0 to 7:
* {\p b, \p a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}. For each of the four bytes
* {b3, b2, b1, b0} selected in the return value, a 4-bit selector is defined within
* the four lower "nibbles" of \p index: {\p index } = {n7, n6, n5, n4, n3, n2, n1, n0}
*
* \par Snippet
* The code snippet below illustrates byte-permute.
* \par
* \code
* #include <cub/cub.cuh>
*
* __global__ void ExampleKernel(...)
* {
* int a = 0x03020100;
* int b = 0x07060504;
* int index = 0x00007531;
*
* int selected = PRMT(a, b, index); // 0x07050301
*
* \endcode
*
*/
__device__ __forceinline__ int PRMT(unsigned int a, unsigned int b, unsigned int index)
{
int ret;
asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index));
return ret;
}
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
/**
* Sync-threads barrier.
*/
__device__ __forceinline__ void BAR(int count)
{
asm volatile("bar.sync 1, %0;" : : "r"(count));
}
/**
* CTA barrier
*/
__device__ __forceinline__ void CTA_SYNC()
{
__syncthreads();
}
/**
* CTA barrier with predicate
*/
__device__ __forceinline__ int CTA_SYNC_AND(int p)
{
return __syncthreads_and(p);
}
/**
* Warp barrier
*/
__device__ __forceinline__ void WARP_SYNC(unsigned int member_mask)
{
#ifdef CUB_USE_COOPERATIVE_GROUPS
__syncwarp(member_mask);
#endif
}
/**
* Warp any
*/
__device__ __forceinline__ int WARP_ANY(int predicate, unsigned int member_mask)
{
#ifdef CUB_USE_COOPERATIVE_GROUPS
return __any_sync(member_mask, predicate);
#else
return ::__any(predicate);
#endif
}
/**
* Warp any
*/
__device__ __forceinline__ int WARP_ALL(int predicate, unsigned int member_mask)
{
#ifdef CUB_USE_COOPERATIVE_GROUPS
return __all_sync(member_mask, predicate);
#else
return ::__all(predicate);
#endif
}
/**
* Warp ballot
*/
__device__ __forceinline__ int WARP_BALLOT(int predicate, unsigned int member_mask)
{
#ifdef CUB_USE_COOPERATIVE_GROUPS
return __ballot_sync(member_mask, predicate);
#else
return __ballot(predicate);
#endif
}
/**
* Warp synchronous shfl_up
*/
__device__ __forceinline__
unsigned int SHFL_UP_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask)
{
#ifdef CUB_USE_COOPERATIVE_GROUPS
asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;"
: "=r"(word) : "r"(word), "r"(src_offset), "r"(flags), "r"(member_mask));
#else
asm volatile("shfl.up.b32 %0, %1, %2, %3;"
: "=r"(word) : "r"(word), "r"(src_offset), "r"(flags));
#endif
return word;
}
/**
* Warp synchronous shfl_down
*/
__device__ __forceinline__
unsigned int SHFL_DOWN_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask)
{
#ifdef CUB_USE_COOPERATIVE_GROUPS
asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;"
: "=r"(word) : "r"(word), "r"(src_offset), "r"(flags), "r"(member_mask));
#else
asm volatile("shfl.down.b32 %0, %1, %2, %3;"
: "=r"(word) : "r"(word), "r"(src_offset), "r"(flags));
#endif
return word;
}
/**
* Warp synchronous shfl_idx
*/
__device__ __forceinline__
unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, int flags, unsigned int member_mask)
{
#ifdef CUB_USE_COOPERATIVE_GROUPS
asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;"
: "=r"(word) : "r"(word), "r"(src_lane), "r"(flags), "r"(member_mask));
#else
asm volatile("shfl.idx.b32 %0, %1, %2, %3;"
: "=r"(word) : "r"(word), "r"(src_lane), "r"(flags));
#endif
return word;
}
/**
* Floating point multiply. (Mantissa LSB rounds towards zero.)
*/
__device__ __forceinline__ float FMUL_RZ(float a, float b)
{
float d;
asm ("mul.rz.f32 %0, %1, %2;" : "=f"(d) : "f"(a), "f"(b));
return d;
}
/**
* Floating point multiply-add. (Mantissa LSB rounds towards zero.)
*/
__device__ __forceinline__ float FFMA_RZ(float a, float b, float c)
{
float d;
asm ("fma.rz.f32 %0, %1, %2, %3;" : "=f"(d) : "f"(a), "f"(b), "f"(c));
return d;
}
#endif // DOXYGEN_SHOULD_SKIP_THIS
/**
* \brief Terminates the calling thread
*/
__device__ __forceinline__ void ThreadExit() {
asm volatile("exit;");
}
/**
* \brief Abort execution and generate an interrupt to the host CPU
*/
__device__ __forceinline__ void ThreadTrap() {
asm volatile("trap;");
}
/**
* \brief Returns the row-major linear thread identifier for a multidimensional thread block
*/
__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
{
return ((block_dim_z == 1) ? 0 : (threadIdx.z * block_dim_x * block_dim_y)) +
((block_dim_y == 1) ? 0 : (threadIdx.y * block_dim_x)) +
threadIdx.x;
}
/**
* \brief Returns the warp lane ID of the calling thread
*/
__device__ __forceinline__ unsigned int LaneId()
{
unsigned int ret;
asm ("mov.u32 %0, %%laneid;" : "=r"(ret) );
return ret;
}
/**
* \brief Returns the warp ID of the calling thread. Warp ID is guaranteed to be unique among warps, but may not correspond to a zero-based ranking within the thread block.
*/
__device__ __forceinline__ unsigned int WarpId()
{
unsigned int ret;
asm ("mov.u32 %0, %%warpid;" : "=r"(ret) );
return ret;
}
/**
* \brief Returns the warp lane mask of all lanes less than the calling thread
*/
__device__ __forceinline__ unsigned int LaneMaskLt()
{
unsigned int ret;
asm ("mov.u32 %0, %%lanemask_lt;" : "=r"(ret) );
return ret;
}
/**
* \brief Returns the warp lane mask of all lanes less than or equal to the calling thread
*/
__device__ __forceinline__ unsigned int LaneMaskLe()
{
unsigned int ret;
asm ("mov.u32 %0, %%lanemask_le;" : "=r"(ret) );
return ret;
}
/**
* \brief Returns the warp lane mask of all lanes greater than the calling thread
*/
__device__ __forceinline__ unsigned int LaneMaskGt()
{
unsigned int ret;
asm ("mov.u32 %0, %%lanemask_gt;" : "=r"(ret) );
return ret;
}
/**
* \brief Returns the warp lane mask of all lanes greater than or equal to the calling thread
*/
__device__ __forceinline__ unsigned int LaneMaskGe()
{
unsigned int ret;
asm ("mov.u32 %0, %%lanemask_ge;" : "=r"(ret) );
return ret;
}
/** @} */ // end group UtilPtx
/**
* \brief Shuffle-up for any data type. Each <em>warp-lane<sub>i</sub></em> obtains the value \p input contributed by <em>warp-lane</em><sub><em>i</em>-<tt>src_offset</tt></sub>. For thread lanes \e i < src_offset, the thread's own \p input is returned to the thread. ![](shfl_up_logo.png)
* \ingroup WarpModule
*
* \tparam LOGICAL_WARP_THREADS The number of threads per "logical" warp. Must be a power-of-two <= 32.
* \tparam T <b>[inferred]</b> The input/output element type
*
* \par
* - Available only for SM3.0 or newer
*
* \par Snippet
* The code snippet below illustrates each thread obtaining a \p double value from the
* predecessor of its predecessor.
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/util_ptx.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Obtain one input item per thread
* double thread_data = ...
*
* // Obtain item from two ranks below
* double peer_data = ShuffleUp<32>(thread_data, 2, 0, 0xffffffff);
*
* \endcode
* \par
* Suppose the set of input \p thread_data across the first warp of threads is <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>.
* The corresponding output \p peer_data will be <tt>{1.0, 2.0, 1.0, 2.0, 3.0, ..., 30.0}</tt>.
*
*/
template <
int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp
typename T>
__device__ __forceinline__ T ShuffleUp(
T input, ///< [in] The value to broadcast
int src_offset, ///< [in] The relative down-offset of the peer to read from
int first_thread, ///< [in] Index of first lane in logical warp (typically 0)
unsigned int member_mask) ///< [in] 32-bit mask of participating warp lanes
{
/// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
enum {
SHFL_C = (32 - LOGICAL_WARP_THREADS) << 8
};
typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
T output;
ShuffleWord *output_alias = reinterpret_cast<ShuffleWord *>(&output);
ShuffleWord *input_alias = reinterpret_cast<ShuffleWord *>(&input);
unsigned int shuffle_word;
shuffle_word = SHFL_UP_SYNC((unsigned int)input_alias[0], src_offset, first_thread | SHFL_C, member_mask);
output_alias[0] = shuffle_word;
#pragma unroll
for (int WORD = 1; WORD < WORDS; ++WORD)
{
shuffle_word = SHFL_UP_SYNC((unsigned int)input_alias[WORD], src_offset, first_thread | SHFL_C, member_mask);
output_alias[WORD] = shuffle_word;
}
return output;
}
/**
* \brief Shuffle-down for any data type. Each <em>warp-lane<sub>i</sub></em> obtains the value \p input contributed by <em>warp-lane</em><sub><em>i</em>+<tt>src_offset</tt></sub>. For thread lanes \e i >= WARP_THREADS, the thread's own \p input is returned to the thread. ![](shfl_down_logo.png)
* \ingroup WarpModule
*
* \tparam LOGICAL_WARP_THREADS The number of threads per "logical" warp. Must be a power-of-two <= 32.
* \tparam T <b>[inferred]</b> The input/output element type
*
* \par
* - Available only for SM3.0 or newer
*
* \par Snippet
* The code snippet below illustrates each thread obtaining a \p double value from the
* successor of its successor.
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/util_ptx.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Obtain one input item per thread
* double thread_data = ...
*
* // Obtain item from two ranks below
* double peer_data = ShuffleDown<32>(thread_data, 2, 31, 0xffffffff);
*
* \endcode
* \par
* Suppose the set of input \p thread_data across the first warp of threads is <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>.
* The corresponding output \p peer_data will be <tt>{3.0, 4.0, 5.0, 6.0, 7.0, ..., 32.0}</tt>.
*
*/
template <
int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp
typename T>
__device__ __forceinline__ T ShuffleDown(
T input, ///< [in] The value to broadcast
int src_offset, ///< [in] The relative up-offset of the peer to read from
int last_thread, ///< [in] Index of last thread in logical warp (typically 31 for a 32-thread warp)
unsigned int member_mask) ///< [in] 32-bit mask of participating warp lanes
{
/// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
enum {
SHFL_C = (32 - LOGICAL_WARP_THREADS) << 8
};
typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
T output;
ShuffleWord *output_alias = reinterpret_cast<ShuffleWord *>(&output);
ShuffleWord *input_alias = reinterpret_cast<ShuffleWord *>(&input);
unsigned int shuffle_word;
shuffle_word = SHFL_DOWN_SYNC((unsigned int)input_alias[0], src_offset, last_thread | SHFL_C, member_mask);
output_alias[0] = shuffle_word;
#pragma unroll
for (int WORD = 1; WORD < WORDS; ++WORD)
{
shuffle_word = SHFL_DOWN_SYNC((unsigned int)input_alias[WORD], src_offset, last_thread | SHFL_C, member_mask);
output_alias[WORD] = shuffle_word;
}
return output;
}
/**
* \brief Shuffle-broadcast for any data type. Each <em>warp-lane<sub>i</sub></em> obtains the value \p input
* contributed by <em>warp-lane</em><sub><tt>src_lane</tt></sub>. For \p src_lane < 0 or \p src_lane >= WARP_THREADS,
* then the thread's own \p input is returned to the thread. ![](shfl_broadcast_logo.png)
*
* \tparam LOGICAL_WARP_THREADS The number of threads per "logical" warp. Must be a power-of-two <= 32.
* \tparam T <b>[inferred]</b> The input/output element type
*
* \ingroup WarpModule
*
* \par
* - Available only for SM3.0 or newer
*
* \par Snippet
* The code snippet below illustrates each thread obtaining a \p double value from <em>warp-lane</em><sub>0</sub>.
*
* \par
* \code
* #include <cub/cub.cuh> // or equivalently <cub/util_ptx.cuh>
*
* __global__ void ExampleKernel(...)
* {
* // Obtain one input item per thread
* double thread_data = ...
*
* // Obtain item from thread 0
* double peer_data = ShuffleIndex<32>(thread_data, 0, 0xffffffff);
*
* \endcode
* \par
* Suppose the set of input \p thread_data across the first warp of threads is <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>.
* The corresponding output \p peer_data will be <tt>{1.0, 1.0, 1.0, 1.0, 1.0, ..., 1.0}</tt>.
*
*/
template <
int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp
typename T>
__device__ __forceinline__ T ShuffleIndex(
T input, ///< [in] The value to broadcast
int src_lane, ///< [in] Which warp lane is to do the broadcasting
unsigned int member_mask) ///< [in] 32-bit mask of participating warp lanes
{
/// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
enum {
SHFL_C = ((32 - LOGICAL_WARP_THREADS) << 8) | (LOGICAL_WARP_THREADS - 1)
};
typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
T output;
ShuffleWord *output_alias = reinterpret_cast<ShuffleWord *>(&output);
ShuffleWord *input_alias = reinterpret_cast<ShuffleWord *>(&input);
unsigned int shuffle_word;
shuffle_word = SHFL_IDX_SYNC((unsigned int)input_alias[0],
src_lane,
SHFL_C,
member_mask);
output_alias[0] = shuffle_word;
#pragma unroll
for (int WORD = 1; WORD < WORDS; ++WORD)
{
shuffle_word = SHFL_IDX_SYNC((unsigned int)input_alias[WORD],
src_lane,
SHFL_C,
member_mask);
output_alias[WORD] = shuffle_word;
}
return output;
}
/**
* Compute a 32b mask of threads having the same least-significant
* LABEL_BITS of \p label as the calling thread.
*/
template <int LABEL_BITS>
inline __device__ unsigned int MatchAny(unsigned int label)
{
unsigned int retval;
// Extract masks of common threads for each bit
#pragma unroll
for (int BIT = 0; BIT < LABEL_BITS; ++BIT)
{
unsigned int mask;
unsigned int current_bit = 1 << BIT;
asm ("{\n"
" .reg .pred p;\n"
" and.b32 %0, %1, %2;"
" setp.eq.u32 p, %0, %2;\n"
#ifdef CUB_USE_COOPERATIVE_GROUPS
" vote.ballot.sync.b32 %0, p, 0xffffffff;\n"
#else
" vote.ballot.b32 %0, p;\n"
#endif
" @!p not.b32 %0, %0;\n"
"}\n" : "=r"(mask) : "r"(label), "r"(current_bit));
// Remove peers who differ
retval = (BIT == 0) ? mask : retval & mask;
}
return retval;
// // VOLTA match
// unsigned int retval;
// asm ("{\n"
// " match.any.sync.b32 %0, %1, 0xffffffff;\n"
// "}\n" : "=r"(retval) : "r"(label));
// return retval;
}
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)