LIVE / thrust /cub /thread /thread_store.cuh
Xu Ma
update
1c3c0d9
raw
history blame
17.9 kB
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* Thread utilities for writing memory using PTX cache modifiers.
*/
#pragma once
#include "../config.cuh"
#include "../util_ptx.cuh"
#include "../util_type.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* \addtogroup UtilIo
* @{
*/
//-----------------------------------------------------------------------------
// Tags and constants
//-----------------------------------------------------------------------------
/**
* \brief Enumeration of cache modifiers for memory store operations.
*/
enum CacheStoreModifier
{
STORE_DEFAULT, ///< Default (no modifier)
STORE_WB, ///< Cache write-back all coherent levels
STORE_CG, ///< Cache at global level
STORE_CS, ///< Cache streaming (likely to be accessed once)
STORE_WT, ///< Cache write-through (to system memory)
STORE_VOLATILE, ///< Volatile shared (any memory space)
};
/**
* \name Thread I/O (cache modified)
* @{
*/
/**
* \brief Thread utility for writing memory using cub::CacheStoreModifier cache modifiers. Can be used to store any data type.
*
* \par Example
* \code
* #include <cub/cub.cuh> // or equivalently <cub/thread/thread_store.cuh>
*
* // 32-bit store using cache-global modifier:
* int *d_out;
* int val;
* cub::ThreadStore<cub::STORE_CG>(d_out + threadIdx.x, val);
*
* // 16-bit store using default modifier
* short *d_out;
* short val;
* cub::ThreadStore<cub::STORE_DEFAULT>(d_out + threadIdx.x, val);
*
* // 256-bit store using write-through modifier
* double4 *d_out;
* double4 val;
* cub::ThreadStore<cub::STORE_WT>(d_out + threadIdx.x, val);
*
* // 96-bit store using cache-streaming cache modifier
* struct TestFoo { bool a; short b; };
* TestFoo *d_struct;
* TestFoo val;
* cub::ThreadStore<cub::STORE_CS>(d_out + threadIdx.x, val);
* \endcode
*
* \tparam MODIFIER <b>[inferred]</b> CacheStoreModifier enumeration
* \tparam InputIteratorT <b>[inferred]</b> Output iterator type \iterator
* \tparam T <b>[inferred]</b> Data type of output value
*/
template <
CacheStoreModifier MODIFIER,
typename OutputIteratorT,
typename T>
__device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val);
//@} end member group
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
/// Helper structure for templated store iteration (inductive case)
template <int COUNT, int MAX>
struct IterateThreadStore
{
template <CacheStoreModifier MODIFIER, typename T>
static __device__ __forceinline__ void Store(T *ptr, T *vals)
{
ThreadStore<MODIFIER>(ptr + COUNT, vals[COUNT]);
IterateThreadStore<COUNT + 1, MAX>::template Store<MODIFIER>(ptr, vals);
}
template <typename OutputIteratorT, typename T>
static __device__ __forceinline__ void Dereference(OutputIteratorT ptr, T *vals)
{
ptr[COUNT] = vals[COUNT];
IterateThreadStore<COUNT + 1, MAX>::Dereference(ptr, vals);
}
};
/// Helper structure for templated store iteration (termination case)
template <int MAX>
struct IterateThreadStore<MAX, MAX>
{
template <CacheStoreModifier MODIFIER, typename T>
static __device__ __forceinline__ void Store(T * /*ptr*/, T * /*vals*/) {}
template <typename OutputIteratorT, typename T>
static __device__ __forceinline__ void Dereference(OutputIteratorT /*ptr*/, T * /*vals*/) {}
};
/**
* Define a uint4 (16B) ThreadStore specialization for the given Cache load modifier
*/
#define _CUB_STORE_16(cub_modifier, ptx_modifier) \
template<> \
__device__ __forceinline__ void ThreadStore<cub_modifier, uint4*, uint4>(uint4* ptr, uint4 val) \
{ \
asm volatile ("st."#ptx_modifier".v4.u32 [%0], {%1, %2, %3, %4};" : : \
_CUB_ASM_PTR_(ptr), \
"r"(val.x), \
"r"(val.y), \
"r"(val.z), \
"r"(val.w)); \
} \
template<> \
__device__ __forceinline__ void ThreadStore<cub_modifier, ulonglong2*, ulonglong2>(ulonglong2* ptr, ulonglong2 val) \
{ \
asm volatile ("st."#ptx_modifier".v2.u64 [%0], {%1, %2};" : : \
_CUB_ASM_PTR_(ptr), \
"l"(val.x), \
"l"(val.y)); \
}
/**
* Define a uint2 (8B) ThreadStore specialization for the given Cache load modifier
*/
#define _CUB_STORE_8(cub_modifier, ptx_modifier) \
template<> \
__device__ __forceinline__ void ThreadStore<cub_modifier, ushort4*, ushort4>(ushort4* ptr, ushort4 val) \
{ \
asm volatile ("st."#ptx_modifier".v4.u16 [%0], {%1, %2, %3, %4};" : : \
_CUB_ASM_PTR_(ptr), \
"h"(val.x), \
"h"(val.y), \
"h"(val.z), \
"h"(val.w)); \
} \
template<> \
__device__ __forceinline__ void ThreadStore<cub_modifier, uint2*, uint2>(uint2* ptr, uint2 val) \
{ \
asm volatile ("st."#ptx_modifier".v2.u32 [%0], {%1, %2};" : : \
_CUB_ASM_PTR_(ptr), \
"r"(val.x), \
"r"(val.y)); \
} \
template<> \
__device__ __forceinline__ void ThreadStore<cub_modifier, unsigned long long*, unsigned long long>(unsigned long long* ptr, unsigned long long val) \
{ \
asm volatile ("st."#ptx_modifier".u64 [%0], %1;" : : \
_CUB_ASM_PTR_(ptr), \
"l"(val)); \
}
/**
* Define a unsigned int (4B) ThreadStore specialization for the given Cache load modifier
*/
#define _CUB_STORE_4(cub_modifier, ptx_modifier) \
template<> \
__device__ __forceinline__ void ThreadStore<cub_modifier, unsigned int*, unsigned int>(unsigned int* ptr, unsigned int val) \
{ \
asm volatile ("st."#ptx_modifier".u32 [%0], %1;" : : \
_CUB_ASM_PTR_(ptr), \
"r"(val)); \
}
/**
* Define a unsigned short (2B) ThreadStore specialization for the given Cache load modifier
*/
#define _CUB_STORE_2(cub_modifier, ptx_modifier) \
template<> \
__device__ __forceinline__ void ThreadStore<cub_modifier, unsigned short*, unsigned short>(unsigned short* ptr, unsigned short val) \
{ \
asm volatile ("st."#ptx_modifier".u16 [%0], %1;" : : \
_CUB_ASM_PTR_(ptr), \
"h"(val)); \
}
/**
* Define a unsigned char (1B) ThreadStore specialization for the given Cache load modifier
*/
#define _CUB_STORE_1(cub_modifier, ptx_modifier) \
template<> \
__device__ __forceinline__ void ThreadStore<cub_modifier, unsigned char*, unsigned char>(unsigned char* ptr, unsigned char val) \
{ \
asm volatile ( \
"{" \
" .reg .u8 datum;" \
" cvt.u8.u16 datum, %1;" \
" st."#ptx_modifier".u8 [%0], datum;" \
"}" : : \
_CUB_ASM_PTR_(ptr), \
"h"((unsigned short) val)); \
}
/**
* Define powers-of-two ThreadStore specializations for the given Cache load modifier
*/
#define _CUB_STORE_ALL(cub_modifier, ptx_modifier) \
_CUB_STORE_16(cub_modifier, ptx_modifier) \
_CUB_STORE_8(cub_modifier, ptx_modifier) \
_CUB_STORE_4(cub_modifier, ptx_modifier) \
_CUB_STORE_2(cub_modifier, ptx_modifier) \
_CUB_STORE_1(cub_modifier, ptx_modifier) \
/**
* Define ThreadStore specializations for the various Cache load modifiers
*/
#if CUB_PTX_ARCH >= 200
_CUB_STORE_ALL(STORE_WB, wb)
_CUB_STORE_ALL(STORE_CG, cg)
_CUB_STORE_ALL(STORE_CS, cs)
_CUB_STORE_ALL(STORE_WT, wt)
#else
_CUB_STORE_ALL(STORE_WB, global)
_CUB_STORE_ALL(STORE_CG, global)
_CUB_STORE_ALL(STORE_CS, global)
_CUB_STORE_ALL(STORE_WT, volatile.global)
#endif
// Macro cleanup
#undef _CUB_STORE_ALL
#undef _CUB_STORE_1
#undef _CUB_STORE_2
#undef _CUB_STORE_4
#undef _CUB_STORE_8
#undef _CUB_STORE_16
/**
* ThreadStore definition for STORE_DEFAULT modifier on iterator types
*/
template <typename OutputIteratorT, typename T>
__device__ __forceinline__ void ThreadStore(
OutputIteratorT itr,
T val,
Int2Type<STORE_DEFAULT> /*modifier*/,
Int2Type<false> /*is_pointer*/)
{
*itr = val;
}
/**
* ThreadStore definition for STORE_DEFAULT modifier on pointer types
*/
template <typename T>
__device__ __forceinline__ void ThreadStore(
T *ptr,
T val,
Int2Type<STORE_DEFAULT> /*modifier*/,
Int2Type<true> /*is_pointer*/)
{
*ptr = val;
}
/**
* ThreadStore definition for STORE_VOLATILE modifier on primitive pointer types
*/
template <typename T>
__device__ __forceinline__ void ThreadStoreVolatilePtr(
T *ptr,
T val,
Int2Type<true> /*is_primitive*/)
{
*reinterpret_cast<volatile T*>(ptr) = val;
}
/**
* ThreadStore definition for STORE_VOLATILE modifier on non-primitive pointer types
*/
template <typename T>
__device__ __forceinline__ void ThreadStoreVolatilePtr(
T *ptr,
T val,
Int2Type<false> /*is_primitive*/)
{
// Create a temporary using shuffle-words, then store using volatile-words
typedef typename UnitWord<T>::VolatileWord VolatileWord;
typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord);
const int SHUFFLE_MULTIPLE = sizeof(T) / sizeof(ShuffleWord);
VolatileWord words[VOLATILE_MULTIPLE];
#pragma unroll
for (int i = 0; i < SHUFFLE_MULTIPLE; ++i)
reinterpret_cast<ShuffleWord*>(words)[i] = reinterpret_cast<ShuffleWord*>(&val)[i];
IterateThreadStore<0, VOLATILE_MULTIPLE>::template Dereference(
reinterpret_cast<volatile VolatileWord*>(ptr),
words);
}
/**
* ThreadStore definition for STORE_VOLATILE modifier on pointer types
*/
template <typename T>
__device__ __forceinline__ void ThreadStore(
T *ptr,
T val,
Int2Type<STORE_VOLATILE> /*modifier*/,
Int2Type<true> /*is_pointer*/)
{
ThreadStoreVolatilePtr(ptr, val, Int2Type<Traits<T>::PRIMITIVE>());
}
/**
* ThreadStore definition for generic modifiers on pointer types
*/
template <typename T, int MODIFIER>
__device__ __forceinline__ void ThreadStore(
T *ptr,
T val,
Int2Type<MODIFIER> /*modifier*/,
Int2Type<true> /*is_pointer*/)
{
// Create a temporary using shuffle-words, then store using device-words
typedef typename UnitWord<T>::DeviceWord DeviceWord;
typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
const int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord);
const int SHUFFLE_MULTIPLE = sizeof(T) / sizeof(ShuffleWord);
DeviceWord words[DEVICE_MULTIPLE];
#pragma unroll
for (int i = 0; i < SHUFFLE_MULTIPLE; ++i)
reinterpret_cast<ShuffleWord*>(words)[i] = reinterpret_cast<ShuffleWord*>(&val)[i];
IterateThreadStore<0, DEVICE_MULTIPLE>::template Store<CacheStoreModifier(MODIFIER)>(
reinterpret_cast<DeviceWord*>(ptr),
words);
}
/**
* ThreadStore definition for generic modifiers
*/
template <CacheStoreModifier MODIFIER, typename OutputIteratorT, typename T>
__device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val)
{
ThreadStore(
itr,
val,
Int2Type<MODIFIER>(),
Int2Type<IsPointer<OutputIteratorT>::VALUE>());
}
#endif // DOXYGEN_SHOULD_SKIP_THIS
/** @} */ // end group UtilIo
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)