Spaces:
Runtime error
Runtime error
/****************************************************************************** | |
* Copyright (c) 2011, Duane Merrill. All rights reserved. | |
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. | |
* | |
* Redistribution and use in source and binary forms, with or without | |
* modification, are permitted provided that the following conditions are met: | |
* * Redistributions of source code must retain the above copyright | |
* notice, this list of conditions and the following disclaimer. | |
* * Redistributions in binary form must reproduce the above copyright | |
* notice, this list of conditions and the following disclaimer in the | |
* documentation and/or other materials provided with the distribution. | |
* * Neither the name of the NVIDIA CORPORATION nor the | |
* names of its contributors may be used to endorse or promote products | |
* derived from this software without specific prior written permission. | |
* | |
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | |
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY | |
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES | |
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND | |
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
* | |
******************************************************************************/ | |
/** | |
* \file | |
* Thread utilities for writing memory using PTX cache modifiers. | |
*/ | |
#pragma once | |
#include "../config.cuh" | |
#include "../util_ptx.cuh" | |
#include "../util_type.cuh" | |
/// Optional outer namespace(s) | |
CUB_NS_PREFIX | |
/// CUB namespace | |
namespace cub { | |
/** | |
* \addtogroup UtilIo | |
* @{ | |
*/ | |
//----------------------------------------------------------------------------- | |
// Tags and constants | |
//----------------------------------------------------------------------------- | |
/** | |
* \brief Enumeration of cache modifiers for memory store operations. | |
*/ | |
enum CacheStoreModifier | |
{ | |
STORE_DEFAULT, ///< Default (no modifier) | |
STORE_WB, ///< Cache write-back all coherent levels | |
STORE_CG, ///< Cache at global level | |
STORE_CS, ///< Cache streaming (likely to be accessed once) | |
STORE_WT, ///< Cache write-through (to system memory) | |
STORE_VOLATILE, ///< Volatile shared (any memory space) | |
}; | |
/** | |
* \name Thread I/O (cache modified) | |
* @{ | |
*/ | |
/** | |
* \brief Thread utility for writing memory using cub::CacheStoreModifier cache modifiers. Can be used to store any data type. | |
* | |
* \par Example | |
* \code | |
* #include <cub/cub.cuh> // or equivalently <cub/thread/thread_store.cuh> | |
* | |
* // 32-bit store using cache-global modifier: | |
* int *d_out; | |
* int val; | |
* cub::ThreadStore<cub::STORE_CG>(d_out + threadIdx.x, val); | |
* | |
* // 16-bit store using default modifier | |
* short *d_out; | |
* short val; | |
* cub::ThreadStore<cub::STORE_DEFAULT>(d_out + threadIdx.x, val); | |
* | |
* // 256-bit store using write-through modifier | |
* double4 *d_out; | |
* double4 val; | |
* cub::ThreadStore<cub::STORE_WT>(d_out + threadIdx.x, val); | |
* | |
* // 96-bit store using cache-streaming cache modifier | |
* struct TestFoo { bool a; short b; }; | |
* TestFoo *d_struct; | |
* TestFoo val; | |
* cub::ThreadStore<cub::STORE_CS>(d_out + threadIdx.x, val); | |
* \endcode | |
* | |
* \tparam MODIFIER <b>[inferred]</b> CacheStoreModifier enumeration | |
* \tparam InputIteratorT <b>[inferred]</b> Output iterator type \iterator | |
* \tparam T <b>[inferred]</b> Data type of output value | |
*/ | |
template < | |
CacheStoreModifier MODIFIER, | |
typename OutputIteratorT, | |
typename T> | |
__device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val); | |
//@} end member group | |
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document | |
/// Helper structure for templated store iteration (inductive case) | |
template <int COUNT, int MAX> | |
struct IterateThreadStore | |
{ | |
template <CacheStoreModifier MODIFIER, typename T> | |
static __device__ __forceinline__ void Store(T *ptr, T *vals) | |
{ | |
ThreadStore<MODIFIER>(ptr + COUNT, vals[COUNT]); | |
IterateThreadStore<COUNT + 1, MAX>::template Store<MODIFIER>(ptr, vals); | |
} | |
template <typename OutputIteratorT, typename T> | |
static __device__ __forceinline__ void Dereference(OutputIteratorT ptr, T *vals) | |
{ | |
ptr[COUNT] = vals[COUNT]; | |
IterateThreadStore<COUNT + 1, MAX>::Dereference(ptr, vals); | |
} | |
}; | |
/// Helper structure for templated store iteration (termination case) | |
template <int MAX> | |
struct IterateThreadStore<MAX, MAX> | |
{ | |
template <CacheStoreModifier MODIFIER, typename T> | |
static __device__ __forceinline__ void Store(T * /*ptr*/, T * /*vals*/) {} | |
template <typename OutputIteratorT, typename T> | |
static __device__ __forceinline__ void Dereference(OutputIteratorT /*ptr*/, T * /*vals*/) {} | |
}; | |
/** | |
* Define a uint4 (16B) ThreadStore specialization for the given Cache load modifier | |
*/ | |
#define _CUB_STORE_16(cub_modifier, ptx_modifier) \ | |
template<> \ | |
__device__ __forceinline__ void ThreadStore<cub_modifier, uint4*, uint4>(uint4* ptr, uint4 val) \ | |
{ \ | |
asm volatile ("st."#ptx_modifier".v4.u32 [%0], {%1, %2, %3, %4};" : : \ | |
_CUB_ASM_PTR_(ptr), \ | |
"r"(val.x), \ | |
"r"(val.y), \ | |
"r"(val.z), \ | |
"r"(val.w)); \ | |
} \ | |
template<> \ | |
__device__ __forceinline__ void ThreadStore<cub_modifier, ulonglong2*, ulonglong2>(ulonglong2* ptr, ulonglong2 val) \ | |
{ \ | |
asm volatile ("st."#ptx_modifier".v2.u64 [%0], {%1, %2};" : : \ | |
_CUB_ASM_PTR_(ptr), \ | |
"l"(val.x), \ | |
"l"(val.y)); \ | |
} | |
/** | |
* Define a uint2 (8B) ThreadStore specialization for the given Cache load modifier | |
*/ | |
#define _CUB_STORE_8(cub_modifier, ptx_modifier) \ | |
template<> \ | |
__device__ __forceinline__ void ThreadStore<cub_modifier, ushort4*, ushort4>(ushort4* ptr, ushort4 val) \ | |
{ \ | |
asm volatile ("st."#ptx_modifier".v4.u16 [%0], {%1, %2, %3, %4};" : : \ | |
_CUB_ASM_PTR_(ptr), \ | |
"h"(val.x), \ | |
"h"(val.y), \ | |
"h"(val.z), \ | |
"h"(val.w)); \ | |
} \ | |
template<> \ | |
__device__ __forceinline__ void ThreadStore<cub_modifier, uint2*, uint2>(uint2* ptr, uint2 val) \ | |
{ \ | |
asm volatile ("st."#ptx_modifier".v2.u32 [%0], {%1, %2};" : : \ | |
_CUB_ASM_PTR_(ptr), \ | |
"r"(val.x), \ | |
"r"(val.y)); \ | |
} \ | |
template<> \ | |
__device__ __forceinline__ void ThreadStore<cub_modifier, unsigned long long*, unsigned long long>(unsigned long long* ptr, unsigned long long val) \ | |
{ \ | |
asm volatile ("st."#ptx_modifier".u64 [%0], %1;" : : \ | |
_CUB_ASM_PTR_(ptr), \ | |
"l"(val)); \ | |
} | |
/** | |
* Define a unsigned int (4B) ThreadStore specialization for the given Cache load modifier | |
*/ | |
#define _CUB_STORE_4(cub_modifier, ptx_modifier) \ | |
template<> \ | |
__device__ __forceinline__ void ThreadStore<cub_modifier, unsigned int*, unsigned int>(unsigned int* ptr, unsigned int val) \ | |
{ \ | |
asm volatile ("st."#ptx_modifier".u32 [%0], %1;" : : \ | |
_CUB_ASM_PTR_(ptr), \ | |
"r"(val)); \ | |
} | |
/** | |
* Define a unsigned short (2B) ThreadStore specialization for the given Cache load modifier | |
*/ | |
#define _CUB_STORE_2(cub_modifier, ptx_modifier) \ | |
template<> \ | |
__device__ __forceinline__ void ThreadStore<cub_modifier, unsigned short*, unsigned short>(unsigned short* ptr, unsigned short val) \ | |
{ \ | |
asm volatile ("st."#ptx_modifier".u16 [%0], %1;" : : \ | |
_CUB_ASM_PTR_(ptr), \ | |
"h"(val)); \ | |
} | |
/** | |
* Define a unsigned char (1B) ThreadStore specialization for the given Cache load modifier | |
*/ | |
#define _CUB_STORE_1(cub_modifier, ptx_modifier) \ | |
template<> \ | |
__device__ __forceinline__ void ThreadStore<cub_modifier, unsigned char*, unsigned char>(unsigned char* ptr, unsigned char val) \ | |
{ \ | |
asm volatile ( \ | |
"{" \ | |
" .reg .u8 datum;" \ | |
" cvt.u8.u16 datum, %1;" \ | |
" st."#ptx_modifier".u8 [%0], datum;" \ | |
"}" : : \ | |
_CUB_ASM_PTR_(ptr), \ | |
"h"((unsigned short) val)); \ | |
} | |
/** | |
* Define powers-of-two ThreadStore specializations for the given Cache load modifier | |
*/ | |
#define _CUB_STORE_ALL(cub_modifier, ptx_modifier) \ | |
_CUB_STORE_16(cub_modifier, ptx_modifier) \ | |
_CUB_STORE_8(cub_modifier, ptx_modifier) \ | |
_CUB_STORE_4(cub_modifier, ptx_modifier) \ | |
_CUB_STORE_2(cub_modifier, ptx_modifier) \ | |
_CUB_STORE_1(cub_modifier, ptx_modifier) \ | |
/** | |
* Define ThreadStore specializations for the various Cache load modifiers | |
*/ | |
#if CUB_PTX_ARCH >= 200 | |
_CUB_STORE_ALL(STORE_WB, wb) | |
_CUB_STORE_ALL(STORE_CG, cg) | |
_CUB_STORE_ALL(STORE_CS, cs) | |
_CUB_STORE_ALL(STORE_WT, wt) | |
#else | |
_CUB_STORE_ALL(STORE_WB, global) | |
_CUB_STORE_ALL(STORE_CG, global) | |
_CUB_STORE_ALL(STORE_CS, global) | |
_CUB_STORE_ALL(STORE_WT, volatile.global) | |
#endif | |
// Macro cleanup | |
#undef _CUB_STORE_ALL | |
#undef _CUB_STORE_1 | |
#undef _CUB_STORE_2 | |
#undef _CUB_STORE_4 | |
#undef _CUB_STORE_8 | |
#undef _CUB_STORE_16 | |
/** | |
* ThreadStore definition for STORE_DEFAULT modifier on iterator types | |
*/ | |
template <typename OutputIteratorT, typename T> | |
__device__ __forceinline__ void ThreadStore( | |
OutputIteratorT itr, | |
T val, | |
Int2Type<STORE_DEFAULT> /*modifier*/, | |
Int2Type<false> /*is_pointer*/) | |
{ | |
*itr = val; | |
} | |
/** | |
* ThreadStore definition for STORE_DEFAULT modifier on pointer types | |
*/ | |
template <typename T> | |
__device__ __forceinline__ void ThreadStore( | |
T *ptr, | |
T val, | |
Int2Type<STORE_DEFAULT> /*modifier*/, | |
Int2Type<true> /*is_pointer*/) | |
{ | |
*ptr = val; | |
} | |
/** | |
* ThreadStore definition for STORE_VOLATILE modifier on primitive pointer types | |
*/ | |
template <typename T> | |
__device__ __forceinline__ void ThreadStoreVolatilePtr( | |
T *ptr, | |
T val, | |
Int2Type<true> /*is_primitive*/) | |
{ | |
*reinterpret_cast<volatile T*>(ptr) = val; | |
} | |
/** | |
* ThreadStore definition for STORE_VOLATILE modifier on non-primitive pointer types | |
*/ | |
template <typename T> | |
__device__ __forceinline__ void ThreadStoreVolatilePtr( | |
T *ptr, | |
T val, | |
Int2Type<false> /*is_primitive*/) | |
{ | |
// Create a temporary using shuffle-words, then store using volatile-words | |
typedef typename UnitWord<T>::VolatileWord VolatileWord; | |
typedef typename UnitWord<T>::ShuffleWord ShuffleWord; | |
const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord); | |
const int SHUFFLE_MULTIPLE = sizeof(T) / sizeof(ShuffleWord); | |
VolatileWord words[VOLATILE_MULTIPLE]; | |
#pragma unroll | |
for (int i = 0; i < SHUFFLE_MULTIPLE; ++i) | |
reinterpret_cast<ShuffleWord*>(words)[i] = reinterpret_cast<ShuffleWord*>(&val)[i]; | |
IterateThreadStore<0, VOLATILE_MULTIPLE>::template Dereference( | |
reinterpret_cast<volatile VolatileWord*>(ptr), | |
words); | |
} | |
/** | |
* ThreadStore definition for STORE_VOLATILE modifier on pointer types | |
*/ | |
template <typename T> | |
__device__ __forceinline__ void ThreadStore( | |
T *ptr, | |
T val, | |
Int2Type<STORE_VOLATILE> /*modifier*/, | |
Int2Type<true> /*is_pointer*/) | |
{ | |
ThreadStoreVolatilePtr(ptr, val, Int2Type<Traits<T>::PRIMITIVE>()); | |
} | |
/** | |
* ThreadStore definition for generic modifiers on pointer types | |
*/ | |
template <typename T, int MODIFIER> | |
__device__ __forceinline__ void ThreadStore( | |
T *ptr, | |
T val, | |
Int2Type<MODIFIER> /*modifier*/, | |
Int2Type<true> /*is_pointer*/) | |
{ | |
// Create a temporary using shuffle-words, then store using device-words | |
typedef typename UnitWord<T>::DeviceWord DeviceWord; | |
typedef typename UnitWord<T>::ShuffleWord ShuffleWord; | |
const int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord); | |
const int SHUFFLE_MULTIPLE = sizeof(T) / sizeof(ShuffleWord); | |
DeviceWord words[DEVICE_MULTIPLE]; | |
#pragma unroll | |
for (int i = 0; i < SHUFFLE_MULTIPLE; ++i) | |
reinterpret_cast<ShuffleWord*>(words)[i] = reinterpret_cast<ShuffleWord*>(&val)[i]; | |
IterateThreadStore<0, DEVICE_MULTIPLE>::template Store<CacheStoreModifier(MODIFIER)>( | |
reinterpret_cast<DeviceWord*>(ptr), | |
words); | |
} | |
/** | |
* ThreadStore definition for generic modifiers | |
*/ | |
template <CacheStoreModifier MODIFIER, typename OutputIteratorT, typename T> | |
__device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val) | |
{ | |
ThreadStore( | |
itr, | |
val, | |
Int2Type<MODIFIER>(), | |
Int2Type<IsPointer<OutputIteratorT>::VALUE>()); | |
} | |
#endif // DOXYGEN_SHOULD_SKIP_THIS | |
/** @} */ // end group UtilIo | |
} // CUB namespace | |
CUB_NS_POSTFIX // Optional outer namespace(s) | |