Spaces:
Runtime error
Runtime error
/****************************************************************************** | |
* Copyright (c) 2011, Duane Merrill. All rights reserved. | |
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. | |
* | |
* Redistribution and use in source and binary forms, with or without | |
* modification, are permitted provided that the following conditions are met: | |
* * Redistributions of source code must retain the above copyright | |
* notice, this list of conditions and the following disclaimer. | |
* * Redistributions in binary form must reproduce the above copyright | |
* notice, this list of conditions and the following disclaimer in the | |
* documentation and/or other materials provided with the distribution. | |
* * Neither the name of the NVIDIA CORPORATION nor the | |
* names of its contributors may be used to endorse or promote products | |
* derived from this software without specific prior written permission. | |
* | |
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | |
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY | |
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES | |
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND | |
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
* | |
******************************************************************************/ | |
/** | |
* \file | |
* Thread utilities for reading memory using PTX cache modifiers. | |
*/ | |
#pragma once | |
#include <iterator> | |
#include "../config.cuh" | |
#include "../util_ptx.cuh" | |
#include "../util_type.cuh" | |
/// Optional outer namespace(s) | |
CUB_NS_PREFIX | |
/// CUB namespace | |
namespace cub { | |
/** | |
* \addtogroup UtilIo | |
* @{ | |
*/ | |
//----------------------------------------------------------------------------- | |
// Tags and constants | |
//----------------------------------------------------------------------------- | |
/** | |
* \brief Enumeration of cache modifiers for memory load operations. | |
*/ | |
enum CacheLoadModifier | |
{ | |
LOAD_DEFAULT, ///< Default (no modifier) | |
LOAD_CA, ///< Cache at all levels | |
LOAD_CG, ///< Cache at global level | |
LOAD_CS, ///< Cache streaming (likely to be accessed once) | |
LOAD_CV, ///< Cache as volatile (including cached system lines) | |
LOAD_LDG, ///< Cache as texture | |
LOAD_VOLATILE, ///< Volatile (any memory space) | |
}; | |
/** | |
* \name Thread I/O (cache modified) | |
* @{ | |
*/ | |
/** | |
* \brief Thread utility for reading memory using cub::CacheLoadModifier cache modifiers. Can be used to load any data type. | |
* | |
* \par Example | |
* \code | |
* #include <cub/cub.cuh> // or equivalently <cub/thread/thread_load.cuh> | |
* | |
* // 32-bit load using cache-global modifier: | |
* int *d_in; | |
* int val = cub::ThreadLoad<cub::LOAD_CA>(d_in + threadIdx.x); | |
* | |
* // 16-bit load using default modifier | |
* short *d_in; | |
* short val = cub::ThreadLoad<cub::LOAD_DEFAULT>(d_in + threadIdx.x); | |
* | |
* // 256-bit load using cache-volatile modifier | |
* double4 *d_in; | |
* double4 val = cub::ThreadLoad<cub::LOAD_CV>(d_in + threadIdx.x); | |
* | |
* // 96-bit load using cache-streaming modifier | |
* struct TestFoo { bool a; short b; }; | |
* TestFoo *d_struct; | |
* TestFoo val = cub::ThreadLoad<cub::LOAD_CS>(d_in + threadIdx.x); | |
* \endcode | |
* | |
* \tparam MODIFIER <b>[inferred]</b> CacheLoadModifier enumeration | |
* \tparam InputIteratorT <b>[inferred]</b> Input iterator type \iterator | |
*/ | |
template < | |
CacheLoadModifier MODIFIER, | |
typename InputIteratorT> | |
__device__ __forceinline__ typename std::iterator_traits<InputIteratorT>::value_type ThreadLoad(InputIteratorT itr); | |
//@} end member group | |
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document | |
/// Helper structure for templated load iteration (inductive case) | |
template <int COUNT, int MAX> | |
struct IterateThreadLoad | |
{ | |
template <CacheLoadModifier MODIFIER, typename T> | |
static __device__ __forceinline__ void Load(T const *ptr, T *vals) | |
{ | |
vals[COUNT] = ThreadLoad<MODIFIER>(ptr + COUNT); | |
IterateThreadLoad<COUNT + 1, MAX>::template Load<MODIFIER>(ptr, vals); | |
} | |
template <typename InputIteratorT, typename T> | |
static __device__ __forceinline__ void Dereference(InputIteratorT itr, T *vals) | |
{ | |
vals[COUNT] = itr[COUNT]; | |
IterateThreadLoad<COUNT + 1, MAX>::Dereference(itr, vals); | |
} | |
}; | |
/// Helper structure for templated load iteration (termination case) | |
template <int MAX> | |
struct IterateThreadLoad<MAX, MAX> | |
{ | |
template <CacheLoadModifier MODIFIER, typename T> | |
static __device__ __forceinline__ void Load(T const * /*ptr*/, T * /*vals*/) {} | |
template <typename InputIteratorT, typename T> | |
static __device__ __forceinline__ void Dereference(InputIteratorT /*itr*/, T * /*vals*/) {} | |
}; | |
/** | |
* Define a uint4 (16B) ThreadLoad specialization for the given Cache load modifier | |
*/ | |
#define _CUB_LOAD_16(cub_modifier, ptx_modifier) \ | |
template<> \ | |
__device__ __forceinline__ uint4 ThreadLoad<cub_modifier, uint4 const *>(uint4 const *ptr) \ | |
{ \ | |
uint4 retval; \ | |
asm volatile ("ld."#ptx_modifier".v4.u32 {%0, %1, %2, %3}, [%4];" : \ | |
"=r"(retval.x), \ | |
"=r"(retval.y), \ | |
"=r"(retval.z), \ | |
"=r"(retval.w) : \ | |
_CUB_ASM_PTR_(ptr)); \ | |
return retval; \ | |
} \ | |
template<> \ | |
__device__ __forceinline__ ulonglong2 ThreadLoad<cub_modifier, ulonglong2 const *>(ulonglong2 const *ptr) \ | |
{ \ | |
ulonglong2 retval; \ | |
asm volatile ("ld."#ptx_modifier".v2.u64 {%0, %1}, [%2];" : \ | |
"=l"(retval.x), \ | |
"=l"(retval.y) : \ | |
_CUB_ASM_PTR_(ptr)); \ | |
return retval; \ | |
} | |
/** | |
* Define a uint2 (8B) ThreadLoad specialization for the given Cache load modifier | |
*/ | |
#define _CUB_LOAD_8(cub_modifier, ptx_modifier) \ | |
template<> \ | |
__device__ __forceinline__ ushort4 ThreadLoad<cub_modifier, ushort4 const *>(ushort4 const *ptr) \ | |
{ \ | |
ushort4 retval; \ | |
asm volatile ("ld."#ptx_modifier".v4.u16 {%0, %1, %2, %3}, [%4];" : \ | |
"=h"(retval.x), \ | |
"=h"(retval.y), \ | |
"=h"(retval.z), \ | |
"=h"(retval.w) : \ | |
_CUB_ASM_PTR_(ptr)); \ | |
return retval; \ | |
} \ | |
template<> \ | |
__device__ __forceinline__ uint2 ThreadLoad<cub_modifier, uint2 const *>(uint2 const *ptr) \ | |
{ \ | |
uint2 retval; \ | |
asm volatile ("ld."#ptx_modifier".v2.u32 {%0, %1}, [%2];" : \ | |
"=r"(retval.x), \ | |
"=r"(retval.y) : \ | |
_CUB_ASM_PTR_(ptr)); \ | |
return retval; \ | |
} \ | |
template<> \ | |
__device__ __forceinline__ unsigned long long ThreadLoad<cub_modifier, unsigned long long const *>(unsigned long long const *ptr) \ | |
{ \ | |
unsigned long long retval; \ | |
asm volatile ("ld."#ptx_modifier".u64 %0, [%1];" : \ | |
"=l"(retval) : \ | |
_CUB_ASM_PTR_(ptr)); \ | |
return retval; \ | |
} | |
/** | |
* Define a uint (4B) ThreadLoad specialization for the given Cache load modifier | |
*/ | |
#define _CUB_LOAD_4(cub_modifier, ptx_modifier) \ | |
template<> \ | |
__device__ __forceinline__ unsigned int ThreadLoad<cub_modifier, unsigned int const *>(unsigned int const *ptr) \ | |
{ \ | |
unsigned int retval; \ | |
asm volatile ("ld."#ptx_modifier".u32 %0, [%1];" : \ | |
"=r"(retval) : \ | |
_CUB_ASM_PTR_(ptr)); \ | |
return retval; \ | |
} | |
/** | |
* Define a unsigned short (2B) ThreadLoad specialization for the given Cache load modifier | |
*/ | |
#define _CUB_LOAD_2(cub_modifier, ptx_modifier) \ | |
template<> \ | |
__device__ __forceinline__ unsigned short ThreadLoad<cub_modifier, unsigned short const *>(unsigned short const *ptr) \ | |
{ \ | |
unsigned short retval; \ | |
asm volatile ("ld."#ptx_modifier".u16 %0, [%1];" : \ | |
"=h"(retval) : \ | |
_CUB_ASM_PTR_(ptr)); \ | |
return retval; \ | |
} | |
/** | |
* Define an unsigned char (1B) ThreadLoad specialization for the given Cache load modifier | |
*/ | |
#define _CUB_LOAD_1(cub_modifier, ptx_modifier) \ | |
template<> \ | |
__device__ __forceinline__ unsigned char ThreadLoad<cub_modifier, unsigned char const *>(unsigned char const *ptr) \ | |
{ \ | |
unsigned short retval; \ | |
asm volatile ( \ | |
"{" \ | |
" .reg .u8 datum;" \ | |
" ld."#ptx_modifier".u8 datum, [%1];" \ | |
" cvt.u16.u8 %0, datum;" \ | |
"}" : \ | |
"=h"(retval) : \ | |
_CUB_ASM_PTR_(ptr)); \ | |
return (unsigned char) retval; \ | |
} | |
/** | |
* Define powers-of-two ThreadLoad specializations for the given Cache load modifier | |
*/ | |
#define _CUB_LOAD_ALL(cub_modifier, ptx_modifier) \ | |
_CUB_LOAD_16(cub_modifier, ptx_modifier) \ | |
_CUB_LOAD_8(cub_modifier, ptx_modifier) \ | |
_CUB_LOAD_4(cub_modifier, ptx_modifier) \ | |
_CUB_LOAD_2(cub_modifier, ptx_modifier) \ | |
_CUB_LOAD_1(cub_modifier, ptx_modifier) \ | |
/** | |
* Define powers-of-two ThreadLoad specializations for the various Cache load modifiers | |
*/ | |
#if CUB_PTX_ARCH >= 200 | |
_CUB_LOAD_ALL(LOAD_CA, ca) | |
_CUB_LOAD_ALL(LOAD_CG, cg) | |
_CUB_LOAD_ALL(LOAD_CS, cs) | |
_CUB_LOAD_ALL(LOAD_CV, cv) | |
#else | |
_CUB_LOAD_ALL(LOAD_CA, global) | |
// Use volatile to ensure coherent reads when this PTX is JIT'd to run on newer architectures with L1 | |
_CUB_LOAD_ALL(LOAD_CG, volatile.global) | |
_CUB_LOAD_ALL(LOAD_CS, global) | |
_CUB_LOAD_ALL(LOAD_CV, volatile.global) | |
#endif | |
#if CUB_PTX_ARCH >= 350 | |
_CUB_LOAD_ALL(LOAD_LDG, global.nc) | |
#else | |
_CUB_LOAD_ALL(LOAD_LDG, global) | |
#endif | |
// Macro cleanup | |
#undef _CUB_LOAD_ALL | |
#undef _CUB_LOAD_1 | |
#undef _CUB_LOAD_2 | |
#undef _CUB_LOAD_4 | |
#undef _CUB_LOAD_8 | |
#undef _CUB_LOAD_16 | |
/** | |
* ThreadLoad definition for LOAD_DEFAULT modifier on iterator types | |
*/ | |
template <typename InputIteratorT> | |
__device__ __forceinline__ typename std::iterator_traits<InputIteratorT>::value_type ThreadLoad( | |
InputIteratorT itr, | |
Int2Type<LOAD_DEFAULT> /*modifier*/, | |
Int2Type<false> /*is_pointer*/) | |
{ | |
return *itr; | |
} | |
/** | |
* ThreadLoad definition for LOAD_DEFAULT modifier on pointer types | |
*/ | |
template <typename T> | |
__device__ __forceinline__ T ThreadLoad( | |
T *ptr, | |
Int2Type<LOAD_DEFAULT> /*modifier*/, | |
Int2Type<true> /*is_pointer*/) | |
{ | |
return *ptr; | |
} | |
/** | |
* ThreadLoad definition for LOAD_VOLATILE modifier on primitive pointer types | |
*/ | |
template <typename T> | |
__device__ __forceinline__ T ThreadLoadVolatilePointer( | |
T *ptr, | |
Int2Type<true> /*is_primitive*/) | |
{ | |
T retval = *reinterpret_cast<volatile T*>(ptr); | |
return retval; | |
} | |
/** | |
* ThreadLoad definition for LOAD_VOLATILE modifier on non-primitive pointer types | |
*/ | |
template <typename T> | |
__device__ __forceinline__ T ThreadLoadVolatilePointer( | |
T *ptr, | |
Int2Type<false> /*is_primitive*/) | |
{ | |
typedef typename UnitWord<T>::VolatileWord VolatileWord; // Word type for memcopying | |
const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord); | |
T retval; | |
VolatileWord *words = reinterpret_cast<VolatileWord*>(&retval); | |
IterateThreadLoad<0, VOLATILE_MULTIPLE>::Dereference( | |
reinterpret_cast<volatile VolatileWord*>(ptr), | |
words); | |
return retval; | |
} | |
/** | |
* ThreadLoad definition for LOAD_VOLATILE modifier on pointer types | |
*/ | |
template <typename T> | |
__device__ __forceinline__ T ThreadLoad( | |
T *ptr, | |
Int2Type<LOAD_VOLATILE> /*modifier*/, | |
Int2Type<true> /*is_pointer*/) | |
{ | |
// Apply tags for partial-specialization | |
return ThreadLoadVolatilePointer(ptr, Int2Type<Traits<T>::PRIMITIVE>()); | |
} | |
/** | |
* ThreadLoad definition for generic modifiers on pointer types | |
*/ | |
template <typename T, int MODIFIER> | |
__device__ __forceinline__ T ThreadLoad( | |
T const *ptr, | |
Int2Type<MODIFIER> /*modifier*/, | |
Int2Type<true> /*is_pointer*/) | |
{ | |
typedef typename UnitWord<T>::DeviceWord DeviceWord; | |
const int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord); | |
DeviceWord words[DEVICE_MULTIPLE]; | |
IterateThreadLoad<0, DEVICE_MULTIPLE>::template Load<CacheLoadModifier(MODIFIER)>( | |
reinterpret_cast<DeviceWord*>(const_cast<T*>(ptr)), | |
words); | |
return *reinterpret_cast<T*>(words); | |
} | |
/** | |
* ThreadLoad definition for generic modifiers | |
*/ | |
template < | |
CacheLoadModifier MODIFIER, | |
typename InputIteratorT> | |
__device__ __forceinline__ typename std::iterator_traits<InputIteratorT>::value_type ThreadLoad(InputIteratorT itr) | |
{ | |
// Apply tags for partial-specialization | |
return ThreadLoad( | |
itr, | |
Int2Type<MODIFIER>(), | |
Int2Type<IsPointer<InputIteratorT>::VALUE>()); | |
} | |
#endif // DOXYGEN_SHOULD_SKIP_THIS | |
/** @} */ // end group UtilIo | |
} // CUB namespace | |
CUB_NS_POSTFIX // Optional outer namespace(s) | |