LIVE / thrust /cub /thread /thread_load.cuh
Xu Ma
update
1c3c0d9
raw
history blame
18.6 kB
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* \file
* Thread utilities for reading memory using PTX cache modifiers.
*/
#pragma once
#include <iterator>
#include "../config.cuh"
#include "../util_ptx.cuh"
#include "../util_type.cuh"
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* \addtogroup UtilIo
* @{
*/
//-----------------------------------------------------------------------------
// Tags and constants
//-----------------------------------------------------------------------------
/**
* \brief Enumeration of cache modifiers for memory load operations.
*/
enum CacheLoadModifier
{
LOAD_DEFAULT, ///< Default (no modifier)
LOAD_CA, ///< Cache at all levels
LOAD_CG, ///< Cache at global level
LOAD_CS, ///< Cache streaming (likely to be accessed once)
LOAD_CV, ///< Cache as volatile (including cached system lines)
LOAD_LDG, ///< Cache as texture
LOAD_VOLATILE, ///< Volatile (any memory space)
};
/**
* \name Thread I/O (cache modified)
* @{
*/
/**
* \brief Thread utility for reading memory using cub::CacheLoadModifier cache modifiers. Can be used to load any data type.
*
* \par Example
* \code
* #include <cub/cub.cuh> // or equivalently <cub/thread/thread_load.cuh>
*
* // 32-bit load using cache-global modifier:
* int *d_in;
* int val = cub::ThreadLoad<cub::LOAD_CA>(d_in + threadIdx.x);
*
* // 16-bit load using default modifier
* short *d_in;
* short val = cub::ThreadLoad<cub::LOAD_DEFAULT>(d_in + threadIdx.x);
*
* // 256-bit load using cache-volatile modifier
* double4 *d_in;
* double4 val = cub::ThreadLoad<cub::LOAD_CV>(d_in + threadIdx.x);
*
* // 96-bit load using cache-streaming modifier
* struct TestFoo { bool a; short b; };
* TestFoo *d_struct;
* TestFoo val = cub::ThreadLoad<cub::LOAD_CS>(d_in + threadIdx.x);
* \endcode
*
* \tparam MODIFIER <b>[inferred]</b> CacheLoadModifier enumeration
* \tparam InputIteratorT <b>[inferred]</b> Input iterator type \iterator
*/
template <
CacheLoadModifier MODIFIER,
typename InputIteratorT>
__device__ __forceinline__ typename std::iterator_traits<InputIteratorT>::value_type ThreadLoad(InputIteratorT itr);
//@} end member group
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
/// Helper structure for templated load iteration (inductive case)
template <int COUNT, int MAX>
struct IterateThreadLoad
{
template <CacheLoadModifier MODIFIER, typename T>
static __device__ __forceinline__ void Load(T const *ptr, T *vals)
{
vals[COUNT] = ThreadLoad<MODIFIER>(ptr + COUNT);
IterateThreadLoad<COUNT + 1, MAX>::template Load<MODIFIER>(ptr, vals);
}
template <typename InputIteratorT, typename T>
static __device__ __forceinline__ void Dereference(InputIteratorT itr, T *vals)
{
vals[COUNT] = itr[COUNT];
IterateThreadLoad<COUNT + 1, MAX>::Dereference(itr, vals);
}
};
/// Helper structure for templated load iteration (termination case)
template <int MAX>
struct IterateThreadLoad<MAX, MAX>
{
template <CacheLoadModifier MODIFIER, typename T>
static __device__ __forceinline__ void Load(T const * /*ptr*/, T * /*vals*/) {}
template <typename InputIteratorT, typename T>
static __device__ __forceinline__ void Dereference(InputIteratorT /*itr*/, T * /*vals*/) {}
};
/**
* Define a uint4 (16B) ThreadLoad specialization for the given Cache load modifier
*/
#define _CUB_LOAD_16(cub_modifier, ptx_modifier) \
template<> \
__device__ __forceinline__ uint4 ThreadLoad<cub_modifier, uint4 const *>(uint4 const *ptr) \
{ \
uint4 retval; \
asm volatile ("ld."#ptx_modifier".v4.u32 {%0, %1, %2, %3}, [%4];" : \
"=r"(retval.x), \
"=r"(retval.y), \
"=r"(retval.z), \
"=r"(retval.w) : \
_CUB_ASM_PTR_(ptr)); \
return retval; \
} \
template<> \
__device__ __forceinline__ ulonglong2 ThreadLoad<cub_modifier, ulonglong2 const *>(ulonglong2 const *ptr) \
{ \
ulonglong2 retval; \
asm volatile ("ld."#ptx_modifier".v2.u64 {%0, %1}, [%2];" : \
"=l"(retval.x), \
"=l"(retval.y) : \
_CUB_ASM_PTR_(ptr)); \
return retval; \
}
/**
* Define a uint2 (8B) ThreadLoad specialization for the given Cache load modifier
*/
#define _CUB_LOAD_8(cub_modifier, ptx_modifier) \
template<> \
__device__ __forceinline__ ushort4 ThreadLoad<cub_modifier, ushort4 const *>(ushort4 const *ptr) \
{ \
ushort4 retval; \
asm volatile ("ld."#ptx_modifier".v4.u16 {%0, %1, %2, %3}, [%4];" : \
"=h"(retval.x), \
"=h"(retval.y), \
"=h"(retval.z), \
"=h"(retval.w) : \
_CUB_ASM_PTR_(ptr)); \
return retval; \
} \
template<> \
__device__ __forceinline__ uint2 ThreadLoad<cub_modifier, uint2 const *>(uint2 const *ptr) \
{ \
uint2 retval; \
asm volatile ("ld."#ptx_modifier".v2.u32 {%0, %1}, [%2];" : \
"=r"(retval.x), \
"=r"(retval.y) : \
_CUB_ASM_PTR_(ptr)); \
return retval; \
} \
template<> \
__device__ __forceinline__ unsigned long long ThreadLoad<cub_modifier, unsigned long long const *>(unsigned long long const *ptr) \
{ \
unsigned long long retval; \
asm volatile ("ld."#ptx_modifier".u64 %0, [%1];" : \
"=l"(retval) : \
_CUB_ASM_PTR_(ptr)); \
return retval; \
}
/**
* Define a uint (4B) ThreadLoad specialization for the given Cache load modifier
*/
#define _CUB_LOAD_4(cub_modifier, ptx_modifier) \
template<> \
__device__ __forceinline__ unsigned int ThreadLoad<cub_modifier, unsigned int const *>(unsigned int const *ptr) \
{ \
unsigned int retval; \
asm volatile ("ld."#ptx_modifier".u32 %0, [%1];" : \
"=r"(retval) : \
_CUB_ASM_PTR_(ptr)); \
return retval; \
}
/**
* Define a unsigned short (2B) ThreadLoad specialization for the given Cache load modifier
*/
#define _CUB_LOAD_2(cub_modifier, ptx_modifier) \
template<> \
__device__ __forceinline__ unsigned short ThreadLoad<cub_modifier, unsigned short const *>(unsigned short const *ptr) \
{ \
unsigned short retval; \
asm volatile ("ld."#ptx_modifier".u16 %0, [%1];" : \
"=h"(retval) : \
_CUB_ASM_PTR_(ptr)); \
return retval; \
}
/**
* Define an unsigned char (1B) ThreadLoad specialization for the given Cache load modifier
*/
#define _CUB_LOAD_1(cub_modifier, ptx_modifier) \
template<> \
__device__ __forceinline__ unsigned char ThreadLoad<cub_modifier, unsigned char const *>(unsigned char const *ptr) \
{ \
unsigned short retval; \
asm volatile ( \
"{" \
" .reg .u8 datum;" \
" ld."#ptx_modifier".u8 datum, [%1];" \
" cvt.u16.u8 %0, datum;" \
"}" : \
"=h"(retval) : \
_CUB_ASM_PTR_(ptr)); \
return (unsigned char) retval; \
}
/**
* Define powers-of-two ThreadLoad specializations for the given Cache load modifier
*/
#define _CUB_LOAD_ALL(cub_modifier, ptx_modifier) \
_CUB_LOAD_16(cub_modifier, ptx_modifier) \
_CUB_LOAD_8(cub_modifier, ptx_modifier) \
_CUB_LOAD_4(cub_modifier, ptx_modifier) \
_CUB_LOAD_2(cub_modifier, ptx_modifier) \
_CUB_LOAD_1(cub_modifier, ptx_modifier) \
/**
* Define powers-of-two ThreadLoad specializations for the various Cache load modifiers
*/
#if CUB_PTX_ARCH >= 200
_CUB_LOAD_ALL(LOAD_CA, ca)
_CUB_LOAD_ALL(LOAD_CG, cg)
_CUB_LOAD_ALL(LOAD_CS, cs)
_CUB_LOAD_ALL(LOAD_CV, cv)
#else
_CUB_LOAD_ALL(LOAD_CA, global)
// Use volatile to ensure coherent reads when this PTX is JIT'd to run on newer architectures with L1
_CUB_LOAD_ALL(LOAD_CG, volatile.global)
_CUB_LOAD_ALL(LOAD_CS, global)
_CUB_LOAD_ALL(LOAD_CV, volatile.global)
#endif
#if CUB_PTX_ARCH >= 350
_CUB_LOAD_ALL(LOAD_LDG, global.nc)
#else
_CUB_LOAD_ALL(LOAD_LDG, global)
#endif
// Macro cleanup
#undef _CUB_LOAD_ALL
#undef _CUB_LOAD_1
#undef _CUB_LOAD_2
#undef _CUB_LOAD_4
#undef _CUB_LOAD_8
#undef _CUB_LOAD_16
/**
* ThreadLoad definition for LOAD_DEFAULT modifier on iterator types
*/
template <typename InputIteratorT>
__device__ __forceinline__ typename std::iterator_traits<InputIteratorT>::value_type ThreadLoad(
InputIteratorT itr,
Int2Type<LOAD_DEFAULT> /*modifier*/,
Int2Type<false> /*is_pointer*/)
{
return *itr;
}
/**
* ThreadLoad definition for LOAD_DEFAULT modifier on pointer types
*/
template <typename T>
__device__ __forceinline__ T ThreadLoad(
T *ptr,
Int2Type<LOAD_DEFAULT> /*modifier*/,
Int2Type<true> /*is_pointer*/)
{
return *ptr;
}
/**
* ThreadLoad definition for LOAD_VOLATILE modifier on primitive pointer types
*/
template <typename T>
__device__ __forceinline__ T ThreadLoadVolatilePointer(
T *ptr,
Int2Type<true> /*is_primitive*/)
{
T retval = *reinterpret_cast<volatile T*>(ptr);
return retval;
}
/**
* ThreadLoad definition for LOAD_VOLATILE modifier on non-primitive pointer types
*/
template <typename T>
__device__ __forceinline__ T ThreadLoadVolatilePointer(
T *ptr,
Int2Type<false> /*is_primitive*/)
{
typedef typename UnitWord<T>::VolatileWord VolatileWord; // Word type for memcopying
const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord);
T retval;
VolatileWord *words = reinterpret_cast<VolatileWord*>(&retval);
IterateThreadLoad<0, VOLATILE_MULTIPLE>::Dereference(
reinterpret_cast<volatile VolatileWord*>(ptr),
words);
return retval;
}
/**
* ThreadLoad definition for LOAD_VOLATILE modifier on pointer types
*/
template <typename T>
__device__ __forceinline__ T ThreadLoad(
T *ptr,
Int2Type<LOAD_VOLATILE> /*modifier*/,
Int2Type<true> /*is_pointer*/)
{
// Apply tags for partial-specialization
return ThreadLoadVolatilePointer(ptr, Int2Type<Traits<T>::PRIMITIVE>());
}
/**
* ThreadLoad definition for generic modifiers on pointer types
*/
template <typename T, int MODIFIER>
__device__ __forceinline__ T ThreadLoad(
T const *ptr,
Int2Type<MODIFIER> /*modifier*/,
Int2Type<true> /*is_pointer*/)
{
typedef typename UnitWord<T>::DeviceWord DeviceWord;
const int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord);
DeviceWord words[DEVICE_MULTIPLE];
IterateThreadLoad<0, DEVICE_MULTIPLE>::template Load<CacheLoadModifier(MODIFIER)>(
reinterpret_cast<DeviceWord*>(const_cast<T*>(ptr)),
words);
return *reinterpret_cast<T*>(words);
}
/**
* ThreadLoad definition for generic modifiers
*/
template <
CacheLoadModifier MODIFIER,
typename InputIteratorT>
__device__ __forceinline__ typename std::iterator_traits<InputIteratorT>::value_type ThreadLoad(InputIteratorT itr)
{
// Apply tags for partial-specialization
return ThreadLoad(
itr,
Int2Type<MODIFIER>(),
Int2Type<IsPointer<InputIteratorT>::VALUE>());
}
#endif // DOXYGEN_SHOULD_SKIP_THIS
/** @} */ // end group UtilIo
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)