Spaces:

ma-xu
/

LIVE

Runtime error

LIVE / thrust /cub /thread /thread_load.cuh

Xu Ma

update

1c3c0d9 over 2 years ago

18.6 kB

	/******************************************************************************
	* Copyright (c) 2011, Duane Merrill. All rights reserved.
	* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* * Neither the name of the NVIDIA CORPORATION nor the
	* names of its contributors may be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	******************************************************************************/

	/**
	* \file
	* Thread utilities for reading memory using PTX cache modifiers.
	*/

	#pragma once

	#include <iterator>

	#include "../config.cuh"
	#include "../util_ptx.cuh"
	#include "../util_type.cuh"

	/// Optional outer namespace(s)
	CUB_NS_PREFIX

	/// CUB namespace
	namespace cub {

	/**
	* \addtogroup UtilIo
	* @{
	*/

	//-----------------------------------------------------------------------------
	// Tags and constants
	//-----------------------------------------------------------------------------

	/**
	* \brief Enumeration of cache modifiers for memory load operations.
	*/
	enum CacheLoadModifier
	{
	LOAD_DEFAULT, ///< Default (no modifier)
	LOAD_CA, ///< Cache at all levels
	LOAD_CG, ///< Cache at global level
	LOAD_CS, ///< Cache streaming (likely to be accessed once)
	LOAD_CV, ///< Cache as volatile (including cached system lines)
	LOAD_LDG, ///< Cache as texture
	LOAD_VOLATILE, ///< Volatile (any memory space)
	};


	/**
	* \name Thread I/O (cache modified)
	* @{
	*/

	/**
	* \brief Thread utility for reading memory using cub::CacheLoadModifier cache modifiers. Can be used to load any data type.
	*
	* \par Example
	* \code
	* #include <cub/cub.cuh> // or equivalently <cub/thread/thread_load.cuh>
	*
	* // 32-bit load using cache-global modifier:
	* int *d_in;
	* int val = cub::ThreadLoad<cub::LOAD_CA>(d_in + threadIdx.x);
	*
	* // 16-bit load using default modifier
	* short *d_in;
	* short val = cub::ThreadLoad<cub::LOAD_DEFAULT>(d_in + threadIdx.x);
	*
	* // 256-bit load using cache-volatile modifier
	* double4 *d_in;
	* double4 val = cub::ThreadLoad<cub::LOAD_CV>(d_in + threadIdx.x);
	*
	* // 96-bit load using cache-streaming modifier
	* struct TestFoo { bool a; short b; };
	* TestFoo *d_struct;
	* TestFoo val = cub::ThreadLoad<cub::LOAD_CS>(d_in + threadIdx.x);
	* \endcode
	*
	* \tparam MODIFIER <b>[inferred]</b> CacheLoadModifier enumeration
	* \tparam InputIteratorT <b>[inferred]</b> Input iterator type \iterator
	*/
	template <
	CacheLoadModifier MODIFIER,
	typename InputIteratorT>
	__device__ __forceinline__ typename std::iterator_traits<InputIteratorT>::value_type ThreadLoad(InputIteratorT itr);


	//@} end member group


	#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document


	/// Helper structure for templated load iteration (inductive case)
	template <int COUNT, int MAX>
	struct IterateThreadLoad
	{
	template <CacheLoadModifier MODIFIER, typename T>
	static __device__ __forceinline__ void Load(T const ptr, T vals)
	{
	vals[COUNT] = ThreadLoad<MODIFIER>(ptr + COUNT);
	IterateThreadLoad<COUNT + 1, MAX>::template Load<MODIFIER>(ptr, vals);
	}

	template <typename InputIteratorT, typename T>
	static __device__ __forceinline__ void Dereference(InputIteratorT itr, T *vals)
	{
	vals[COUNT] = itr[COUNT];
	IterateThreadLoad<COUNT + 1, MAX>::Dereference(itr, vals);
	}
	};


	/// Helper structure for templated load iteration (termination case)
	template <int MAX>
	struct IterateThreadLoad<MAX, MAX>
	{
	template <CacheLoadModifier MODIFIER, typename T>
	static __device__ __forceinline__ void Load(T const * /ptr/, T * /vals/) {}

	template <typename InputIteratorT, typename T>
	static __device__ __forceinline__ void Dereference(InputIteratorT /itr/, T * /vals/) {}
	};


	/**
	* Define a uint4 (16B) ThreadLoad specialization for the given Cache load modifier
	*/
	#define _CUB_LOAD_16(cub_modifier, ptx_modifier) \
	template<> \
	__device__ __forceinline__ uint4 ThreadLoad<cub_modifier, uint4 const >(uint4 const ptr) \
	{ \
	uint4 retval; \
	asm volatile ("ld."#ptx_modifier".v4.u32 {%0, %1, %2, %3}, [%4];" : \
	"=r"(retval.x), \
	"=r"(retval.y), \
	"=r"(retval.z), \
	"=r"(retval.w) : \
	_CUB_ASM_PTR_(ptr)); \
	return retval; \
	} \
	template<> \
	__device__ __forceinline__ ulonglong2 ThreadLoad<cub_modifier, ulonglong2 const >(ulonglong2 const ptr) \
	{ \
	ulonglong2 retval; \
	asm volatile ("ld."#ptx_modifier".v2.u64 {%0, %1}, [%2];" : \
	"=l"(retval.x), \
	"=l"(retval.y) : \
	_CUB_ASM_PTR_(ptr)); \
	return retval; \
	}

	/**
	* Define a uint2 (8B) ThreadLoad specialization for the given Cache load modifier
	*/
	#define _CUB_LOAD_8(cub_modifier, ptx_modifier) \
	template<> \
	__device__ __forceinline__ ushort4 ThreadLoad<cub_modifier, ushort4 const >(ushort4 const ptr) \
	{ \
	ushort4 retval; \
	asm volatile ("ld."#ptx_modifier".v4.u16 {%0, %1, %2, %3}, [%4];" : \
	"=h"(retval.x), \
	"=h"(retval.y), \
	"=h"(retval.z), \
	"=h"(retval.w) : \
	_CUB_ASM_PTR_(ptr)); \
	return retval; \
	} \
	template<> \
	__device__ __forceinline__ uint2 ThreadLoad<cub_modifier, uint2 const >(uint2 const ptr) \
	{ \
	uint2 retval; \
	asm volatile ("ld."#ptx_modifier".v2.u32 {%0, %1}, [%2];" : \
	"=r"(retval.x), \
	"=r"(retval.y) : \
	_CUB_ASM_PTR_(ptr)); \
	return retval; \
	} \
	template<> \
	__device__ __forceinline__ unsigned long long ThreadLoad<cub_modifier, unsigned long long const >(unsigned long long const ptr) \
	{ \
	unsigned long long retval; \
	asm volatile ("ld."#ptx_modifier".u64 %0, [%1];" : \
	"=l"(retval) : \
	_CUB_ASM_PTR_(ptr)); \
	return retval; \
	}

	/**
	* Define a uint (4B) ThreadLoad specialization for the given Cache load modifier
	*/
	#define _CUB_LOAD_4(cub_modifier, ptx_modifier) \
	template<> \
	__device__ __forceinline__ unsigned int ThreadLoad<cub_modifier, unsigned int const >(unsigned int const ptr) \
	{ \
	unsigned int retval; \
	asm volatile ("ld."#ptx_modifier".u32 %0, [%1];" : \
	"=r"(retval) : \
	_CUB_ASM_PTR_(ptr)); \
	return retval; \
	}


	/**
	* Define a unsigned short (2B) ThreadLoad specialization for the given Cache load modifier
	*/
	#define _CUB_LOAD_2(cub_modifier, ptx_modifier) \
	template<> \
	__device__ __forceinline__ unsigned short ThreadLoad<cub_modifier, unsigned short const >(unsigned short const ptr) \
	{ \
	unsigned short retval; \
	asm volatile ("ld."#ptx_modifier".u16 %0, [%1];" : \
	"=h"(retval) : \
	_CUB_ASM_PTR_(ptr)); \
	return retval; \
	}


	/**
	* Define an unsigned char (1B) ThreadLoad specialization for the given Cache load modifier
	*/
	#define _CUB_LOAD_1(cub_modifier, ptx_modifier) \
	template<> \
	__device__ __forceinline__ unsigned char ThreadLoad<cub_modifier, unsigned char const >(unsigned char const ptr) \
	{ \
	unsigned short retval; \
	asm volatile ( \
	"{" \
	" .reg .u8 datum;" \
	" ld."#ptx_modifier".u8 datum, [%1];" \
	" cvt.u16.u8 %0, datum;" \
	"}" : \
	"=h"(retval) : \
	_CUB_ASM_PTR_(ptr)); \
	return (unsigned char) retval; \
	}


	/**
	* Define powers-of-two ThreadLoad specializations for the given Cache load modifier
	*/
	#define _CUB_LOAD_ALL(cub_modifier, ptx_modifier) \
	_CUB_LOAD_16(cub_modifier, ptx_modifier) \
	_CUB_LOAD_8(cub_modifier, ptx_modifier) \
	_CUB_LOAD_4(cub_modifier, ptx_modifier) \
	_CUB_LOAD_2(cub_modifier, ptx_modifier) \
	_CUB_LOAD_1(cub_modifier, ptx_modifier) \


	/**
	* Define powers-of-two ThreadLoad specializations for the various Cache load modifiers
	*/
	#if CUB_PTX_ARCH >= 200
	_CUB_LOAD_ALL(LOAD_CA, ca)
	_CUB_LOAD_ALL(LOAD_CG, cg)
	_CUB_LOAD_ALL(LOAD_CS, cs)
	_CUB_LOAD_ALL(LOAD_CV, cv)
	#else
	_CUB_LOAD_ALL(LOAD_CA, global)
	// Use volatile to ensure coherent reads when this PTX is JIT'd to run on newer architectures with L1
	_CUB_LOAD_ALL(LOAD_CG, volatile.global)
	_CUB_LOAD_ALL(LOAD_CS, global)
	_CUB_LOAD_ALL(LOAD_CV, volatile.global)
	#endif

	#if CUB_PTX_ARCH >= 350
	_CUB_LOAD_ALL(LOAD_LDG, global.nc)
	#else
	_CUB_LOAD_ALL(LOAD_LDG, global)
	#endif


	// Macro cleanup
	#undef _CUB_LOAD_ALL
	#undef _CUB_LOAD_1
	#undef _CUB_LOAD_2
	#undef _CUB_LOAD_4
	#undef _CUB_LOAD_8
	#undef _CUB_LOAD_16



	/**
	* ThreadLoad definition for LOAD_DEFAULT modifier on iterator types
	*/
	template <typename InputIteratorT>
	__device__ __forceinline__ typename std::iterator_traits<InputIteratorT>::value_type ThreadLoad(
	InputIteratorT itr,
	Int2Type<LOAD_DEFAULT> /modifier/,
	Int2Type<false> /is_pointer/)
	{
	return *itr;
	}


	/**
	* ThreadLoad definition for LOAD_DEFAULT modifier on pointer types
	*/
	template <typename T>
	__device__ __forceinline__ T ThreadLoad(
	T *ptr,
	Int2Type<LOAD_DEFAULT> /modifier/,
	Int2Type<true> /is_pointer/)
	{
	return *ptr;
	}


	/**
	* ThreadLoad definition for LOAD_VOLATILE modifier on primitive pointer types
	*/
	template <typename T>
	__device__ __forceinline__ T ThreadLoadVolatilePointer(
	T *ptr,
	Int2Type<true> /is_primitive/)
	{
	T retval = reinterpret_cast<volatile T>(ptr);
	return retval;
	}


	/**
	* ThreadLoad definition for LOAD_VOLATILE modifier on non-primitive pointer types
	*/
	template <typename T>
	__device__ __forceinline__ T ThreadLoadVolatilePointer(
	T *ptr,
	Int2Type<false> /is_primitive/)
	{
	typedef typename UnitWord<T>::VolatileWord VolatileWord; // Word type for memcopying

	const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord);

	T retval;
	VolatileWord words = reinterpret_cast<VolatileWord>(&retval);
	IterateThreadLoad<0, VOLATILE_MULTIPLE>::Dereference(
	reinterpret_cast<volatile VolatileWord*>(ptr),
	words);
	return retval;
	}


	/**
	* ThreadLoad definition for LOAD_VOLATILE modifier on pointer types
	*/
	template <typename T>
	__device__ __forceinline__ T ThreadLoad(
	T *ptr,
	Int2Type<LOAD_VOLATILE> /modifier/,
	Int2Type<true> /is_pointer/)
	{
	// Apply tags for partial-specialization
	return ThreadLoadVolatilePointer(ptr, Int2Type<Traits<T>::PRIMITIVE>());
	}


	/**
	* ThreadLoad definition for generic modifiers on pointer types
	*/
	template <typename T, int MODIFIER>
	__device__ __forceinline__ T ThreadLoad(
	T const *ptr,
	Int2Type<MODIFIER> /modifier/,
	Int2Type<true> /is_pointer/)
	{
	typedef typename UnitWord<T>::DeviceWord DeviceWord;

	const int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord);

	DeviceWord words[DEVICE_MULTIPLE];

	IterateThreadLoad<0, DEVICE_MULTIPLE>::template Load<CacheLoadModifier(MODIFIER)>(
	reinterpret_cast<DeviceWord>(const_cast<T>(ptr)),
	words);

	return reinterpret_cast<T>(words);
	}


	/**
	* ThreadLoad definition for generic modifiers
	*/
	template <
	CacheLoadModifier MODIFIER,
	typename InputIteratorT>
	__device__ __forceinline__ typename std::iterator_traits<InputIteratorT>::value_type ThreadLoad(InputIteratorT itr)
	{
	// Apply tags for partial-specialization
	return ThreadLoad(
	itr,
	Int2Type<MODIFIER>(),
	Int2Type<IsPointer<InputIteratorT>::VALUE>());
	}



	#endif // DOXYGEN_SHOULD_SKIP_THIS


	/** @} */ // end group UtilIo


	} // CUB namespace
	CUB_NS_POSTFIX // Optional outer namespace(s)