Spaces:

ma-xu
/

LIVE

Runtime error

LIVE / thrust /cub /thread /thread_store.cuh

Xu Ma

update

1c3c0d9 over 2 years ago

17.9 kB

	/******************************************************************************
	* Copyright (c) 2011, Duane Merrill. All rights reserved.
	* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* * Neither the name of the NVIDIA CORPORATION nor the
	* names of its contributors may be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	******************************************************************************/

	/**
	* \file
	* Thread utilities for writing memory using PTX cache modifiers.
	*/

	#pragma once

	#include "../config.cuh"
	#include "../util_ptx.cuh"
	#include "../util_type.cuh"

	/// Optional outer namespace(s)
	CUB_NS_PREFIX

	/// CUB namespace
	namespace cub {

	/**
	* \addtogroup UtilIo
	* @{
	*/


	//-----------------------------------------------------------------------------
	// Tags and constants
	//-----------------------------------------------------------------------------

	/**
	* \brief Enumeration of cache modifiers for memory store operations.
	*/
	enum CacheStoreModifier
	{
	STORE_DEFAULT, ///< Default (no modifier)
	STORE_WB, ///< Cache write-back all coherent levels
	STORE_CG, ///< Cache at global level
	STORE_CS, ///< Cache streaming (likely to be accessed once)
	STORE_WT, ///< Cache write-through (to system memory)
	STORE_VOLATILE, ///< Volatile shared (any memory space)
	};


	/**
	* \name Thread I/O (cache modified)
	* @{
	*/

	/**
	* \brief Thread utility for writing memory using cub::CacheStoreModifier cache modifiers. Can be used to store any data type.
	*
	* \par Example
	* \code
	* #include <cub/cub.cuh> // or equivalently <cub/thread/thread_store.cuh>
	*
	* // 32-bit store using cache-global modifier:
	* int *d_out;
	* int val;
	* cub::ThreadStore<cub::STORE_CG>(d_out + threadIdx.x, val);
	*
	* // 16-bit store using default modifier
	* short *d_out;
	* short val;
	* cub::ThreadStore<cub::STORE_DEFAULT>(d_out + threadIdx.x, val);
	*
	* // 256-bit store using write-through modifier
	* double4 *d_out;
	* double4 val;
	* cub::ThreadStore<cub::STORE_WT>(d_out + threadIdx.x, val);
	*
	* // 96-bit store using cache-streaming cache modifier
	* struct TestFoo { bool a; short b; };
	* TestFoo *d_struct;
	* TestFoo val;
	* cub::ThreadStore<cub::STORE_CS>(d_out + threadIdx.x, val);
	* \endcode
	*
	* \tparam MODIFIER <b>[inferred]</b> CacheStoreModifier enumeration
	* \tparam InputIteratorT <b>[inferred]</b> Output iterator type \iterator
	* \tparam T <b>[inferred]</b> Data type of output value
	*/
	template <
	CacheStoreModifier MODIFIER,
	typename OutputIteratorT,
	typename T>
	__device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val);


	//@} end member group


	#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document


	/// Helper structure for templated store iteration (inductive case)
	template <int COUNT, int MAX>
	struct IterateThreadStore
	{
	template <CacheStoreModifier MODIFIER, typename T>
	static __device__ __forceinline__ void Store(T ptr, T vals)
	{
	ThreadStore<MODIFIER>(ptr + COUNT, vals[COUNT]);
	IterateThreadStore<COUNT + 1, MAX>::template Store<MODIFIER>(ptr, vals);
	}

	template <typename OutputIteratorT, typename T>
	static __device__ __forceinline__ void Dereference(OutputIteratorT ptr, T *vals)
	{
	ptr[COUNT] = vals[COUNT];
	IterateThreadStore<COUNT + 1, MAX>::Dereference(ptr, vals);
	}

	};

	/// Helper structure for templated store iteration (termination case)
	template <int MAX>
	struct IterateThreadStore<MAX, MAX>
	{
	template <CacheStoreModifier MODIFIER, typename T>
	static __device__ __forceinline__ void Store(T * /ptr/, T * /vals/) {}

	template <typename OutputIteratorT, typename T>
	static __device__ __forceinline__ void Dereference(OutputIteratorT /ptr/, T * /vals/) {}
	};


	/**
	* Define a uint4 (16B) ThreadStore specialization for the given Cache load modifier
	*/
	#define _CUB_STORE_16(cub_modifier, ptx_modifier) \
	template<> \
	__device__ __forceinline__ void ThreadStore<cub_modifier, uint4, uint4>(uint4 ptr, uint4 val) \
	{ \
	asm volatile ("st."#ptx_modifier".v4.u32 [%0], {%1, %2, %3, %4};" : : \
	_CUB_ASM_PTR_(ptr), \
	"r"(val.x), \
	"r"(val.y), \
	"r"(val.z), \
	"r"(val.w)); \
	} \
	template<> \
	__device__ __forceinline__ void ThreadStore<cub_modifier, ulonglong2, ulonglong2>(ulonglong2 ptr, ulonglong2 val) \
	{ \
	asm volatile ("st."#ptx_modifier".v2.u64 [%0], {%1, %2};" : : \
	_CUB_ASM_PTR_(ptr), \
	"l"(val.x), \
	"l"(val.y)); \
	}


	/**
	* Define a uint2 (8B) ThreadStore specialization for the given Cache load modifier
	*/
	#define _CUB_STORE_8(cub_modifier, ptx_modifier) \
	template<> \
	__device__ __forceinline__ void ThreadStore<cub_modifier, ushort4, ushort4>(ushort4 ptr, ushort4 val) \
	{ \
	asm volatile ("st."#ptx_modifier".v4.u16 [%0], {%1, %2, %3, %4};" : : \
	_CUB_ASM_PTR_(ptr), \
	"h"(val.x), \
	"h"(val.y), \
	"h"(val.z), \
	"h"(val.w)); \
	} \
	template<> \
	__device__ __forceinline__ void ThreadStore<cub_modifier, uint2, uint2>(uint2 ptr, uint2 val) \
	{ \
	asm volatile ("st."#ptx_modifier".v2.u32 [%0], {%1, %2};" : : \
	_CUB_ASM_PTR_(ptr), \
	"r"(val.x), \
	"r"(val.y)); \
	} \
	template<> \
	__device__ __forceinline__ void ThreadStore<cub_modifier, unsigned long long, unsigned long long>(unsigned long long ptr, unsigned long long val) \
	{ \
	asm volatile ("st."#ptx_modifier".u64 [%0], %1;" : : \
	_CUB_ASM_PTR_(ptr), \
	"l"(val)); \
	}

	/**
	* Define a unsigned int (4B) ThreadStore specialization for the given Cache load modifier
	*/
	#define _CUB_STORE_4(cub_modifier, ptx_modifier) \
	template<> \
	__device__ __forceinline__ void ThreadStore<cub_modifier, unsigned int, unsigned int>(unsigned int ptr, unsigned int val) \
	{ \
	asm volatile ("st."#ptx_modifier".u32 [%0], %1;" : : \
	_CUB_ASM_PTR_(ptr), \
	"r"(val)); \
	}


	/**
	* Define a unsigned short (2B) ThreadStore specialization for the given Cache load modifier
	*/
	#define _CUB_STORE_2(cub_modifier, ptx_modifier) \
	template<> \
	__device__ __forceinline__ void ThreadStore<cub_modifier, unsigned short, unsigned short>(unsigned short ptr, unsigned short val) \
	{ \
	asm volatile ("st."#ptx_modifier".u16 [%0], %1;" : : \
	_CUB_ASM_PTR_(ptr), \
	"h"(val)); \
	}


	/**
	* Define a unsigned char (1B) ThreadStore specialization for the given Cache load modifier
	*/
	#define _CUB_STORE_1(cub_modifier, ptx_modifier) \
	template<> \
	__device__ __forceinline__ void ThreadStore<cub_modifier, unsigned char, unsigned char>(unsigned char ptr, unsigned char val) \
	{ \
	asm volatile ( \
	"{" \
	" .reg .u8 datum;" \
	" cvt.u8.u16 datum, %1;" \
	" st."#ptx_modifier".u8 [%0], datum;" \
	"}" : : \
	_CUB_ASM_PTR_(ptr), \
	"h"((unsigned short) val)); \
	}

	/**
	* Define powers-of-two ThreadStore specializations for the given Cache load modifier
	*/
	#define _CUB_STORE_ALL(cub_modifier, ptx_modifier) \
	_CUB_STORE_16(cub_modifier, ptx_modifier) \
	_CUB_STORE_8(cub_modifier, ptx_modifier) \
	_CUB_STORE_4(cub_modifier, ptx_modifier) \
	_CUB_STORE_2(cub_modifier, ptx_modifier) \
	_CUB_STORE_1(cub_modifier, ptx_modifier) \


	/**
	* Define ThreadStore specializations for the various Cache load modifiers
	*/
	#if CUB_PTX_ARCH >= 200
	_CUB_STORE_ALL(STORE_WB, wb)
	_CUB_STORE_ALL(STORE_CG, cg)
	_CUB_STORE_ALL(STORE_CS, cs)
	_CUB_STORE_ALL(STORE_WT, wt)
	#else
	_CUB_STORE_ALL(STORE_WB, global)
	_CUB_STORE_ALL(STORE_CG, global)
	_CUB_STORE_ALL(STORE_CS, global)
	_CUB_STORE_ALL(STORE_WT, volatile.global)
	#endif


	// Macro cleanup
	#undef _CUB_STORE_ALL
	#undef _CUB_STORE_1
	#undef _CUB_STORE_2
	#undef _CUB_STORE_4
	#undef _CUB_STORE_8
	#undef _CUB_STORE_16


	/**
	* ThreadStore definition for STORE_DEFAULT modifier on iterator types
	*/
	template <typename OutputIteratorT, typename T>
	__device__ __forceinline__ void ThreadStore(
	OutputIteratorT itr,
	T val,
	Int2Type<STORE_DEFAULT> /modifier/,
	Int2Type<false> /is_pointer/)
	{
	*itr = val;
	}


	/**
	* ThreadStore definition for STORE_DEFAULT modifier on pointer types
	*/
	template <typename T>
	__device__ __forceinline__ void ThreadStore(
	T *ptr,
	T val,
	Int2Type<STORE_DEFAULT> /modifier/,
	Int2Type<true> /is_pointer/)
	{
	*ptr = val;
	}


	/**
	* ThreadStore definition for STORE_VOLATILE modifier on primitive pointer types
	*/
	template <typename T>
	__device__ __forceinline__ void ThreadStoreVolatilePtr(
	T *ptr,
	T val,
	Int2Type<true> /is_primitive/)
	{
	reinterpret_cast<volatile T>(ptr) = val;
	}


	/**
	* ThreadStore definition for STORE_VOLATILE modifier on non-primitive pointer types
	*/
	template <typename T>
	__device__ __forceinline__ void ThreadStoreVolatilePtr(
	T *ptr,
	T val,
	Int2Type<false> /is_primitive/)
	{
	// Create a temporary using shuffle-words, then store using volatile-words
	typedef typename UnitWord<T>::VolatileWord VolatileWord;
	typedef typename UnitWord<T>::ShuffleWord ShuffleWord;

	const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord);
	const int SHUFFLE_MULTIPLE = sizeof(T) / sizeof(ShuffleWord);

	VolatileWord words[VOLATILE_MULTIPLE];

	#pragma unroll
	for (int i = 0; i < SHUFFLE_MULTIPLE; ++i)
	reinterpret_cast<ShuffleWord>(words)[i] = reinterpret_cast<ShuffleWord>(&val)[i];

	IterateThreadStore<0, VOLATILE_MULTIPLE>::template Dereference(
	reinterpret_cast<volatile VolatileWord*>(ptr),
	words);
	}


	/**
	* ThreadStore definition for STORE_VOLATILE modifier on pointer types
	*/
	template <typename T>
	__device__ __forceinline__ void ThreadStore(
	T *ptr,
	T val,
	Int2Type<STORE_VOLATILE> /modifier/,
	Int2Type<true> /is_pointer/)
	{
	ThreadStoreVolatilePtr(ptr, val, Int2Type<Traits<T>::PRIMITIVE>());
	}


	/**
	* ThreadStore definition for generic modifiers on pointer types
	*/
	template <typename T, int MODIFIER>
	__device__ __forceinline__ void ThreadStore(
	T *ptr,
	T val,
	Int2Type<MODIFIER> /modifier/,
	Int2Type<true> /is_pointer/)
	{
	// Create a temporary using shuffle-words, then store using device-words
	typedef typename UnitWord<T>::DeviceWord DeviceWord;
	typedef typename UnitWord<T>::ShuffleWord ShuffleWord;

	const int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord);
	const int SHUFFLE_MULTIPLE = sizeof(T) / sizeof(ShuffleWord);

	DeviceWord words[DEVICE_MULTIPLE];

	#pragma unroll
	for (int i = 0; i < SHUFFLE_MULTIPLE; ++i)
	reinterpret_cast<ShuffleWord>(words)[i] = reinterpret_cast<ShuffleWord>(&val)[i];

	IterateThreadStore<0, DEVICE_MULTIPLE>::template Store<CacheStoreModifier(MODIFIER)>(
	reinterpret_cast<DeviceWord*>(ptr),
	words);
	}


	/**
	* ThreadStore definition for generic modifiers
	*/
	template <CacheStoreModifier MODIFIER, typename OutputIteratorT, typename T>
	__device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val)
	{
	ThreadStore(
	itr,
	val,
	Int2Type<MODIFIER>(),
	Int2Type<IsPointer<OutputIteratorT>::VALUE>());
	}



	#endif // DOXYGEN_SHOULD_SKIP_THIS


	/** @} */ // end group UtilIo


	} // CUB namespace
	CUB_NS_POSTFIX // Optional outer namespace(s)