Spaces:

ma-xu
/

LIVE

Runtime error

LIVE / thrust /cub /agent /agent_reduce.cuh

Xu Ma

update

1c3c0d9 over 2 years ago

17 kB

	/******************************************************************************
	* Copyright (c) 2011, Duane Merrill. All rights reserved.
	* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* * Neither the name of the NVIDIA CORPORATION nor the
	* names of its contributors may be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	******************************************************************************/

	/**
	* \file
	* cub::AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction .
	*/

	#pragma once

	#include <iterator>

	#include "../block/block_load.cuh"
	#include "../block/block_reduce.cuh"
	#include "../grid/grid_mapping.cuh"
	#include "../grid/grid_even_share.cuh"
	#include "../config.cuh"
	#include "../util_type.cuh"
	#include "../iterator/cache_modified_input_iterator.cuh"


	/// Optional outer namespace(s)
	CUB_NS_PREFIX

	/// CUB namespace
	namespace cub {


	/******************************************************************************
	* Tuning policy types
	******************************************************************************/

	/**
	* Parameterizable tuning policy type for AgentReduce
	*/
	template <
	int NOMINAL_BLOCK_THREADS_4B, ///< Threads per thread block
	int NOMINAL_ITEMS_PER_THREAD_4B, ///< Items per thread (per tile of input)
	typename ComputeT, ///< Dominant compute type
	int _VECTOR_LOAD_LENGTH, ///< Number of items per vectorized load
	BlockReduceAlgorithm _BLOCK_ALGORITHM, ///< Cooperative block-wide reduction algorithm to use
	CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements
	typename ScalingType = MemBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT> >
	struct AgentReducePolicy :
	ScalingType
	{
	enum
	{
	VECTOR_LOAD_LENGTH = _VECTOR_LOAD_LENGTH, ///< Number of items per vectorized load
	};

	static const BlockReduceAlgorithm BLOCK_ALGORITHM = _BLOCK_ALGORITHM; ///< Cooperative block-wide reduction algorithm to use
	static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements
	};



	/******************************************************************************
	* Thread block abstractions
	******************************************************************************/

	/**
	* \brief AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction .
	*
	* Each thread reduces only the values it loads. If \p FIRST_TILE, this
	* partial reduction is stored into \p thread_aggregate. Otherwise it is
	* accumulated into \p thread_aggregate.
	*/
	template <
	typename AgentReducePolicy, ///< Parameterized AgentReducePolicy tuning policy type
	typename InputIteratorT, ///< Random-access iterator type for input
	typename OutputIteratorT, ///< Random-access iterator type for output
	typename OffsetT, ///< Signed integer type for global offsets
	typename ReductionOp> ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
	struct AgentReduce
	{

	//---------------------------------------------------------------------
	// Types and constants
	//---------------------------------------------------------------------

	/// The input value type
	typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;

	/// The output value type
	typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
	typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type,
	typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type

	/// Vector type of InputT for data movement
	typedef typename CubVector<InputT, AgentReducePolicy::VECTOR_LOAD_LENGTH>::Type VectorT;

	/// Input iterator wrapper type (for applying cache modifier)
	typedef typename If<IsPointer<InputIteratorT>::VALUE,
	CacheModifiedInputIterator<AgentReducePolicy::LOAD_MODIFIER, InputT, OffsetT>, // Wrap the native input pointer with CacheModifiedInputIterator
	InputIteratorT>::Type // Directly use the supplied input iterator type
	WrappedInputIteratorT;

	/// Constants
	enum
	{
	BLOCK_THREADS = AgentReducePolicy::BLOCK_THREADS,
	ITEMS_PER_THREAD = AgentReducePolicy::ITEMS_PER_THREAD,
	VECTOR_LOAD_LENGTH = CUB_MIN(ITEMS_PER_THREAD, AgentReducePolicy::VECTOR_LOAD_LENGTH),
	TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,

	// Can vectorize according to the policy if the input iterator is a native pointer to a primitive type
	ATTEMPT_VECTORIZATION = (VECTOR_LOAD_LENGTH > 1) &&
	(ITEMS_PER_THREAD % VECTOR_LOAD_LENGTH == 0) &&
	(IsPointer<InputIteratorT>::VALUE) && Traits<InputT>::PRIMITIVE,

	};

	static const CacheLoadModifier LOAD_MODIFIER = AgentReducePolicy::LOAD_MODIFIER;
	static const BlockReduceAlgorithm BLOCK_ALGORITHM = AgentReducePolicy::BLOCK_ALGORITHM;

	/// Parameterized BlockReduce primitive
	typedef BlockReduce<OutputT, BLOCK_THREADS, AgentReducePolicy::BLOCK_ALGORITHM> BlockReduceT;

	/// Shared memory type required by this thread block
	struct _TempStorage
	{
	typename BlockReduceT::TempStorage reduce;
	};

	/// Alias wrapper allowing storage to be unioned
	struct TempStorage : Uninitialized<_TempStorage> {};


	//---------------------------------------------------------------------
	// Per-thread fields
	//---------------------------------------------------------------------

	_TempStorage& temp_storage; ///< Reference to temp_storage
	InputIteratorT d_in; ///< Input data to reduce
	WrappedInputIteratorT d_wrapped_in; ///< Wrapped input data to reduce
	ReductionOp reduction_op; ///< Binary reduction operator


	//---------------------------------------------------------------------
	// Utility
	//---------------------------------------------------------------------


	// Whether or not the input is aligned with the vector type (specialized for types we can vectorize)
	template <typename Iterator>
	static __device__ __forceinline__ bool IsAligned(
	Iterator d_in,
	Int2Type<true> /can_vectorize/)
	{
	return (size_t(d_in) & (sizeof(VectorT) - 1)) == 0;
	}

	// Whether or not the input is aligned with the vector type (specialized for types we cannot vectorize)
	template <typename Iterator>
	static __device__ __forceinline__ bool IsAligned(
	Iterator /d_in/,
	Int2Type<false> /can_vectorize/)
	{
	return false;
	}


	//---------------------------------------------------------------------
	// Constructor
	//---------------------------------------------------------------------

	/**
	* Constructor
	*/
	__device__ __forceinline__ AgentReduce(
	TempStorage& temp_storage, ///< Reference to temp_storage
	InputIteratorT d_in, ///< Input data to reduce
	ReductionOp reduction_op) ///< Binary reduction operator
	:
	temp_storage(temp_storage.Alias()),
	d_in(d_in),
	d_wrapped_in(d_in),
	reduction_op(reduction_op)
	{}


	//---------------------------------------------------------------------
	// Tile consumption
	//---------------------------------------------------------------------

	/**
	* Consume a full tile of input (non-vectorized)
	*/
	template <int IS_FIRST_TILE>
	__device__ __forceinline__ void ConsumeTile(
	OutputT &thread_aggregate,
	OffsetT block_offset, ///< The offset the tile to consume
	int /valid_items/, ///< The number of valid items in the tile
	Int2Type<true> /is_full_tile/, ///< Whether or not this is a full tile
	Int2Type<false> /can_vectorize/) ///< Whether or not we can vectorize loads
	{
	OutputT items[ITEMS_PER_THREAD];

	// Load items in striped fashion
	LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_wrapped_in + block_offset, items);

	// Reduce items within each thread stripe
	thread_aggregate = (IS_FIRST_TILE) ?
	internal::ThreadReduce(items, reduction_op) :
	internal::ThreadReduce(items, reduction_op, thread_aggregate);
	}


	/**
	* Consume a full tile of input (vectorized)
	*/
	template <int IS_FIRST_TILE>
	__device__ __forceinline__ void ConsumeTile(
	OutputT &thread_aggregate,
	OffsetT block_offset, ///< The offset the tile to consume
	int /valid_items/, ///< The number of valid items in the tile
	Int2Type<true> /is_full_tile/, ///< Whether or not this is a full tile
	Int2Type<true> /can_vectorize/) ///< Whether or not we can vectorize loads
	{
	// Alias items as an array of VectorT and load it in striped fashion
	enum { WORDS = ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH };

	// Fabricate a vectorized input iterator
	InputT d_in_unqualified = const_cast<InputT>(d_in) + block_offset + (threadIdx.x * VECTOR_LOAD_LENGTH);
	CacheModifiedInputIterator<AgentReducePolicy::LOAD_MODIFIER, VectorT, OffsetT> d_vec_in(
	reinterpret_cast<VectorT*>(d_in_unqualified));

	// Load items as vector items
	InputT input_items[ITEMS_PER_THREAD];
	VectorT vec_items = reinterpret_cast<VectorT>(input_items);
	#pragma unroll
	for (int i = 0; i < WORDS; ++i)
	vec_items[i] = d_vec_in[BLOCK_THREADS * i];

	// Convert from input type to output type
	OutputT items[ITEMS_PER_THREAD];
	#pragma unroll
	for (int i = 0; i < ITEMS_PER_THREAD; ++i)
	items[i] = input_items[i];

	// Reduce items within each thread stripe
	thread_aggregate = (IS_FIRST_TILE) ?
	internal::ThreadReduce(items, reduction_op) :
	internal::ThreadReduce(items, reduction_op, thread_aggregate);
	}


	/**
	* Consume a partial tile of input
	*/
	template <int IS_FIRST_TILE, int CAN_VECTORIZE>
	__device__ __forceinline__ void ConsumeTile(
	OutputT &thread_aggregate,
	OffsetT block_offset, ///< The offset the tile to consume
	int valid_items, ///< The number of valid items in the tile
	Int2Type<false> /is_full_tile/, ///< Whether or not this is a full tile
	Int2Type<CAN_VECTORIZE> /can_vectorize/) ///< Whether or not we can vectorize loads
	{
	// Partial tile
	int thread_offset = threadIdx.x;

	// Read first item
	if ((IS_FIRST_TILE) && (thread_offset < valid_items))
	{
	thread_aggregate = d_wrapped_in[block_offset + thread_offset];
	thread_offset += BLOCK_THREADS;
	}

	// Continue reading items (block-striped)
	while (thread_offset < valid_items)
	{
	OutputT item (d_wrapped_in[block_offset + thread_offset]);
	thread_aggregate = reduction_op(thread_aggregate, item);
	thread_offset += BLOCK_THREADS;
	}
	}


	//---------------------------------------------------------------
	// Consume a contiguous segment of tiles
	//---------------------------------------------------------------------

	/**
	* \brief Reduce a contiguous segment of input tiles
	*/
	template <int CAN_VECTORIZE>
	__device__ __forceinline__ OutputT ConsumeRange(
	GridEvenShare<OffsetT> &even_share, ///< GridEvenShare descriptor
	Int2Type<CAN_VECTORIZE> can_vectorize) ///< Whether or not we can vectorize loads
	{
	OutputT thread_aggregate;

	if (even_share.block_offset + TILE_ITEMS > even_share.block_end)
	{
	// First tile isn't full (not all threads have valid items)
	int valid_items = even_share.block_end - even_share.block_offset;
	ConsumeTile<true>(thread_aggregate, even_share.block_offset, valid_items, Int2Type<false>(), can_vectorize);
	return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op, valid_items);
	}

	// At least one full block
	ConsumeTile<true>(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
	even_share.block_offset += even_share.block_stride;

	// Consume subsequent full tiles of input
	while (even_share.block_offset + TILE_ITEMS <= even_share.block_end)
	{
	ConsumeTile<false>(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
	even_share.block_offset += even_share.block_stride;
	}

	// Consume a partially-full tile
	if (even_share.block_offset < even_share.block_end)
	{
	int valid_items = even_share.block_end - even_share.block_offset;
	ConsumeTile<false>(thread_aggregate, even_share.block_offset, valid_items, Int2Type<false>(), can_vectorize);
	}

	// Compute block-wide reduction (all threads have valid items)
	return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op);
	}


	/**
	* \brief Reduce a contiguous segment of input tiles
	*/
	__device__ __forceinline__ OutputT ConsumeRange(
	OffsetT block_offset, ///< [in] Threadblock begin offset (inclusive)
	OffsetT block_end) ///< [in] Threadblock end offset (exclusive)
	{
	GridEvenShare<OffsetT> even_share;
	even_share.template BlockInit<TILE_ITEMS>(block_offset, block_end);

	return (IsAligned(d_in + block_offset, Int2Type<ATTEMPT_VECTORIZATION>())) ?
	ConsumeRange(even_share, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
	ConsumeRange(even_share, Int2Type<false && ATTEMPT_VECTORIZATION>());
	}


	/**
	* Reduce a contiguous segment of input tiles
	*/
	__device__ __forceinline__ OutputT ConsumeTiles(
	GridEvenShare<OffsetT> &even_share) ///< [in] GridEvenShare descriptor
	{
	// Initialize GRID_MAPPING_STRIP_MINE even-share descriptor for this thread block
	even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_STRIP_MINE>();

	return (IsAligned(d_in, Int2Type<ATTEMPT_VECTORIZATION>())) ?
	ConsumeRange(even_share, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
	ConsumeRange(even_share, Int2Type<false && ATTEMPT_VECTORIZATION>());

	}

	};


	} // CUB namespace
	CUB_NS_POSTFIX // Optional outer namespace(s)