Spaces:

ma-xu
/

LIVE

Runtime error

LIVE / thrust /cub /agent /agent_spmv_orig.cuh

Xu Ma

update

1c3c0d9 over 2 years ago

27.4 kB

	/******************************************************************************
	* Copyright (c) 2011, Duane Merrill. All rights reserved.
	* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* * Neither the name of the NVIDIA CORPORATION nor the
	* names of its contributors may be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	******************************************************************************/

	/**
	* \file
	* cub::AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide SpMV.
	*/

	#pragma once

	#include <iterator>

	#include "../util_type.cuh"
	#include "../block/block_reduce.cuh"
	#include "../block/block_scan.cuh"
	#include "../block/block_exchange.cuh"
	#include "../config.cuh"
	#include "../thread/thread_search.cuh"
	#include "../thread/thread_operators.cuh"
	#include "../iterator/cache_modified_input_iterator.cuh"
	#include "../iterator/counting_input_iterator.cuh"
	#include "../iterator/tex_ref_input_iterator.cuh"

	/// Optional outer namespace(s)
	CUB_NS_PREFIX

	/// CUB namespace
	namespace cub {


	/******************************************************************************
	* Tuning policy
	******************************************************************************/

	/**
	* Parameterizable tuning policy type for AgentSpmv
	*/
	template <
	int _BLOCK_THREADS, ///< Threads per thread block
	int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
	CacheLoadModifier _ROW_OFFSETS_SEARCH_LOAD_MODIFIER, ///< Cache load modifier for reading CSR row-offsets during search
	CacheLoadModifier _ROW_OFFSETS_LOAD_MODIFIER, ///< Cache load modifier for reading CSR row-offsets
	CacheLoadModifier _COLUMN_INDICES_LOAD_MODIFIER, ///< Cache load modifier for reading CSR column-indices
	CacheLoadModifier _VALUES_LOAD_MODIFIER, ///< Cache load modifier for reading CSR values
	CacheLoadModifier _VECTOR_VALUES_LOAD_MODIFIER, ///< Cache load modifier for reading vector values
	bool _DIRECT_LOAD_NONZEROS, ///< Whether to load nonzeros directly from global during sequential merging (vs. pre-staged through shared memory)
	BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use
	struct AgentSpmvPolicy
	{
	enum
	{
	BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block
	ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
	DIRECT_LOAD_NONZEROS = _DIRECT_LOAD_NONZEROS, ///< Whether to load nonzeros directly from global during sequential merging (pre-staged through shared memory)
	};

	static const CacheLoadModifier ROW_OFFSETS_SEARCH_LOAD_MODIFIER = _ROW_OFFSETS_SEARCH_LOAD_MODIFIER; ///< Cache load modifier for reading CSR row-offsets
	static const CacheLoadModifier ROW_OFFSETS_LOAD_MODIFIER = _ROW_OFFSETS_LOAD_MODIFIER; ///< Cache load modifier for reading CSR row-offsets
	static const CacheLoadModifier COLUMN_INDICES_LOAD_MODIFIER = _COLUMN_INDICES_LOAD_MODIFIER; ///< Cache load modifier for reading CSR column-indices
	static const CacheLoadModifier VALUES_LOAD_MODIFIER = _VALUES_LOAD_MODIFIER; ///< Cache load modifier for reading CSR values
	static const CacheLoadModifier VECTOR_VALUES_LOAD_MODIFIER = _VECTOR_VALUES_LOAD_MODIFIER; ///< Cache load modifier for reading vector values
	static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use

	};


	/******************************************************************************
	* Thread block abstractions
	******************************************************************************/

	template <
	typename ValueT, ///< Matrix and vector value type
	typename OffsetT> ///< Signed integer type for sequence offsets
	struct SpmvParams
	{
	ValueT* d_values; ///< Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix <b>A</b>.
	OffsetT* d_row_end_offsets; ///< Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices and \p d_values
	OffsetT* d_column_indices; ///< Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix <b>A</b>. (Indices are zero-valued.)
	ValueT* d_vector_x; ///< Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
	ValueT* d_vector_y; ///< Pointer to the array of \p num_rows values corresponding to the dense output vector <em>y</em>
	int num_rows; ///< Number of rows of matrix <b>A</b>.
	int num_cols; ///< Number of columns of matrix <b>A</b>.
	int num_nonzeros; ///< Number of nonzero elements of matrix <b>A</b>.
	ValueT alpha; ///< Alpha multiplicand
	ValueT beta; ///< Beta addend-multiplicand

	TexRefInputIterator<ValueT, 66778899, OffsetT> t_vector_x;
	};


	/**
	* \brief AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide SpMV.
	*/
	template <
	typename AgentSpmvPolicyT, ///< Parameterized AgentSpmvPolicy tuning policy type
	typename ValueT, ///< Matrix and vector value type
	typename OffsetT, ///< Signed integer type for sequence offsets
	bool HAS_ALPHA, ///< Whether the input parameter \p alpha is 1
	bool HAS_BETA, ///< Whether the input parameter \p beta is 0
	int PTX_ARCH = CUB_PTX_ARCH> ///< PTX compute capability
	struct AgentSpmv
	{
	//---------------------------------------------------------------------
	// Types and constants
	//---------------------------------------------------------------------

	/// Constants
	enum
	{
	BLOCK_THREADS = AgentSpmvPolicyT::BLOCK_THREADS,
	ITEMS_PER_THREAD = AgentSpmvPolicyT::ITEMS_PER_THREAD,
	TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
	};

	/// 2D merge path coordinate type
	typedef typename CubVector<OffsetT, 2>::Type CoordinateT;

	/// Input iterator wrapper types (for applying cache modifiers)

	typedef CacheModifiedInputIterator<
	AgentSpmvPolicyT::ROW_OFFSETS_SEARCH_LOAD_MODIFIER,
	OffsetT,
	OffsetT>
	RowOffsetsSearchIteratorT;

	typedef CacheModifiedInputIterator<
	AgentSpmvPolicyT::ROW_OFFSETS_LOAD_MODIFIER,
	OffsetT,
	OffsetT>
	RowOffsetsIteratorT;

	typedef CacheModifiedInputIterator<
	AgentSpmvPolicyT::COLUMN_INDICES_LOAD_MODIFIER,
	OffsetT,
	OffsetT>
	ColumnIndicesIteratorT;

	typedef CacheModifiedInputIterator<
	AgentSpmvPolicyT::VALUES_LOAD_MODIFIER,
	ValueT,
	OffsetT>
	ValueIteratorT;

	typedef CacheModifiedInputIterator<
	AgentSpmvPolicyT::VECTOR_VALUES_LOAD_MODIFIER,
	ValueT,
	OffsetT>
	VectorValueIteratorT;

	// Tuple type for scanning (pairs accumulated segment-value with segment-index)
	typedef KeyValuePair<OffsetT, ValueT> KeyValuePairT;

	// Reduce-value-by-segment scan operator
	typedef ReduceByKeyOp<cub::Sum> ReduceBySegmentOpT;

	// BlockReduce specialization
	typedef BlockReduce<
	ValueT,
	BLOCK_THREADS,
	BLOCK_REDUCE_WARP_REDUCTIONS>
	BlockReduceT;

	// BlockScan specialization
	typedef BlockScan<
	KeyValuePairT,
	BLOCK_THREADS,
	AgentSpmvPolicyT::SCAN_ALGORITHM>
	BlockScanT;

	// BlockScan specialization
	typedef BlockScan<
	ValueT,
	BLOCK_THREADS,
	AgentSpmvPolicyT::SCAN_ALGORITHM>
	BlockPrefixSumT;

	// BlockExchange specialization
	typedef BlockExchange<
	ValueT,
	BLOCK_THREADS,
	ITEMS_PER_THREAD>
	BlockExchangeT;

	/// Merge item type (either a non-zero value or a row-end offset)
	union MergeItem
	{
	// Value type to pair with index type OffsetT (NullType if loading values directly during merge)
	typedef typename If<AgentSpmvPolicyT::DIRECT_LOAD_NONZEROS, NullType, ValueT>::Type MergeValueT;

	OffsetT row_end_offset;
	MergeValueT nonzero;
	};

	/// Shared memory type required by this thread block
	struct _TempStorage
	{
	CoordinateT tile_coords[2];

	union Aliasable
	{
	// Smem needed for tile of merge items
	MergeItem merge_items[ITEMS_PER_THREAD + TILE_ITEMS + 1];

	// Smem needed for block exchange
	typename BlockExchangeT::TempStorage exchange;

	// Smem needed for block-wide reduction
	typename BlockReduceT::TempStorage reduce;

	// Smem needed for tile scanning
	typename BlockScanT::TempStorage scan;

	// Smem needed for tile prefix sum
	typename BlockPrefixSumT::TempStorage prefix_sum;

	} aliasable;
	};

	/// Temporary storage type (unionable)
	struct TempStorage : Uninitialized<_TempStorage> {};


	//---------------------------------------------------------------------
	// Per-thread fields
	//---------------------------------------------------------------------


	_TempStorage& temp_storage; /// Reference to temp_storage

	SpmvParams<ValueT, OffsetT>& spmv_params;

	ValueIteratorT wd_values; ///< Wrapped pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix <b>A</b>.
	RowOffsetsIteratorT wd_row_end_offsets; ///< Wrapped Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices and \p d_values
	ColumnIndicesIteratorT wd_column_indices; ///< Wrapped Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix <b>A</b>. (Indices are zero-valued.)
	VectorValueIteratorT wd_vector_x; ///< Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
	VectorValueIteratorT wd_vector_y; ///< Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>


	//---------------------------------------------------------------------
	// Interface
	//---------------------------------------------------------------------

	/**
	* Constructor
	*/
	__device__ __forceinline__ AgentSpmv(
	TempStorage& temp_storage, ///< Reference to temp_storage
	SpmvParams<ValueT, OffsetT>& spmv_params) ///< SpMV input parameter bundle
	:
	temp_storage(temp_storage.Alias()),
	spmv_params(spmv_params),
	wd_values(spmv_params.d_values),
	wd_row_end_offsets(spmv_params.d_row_end_offsets),
	wd_column_indices(spmv_params.d_column_indices),
	wd_vector_x(spmv_params.d_vector_x),
	wd_vector_y(spmv_params.d_vector_y)
	{}




	/**
	* Consume a merge tile, specialized for direct-load of nonzeros
	*/
	__device__ __forceinline__ KeyValuePairT ConsumeTile(
	int tile_idx,
	CoordinateT tile_start_coord,
	CoordinateT tile_end_coord,
	Int2Type<true> is_direct_load) ///< Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch
	{
	int tile_num_rows = tile_end_coord.x - tile_start_coord.x;
	int tile_num_nonzeros = tile_end_coord.y - tile_start_coord.y;
	OffsetT* s_tile_row_end_offsets = &temp_storage.aliasable.merge_items[0].row_end_offset;

	// Gather the row end-offsets for the merge tile into shared memory
	for (int item = threadIdx.x; item <= tile_num_rows; item += BLOCK_THREADS)
	{
	s_tile_row_end_offsets[item] = wd_row_end_offsets[tile_start_coord.x + item];
	}

	CTA_SYNC();

	// Search for the thread's starting coordinate within the merge tile
	CountingInputIterator<OffsetT> tile_nonzero_indices(tile_start_coord.y);
	CoordinateT thread_start_coord;

	MergePathSearch(
	OffsetT(threadIdx.x * ITEMS_PER_THREAD), // Diagonal
	s_tile_row_end_offsets, // List A
	tile_nonzero_indices, // List B
	tile_num_rows,
	tile_num_nonzeros,
	thread_start_coord);

	CTA_SYNC(); // Perf-sync

	// Compute the thread's merge path segment
	CoordinateT thread_current_coord = thread_start_coord;
	KeyValuePairT scan_segment[ITEMS_PER_THREAD];

	ValueT running_total = 0.0;

	#pragma unroll
	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
	{
	OffsetT nonzero_idx = CUB_MIN(tile_nonzero_indices[thread_current_coord.y], spmv_params.num_nonzeros - 1);
	OffsetT column_idx = wd_column_indices[nonzero_idx];
	ValueT value = wd_values[nonzero_idx];

	ValueT vector_value = spmv_params.t_vector_x[column_idx];
	#if (CUB_PTX_ARCH >= 350)
	vector_value = wd_vector_x[column_idx];
	#endif
	ValueT nonzero = value * vector_value;

	OffsetT row_end_offset = s_tile_row_end_offsets[thread_current_coord.x];

	if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset)
	{
	// Move down (accumulate)
	running_total += nonzero;
	scan_segment[ITEM].value = running_total;
	scan_segment[ITEM].key = tile_num_rows;
	++thread_current_coord.y;
	}
	else
	{
	// Move right (reset)
	scan_segment[ITEM].value = running_total;
	scan_segment[ITEM].key = thread_current_coord.x;
	running_total = 0.0;
	++thread_current_coord.x;
	}
	}

	CTA_SYNC();

	// Block-wide reduce-value-by-segment
	KeyValuePairT tile_carry;
	ReduceBySegmentOpT scan_op;
	KeyValuePairT scan_item;

	scan_item.value = running_total;
	scan_item.key = thread_current_coord.x;

	BlockScanT(temp_storage.aliasable.scan).ExclusiveScan(scan_item, scan_item, scan_op, tile_carry);

	if (tile_num_rows > 0)
	{
	if (threadIdx.x == 0)
	scan_item.key = -1;

	// Direct scatter
	#pragma unroll
	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
	{
	if (scan_segment[ITEM].key < tile_num_rows)
	{
	if (scan_item.key == scan_segment[ITEM].key)
	scan_segment[ITEM].value = scan_item.value + scan_segment[ITEM].value;

	if (HAS_ALPHA)
	{
	scan_segment[ITEM].value *= spmv_params.alpha;
	}

	if (HAS_BETA)
	{
	// Update the output vector element
	ValueT addend = spmv_params.beta * wd_vector_y[tile_start_coord.x + scan_segment[ITEM].key];
	scan_segment[ITEM].value += addend;
	}

	// Set the output vector element
	spmv_params.d_vector_y[tile_start_coord.x + scan_segment[ITEM].key] = scan_segment[ITEM].value;
	}
	}
	}

	// Return the tile's running carry-out
	return tile_carry;
	}



	/**
	* Consume a merge tile, specialized for indirect load of nonzeros
	*/
	__device__ __forceinline__ KeyValuePairT ConsumeTile(
	int tile_idx,
	CoordinateT tile_start_coord,
	CoordinateT tile_end_coord,
	Int2Type<false> is_direct_load) ///< Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch
	{
	int tile_num_rows = tile_end_coord.x - tile_start_coord.x;
	int tile_num_nonzeros = tile_end_coord.y - tile_start_coord.y;

	#if (CUB_PTX_ARCH >= 520)

	OffsetT* s_tile_row_end_offsets = &temp_storage.aliasable.merge_items[0].row_end_offset;
	ValueT* s_tile_nonzeros = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero;

	// Gather the nonzeros for the merge tile into shared memory
	#pragma unroll
	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
	{
	int nonzero_idx = threadIdx.x + (ITEM * BLOCK_THREADS);

	ValueIteratorT a = wd_values + tile_start_coord.y + nonzero_idx;
	ColumnIndicesIteratorT ci = wd_column_indices + tile_start_coord.y + nonzero_idx;
	ValueT* s = s_tile_nonzeros + nonzero_idx;

	if (nonzero_idx < tile_num_nonzeros)
	{

	OffsetT column_idx = *ci;
	ValueT value = *a;

	ValueT vector_value = spmv_params.t_vector_x[column_idx];
	vector_value = wd_vector_x[column_idx];

	ValueT nonzero = value * vector_value;

	*s = nonzero;
	}
	}


	#else

	OffsetT* s_tile_row_end_offsets = &temp_storage.aliasable.merge_items[0].row_end_offset;
	ValueT* s_tile_nonzeros = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero;

	// Gather the nonzeros for the merge tile into shared memory
	if (tile_num_nonzeros > 0)
	{
	#pragma unroll
	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
	{
	int nonzero_idx = threadIdx.x + (ITEM * BLOCK_THREADS);
	nonzero_idx = CUB_MIN(nonzero_idx, tile_num_nonzeros - 1);

	OffsetT column_idx = wd_column_indices[tile_start_coord.y + nonzero_idx];
	ValueT value = wd_values[tile_start_coord.y + nonzero_idx];

	ValueT vector_value = spmv_params.t_vector_x[column_idx];
	#if (CUB_PTX_ARCH >= 350)
	vector_value = wd_vector_x[column_idx];
	#endif
	ValueT nonzero = value * vector_value;

	s_tile_nonzeros[nonzero_idx] = nonzero;
	}
	}

	#endif

	// Gather the row end-offsets for the merge tile into shared memory
	#pragma unroll 1
	for (int item = threadIdx.x; item <= tile_num_rows; item += BLOCK_THREADS)
	{
	s_tile_row_end_offsets[item] = wd_row_end_offsets[tile_start_coord.x + item];
	}

	CTA_SYNC();

	// Search for the thread's starting coordinate within the merge tile
	CountingInputIterator<OffsetT> tile_nonzero_indices(tile_start_coord.y);
	CoordinateT thread_start_coord;

	MergePathSearch(
	OffsetT(threadIdx.x * ITEMS_PER_THREAD), // Diagonal
	s_tile_row_end_offsets, // List A
	tile_nonzero_indices, // List B
	tile_num_rows,
	tile_num_nonzeros,
	thread_start_coord);

	CTA_SYNC(); // Perf-sync

	// Compute the thread's merge path segment
	CoordinateT thread_current_coord = thread_start_coord;
	KeyValuePairT scan_segment[ITEMS_PER_THREAD];
	ValueT running_total = 0.0;

	OffsetT row_end_offset = s_tile_row_end_offsets[thread_current_coord.x];
	ValueT nonzero = s_tile_nonzeros[thread_current_coord.y];

	#pragma unroll
	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
	{
	if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset)
	{
	// Move down (accumulate)
	scan_segment[ITEM].value = nonzero;
	running_total += nonzero;
	++thread_current_coord.y;
	nonzero = s_tile_nonzeros[thread_current_coord.y];
	}
	else
	{
	// Move right (reset)
	scan_segment[ITEM].value = 0.0;
	running_total = 0.0;
	++thread_current_coord.x;
	row_end_offset = s_tile_row_end_offsets[thread_current_coord.x];
	}

	scan_segment[ITEM].key = thread_current_coord.x;
	}

	CTA_SYNC();

	// Block-wide reduce-value-by-segment
	KeyValuePairT tile_carry;
	ReduceBySegmentOpT scan_op;
	KeyValuePairT scan_item;

	scan_item.value = running_total;
	scan_item.key = thread_current_coord.x;

	BlockScanT(temp_storage.aliasable.scan).ExclusiveScan(scan_item, scan_item, scan_op, tile_carry);

	if (threadIdx.x == 0)
	{
	scan_item.key = thread_start_coord.x;
	scan_item.value = 0.0;
	}

	if (tile_num_rows > 0)
	{

	CTA_SYNC();

	// Scan downsweep and scatter
	ValueT* s_partials = &temp_storage.aliasable.merge_items[0].nonzero;

	if (scan_item.key != scan_segment[0].key)
	{
	s_partials[scan_item.key] = scan_item.value;
	}
	else
	{
	scan_segment[0].value += scan_item.value;
	}

	#pragma unroll
	for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM)
	{
	if (scan_segment[ITEM - 1].key != scan_segment[ITEM].key)
	{
	s_partials[scan_segment[ITEM - 1].key] = scan_segment[ITEM - 1].value;
	}
	else
	{
	scan_segment[ITEM].value += scan_segment[ITEM - 1].value;
	}
	}

	CTA_SYNC();

	#pragma unroll 1
	for (int item = threadIdx.x; item < tile_num_rows; item += BLOCK_THREADS)
	{
	spmv_params.d_vector_y[tile_start_coord.x + item] = s_partials[item];
	}
	}

	// Return the tile's running carry-out
	return tile_carry;
	}


	/**
	* Consume input tile
	*/
	__device__ __forceinline__ void ConsumeTile(
	CoordinateT* d_tile_coordinates, ///< [in] Pointer to the temporary array of tile starting coordinates
	KeyValuePairT* d_tile_carry_pairs, ///< [out] Pointer to the temporary array carry-out dot product row-ids, one per block
	int num_merge_tiles) ///< [in] Number of merge tiles
	{
	int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index

	if (tile_idx >= num_merge_tiles)
	return;

	// Read our starting coordinates
	if (threadIdx.x < 2)
	{
	if (d_tile_coordinates == NULL)
	{
	// Search our starting coordinates
	OffsetT diagonal = (tile_idx + threadIdx.x) * TILE_ITEMS;
	CoordinateT tile_coord;
	CountingInputIterator<OffsetT> nonzero_indices(0);

	// Search the merge path
	MergePathSearch(
	diagonal,
	RowOffsetsSearchIteratorT(spmv_params.d_row_end_offsets),
	nonzero_indices,
	spmv_params.num_rows,
	spmv_params.num_nonzeros,
	tile_coord);

	temp_storage.tile_coords[threadIdx.x] = tile_coord;
	}
	else
	{
	temp_storage.tile_coords[threadIdx.x] = d_tile_coordinates[tile_idx + threadIdx.x];
	}
	}

	CTA_SYNC();

	CoordinateT tile_start_coord = temp_storage.tile_coords[0];
	CoordinateT tile_end_coord = temp_storage.tile_coords[1];

	// Consume multi-segment tile
	KeyValuePairT tile_carry = ConsumeTile(
	tile_idx,
	tile_start_coord,
	tile_end_coord,
	Int2Type<AgentSpmvPolicyT::DIRECT_LOAD_NONZEROS>());

	// Output the tile's carry-out
	if (threadIdx.x == 0)
	{
	if (HAS_ALPHA)
	tile_carry.value *= spmv_params.alpha;

	tile_carry.key += tile_start_coord.x;
	d_tile_carry_pairs[tile_idx] = tile_carry;
	}
	}


	};




	} // CUB namespace
	CUB_NS_POSTFIX // Optional outer namespace(s)