Spaces:

ma-xu
/

LIVE

Runtime error

LIVE / thrust /cub /util_allocator.cuh

Xu Ma

update

1c3c0d9 over 2 years ago

28.7 kB

	/******************************************************************************
	* Copyright (c) 2011, Duane Merrill. All rights reserved.
	* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* * Neither the name of the NVIDIA CORPORATION nor the
	* names of its contributors may be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	******************************************************************************/

	/******************************************************************************
	* Simple caching allocator for device memory allocations. The allocator is
	* thread-safe and capable of managing device allocations on multiple devices.
	******************************************************************************/

	#pragma once

	#include "util_namespace.cuh"
	#include "util_debug.cuh"

	#include <set>
	#include <map>

	#include "host/mutex.cuh"
	#include <math.h>

	/// Optional outer namespace(s)
	CUB_NS_PREFIX

	/// CUB namespace
	namespace cub {


	/**
	* \addtogroup UtilMgmt
	* @{
	*/


	/******************************************************************************
	* CachingDeviceAllocator (host use)
	******************************************************************************/

	/**
	* \brief A simple caching allocator for device memory allocations.
	*
	* \par Overview
	* The allocator is thread-safe and stream-safe and is capable of managing cached
	* device allocations on multiple devices. It behaves as follows:
	*
	* \par
	* - Allocations from the allocator are associated with an \p active_stream. Once freed,
	* the allocation becomes available immediately for reuse within the \p active_stream
	* with which it was associated with during allocation, and it becomes available for
	* reuse within other streams when all prior work submitted to \p active_stream has completed.
	* - Allocations are categorized and cached by bin size. A new allocation request of
	* a given size will only consider cached allocations within the corresponding bin.
	* - Bin limits progress geometrically in accordance with the growth factor
	* \p bin_growth provided during construction. Unused device allocations within
	* a larger bin cache are not reused for allocation requests that categorize to
	* smaller bin sizes.
	* - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to
	* (\p bin_growth ^ \p min_bin).
	* - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest
	* bin and are simply freed when they are deallocated instead of being returned
	* to a bin-cache.
	* - %If the total storage of cached allocations on a given device will exceed
	* \p max_cached_bytes, allocations for that device are simply freed when they are
	* deallocated instead of being returned to their bin-cache.
	*
	* \par
	* For example, the default-constructed CachingDeviceAllocator is configured with:
	* - \p bin_growth = 8
	* - \p min_bin = 3
	* - \p max_bin = 7
	* - \p max_cached_bytes = 6MB - 1B
	*
	* \par
	* which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB
	* and sets a maximum of 6,291,455 cached bytes per device
	*
	*/
	struct CachingDeviceAllocator
	{

	//---------------------------------------------------------------------
	// Constants
	//---------------------------------------------------------------------

	/// Out-of-bounds bin
	static const unsigned int INVALID_BIN = (unsigned int) -1;

	/// Invalid size
	static const size_t INVALID_SIZE = (size_t) -1;

	#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document

	/// Invalid device ordinal
	static const int INVALID_DEVICE_ORDINAL = -1;

	//---------------------------------------------------------------------
	// Type definitions and helper types
	//---------------------------------------------------------------------

	/**
	* Descriptor for device memory allocations
	*/
	struct BlockDescriptor
	{
	void* d_ptr; // Device pointer
	size_t bytes; // Size of allocation in bytes
	unsigned int bin; // Bin enumeration
	int device; // device ordinal
	cudaStream_t associated_stream; // Associated associated_stream
	cudaEvent_t ready_event; // Signal when associated stream has run to the point at which this block was freed

	// Constructor (suitable for searching maps for a specific block, given its pointer and device)
	BlockDescriptor(void *d_ptr, int device) :
	d_ptr(d_ptr),
	bytes(0),
	bin(INVALID_BIN),
	device(device),
	associated_stream(0),
	ready_event(0)
	{}

	// Constructor (suitable for searching maps for a range of suitable blocks, given a device)
	BlockDescriptor(int device) :
	d_ptr(NULL),
	bytes(0),
	bin(INVALID_BIN),
	device(device),
	associated_stream(0),
	ready_event(0)
	{}

	// Comparison functor for comparing device pointers
	static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b)
	{
	if (a.device == b.device)
	return (a.d_ptr < b.d_ptr);
	else
	return (a.device < b.device);
	}

	// Comparison functor for comparing allocation sizes
	static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b)
	{
	if (a.device == b.device)
	return (a.bytes < b.bytes);
	else
	return (a.device < b.device);
	}
	};

	/// BlockDescriptor comparator function interface
	typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);

	class TotalBytes {
	public:
	size_t free;
	size_t live;
	TotalBytes() { free = live = 0; }
	};

	/// Set type for cached blocks (ordered by size)
	typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;

	/// Set type for live blocks (ordered by ptr)
	typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;

	/// Map type of device ordinals to the number of cached bytes cached by each device
	typedef std::map<int, TotalBytes> GpuCachedBytes;


	//---------------------------------------------------------------------
	// Utility functions
	//---------------------------------------------------------------------

	/**
	* Integer pow function for unsigned base and exponent
	*/
	static unsigned int IntPow(
	unsigned int base,
	unsigned int exp)
	{
	unsigned int retval = 1;
	while (exp > 0)
	{
	if (exp & 1) {
	retval = retval * base; // multiply the result by the current base
	}
	base = base * base; // square the base
	exp = exp >> 1; // divide the exponent in half
	}
	return retval;
	}


	/**
	* Round up to the nearest power-of
	*/
	void NearestPowerOf(
	unsigned int &power,
	size_t &rounded_bytes,
	unsigned int base,
	size_t value)
	{
	power = 0;
	rounded_bytes = 1;

	if (value * base < value)
	{
	// Overflow
	power = sizeof(size_t) * 8;
	rounded_bytes = size_t(0) - 1;
	return;
	}

	while (rounded_bytes < value)
	{
	rounded_bytes *= base;
	power++;
	}
	}


	//---------------------------------------------------------------------
	// Fields
	//---------------------------------------------------------------------

	cub::Mutex mutex; /// Mutex for thread-safety

	unsigned int bin_growth; /// Geometric growth factor for bin-sizes
	unsigned int min_bin; /// Minimum bin enumeration
	unsigned int max_bin; /// Maximum bin enumeration

	size_t min_bin_bytes; /// Minimum bin size
	size_t max_bin_bytes; /// Maximum bin size
	size_t max_cached_bytes; /// Maximum aggregate cached bytes per device

	const bool skip_cleanup; /// Whether or not to skip a call to FreeAllCached() when destructor is called. (The CUDA runtime may have already shut down for statically declared allocators)
	bool debug; /// Whether or not to print (de)allocation events to stdout

	GpuCachedBytes cached_bytes; /// Map of device ordinal to aggregate cached bytes on that device
	CachedBlocks cached_blocks; /// Set of cached device allocations available for reuse
	BusyBlocks live_blocks; /// Set of live device allocations currently in use

	#endif // DOXYGEN_SHOULD_SKIP_THIS

	//---------------------------------------------------------------------
	// Methods
	//---------------------------------------------------------------------

	/**
	* \brief Constructor.
	*/
	CachingDeviceAllocator(
	unsigned int bin_growth, ///< Geometric growth factor for bin-sizes
	unsigned int min_bin = 1, ///< Minimum bin (default is bin_growth ^ 1)
	unsigned int max_bin = INVALID_BIN, ///< Maximum bin (default is no max bin)
	size_t max_cached_bytes = INVALID_SIZE, ///< Maximum aggregate cached bytes per device (default is no limit)
	bool skip_cleanup = false, ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called (default is to deallocate)
	bool debug = false) ///< Whether or not to print (de)allocation events to stdout (default is no stderr output)
	:
	bin_growth(bin_growth),
	min_bin(min_bin),
	max_bin(max_bin),
	min_bin_bytes(IntPow(bin_growth, min_bin)),
	max_bin_bytes(IntPow(bin_growth, max_bin)),
	max_cached_bytes(max_cached_bytes),
	skip_cleanup(skip_cleanup),
	debug(debug),
	cached_blocks(BlockDescriptor::SizeCompare),
	live_blocks(BlockDescriptor::PtrCompare)
	{}


	/**
	* \brief Default constructor.
	*
	* Configured with:
	* \par
	* - \p bin_growth = 8
	* - \p min_bin = 3
	* - \p max_bin = 7
	* - \p max_cached_bytes = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes
	*
	* which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and
	* sets a maximum of 6,291,455 cached bytes per device
	*/
	CachingDeviceAllocator(
	bool skip_cleanup = false,
	bool debug = false)
	:
	bin_growth(8),
	min_bin(3),
	max_bin(7),
	min_bin_bytes(IntPow(bin_growth, min_bin)),
	max_bin_bytes(IntPow(bin_growth, max_bin)),
	max_cached_bytes((max_bin_bytes * 3) - 1),
	skip_cleanup(skip_cleanup),
	debug(debug),
	cached_blocks(BlockDescriptor::SizeCompare),
	live_blocks(BlockDescriptor::PtrCompare)
	{}


	/**
	* \brief Sets the limit on the number bytes this allocator is allowed to cache per device.
	*
	* Changing the ceiling of cached bytes does not cause any allocations (in-use or
	* cached-in-reserve) to be freed. See \p FreeAllCached().
	*/
	cudaError_t SetMaxCachedBytes(
	size_t max_cached_bytes)
	{
	// Lock
	mutex.Lock();

	if (debug) _CubLog("Changing max_cached_bytes (%lld -> %lld)\n", (long long) this->max_cached_bytes, (long long) max_cached_bytes);

	this->max_cached_bytes = max_cached_bytes;

	// Unlock
	mutex.Unlock();

	return cudaSuccess;
	}


	/**
	* \brief Provides a suitable allocation of device memory for the given size on the specified device.
	*
	* Once freed, the allocation becomes available immediately for reuse within the \p active_stream
	* with which it was associated with during allocation, and it becomes available for reuse within other
	* streams when all prior work submitted to \p active_stream has completed.
	*/
	cudaError_t DeviceAllocate(
	int device, ///< [in] Device on which to place the allocation
	void **d_ptr, ///< [out] Reference to pointer to the allocation
	size_t bytes, ///< [in] Minimum number of bytes for the allocation
	cudaStream_t active_stream = 0) ///< [in] The stream to be associated with this allocation
	{
	*d_ptr = NULL;
	int entrypoint_device = INVALID_DEVICE_ORDINAL;
	cudaError_t error = cudaSuccess;

	if (device == INVALID_DEVICE_ORDINAL)
	{
	if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
	device = entrypoint_device;
	}

	// Create a block descriptor for the requested allocation
	bool found = false;
	BlockDescriptor search_key(device);
	search_key.associated_stream = active_stream;
	NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes);

	if (search_key.bin > max_bin)
	{
	// Bin is greater than our maximum bin: allocate the request
	// exactly and give out-of-bounds bin. It will not be cached
	// for reuse when returned.
	search_key.bin = INVALID_BIN;
	search_key.bytes = bytes;
	}
	else
	{
	// Search for a suitable cached allocation: lock
	mutex.Lock();

	if (search_key.bin < min_bin)
	{
	// Bin is less than minimum bin: round up
	search_key.bin = min_bin;
	search_key.bytes = min_bin_bytes;
	}

	// Iterate through the range of cached blocks on the same device in the same bin
	CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
	while ((block_itr != cached_blocks.end())
	&& (block_itr->device == device)
	&& (block_itr->bin == search_key.bin))
	{
	// To prevent races with reusing blocks returned by the host but still
	// in use by the device, only consider cached blocks that are
	// either (from the active stream) or (from an idle stream)
	if ((active_stream == block_itr->associated_stream) \|\|
	(CubDebug(cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady)))
	{
	// Reuse existing cache block. Insert into live blocks.
	found = true;
	search_key = *block_itr;
	search_key.associated_stream = active_stream;
	live_blocks.insert(search_key);

	// Remove from free blocks
	cached_bytes[device].free -= search_key.bytes;
	cached_bytes[device].live += search_key.bytes;

	if (debug) _CubLog("\tDevice %d reused cached block at %p (%lld bytes) for stream %lld (previously associated with stream %lld).\n",
	device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) block_itr->associated_stream);

	cached_blocks.erase(block_itr);

	break;
	}
	block_itr++;
	}

	// Done searching: unlock
	mutex.Unlock();
	}

	// Allocate the block if necessary
	if (!found)
	{
	// Set runtime's current device to specified device (entrypoint may not be set)
	if (device != entrypoint_device)
	{
	if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
	if (CubDebug(error = cudaSetDevice(device))) return error;
	}

	// Attempt to allocate
	if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation)
	{
	// The allocation attempt failed: free all cached blocks on device and retry
	if (debug) _CubLog("\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations",
	device, (long long) search_key.bytes, (long long) search_key.associated_stream);

	error = cudaSuccess; // Reset the error we will return
	cudaGetLastError(); // Reset CUDART's error

	// Lock
	mutex.Lock();

	// Iterate the range of free blocks on the same device
	BlockDescriptor free_key(device);
	CachedBlocks::iterator block_itr = cached_blocks.lower_bound(free_key);

	while ((block_itr != cached_blocks.end()) && (block_itr->device == device))
	{
	// No need to worry about synchronization with the device: cudaFree is
	// blocking and will synchronize across all kernels executing
	// on the current device

	// Free device memory and destroy stream event.
	if (CubDebug(error = cudaFree(block_itr->d_ptr))) break;
	if (CubDebug(error = cudaEventDestroy(block_itr->ready_event))) break;

	// Reduce balance and erase entry
	cached_bytes[device].free -= block_itr->bytes;

	if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
	device, (long long) block_itr->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);

	cached_blocks.erase(block_itr);

	block_itr++;
	}

	// Unlock
	mutex.Unlock();

	// Return under error
	if (error) return error;

	// Try to allocate again
	if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes))) return error;
	}

	// Create ready event
	if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))
	return error;

	// Insert into live blocks
	mutex.Lock();
	live_blocks.insert(search_key);
	cached_bytes[device].live += search_key.bytes;
	mutex.Unlock();

	if (debug) _CubLog("\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld).\n",
	device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream);

	// Attempt to revert back to previous device if necessary
	if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
	{
	if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
	}
	}

	// Copy device pointer to output parameter
	*d_ptr = search_key.d_ptr;

	if (debug) _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
	(long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);

	return error;
	}


	/**
	* \brief Provides a suitable allocation of device memory for the given size on the current device.
	*
	* Once freed, the allocation becomes available immediately for reuse within the \p active_stream
	* with which it was associated with during allocation, and it becomes available for reuse within other
	* streams when all prior work submitted to \p active_stream has completed.
	*/
	cudaError_t DeviceAllocate(
	void **d_ptr, ///< [out] Reference to pointer to the allocation
	size_t bytes, ///< [in] Minimum number of bytes for the allocation
	cudaStream_t active_stream = 0) ///< [in] The stream to be associated with this allocation
	{
	return DeviceAllocate(INVALID_DEVICE_ORDINAL, d_ptr, bytes, active_stream);
	}


	/**
	* \brief Frees a live allocation of device memory on the specified device, returning it to the allocator.
	*
	* Once freed, the allocation becomes available immediately for reuse within the \p active_stream
	* with which it was associated with during allocation, and it becomes available for reuse within other
	* streams when all prior work submitted to \p active_stream has completed.
	*/
	cudaError_t DeviceFree(
	int device,
	void* d_ptr)
	{
	int entrypoint_device = INVALID_DEVICE_ORDINAL;
	cudaError_t error = cudaSuccess;

	if (device == INVALID_DEVICE_ORDINAL)
	{
	if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
	return error;
	device = entrypoint_device;
	}

	// Lock
	mutex.Lock();

	// Find corresponding block descriptor
	bool recached = false;
	BlockDescriptor search_key(d_ptr, device);
	BusyBlocks::iterator block_itr = live_blocks.find(search_key);
	if (block_itr != live_blocks.end())
	{
	// Remove from live blocks
	search_key = *block_itr;
	live_blocks.erase(block_itr);
	cached_bytes[device].live -= search_key.bytes;

	// Keep the returned allocation if bin is valid and we won't exceed the max cached threshold
	if ((search_key.bin != INVALID_BIN) && (cached_bytes[device].free + search_key.bytes <= max_cached_bytes))
	{
	// Insert returned allocation into free blocks
	recached = true;
	cached_blocks.insert(search_key);
	cached_bytes[device].free += search_key.bytes;

	if (debug) _CubLog("\tDevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
	device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(),
	(long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
	}
	}

	// First set to specified device (entrypoint may not be set)
	if (device != entrypoint_device)
	{
	if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
	if (CubDebug(error = cudaSetDevice(device))) return error;
	}

	if (recached)
	{
	// Insert the ready event in the associated stream (must have current device set properly)
	if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream))) return error;
	}

	// Unlock
	mutex.Unlock();

	if (!recached)
	{
	// Free the allocation from the runtime and cleanup the event.
	if (CubDebug(error = cudaFree(d_ptr))) return error;
	if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) return error;

	if (debug) _CubLog("\tDevice %d freed %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
	device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
	}

	// Reset device
	if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
	{
	if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
	}

	return error;
	}


	/**
	* \brief Frees a live allocation of device memory on the current device, returning it to the allocator.
	*
	* Once freed, the allocation becomes available immediately for reuse within the \p active_stream
	* with which it was associated with during allocation, and it becomes available for reuse within other
	* streams when all prior work submitted to \p active_stream has completed.
	*/
	cudaError_t DeviceFree(
	void* d_ptr)
	{
	return DeviceFree(INVALID_DEVICE_ORDINAL, d_ptr);
	}


	/**
	* \brief Frees all cached device allocations on all devices
	*/
	cudaError_t FreeAllCached()
	{
	cudaError_t error = cudaSuccess;
	int entrypoint_device = INVALID_DEVICE_ORDINAL;
	int current_device = INVALID_DEVICE_ORDINAL;

	mutex.Lock();

	while (!cached_blocks.empty())
	{
	// Get first block
	CachedBlocks::iterator begin = cached_blocks.begin();

	// Get entry-point device ordinal if necessary
	if (entrypoint_device == INVALID_DEVICE_ORDINAL)
	{
	if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
	}

	// Set current device ordinal if necessary
	if (begin->device != current_device)
	{
	if (CubDebug(error = cudaSetDevice(begin->device))) break;
	current_device = begin->device;
	}

	// Free device memory
	if (CubDebug(error = cudaFree(begin->d_ptr))) break;
	if (CubDebug(error = cudaEventDestroy(begin->ready_event))) break;

	// Reduce balance and erase entry
	cached_bytes[current_device].free -= begin->bytes;

	if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
	current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device].free, (long long) live_blocks.size(), (long long) cached_bytes[current_device].live);

	cached_blocks.erase(begin);
	}

	mutex.Unlock();

	// Attempt to revert back to entry-point device if necessary
	if (entrypoint_device != INVALID_DEVICE_ORDINAL)
	{
	if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
	}

	return error;
	}


	/**
	* \brief Destructor
	*/
	virtual ~CachingDeviceAllocator()
	{
	if (!skip_cleanup)
	FreeAllCached();
	}

	};




	/** @} */ // end group UtilMgmt

	} // CUB namespace
	CUB_NS_POSTFIX // Optional outer namespace(s)