LIVE / thrust /cub /util_allocator.cuh
Xu Ma
update
1c3c0d9
raw
history blame
28.7 kB
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
* Simple caching allocator for device memory allocations. The allocator is
* thread-safe and capable of managing device allocations on multiple devices.
******************************************************************************/
#pragma once
#include "util_namespace.cuh"
#include "util_debug.cuh"
#include <set>
#include <map>
#include "host/mutex.cuh"
#include <math.h>
/// Optional outer namespace(s)
CUB_NS_PREFIX
/// CUB namespace
namespace cub {
/**
* \addtogroup UtilMgmt
* @{
*/
/******************************************************************************
* CachingDeviceAllocator (host use)
******************************************************************************/
/**
* \brief A simple caching allocator for device memory allocations.
*
* \par Overview
* The allocator is thread-safe and stream-safe and is capable of managing cached
* device allocations on multiple devices. It behaves as follows:
*
* \par
* - Allocations from the allocator are associated with an \p active_stream. Once freed,
* the allocation becomes available immediately for reuse within the \p active_stream
* with which it was associated with during allocation, and it becomes available for
* reuse within other streams when all prior work submitted to \p active_stream has completed.
* - Allocations are categorized and cached by bin size. A new allocation request of
* a given size will only consider cached allocations within the corresponding bin.
* - Bin limits progress geometrically in accordance with the growth factor
* \p bin_growth provided during construction. Unused device allocations within
* a larger bin cache are not reused for allocation requests that categorize to
* smaller bin sizes.
* - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to
* (\p bin_growth ^ \p min_bin).
* - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest
* bin and are simply freed when they are deallocated instead of being returned
* to a bin-cache.
* - %If the total storage of cached allocations on a given device will exceed
* \p max_cached_bytes, allocations for that device are simply freed when they are
* deallocated instead of being returned to their bin-cache.
*
* \par
* For example, the default-constructed CachingDeviceAllocator is configured with:
* - \p bin_growth = 8
* - \p min_bin = 3
* - \p max_bin = 7
* - \p max_cached_bytes = 6MB - 1B
*
* \par
* which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB
* and sets a maximum of 6,291,455 cached bytes per device
*
*/
struct CachingDeviceAllocator
{
//---------------------------------------------------------------------
// Constants
//---------------------------------------------------------------------
/// Out-of-bounds bin
static const unsigned int INVALID_BIN = (unsigned int) -1;
/// Invalid size
static const size_t INVALID_SIZE = (size_t) -1;
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
/// Invalid device ordinal
static const int INVALID_DEVICE_ORDINAL = -1;
//---------------------------------------------------------------------
// Type definitions and helper types
//---------------------------------------------------------------------
/**
* Descriptor for device memory allocations
*/
struct BlockDescriptor
{
void* d_ptr; // Device pointer
size_t bytes; // Size of allocation in bytes
unsigned int bin; // Bin enumeration
int device; // device ordinal
cudaStream_t associated_stream; // Associated associated_stream
cudaEvent_t ready_event; // Signal when associated stream has run to the point at which this block was freed
// Constructor (suitable for searching maps for a specific block, given its pointer and device)
BlockDescriptor(void *d_ptr, int device) :
d_ptr(d_ptr),
bytes(0),
bin(INVALID_BIN),
device(device),
associated_stream(0),
ready_event(0)
{}
// Constructor (suitable for searching maps for a range of suitable blocks, given a device)
BlockDescriptor(int device) :
d_ptr(NULL),
bytes(0),
bin(INVALID_BIN),
device(device),
associated_stream(0),
ready_event(0)
{}
// Comparison functor for comparing device pointers
static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b)
{
if (a.device == b.device)
return (a.d_ptr < b.d_ptr);
else
return (a.device < b.device);
}
// Comparison functor for comparing allocation sizes
static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b)
{
if (a.device == b.device)
return (a.bytes < b.bytes);
else
return (a.device < b.device);
}
};
/// BlockDescriptor comparator function interface
typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
class TotalBytes {
public:
size_t free;
size_t live;
TotalBytes() { free = live = 0; }
};
/// Set type for cached blocks (ordered by size)
typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
/// Set type for live blocks (ordered by ptr)
typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
/// Map type of device ordinals to the number of cached bytes cached by each device
typedef std::map<int, TotalBytes> GpuCachedBytes;
//---------------------------------------------------------------------
// Utility functions
//---------------------------------------------------------------------
/**
* Integer pow function for unsigned base and exponent
*/
static unsigned int IntPow(
unsigned int base,
unsigned int exp)
{
unsigned int retval = 1;
while (exp > 0)
{
if (exp & 1) {
retval = retval * base; // multiply the result by the current base
}
base = base * base; // square the base
exp = exp >> 1; // divide the exponent in half
}
return retval;
}
/**
* Round up to the nearest power-of
*/
void NearestPowerOf(
unsigned int &power,
size_t &rounded_bytes,
unsigned int base,
size_t value)
{
power = 0;
rounded_bytes = 1;
if (value * base < value)
{
// Overflow
power = sizeof(size_t) * 8;
rounded_bytes = size_t(0) - 1;
return;
}
while (rounded_bytes < value)
{
rounded_bytes *= base;
power++;
}
}
//---------------------------------------------------------------------
// Fields
//---------------------------------------------------------------------
cub::Mutex mutex; /// Mutex for thread-safety
unsigned int bin_growth; /// Geometric growth factor for bin-sizes
unsigned int min_bin; /// Minimum bin enumeration
unsigned int max_bin; /// Maximum bin enumeration
size_t min_bin_bytes; /// Minimum bin size
size_t max_bin_bytes; /// Maximum bin size
size_t max_cached_bytes; /// Maximum aggregate cached bytes per device
const bool skip_cleanup; /// Whether or not to skip a call to FreeAllCached() when destructor is called. (The CUDA runtime may have already shut down for statically declared allocators)
bool debug; /// Whether or not to print (de)allocation events to stdout
GpuCachedBytes cached_bytes; /// Map of device ordinal to aggregate cached bytes on that device
CachedBlocks cached_blocks; /// Set of cached device allocations available for reuse
BusyBlocks live_blocks; /// Set of live device allocations currently in use
#endif // DOXYGEN_SHOULD_SKIP_THIS
//---------------------------------------------------------------------
// Methods
//---------------------------------------------------------------------
/**
* \brief Constructor.
*/
CachingDeviceAllocator(
unsigned int bin_growth, ///< Geometric growth factor for bin-sizes
unsigned int min_bin = 1, ///< Minimum bin (default is bin_growth ^ 1)
unsigned int max_bin = INVALID_BIN, ///< Maximum bin (default is no max bin)
size_t max_cached_bytes = INVALID_SIZE, ///< Maximum aggregate cached bytes per device (default is no limit)
bool skip_cleanup = false, ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called (default is to deallocate)
bool debug = false) ///< Whether or not to print (de)allocation events to stdout (default is no stderr output)
:
bin_growth(bin_growth),
min_bin(min_bin),
max_bin(max_bin),
min_bin_bytes(IntPow(bin_growth, min_bin)),
max_bin_bytes(IntPow(bin_growth, max_bin)),
max_cached_bytes(max_cached_bytes),
skip_cleanup(skip_cleanup),
debug(debug),
cached_blocks(BlockDescriptor::SizeCompare),
live_blocks(BlockDescriptor::PtrCompare)
{}
/**
* \brief Default constructor.
*
* Configured with:
* \par
* - \p bin_growth = 8
* - \p min_bin = 3
* - \p max_bin = 7
* - \p max_cached_bytes = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes
*
* which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and
* sets a maximum of 6,291,455 cached bytes per device
*/
CachingDeviceAllocator(
bool skip_cleanup = false,
bool debug = false)
:
bin_growth(8),
min_bin(3),
max_bin(7),
min_bin_bytes(IntPow(bin_growth, min_bin)),
max_bin_bytes(IntPow(bin_growth, max_bin)),
max_cached_bytes((max_bin_bytes * 3) - 1),
skip_cleanup(skip_cleanup),
debug(debug),
cached_blocks(BlockDescriptor::SizeCompare),
live_blocks(BlockDescriptor::PtrCompare)
{}
/**
* \brief Sets the limit on the number bytes this allocator is allowed to cache per device.
*
* Changing the ceiling of cached bytes does not cause any allocations (in-use or
* cached-in-reserve) to be freed. See \p FreeAllCached().
*/
cudaError_t SetMaxCachedBytes(
size_t max_cached_bytes)
{
// Lock
mutex.Lock();
if (debug) _CubLog("Changing max_cached_bytes (%lld -> %lld)\n", (long long) this->max_cached_bytes, (long long) max_cached_bytes);
this->max_cached_bytes = max_cached_bytes;
// Unlock
mutex.Unlock();
return cudaSuccess;
}
/**
* \brief Provides a suitable allocation of device memory for the given size on the specified device.
*
* Once freed, the allocation becomes available immediately for reuse within the \p active_stream
* with which it was associated with during allocation, and it becomes available for reuse within other
* streams when all prior work submitted to \p active_stream has completed.
*/
cudaError_t DeviceAllocate(
int device, ///< [in] Device on which to place the allocation
void **d_ptr, ///< [out] Reference to pointer to the allocation
size_t bytes, ///< [in] Minimum number of bytes for the allocation
cudaStream_t active_stream = 0) ///< [in] The stream to be associated with this allocation
{
*d_ptr = NULL;
int entrypoint_device = INVALID_DEVICE_ORDINAL;
cudaError_t error = cudaSuccess;
if (device == INVALID_DEVICE_ORDINAL)
{
if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
device = entrypoint_device;
}
// Create a block descriptor for the requested allocation
bool found = false;
BlockDescriptor search_key(device);
search_key.associated_stream = active_stream;
NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes);
if (search_key.bin > max_bin)
{
// Bin is greater than our maximum bin: allocate the request
// exactly and give out-of-bounds bin. It will not be cached
// for reuse when returned.
search_key.bin = INVALID_BIN;
search_key.bytes = bytes;
}
else
{
// Search for a suitable cached allocation: lock
mutex.Lock();
if (search_key.bin < min_bin)
{
// Bin is less than minimum bin: round up
search_key.bin = min_bin;
search_key.bytes = min_bin_bytes;
}
// Iterate through the range of cached blocks on the same device in the same bin
CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
while ((block_itr != cached_blocks.end())
&& (block_itr->device == device)
&& (block_itr->bin == search_key.bin))
{
// To prevent races with reusing blocks returned by the host but still
// in use by the device, only consider cached blocks that are
// either (from the active stream) or (from an idle stream)
if ((active_stream == block_itr->associated_stream) ||
(CubDebug(cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady)))
{
// Reuse existing cache block. Insert into live blocks.
found = true;
search_key = *block_itr;
search_key.associated_stream = active_stream;
live_blocks.insert(search_key);
// Remove from free blocks
cached_bytes[device].free -= search_key.bytes;
cached_bytes[device].live += search_key.bytes;
if (debug) _CubLog("\tDevice %d reused cached block at %p (%lld bytes) for stream %lld (previously associated with stream %lld).\n",
device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) block_itr->associated_stream);
cached_blocks.erase(block_itr);
break;
}
block_itr++;
}
// Done searching: unlock
mutex.Unlock();
}
// Allocate the block if necessary
if (!found)
{
// Set runtime's current device to specified device (entrypoint may not be set)
if (device != entrypoint_device)
{
if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
if (CubDebug(error = cudaSetDevice(device))) return error;
}
// Attempt to allocate
if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation)
{
// The allocation attempt failed: free all cached blocks on device and retry
if (debug) _CubLog("\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations",
device, (long long) search_key.bytes, (long long) search_key.associated_stream);
error = cudaSuccess; // Reset the error we will return
cudaGetLastError(); // Reset CUDART's error
// Lock
mutex.Lock();
// Iterate the range of free blocks on the same device
BlockDescriptor free_key(device);
CachedBlocks::iterator block_itr = cached_blocks.lower_bound(free_key);
while ((block_itr != cached_blocks.end()) && (block_itr->device == device))
{
// No need to worry about synchronization with the device: cudaFree is
// blocking and will synchronize across all kernels executing
// on the current device
// Free device memory and destroy stream event.
if (CubDebug(error = cudaFree(block_itr->d_ptr))) break;
if (CubDebug(error = cudaEventDestroy(block_itr->ready_event))) break;
// Reduce balance and erase entry
cached_bytes[device].free -= block_itr->bytes;
if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
device, (long long) block_itr->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
cached_blocks.erase(block_itr);
block_itr++;
}
// Unlock
mutex.Unlock();
// Return under error
if (error) return error;
// Try to allocate again
if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes))) return error;
}
// Create ready event
if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))
return error;
// Insert into live blocks
mutex.Lock();
live_blocks.insert(search_key);
cached_bytes[device].live += search_key.bytes;
mutex.Unlock();
if (debug) _CubLog("\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld).\n",
device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream);
// Attempt to revert back to previous device if necessary
if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
{
if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
}
}
// Copy device pointer to output parameter
*d_ptr = search_key.d_ptr;
if (debug) _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
(long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
return error;
}
/**
* \brief Provides a suitable allocation of device memory for the given size on the current device.
*
* Once freed, the allocation becomes available immediately for reuse within the \p active_stream
* with which it was associated with during allocation, and it becomes available for reuse within other
* streams when all prior work submitted to \p active_stream has completed.
*/
cudaError_t DeviceAllocate(
void **d_ptr, ///< [out] Reference to pointer to the allocation
size_t bytes, ///< [in] Minimum number of bytes for the allocation
cudaStream_t active_stream = 0) ///< [in] The stream to be associated with this allocation
{
return DeviceAllocate(INVALID_DEVICE_ORDINAL, d_ptr, bytes, active_stream);
}
/**
* \brief Frees a live allocation of device memory on the specified device, returning it to the allocator.
*
* Once freed, the allocation becomes available immediately for reuse within the \p active_stream
* with which it was associated with during allocation, and it becomes available for reuse within other
* streams when all prior work submitted to \p active_stream has completed.
*/
cudaError_t DeviceFree(
int device,
void* d_ptr)
{
int entrypoint_device = INVALID_DEVICE_ORDINAL;
cudaError_t error = cudaSuccess;
if (device == INVALID_DEVICE_ORDINAL)
{
if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
return error;
device = entrypoint_device;
}
// Lock
mutex.Lock();
// Find corresponding block descriptor
bool recached = false;
BlockDescriptor search_key(d_ptr, device);
BusyBlocks::iterator block_itr = live_blocks.find(search_key);
if (block_itr != live_blocks.end())
{
// Remove from live blocks
search_key = *block_itr;
live_blocks.erase(block_itr);
cached_bytes[device].live -= search_key.bytes;
// Keep the returned allocation if bin is valid and we won't exceed the max cached threshold
if ((search_key.bin != INVALID_BIN) && (cached_bytes[device].free + search_key.bytes <= max_cached_bytes))
{
// Insert returned allocation into free blocks
recached = true;
cached_blocks.insert(search_key);
cached_bytes[device].free += search_key.bytes;
if (debug) _CubLog("\tDevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(),
(long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
}
}
// First set to specified device (entrypoint may not be set)
if (device != entrypoint_device)
{
if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
if (CubDebug(error = cudaSetDevice(device))) return error;
}
if (recached)
{
// Insert the ready event in the associated stream (must have current device set properly)
if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream))) return error;
}
// Unlock
mutex.Unlock();
if (!recached)
{
// Free the allocation from the runtime and cleanup the event.
if (CubDebug(error = cudaFree(d_ptr))) return error;
if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) return error;
if (debug) _CubLog("\tDevice %d freed %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
}
// Reset device
if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
{
if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
}
return error;
}
/**
* \brief Frees a live allocation of device memory on the current device, returning it to the allocator.
*
* Once freed, the allocation becomes available immediately for reuse within the \p active_stream
* with which it was associated with during allocation, and it becomes available for reuse within other
* streams when all prior work submitted to \p active_stream has completed.
*/
cudaError_t DeviceFree(
void* d_ptr)
{
return DeviceFree(INVALID_DEVICE_ORDINAL, d_ptr);
}
/**
* \brief Frees all cached device allocations on all devices
*/
cudaError_t FreeAllCached()
{
cudaError_t error = cudaSuccess;
int entrypoint_device = INVALID_DEVICE_ORDINAL;
int current_device = INVALID_DEVICE_ORDINAL;
mutex.Lock();
while (!cached_blocks.empty())
{
// Get first block
CachedBlocks::iterator begin = cached_blocks.begin();
// Get entry-point device ordinal if necessary
if (entrypoint_device == INVALID_DEVICE_ORDINAL)
{
if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
}
// Set current device ordinal if necessary
if (begin->device != current_device)
{
if (CubDebug(error = cudaSetDevice(begin->device))) break;
current_device = begin->device;
}
// Free device memory
if (CubDebug(error = cudaFree(begin->d_ptr))) break;
if (CubDebug(error = cudaEventDestroy(begin->ready_event))) break;
// Reduce balance and erase entry
cached_bytes[current_device].free -= begin->bytes;
if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device].free, (long long) live_blocks.size(), (long long) cached_bytes[current_device].live);
cached_blocks.erase(begin);
}
mutex.Unlock();
// Attempt to revert back to entry-point device if necessary
if (entrypoint_device != INVALID_DEVICE_ORDINAL)
{
if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
}
return error;
}
/**
* \brief Destructor
*/
virtual ~CachingDeviceAllocator()
{
if (!skip_cleanup)
FreeAllCached();
}
};
/** @} */ // end group UtilMgmt
} // CUB namespace
CUB_NS_POSTFIX // Optional outer namespace(s)