Spaces:
Runtime error
Runtime error
/****************************************************************************** | |
* Copyright (c) 2011, Duane Merrill. All rights reserved. | |
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. | |
* | |
* Redistribution and use in source and binary forms, with or without | |
* modification, are permitted provided that the following conditions are met: | |
* * Redistributions of source code must retain the above copyright | |
* notice, this list of conditions and the following disclaimer. | |
* * Redistributions in binary form must reproduce the above copyright | |
* notice, this list of conditions and the following disclaimer in the | |
* documentation and/or other materials provided with the distribution. | |
* * Neither the name of the NVIDIA CORPORATION nor the | |
* names of its contributors may be used to endorse or promote products | |
* derived from this software without specific prior written permission. | |
* | |
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | |
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY | |
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES | |
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND | |
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
* | |
******************************************************************************/ | |
/****************************************************************************** | |
* Simple caching allocator for device memory allocations. The allocator is | |
* thread-safe and capable of managing device allocations on multiple devices. | |
******************************************************************************/ | |
#pragma once | |
#include "util_namespace.cuh" | |
#include "util_debug.cuh" | |
#include <set> | |
#include <map> | |
#include "host/mutex.cuh" | |
#include <math.h> | |
/// Optional outer namespace(s) | |
CUB_NS_PREFIX | |
/// CUB namespace | |
namespace cub { | |
/** | |
* \addtogroup UtilMgmt | |
* @{ | |
*/ | |
/****************************************************************************** | |
* CachingDeviceAllocator (host use) | |
******************************************************************************/ | |
/** | |
* \brief A simple caching allocator for device memory allocations. | |
* | |
* \par Overview | |
* The allocator is thread-safe and stream-safe and is capable of managing cached | |
* device allocations on multiple devices. It behaves as follows: | |
* | |
* \par | |
* - Allocations from the allocator are associated with an \p active_stream. Once freed, | |
* the allocation becomes available immediately for reuse within the \p active_stream | |
* with which it was associated with during allocation, and it becomes available for | |
* reuse within other streams when all prior work submitted to \p active_stream has completed. | |
* - Allocations are categorized and cached by bin size. A new allocation request of | |
* a given size will only consider cached allocations within the corresponding bin. | |
* - Bin limits progress geometrically in accordance with the growth factor | |
* \p bin_growth provided during construction. Unused device allocations within | |
* a larger bin cache are not reused for allocation requests that categorize to | |
* smaller bin sizes. | |
* - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to | |
* (\p bin_growth ^ \p min_bin). | |
* - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest | |
* bin and are simply freed when they are deallocated instead of being returned | |
* to a bin-cache. | |
* - %If the total storage of cached allocations on a given device will exceed | |
* \p max_cached_bytes, allocations for that device are simply freed when they are | |
* deallocated instead of being returned to their bin-cache. | |
* | |
* \par | |
* For example, the default-constructed CachingDeviceAllocator is configured with: | |
* - \p bin_growth = 8 | |
* - \p min_bin = 3 | |
* - \p max_bin = 7 | |
* - \p max_cached_bytes = 6MB - 1B | |
* | |
* \par | |
* which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB | |
* and sets a maximum of 6,291,455 cached bytes per device | |
* | |
*/ | |
struct CachingDeviceAllocator | |
{ | |
//--------------------------------------------------------------------- | |
// Constants | |
//--------------------------------------------------------------------- | |
/// Out-of-bounds bin | |
static const unsigned int INVALID_BIN = (unsigned int) -1; | |
/// Invalid size | |
static const size_t INVALID_SIZE = (size_t) -1; | |
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document | |
/// Invalid device ordinal | |
static const int INVALID_DEVICE_ORDINAL = -1; | |
//--------------------------------------------------------------------- | |
// Type definitions and helper types | |
//--------------------------------------------------------------------- | |
/** | |
* Descriptor for device memory allocations | |
*/ | |
struct BlockDescriptor | |
{ | |
void* d_ptr; // Device pointer | |
size_t bytes; // Size of allocation in bytes | |
unsigned int bin; // Bin enumeration | |
int device; // device ordinal | |
cudaStream_t associated_stream; // Associated associated_stream | |
cudaEvent_t ready_event; // Signal when associated stream has run to the point at which this block was freed | |
// Constructor (suitable for searching maps for a specific block, given its pointer and device) | |
BlockDescriptor(void *d_ptr, int device) : | |
d_ptr(d_ptr), | |
bytes(0), | |
bin(INVALID_BIN), | |
device(device), | |
associated_stream(0), | |
ready_event(0) | |
{} | |
// Constructor (suitable for searching maps for a range of suitable blocks, given a device) | |
BlockDescriptor(int device) : | |
d_ptr(NULL), | |
bytes(0), | |
bin(INVALID_BIN), | |
device(device), | |
associated_stream(0), | |
ready_event(0) | |
{} | |
// Comparison functor for comparing device pointers | |
static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b) | |
{ | |
if (a.device == b.device) | |
return (a.d_ptr < b.d_ptr); | |
else | |
return (a.device < b.device); | |
} | |
// Comparison functor for comparing allocation sizes | |
static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b) | |
{ | |
if (a.device == b.device) | |
return (a.bytes < b.bytes); | |
else | |
return (a.device < b.device); | |
} | |
}; | |
/// BlockDescriptor comparator function interface | |
typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &); | |
class TotalBytes { | |
public: | |
size_t free; | |
size_t live; | |
TotalBytes() { free = live = 0; } | |
}; | |
/// Set type for cached blocks (ordered by size) | |
typedef std::multiset<BlockDescriptor, Compare> CachedBlocks; | |
/// Set type for live blocks (ordered by ptr) | |
typedef std::multiset<BlockDescriptor, Compare> BusyBlocks; | |
/// Map type of device ordinals to the number of cached bytes cached by each device | |
typedef std::map<int, TotalBytes> GpuCachedBytes; | |
//--------------------------------------------------------------------- | |
// Utility functions | |
//--------------------------------------------------------------------- | |
/** | |
* Integer pow function for unsigned base and exponent | |
*/ | |
static unsigned int IntPow( | |
unsigned int base, | |
unsigned int exp) | |
{ | |
unsigned int retval = 1; | |
while (exp > 0) | |
{ | |
if (exp & 1) { | |
retval = retval * base; // multiply the result by the current base | |
} | |
base = base * base; // square the base | |
exp = exp >> 1; // divide the exponent in half | |
} | |
return retval; | |
} | |
/** | |
* Round up to the nearest power-of | |
*/ | |
void NearestPowerOf( | |
unsigned int &power, | |
size_t &rounded_bytes, | |
unsigned int base, | |
size_t value) | |
{ | |
power = 0; | |
rounded_bytes = 1; | |
if (value * base < value) | |
{ | |
// Overflow | |
power = sizeof(size_t) * 8; | |
rounded_bytes = size_t(0) - 1; | |
return; | |
} | |
while (rounded_bytes < value) | |
{ | |
rounded_bytes *= base; | |
power++; | |
} | |
} | |
//--------------------------------------------------------------------- | |
// Fields | |
//--------------------------------------------------------------------- | |
cub::Mutex mutex; /// Mutex for thread-safety | |
unsigned int bin_growth; /// Geometric growth factor for bin-sizes | |
unsigned int min_bin; /// Minimum bin enumeration | |
unsigned int max_bin; /// Maximum bin enumeration | |
size_t min_bin_bytes; /// Minimum bin size | |
size_t max_bin_bytes; /// Maximum bin size | |
size_t max_cached_bytes; /// Maximum aggregate cached bytes per device | |
const bool skip_cleanup; /// Whether or not to skip a call to FreeAllCached() when destructor is called. (The CUDA runtime may have already shut down for statically declared allocators) | |
bool debug; /// Whether or not to print (de)allocation events to stdout | |
GpuCachedBytes cached_bytes; /// Map of device ordinal to aggregate cached bytes on that device | |
CachedBlocks cached_blocks; /// Set of cached device allocations available for reuse | |
BusyBlocks live_blocks; /// Set of live device allocations currently in use | |
#endif // DOXYGEN_SHOULD_SKIP_THIS | |
//--------------------------------------------------------------------- | |
// Methods | |
//--------------------------------------------------------------------- | |
/** | |
* \brief Constructor. | |
*/ | |
CachingDeviceAllocator( | |
unsigned int bin_growth, ///< Geometric growth factor for bin-sizes | |
unsigned int min_bin = 1, ///< Minimum bin (default is bin_growth ^ 1) | |
unsigned int max_bin = INVALID_BIN, ///< Maximum bin (default is no max bin) | |
size_t max_cached_bytes = INVALID_SIZE, ///< Maximum aggregate cached bytes per device (default is no limit) | |
bool skip_cleanup = false, ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called (default is to deallocate) | |
bool debug = false) ///< Whether or not to print (de)allocation events to stdout (default is no stderr output) | |
: | |
bin_growth(bin_growth), | |
min_bin(min_bin), | |
max_bin(max_bin), | |
min_bin_bytes(IntPow(bin_growth, min_bin)), | |
max_bin_bytes(IntPow(bin_growth, max_bin)), | |
max_cached_bytes(max_cached_bytes), | |
skip_cleanup(skip_cleanup), | |
debug(debug), | |
cached_blocks(BlockDescriptor::SizeCompare), | |
live_blocks(BlockDescriptor::PtrCompare) | |
{} | |
/** | |
* \brief Default constructor. | |
* | |
* Configured with: | |
* \par | |
* - \p bin_growth = 8 | |
* - \p min_bin = 3 | |
* - \p max_bin = 7 | |
* - \p max_cached_bytes = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes | |
* | |
* which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and | |
* sets a maximum of 6,291,455 cached bytes per device | |
*/ | |
CachingDeviceAllocator( | |
bool skip_cleanup = false, | |
bool debug = false) | |
: | |
bin_growth(8), | |
min_bin(3), | |
max_bin(7), | |
min_bin_bytes(IntPow(bin_growth, min_bin)), | |
max_bin_bytes(IntPow(bin_growth, max_bin)), | |
max_cached_bytes((max_bin_bytes * 3) - 1), | |
skip_cleanup(skip_cleanup), | |
debug(debug), | |
cached_blocks(BlockDescriptor::SizeCompare), | |
live_blocks(BlockDescriptor::PtrCompare) | |
{} | |
/** | |
* \brief Sets the limit on the number bytes this allocator is allowed to cache per device. | |
* | |
* Changing the ceiling of cached bytes does not cause any allocations (in-use or | |
* cached-in-reserve) to be freed. See \p FreeAllCached(). | |
*/ | |
cudaError_t SetMaxCachedBytes( | |
size_t max_cached_bytes) | |
{ | |
// Lock | |
mutex.Lock(); | |
if (debug) _CubLog("Changing max_cached_bytes (%lld -> %lld)\n", (long long) this->max_cached_bytes, (long long) max_cached_bytes); | |
this->max_cached_bytes = max_cached_bytes; | |
// Unlock | |
mutex.Unlock(); | |
return cudaSuccess; | |
} | |
/** | |
* \brief Provides a suitable allocation of device memory for the given size on the specified device. | |
* | |
* Once freed, the allocation becomes available immediately for reuse within the \p active_stream | |
* with which it was associated with during allocation, and it becomes available for reuse within other | |
* streams when all prior work submitted to \p active_stream has completed. | |
*/ | |
cudaError_t DeviceAllocate( | |
int device, ///< [in] Device on which to place the allocation | |
void **d_ptr, ///< [out] Reference to pointer to the allocation | |
size_t bytes, ///< [in] Minimum number of bytes for the allocation | |
cudaStream_t active_stream = 0) ///< [in] The stream to be associated with this allocation | |
{ | |
*d_ptr = NULL; | |
int entrypoint_device = INVALID_DEVICE_ORDINAL; | |
cudaError_t error = cudaSuccess; | |
if (device == INVALID_DEVICE_ORDINAL) | |
{ | |
if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error; | |
device = entrypoint_device; | |
} | |
// Create a block descriptor for the requested allocation | |
bool found = false; | |
BlockDescriptor search_key(device); | |
search_key.associated_stream = active_stream; | |
NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes); | |
if (search_key.bin > max_bin) | |
{ | |
// Bin is greater than our maximum bin: allocate the request | |
// exactly and give out-of-bounds bin. It will not be cached | |
// for reuse when returned. | |
search_key.bin = INVALID_BIN; | |
search_key.bytes = bytes; | |
} | |
else | |
{ | |
// Search for a suitable cached allocation: lock | |
mutex.Lock(); | |
if (search_key.bin < min_bin) | |
{ | |
// Bin is less than minimum bin: round up | |
search_key.bin = min_bin; | |
search_key.bytes = min_bin_bytes; | |
} | |
// Iterate through the range of cached blocks on the same device in the same bin | |
CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key); | |
while ((block_itr != cached_blocks.end()) | |
&& (block_itr->device == device) | |
&& (block_itr->bin == search_key.bin)) | |
{ | |
// To prevent races with reusing blocks returned by the host but still | |
// in use by the device, only consider cached blocks that are | |
// either (from the active stream) or (from an idle stream) | |
if ((active_stream == block_itr->associated_stream) || | |
(CubDebug(cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady))) | |
{ | |
// Reuse existing cache block. Insert into live blocks. | |
found = true; | |
search_key = *block_itr; | |
search_key.associated_stream = active_stream; | |
live_blocks.insert(search_key); | |
// Remove from free blocks | |
cached_bytes[device].free -= search_key.bytes; | |
cached_bytes[device].live += search_key.bytes; | |
if (debug) _CubLog("\tDevice %d reused cached block at %p (%lld bytes) for stream %lld (previously associated with stream %lld).\n", | |
device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) block_itr->associated_stream); | |
cached_blocks.erase(block_itr); | |
break; | |
} | |
block_itr++; | |
} | |
// Done searching: unlock | |
mutex.Unlock(); | |
} | |
// Allocate the block if necessary | |
if (!found) | |
{ | |
// Set runtime's current device to specified device (entrypoint may not be set) | |
if (device != entrypoint_device) | |
{ | |
if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error; | |
if (CubDebug(error = cudaSetDevice(device))) return error; | |
} | |
// Attempt to allocate | |
if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation) | |
{ | |
// The allocation attempt failed: free all cached blocks on device and retry | |
if (debug) _CubLog("\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations", | |
device, (long long) search_key.bytes, (long long) search_key.associated_stream); | |
error = cudaSuccess; // Reset the error we will return | |
cudaGetLastError(); // Reset CUDART's error | |
// Lock | |
mutex.Lock(); | |
// Iterate the range of free blocks on the same device | |
BlockDescriptor free_key(device); | |
CachedBlocks::iterator block_itr = cached_blocks.lower_bound(free_key); | |
while ((block_itr != cached_blocks.end()) && (block_itr->device == device)) | |
{ | |
// No need to worry about synchronization with the device: cudaFree is | |
// blocking and will synchronize across all kernels executing | |
// on the current device | |
// Free device memory and destroy stream event. | |
if (CubDebug(error = cudaFree(block_itr->d_ptr))) break; | |
if (CubDebug(error = cudaEventDestroy(block_itr->ready_event))) break; | |
// Reduce balance and erase entry | |
cached_bytes[device].free -= block_itr->bytes; | |
if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n", | |
device, (long long) block_itr->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live); | |
cached_blocks.erase(block_itr); | |
block_itr++; | |
} | |
// Unlock | |
mutex.Unlock(); | |
// Return under error | |
if (error) return error; | |
// Try to allocate again | |
if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes))) return error; | |
} | |
// Create ready event | |
if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming))) | |
return error; | |
// Insert into live blocks | |
mutex.Lock(); | |
live_blocks.insert(search_key); | |
cached_bytes[device].live += search_key.bytes; | |
mutex.Unlock(); | |
if (debug) _CubLog("\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld).\n", | |
device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream); | |
// Attempt to revert back to previous device if necessary | |
if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) | |
{ | |
if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error; | |
} | |
} | |
// Copy device pointer to output parameter | |
*d_ptr = search_key.d_ptr; | |
if (debug) _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n", | |
(long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live); | |
return error; | |
} | |
/** | |
* \brief Provides a suitable allocation of device memory for the given size on the current device. | |
* | |
* Once freed, the allocation becomes available immediately for reuse within the \p active_stream | |
* with which it was associated with during allocation, and it becomes available for reuse within other | |
* streams when all prior work submitted to \p active_stream has completed. | |
*/ | |
cudaError_t DeviceAllocate( | |
void **d_ptr, ///< [out] Reference to pointer to the allocation | |
size_t bytes, ///< [in] Minimum number of bytes for the allocation | |
cudaStream_t active_stream = 0) ///< [in] The stream to be associated with this allocation | |
{ | |
return DeviceAllocate(INVALID_DEVICE_ORDINAL, d_ptr, bytes, active_stream); | |
} | |
/** | |
* \brief Frees a live allocation of device memory on the specified device, returning it to the allocator. | |
* | |
* Once freed, the allocation becomes available immediately for reuse within the \p active_stream | |
* with which it was associated with during allocation, and it becomes available for reuse within other | |
* streams when all prior work submitted to \p active_stream has completed. | |
*/ | |
cudaError_t DeviceFree( | |
int device, | |
void* d_ptr) | |
{ | |
int entrypoint_device = INVALID_DEVICE_ORDINAL; | |
cudaError_t error = cudaSuccess; | |
if (device == INVALID_DEVICE_ORDINAL) | |
{ | |
if (CubDebug(error = cudaGetDevice(&entrypoint_device))) | |
return error; | |
device = entrypoint_device; | |
} | |
// Lock | |
mutex.Lock(); | |
// Find corresponding block descriptor | |
bool recached = false; | |
BlockDescriptor search_key(d_ptr, device); | |
BusyBlocks::iterator block_itr = live_blocks.find(search_key); | |
if (block_itr != live_blocks.end()) | |
{ | |
// Remove from live blocks | |
search_key = *block_itr; | |
live_blocks.erase(block_itr); | |
cached_bytes[device].live -= search_key.bytes; | |
// Keep the returned allocation if bin is valid and we won't exceed the max cached threshold | |
if ((search_key.bin != INVALID_BIN) && (cached_bytes[device].free + search_key.bytes <= max_cached_bytes)) | |
{ | |
// Insert returned allocation into free blocks | |
recached = true; | |
cached_blocks.insert(search_key); | |
cached_bytes[device].free += search_key.bytes; | |
if (debug) _CubLog("\tDevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n", | |
device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), | |
(long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live); | |
} | |
} | |
// First set to specified device (entrypoint may not be set) | |
if (device != entrypoint_device) | |
{ | |
if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error; | |
if (CubDebug(error = cudaSetDevice(device))) return error; | |
} | |
if (recached) | |
{ | |
// Insert the ready event in the associated stream (must have current device set properly) | |
if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream))) return error; | |
} | |
// Unlock | |
mutex.Unlock(); | |
if (!recached) | |
{ | |
// Free the allocation from the runtime and cleanup the event. | |
if (CubDebug(error = cudaFree(d_ptr))) return error; | |
if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) return error; | |
if (debug) _CubLog("\tDevice %d freed %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n", | |
device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live); | |
} | |
// Reset device | |
if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) | |
{ | |
if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error; | |
} | |
return error; | |
} | |
/** | |
* \brief Frees a live allocation of device memory on the current device, returning it to the allocator. | |
* | |
* Once freed, the allocation becomes available immediately for reuse within the \p active_stream | |
* with which it was associated with during allocation, and it becomes available for reuse within other | |
* streams when all prior work submitted to \p active_stream has completed. | |
*/ | |
cudaError_t DeviceFree( | |
void* d_ptr) | |
{ | |
return DeviceFree(INVALID_DEVICE_ORDINAL, d_ptr); | |
} | |
/** | |
* \brief Frees all cached device allocations on all devices | |
*/ | |
cudaError_t FreeAllCached() | |
{ | |
cudaError_t error = cudaSuccess; | |
int entrypoint_device = INVALID_DEVICE_ORDINAL; | |
int current_device = INVALID_DEVICE_ORDINAL; | |
mutex.Lock(); | |
while (!cached_blocks.empty()) | |
{ | |
// Get first block | |
CachedBlocks::iterator begin = cached_blocks.begin(); | |
// Get entry-point device ordinal if necessary | |
if (entrypoint_device == INVALID_DEVICE_ORDINAL) | |
{ | |
if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break; | |
} | |
// Set current device ordinal if necessary | |
if (begin->device != current_device) | |
{ | |
if (CubDebug(error = cudaSetDevice(begin->device))) break; | |
current_device = begin->device; | |
} | |
// Free device memory | |
if (CubDebug(error = cudaFree(begin->d_ptr))) break; | |
if (CubDebug(error = cudaEventDestroy(begin->ready_event))) break; | |
// Reduce balance and erase entry | |
cached_bytes[current_device].free -= begin->bytes; | |
if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n", | |
current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device].free, (long long) live_blocks.size(), (long long) cached_bytes[current_device].live); | |
cached_blocks.erase(begin); | |
} | |
mutex.Unlock(); | |
// Attempt to revert back to entry-point device if necessary | |
if (entrypoint_device != INVALID_DEVICE_ORDINAL) | |
{ | |
if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error; | |
} | |
return error; | |
} | |
/** | |
* \brief Destructor | |
*/ | |
virtual ~CachingDeviceAllocator() | |
{ | |
if (!skip_cleanup) | |
FreeAllCached(); | |
} | |
}; | |
/** @} */ // end group UtilMgmt | |
} // CUB namespace | |
CUB_NS_POSTFIX // Optional outer namespace(s) | |