/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory. */ #pragma once #include #include #include #include "dispatch/dispatch_histogram.cuh" #include "../config.cuh" /// Optional outer namespace(s) CUB_NS_PREFIX /// CUB namespace namespace cub { /** * \brief DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory. ![](histogram_logo.png) * \ingroup SingleModule * * \par Overview * A histogram * counts the number of observations that fall into each of the disjoint categories (known as bins). * * \par Usage Considerations * \cdp_class{DeviceHistogram} * */ struct DeviceHistogram { /******************************************************************//** * \name Evenly-segmented bin ranges *********************************************************************/ //@{ /** * \brief Computes an intensity histogram from a sequence of data samples using equal-width bins. * * \par * - The number of histogram bins is (\p num_levels - 1) * - All bins comprise the same width of sample values: (\p upper_level - \p lower_level) / (\p num_levels - 1) * - \devicestorage * * \par Snippet * The code snippet below illustrates the computation of a six-bin histogram * from a sequence of float samples * * \par * \code * #include // or equivalently * * // Declare, allocate, and initialize device-accessible pointers for input samples and * // output histogram * int num_samples; // e.g., 10 * float* d_samples; // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, 0.3, 2.9, 2.0, 6.1, 999.5] * int* d_histogram; // e.g., [ -, -, -, -, -, -, -, -] * int num_levels; // e.g., 7 (seven level boundaries for six bins) * float lower_level; // e.g., 0.0 (lower sample value boundary of lowest bin) * float upper_level; // e.g., 12.0 (upper sample value boundary of upper bin) * ... * * // Determine temporary device storage requirements * void* d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes, * d_samples, d_histogram, num_levels, lower_level, upper_level, num_samples); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Compute histograms * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes, * d_samples, d_histogram, num_levels, lower_level, upper_level, num_samples); * * // d_histogram <-- [1, 0, 5, 0, 3, 0, 0, 0]; * * \endcode * * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator * \tparam CounterT [inferred] Integer type for histogram bin counters * \tparam LevelT [inferred] Type for specifying boundaries (levels) * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 */ template < typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT> CUB_RUNTIME_FUNCTION static cudaError_t HistogramEven( void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of data samples. CounterT* d_histogram, ///< [out] The pointer to the histogram counter output array of length num_levels - 1. int num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples. Implies that the number of bins is num_levels - 1. LevelT lower_level, ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin. LevelT upper_level, ///< [in] The upper sample value bound (exclusive) for the highest histogram bin. OffsetT num_samples, ///< [in] The number of input samples (i.e., the length of \p d_samples) cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. { /// The sample value type of the input iterator typedef typename std::iterator_traits::value_type SampleT; CounterT* d_histogram1[1] = {d_histogram}; int num_levels1[1] = {num_levels}; LevelT lower_level1[1] = {lower_level}; LevelT upper_level1[1] = {upper_level}; return MultiHistogramEven<1, 1>( d_temp_storage, temp_storage_bytes, d_samples, d_histogram1, num_levels1, lower_level1, upper_level1, num_samples, 1, sizeof(SampleT) * num_samples, stream, debug_synchronous); } /** * \brief Computes an intensity histogram from a sequence of data samples using equal-width bins. * * \par * - A two-dimensional region of interest within \p d_samples can be specified * using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters. * - The row stride must be a whole multiple of the sample data type * size, i.e., (row_stride_bytes % sizeof(SampleT)) == 0. * - The number of histogram bins is (\p num_levels - 1) * - All bins comprise the same width of sample values: (\p upper_level - \p lower_level) / (\p num_levels - 1) * - \devicestorage * * \par Snippet * The code snippet below illustrates the computation of a six-bin histogram * from a 2x5 region of interest within a flattened 2x7 array of float samples. * * \par * \code * #include // or equivalently * * // Declare, allocate, and initialize device-accessible pointers for input samples and * // output histogram * int num_row_samples; // e.g., 5 * int num_rows; // e.g., 2; * size_t row_stride_bytes; // e.g., 7 * sizeof(float) * float* d_samples; // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, -, -, * // 0.3, 2.9, 2.0, 6.1, 999.5, -, -] * int* d_histogram; // e.g., [ -, -, -, -, -, -, -, -] * int num_levels; // e.g., 7 (seven level boundaries for six bins) * float lower_level; // e.g., 0.0 (lower sample value boundary of lowest bin) * float upper_level; // e.g., 12.0 (upper sample value boundary of upper bin) * ... * * // Determine temporary device storage requirements * void* d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes, * d_samples, d_histogram, num_levels, lower_level, upper_level, * num_row_samples, num_rows, row_stride_bytes); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Compute histograms * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes, d_samples, d_histogram, * d_samples, d_histogram, num_levels, lower_level, upper_level, * num_row_samples, num_rows, row_stride_bytes); * * // d_histogram <-- [1, 0, 5, 0, 3, 0, 0, 0]; * * \endcode * * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator * \tparam CounterT [inferred] Integer type for histogram bin counters * \tparam LevelT [inferred] Type for specifying boundaries (levels) * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 */ template < typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT> CUB_RUNTIME_FUNCTION static cudaError_t HistogramEven( void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of data samples. CounterT* d_histogram, ///< [out] The pointer to the histogram counter output array of length num_levels - 1. int num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples. Implies that the number of bins is num_levels - 1. LevelT lower_level, ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin. LevelT upper_level, ///< [in] The upper sample value bound (exclusive) for the highest histogram bin. OffsetT num_row_samples, ///< [in] The number of data samples per row in the region of interest OffsetT num_rows, ///< [in] The number of rows in the region of interest size_t row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. { CounterT* d_histogram1[1] = {d_histogram}; int num_levels1[1] = {num_levels}; LevelT lower_level1[1] = {lower_level}; LevelT upper_level1[1] = {upper_level}; return MultiHistogramEven<1, 1>( d_temp_storage, temp_storage_bytes, d_samples, d_histogram1, num_levels1, lower_level1, upper_level1, num_row_samples, num_rows, row_stride_bytes, stream, debug_synchronous); } /** * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using equal-width bins. * * \par * - The input is a sequence of pixel structures, where each pixel comprises * a record of \p NUM_CHANNELS consecutive data samples (e.g., an RGBA pixel). * - Of the \p NUM_CHANNELS specified, the function will only compute histograms * for the first \p NUM_ACTIVE_CHANNELS (e.g., only RGB histograms from RGBA * pixel samples). * - The number of histogram bins for channeli is num_levels[i] - 1. * - For channeli, the range of values for all histogram bins * have the same width: (upper_level[i] - lower_level[i]) / ( num_levels[i] - 1) * - \devicestorage * * \par Snippet * The code snippet below illustrates the computation of three 256-bin RGB histograms * from a quad-channel sequence of RGBA pixels (8 bits per channel per pixel) * * \par * \code * #include // or equivalently * * // Declare, allocate, and initialize device-accessible pointers for input samples * // and output histograms * int num_pixels; // e.g., 5 * unsigned char* d_samples; // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2), * // (0, 6, 7, 5), (3, 0, 2, 6)] * int* d_histogram[3]; // e.g., three device pointers to three device buffers, * // each allocated with 256 integer counters * int num_levels[3]; // e.g., {257, 257, 257}; * unsigned int lower_level[3]; // e.g., {0, 0, 0}; * unsigned int upper_level[3]; // e.g., {256, 256, 256}; * ... * * // Determine temporary device storage requirements * void* d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes, * d_samples, d_histogram, num_levels, lower_level, upper_level, num_pixels); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Compute histograms * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes, * d_samples, d_histogram, num_levels, lower_level, upper_level, num_pixels); * * // d_histogram <-- [ [1, 0, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0], * // [0, 3, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0], * // [0, 0, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ] * * \endcode * * \tparam NUM_CHANNELS Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) * \tparam NUM_ACTIVE_CHANNELS [inferred] Number of channels actively being histogrammed * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator * \tparam CounterT [inferred] Integer type for histogram bin counters * \tparam LevelT [inferred] Type for specifying boundaries (levels) * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 */ template < int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT> CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramEven( void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). CounterT* d_histogram[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histogram[i] should be num_levels[i] - 1. int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel. OffsetT num_pixels, ///< [in] The number of multi-channel pixels (i.e., the length of \p d_samples / NUM_CHANNELS) cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. { /// The sample value type of the input iterator typedef typename std::iterator_traits::value_type SampleT; return MultiHistogramEven( d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level, num_pixels, 1, sizeof(SampleT) * NUM_CHANNELS * num_pixels, stream, debug_synchronous); } /** * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using equal-width bins. * * \par * - The input is a sequence of pixel structures, where each pixel comprises * a record of \p NUM_CHANNELS consecutive data samples (e.g., an RGBA pixel). * - Of the \p NUM_CHANNELS specified, the function will only compute histograms * for the first \p NUM_ACTIVE_CHANNELS (e.g., only RGB histograms from RGBA * pixel samples). * - A two-dimensional region of interest within \p d_samples can be specified * using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters. * - The row stride must be a whole multiple of the sample data type * size, i.e., (row_stride_bytes % sizeof(SampleT)) == 0. * - The number of histogram bins for channeli is num_levels[i] - 1. * - For channeli, the range of values for all histogram bins * have the same width: (upper_level[i] - lower_level[i]) / ( num_levels[i] - 1) * - \devicestorage * * \par Snippet * The code snippet below illustrates the computation of three 256-bin RGB histograms from a 2x3 region of * interest of within a flattened 2x4 array of quad-channel RGBA pixels (8 bits per channel per pixel). * * \par * \code * #include // or equivalently * * // Declare, allocate, and initialize device-accessible pointers for input samples * // and output histograms * int num_row_pixels; // e.g., 3 * int num_rows; // e.g., 2 * size_t row_stride_bytes; // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS * unsigned char* d_samples; // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2), (-, -, -, -), * // (0, 6, 7, 5), (3, 0, 2, 6), (1, 1, 1, 1), (-, -, -, -)] * int* d_histogram[3]; // e.g., three device pointers to three device buffers, * // each allocated with 256 integer counters * int num_levels[3]; // e.g., {257, 257, 257}; * unsigned int lower_level[3]; // e.g., {0, 0, 0}; * unsigned int upper_level[3]; // e.g., {256, 256, 256}; * ... * * // Determine temporary device storage requirements * void* d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes, * d_samples, d_histogram, num_levels, lower_level, upper_level, * num_row_pixels, num_rows, row_stride_bytes); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Compute histograms * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes, * d_samples, d_histogram, num_levels, lower_level, upper_level, * num_row_pixels, num_rows, row_stride_bytes); * * // d_histogram <-- [ [1, 1, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0], * // [0, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0], * // [0, 1, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ] * * \endcode * * \tparam NUM_CHANNELS Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) * \tparam NUM_ACTIVE_CHANNELS [inferred] Number of channels actively being histogrammed * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator * \tparam CounterT [inferred] Integer type for histogram bin counters * \tparam LevelT [inferred] Type for specifying boundaries (levels) * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 */ template < int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT> CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramEven( void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). CounterT* d_histogram[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histogram[i] should be num_levels[i] - 1. int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel. OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest OffsetT num_rows, ///< [in] The number of rows in the region of interest size_t row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. { /// The sample value type of the input iterator typedef typename std::iterator_traits::value_type SampleT; Int2Type is_byte_sample; if ((sizeof(OffsetT) > sizeof(int)) && ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) INT_MAX)) { // Down-convert OffsetT data type return DipatchHistogram::DispatchEven( d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level, (int) num_row_pixels, (int) num_rows, (int) (row_stride_bytes / sizeof(SampleT)), stream, debug_synchronous, is_byte_sample); } return DipatchHistogram::DispatchEven( d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level, num_row_pixels, num_rows, (OffsetT) (row_stride_bytes / sizeof(SampleT)), stream, debug_synchronous, is_byte_sample); } //@} end member group /******************************************************************//** * \name Custom bin ranges *********************************************************************/ //@{ /** * \brief Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels. * * \par * - The number of histogram bins is (\p num_levels - 1) * - The value range for bini is [level[i], level[i+1]) * - \devicestorage * * \par Snippet * The code snippet below illustrates the computation of an six-bin histogram * from a sequence of float samples * * \par * \code * #include // or equivalently * * // Declare, allocate, and initialize device-accessible pointers for input samples and * // output histogram * int num_samples; // e.g., 10 * float* d_samples; // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, 0.3, 2.9, 2.0, 6.1, 999.5] * int* d_histogram; // e.g., [ -, -, -, -, -, -, -, -] * int num_levels // e.g., 7 (seven level boundaries for six bins) * float* d_levels; // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0] * ... * * // Determine temporary device storage requirements * void* d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes, * d_samples, d_histogram, num_levels, d_levels, num_samples); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Compute histograms * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes, * d_samples, d_histogram, num_levels, d_levels, num_samples); * * // d_histogram <-- [1, 0, 5, 0, 3, 0, 0, 0]; * * \endcode * * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator * \tparam CounterT [inferred] Integer type for histogram bin counters * \tparam LevelT [inferred] Type for specifying boundaries (levels) * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 */ template < typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT> CUB_RUNTIME_FUNCTION static cudaError_t HistogramRange( void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of data samples. CounterT* d_histogram, ///< [out] The pointer to the histogram counter output array of length num_levels - 1. int num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples. Implies that the number of bins is num_levels - 1. LevelT* d_levels, ///< [in] The pointer to the array of boundaries (levels). Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. OffsetT num_samples, ///< [in] The number of data samples per row in the region of interest cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. { /// The sample value type of the input iterator typedef typename std::iterator_traits::value_type SampleT; CounterT* d_histogram1[1] = {d_histogram}; int num_levels1[1] = {num_levels}; LevelT* d_levels1[1] = {d_levels}; return MultiHistogramRange<1, 1>( d_temp_storage, temp_storage_bytes, d_samples, d_histogram1, num_levels1, d_levels1, num_samples, (OffsetT)1, (size_t)(sizeof(SampleT) * num_samples), stream, debug_synchronous); } /** * \brief Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels. * * \par * - A two-dimensional region of interest within \p d_samples can be specified * using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters. * - The row stride must be a whole multiple of the sample data type * size, i.e., (row_stride_bytes % sizeof(SampleT)) == 0. * - The number of histogram bins is (\p num_levels - 1) * - The value range for bini is [level[i], level[i+1]) * - \devicestorage * * \par Snippet * The code snippet below illustrates the computation of a six-bin histogram * from a 2x5 region of interest within a flattened 2x7 array of float samples. * * \par * \code * #include // or equivalently * * // Declare, allocate, and initialize device-accessible pointers for input samples and * // output histogram * int num_row_samples; // e.g., 5 * int num_rows; // e.g., 2; * int row_stride_bytes; // e.g., 7 * sizeof(float) * float* d_samples; // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, -, -, * // 0.3, 2.9, 2.0, 6.1, 999.5, -, -] * int* d_histogram; // e.g., [ , , , , , , , ] * int num_levels // e.g., 7 (seven level boundaries for six bins) * float *d_levels; // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0] * ... * * // Determine temporary device storage requirements * void* d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes, * d_samples, d_histogram, num_levels, d_levels, * num_row_samples, num_rows, row_stride_bytes); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Compute histograms * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes, * d_samples, d_histogram, num_levels, d_levels, * num_row_samples, num_rows, row_stride_bytes); * * // d_histogram <-- [1, 0, 5, 0, 3, 0, 0, 0]; * * \endcode * * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator * \tparam CounterT [inferred] Integer type for histogram bin counters * \tparam LevelT [inferred] Type for specifying boundaries (levels) * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 */ template < typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT> CUB_RUNTIME_FUNCTION static cudaError_t HistogramRange( void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of data samples. CounterT* d_histogram, ///< [out] The pointer to the histogram counter output array of length num_levels - 1. int num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples. Implies that the number of bins is num_levels - 1. LevelT* d_levels, ///< [in] The pointer to the array of boundaries (levels). Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. OffsetT num_row_samples, ///< [in] The number of data samples per row in the region of interest OffsetT num_rows, ///< [in] The number of rows in the region of interest size_t row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. { CounterT* d_histogram1[1] = {d_histogram}; int num_levels1[1] = {num_levels}; LevelT* d_levels1[1] = {d_levels}; return MultiHistogramRange<1, 1>( d_temp_storage, temp_storage_bytes, d_samples, d_histogram1, num_levels1, d_levels1, num_row_samples, num_rows, row_stride_bytes, stream, debug_synchronous); } /** * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using the specified bin boundary levels. * * \par * - The input is a sequence of pixel structures, where each pixel comprises * a record of \p NUM_CHANNELS consecutive data samples (e.g., an RGBA pixel). * - Of the \p NUM_CHANNELS specified, the function will only compute histograms * for the first \p NUM_ACTIVE_CHANNELS (e.g., RGB histograms from RGBA * pixel samples). * - The number of histogram bins for channeli is num_levels[i] - 1. * - For channeli, the range of values for all histogram bins * have the same width: (upper_level[i] - lower_level[i]) / ( num_levels[i] - 1) * - \devicestorage * * \par Snippet * The code snippet below illustrates the computation of three 4-bin RGB histograms * from a quad-channel sequence of RGBA pixels (8 bits per channel per pixel) * * \par * \code * #include // or equivalently * * // Declare, allocate, and initialize device-accessible pointers for input samples * // and output histograms * int num_pixels; // e.g., 5 * unsigned char *d_samples; // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(7, 0, 6, 2), * // (0, 6, 7, 5),(3, 0, 2, 6)] * unsigned int *d_histogram[3]; // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]]; * int num_levels[3]; // e.g., {5, 5, 5}; * unsigned int *d_levels[3]; // e.g., [ [0, 2, 4, 6, 8], * // [0, 2, 4, 6, 8], * // [0, 2, 4, 6, 8] ]; * ... * * // Determine temporary device storage requirements * void* d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes, * d_samples, d_histogram, num_levels, d_levels, num_pixels); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Compute histograms * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes, * d_samples, d_histogram, num_levels, d_levels, num_pixels); * * // d_histogram <-- [ [1, 3, 0, 1], * // [3, 0, 0, 2], * // [0, 2, 0, 3] ] * * \endcode * * \tparam NUM_CHANNELS Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) * \tparam NUM_ACTIVE_CHANNELS [inferred] Number of channels actively being histogrammed * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator * \tparam CounterT [inferred] Integer type for histogram bin counters * \tparam LevelT [inferred] Type for specifying boundaries (levels) * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 */ template < int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT> CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramRange( void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). CounterT* d_histogram[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histogram[i] should be num_levels[i] - 1. int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. LevelT* d_levels[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. OffsetT num_pixels, ///< [in] The number of multi-channel pixels (i.e., the length of \p d_samples / NUM_CHANNELS) cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. { /// The sample value type of the input iterator typedef typename std::iterator_traits::value_type SampleT; return MultiHistogramRange( d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, num_pixels, (OffsetT)1, (size_t)(sizeof(SampleT) * NUM_CHANNELS * num_pixels), stream, debug_synchronous); } /** * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using the specified bin boundary levels. * * \par * - The input is a sequence of pixel structures, where each pixel comprises * a record of \p NUM_CHANNELS consecutive data samples (e.g., an RGBA pixel). * - Of the \p NUM_CHANNELS specified, the function will only compute histograms * for the first \p NUM_ACTIVE_CHANNELS (e.g., RGB histograms from RGBA * pixel samples). * - A two-dimensional region of interest within \p d_samples can be specified * using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters. * - The row stride must be a whole multiple of the sample data type * size, i.e., (row_stride_bytes % sizeof(SampleT)) == 0. * - The number of histogram bins for channeli is num_levels[i] - 1. * - For channeli, the range of values for all histogram bins * have the same width: (upper_level[i] - lower_level[i]) / ( num_levels[i] - 1) * - \devicestorage * * \par Snippet * The code snippet below illustrates the computation of three 4-bin RGB histograms from a 2x3 region of * interest of within a flattened 2x4 array of quad-channel RGBA pixels (8 bits per channel per pixel). * * \par * \code * #include // or equivalently * * // Declare, allocate, and initialize device-accessible pointers for input samples * // and output histograms * int num_row_pixels; // e.g., 3 * int num_rows; // e.g., 2 * size_t row_stride_bytes; // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS * unsigned char* d_samples; // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(1, 1, 1, 1),(-, -, -, -), * // (7, 0, 6, 2),(0, 6, 7, 5),(3, 0, 2, 6),(-, -, -, -)] * int* d_histogram[3]; // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]]; * int num_levels[3]; // e.g., {5, 5, 5}; * unsigned int* d_levels[3]; // e.g., [ [0, 2, 4, 6, 8], * // [0, 2, 4, 6, 8], * // [0, 2, 4, 6, 8] ]; * ... * * // Determine temporary device storage requirements * void* d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes, * d_samples, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, row_stride_bytes); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Compute histograms * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes, * d_samples, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, row_stride_bytes); * * // d_histogram <-- [ [2, 3, 0, 1], * // [3, 0, 0, 2], * // [1, 2, 0, 3] ] * * \endcode * * \tparam NUM_CHANNELS Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) * \tparam NUM_ACTIVE_CHANNELS [inferred] Number of channels actively being histogrammed * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator * \tparam CounterT [inferred] Integer type for histogram bin counters * \tparam LevelT [inferred] Type for specifying boundaries (levels) * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 */ template < int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT> CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramRange( void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). CounterT* d_histogram[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histogram[i] should be num_levels[i] - 1. int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. LevelT* d_levels[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest OffsetT num_rows, ///< [in] The number of rows in the region of interest size_t row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. { /// The sample value type of the input iterator typedef typename std::iterator_traits::value_type SampleT; Int2Type is_byte_sample; if ((sizeof(OffsetT) > sizeof(int)) && ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) INT_MAX)) { // Down-convert OffsetT data type return DipatchHistogram::DispatchRange( d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, (int) num_row_pixels, (int) num_rows, (int) (row_stride_bytes / sizeof(SampleT)), stream, debug_synchronous, is_byte_sample); } return DipatchHistogram::DispatchRange( d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, (OffsetT) (row_stride_bytes / sizeof(SampleT)), stream, debug_synchronous, is_byte_sample); } //@} end member group }; } // CUB namespace CUB_NS_POSTFIX // Optional outer namespace(s)