LIVE / thrust /dependencies /cub /experimental /histogram_compare.cu
Xu Ma
update
1c3c0d9
raw
history blame
21.8 kB
/******************************************************************************
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
#include <stdio.h>
#include <map>
#include <vector>
#include <algorithm>
#include <cstdio>
#include <fstream>
#include "histogram/histogram_gmem_atomics.h"
#include "histogram/histogram_smem_atomics.h"
#include "histogram/histogram_cub.h"
#include <cub/util_allocator.cuh>
#include <test/test_util.h>
using namespace cub;
//---------------------------------------------------------------------
// Globals, constants, and type declarations
//---------------------------------------------------------------------
// Ensure printing of CUDA runtime errors to console
#define CUB_STDERR
bool g_verbose = false; // Whether to display input/output to console
bool g_report = false; // Whether to display a full report in CSV format
CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory
struct less_than_value
{
inline bool operator()(
const std::pair<std::string, double> &a,
const std::pair<std::string, double> &b)
{
return a.second < b.second;
}
};
//---------------------------------------------------------------------
// Targa (.tga) image file parsing
//---------------------------------------------------------------------
/**
* TGA image header info
*/
struct TgaHeader
{
char idlength;
char colormaptype;
char datatypecode;
short colormaporigin;
short colormaplength;
char colormapdepth;
short x_origin;
short y_origin;
short width;
short height;
char bitsperpixel;
char imagedescriptor;
void Parse (FILE *fptr)
{
idlength = fgetc(fptr);
colormaptype = fgetc(fptr);
datatypecode = fgetc(fptr);
fread(&colormaporigin, 2, 1, fptr);
fread(&colormaplength, 2, 1, fptr);
colormapdepth = fgetc(fptr);
fread(&x_origin, 2, 1, fptr);
fread(&y_origin, 2, 1, fptr);
fread(&width, 2, 1, fptr);
fread(&height, 2, 1, fptr);
bitsperpixel = fgetc(fptr);
imagedescriptor = fgetc(fptr);
}
void Display (FILE *fptr)
{
fprintf(fptr, "ID length: %d\n", idlength);
fprintf(fptr, "Color map type: %d\n", colormaptype);
fprintf(fptr, "Image type: %d\n", datatypecode);
fprintf(fptr, "Color map offset: %d\n", colormaporigin);
fprintf(fptr, "Color map length: %d\n", colormaplength);
fprintf(fptr, "Color map depth: %d\n", colormapdepth);
fprintf(fptr, "X origin: %d\n", x_origin);
fprintf(fptr, "Y origin: %d\n", y_origin);
fprintf(fptr, "Width: %d\n", width);
fprintf(fptr, "Height: %d\n", height);
fprintf(fptr, "Bits per pixel: %d\n", bitsperpixel);
fprintf(fptr, "Descriptor: %d\n", imagedescriptor);
}
};
/**
* Decode image byte data into pixel
*/
void ParseTgaPixel(uchar4 &pixel, unsigned char *tga_pixel, int bytes)
{
if (bytes == 4)
{
pixel.x = tga_pixel[2];
pixel.y = tga_pixel[1];
pixel.z = tga_pixel[0];
pixel.w = tga_pixel[3];
}
else if (bytes == 3)
{
pixel.x = tga_pixel[2];
pixel.y = tga_pixel[1];
pixel.z = tga_pixel[0];
pixel.w = 0;
}
else if (bytes == 2)
{
pixel.x = (tga_pixel[1] & 0x7c) << 1;
pixel.y = ((tga_pixel[1] & 0x03) << 6) | ((tga_pixel[0] & 0xe0) >> 2);
pixel.z = (tga_pixel[0] & 0x1f) << 3;
pixel.w = (tga_pixel[1] & 0x80);
}
}
/**
* Reads a .tga image file
*/
void ReadTga(uchar4* &pixels, int &width, int &height, const char *filename)
{
// Open the file
FILE *fptr;
if ((fptr = fopen(filename, "rb")) == NULL)
{
fprintf(stderr, "File open failed\n");
exit(-1);
}
// Parse header
TgaHeader header;
header.Parse(fptr);
// header.Display(stdout);
width = header.width;
height = header.height;
// Verify compatibility
if (header.datatypecode != 2 && header.datatypecode != 10)
{
fprintf(stderr, "Can only handle image type 2 and 10\n");
exit(-1);
}
if (header.bitsperpixel != 16 && header.bitsperpixel != 24 && header.bitsperpixel != 32)
{
fprintf(stderr, "Can only handle pixel depths of 16, 24, and 32\n");
exit(-1);
}
if (header.colormaptype != 0 && header.colormaptype != 1)
{
fprintf(stderr, "Can only handle color map types of 0 and 1\n");
exit(-1);
}
// Skip unnecessary header info
int skip_bytes = header.idlength + (header.colormaptype * header.colormaplength);
fseek(fptr, skip_bytes, SEEK_CUR);
// Read the image
int pixel_bytes = header.bitsperpixel / 8;
// Allocate and initialize pixel data
size_t image_bytes = width * height * sizeof(uchar4);
if ((pixels == NULL) && ((pixels = (uchar4*) malloc(image_bytes)) == NULL))
{
fprintf(stderr, "malloc of image failed\n");
exit(-1);
}
memset(pixels, 0, image_bytes);
// Parse pixels
unsigned char tga_pixel[5];
int current_pixel = 0;
while (current_pixel < header.width * header.height)
{
if (header.datatypecode == 2)
{
// Uncompressed
if (fread(tga_pixel, 1, pixel_bytes, fptr) != pixel_bytes)
{
fprintf(stderr, "Unexpected end of file at pixel %d (uncompressed)\n", current_pixel);
exit(-1);
}
ParseTgaPixel(pixels[current_pixel], tga_pixel, pixel_bytes);
current_pixel++;
}
else if (header.datatypecode == 10)
{
// Compressed
if (fread(tga_pixel, 1, pixel_bytes + 1, fptr) != pixel_bytes + 1)
{
fprintf(stderr, "Unexpected end of file at pixel %d (compressed)\n", current_pixel);
exit(-1);
}
int run_length = tga_pixel[0] & 0x7f;
ParseTgaPixel(pixels[current_pixel], &(tga_pixel[1]), pixel_bytes);
current_pixel++;
if (tga_pixel[0] & 0x80)
{
// RLE chunk
for (int i = 0; i < run_length; i++)
{
ParseTgaPixel(pixels[current_pixel], &(tga_pixel[1]), pixel_bytes);
current_pixel++;
}
}
else
{
// Normal chunk
for (int i = 0; i < run_length; i++)
{
if (fread(tga_pixel, 1, pixel_bytes, fptr) != pixel_bytes)
{
fprintf(stderr, "Unexpected end of file at pixel %d (normal)\n", current_pixel);
exit(-1);
}
ParseTgaPixel(pixels[current_pixel], tga_pixel, pixel_bytes);
current_pixel++;
}
}
}
}
// Close file
fclose(fptr);
}
//---------------------------------------------------------------------
// Random image generation
//---------------------------------------------------------------------
/**
* Generate a random image with specified entropy
*/
void GenerateRandomImage(uchar4* &pixels, int width, int height, int entropy_reduction)
{
int num_pixels = width * height;
size_t image_bytes = num_pixels * sizeof(uchar4);
if ((pixels == NULL) && ((pixels = (uchar4*) malloc(image_bytes)) == NULL))
{
fprintf(stderr, "malloc of image failed\n");
exit(-1);
}
for (int i = 0; i < num_pixels; ++i)
{
RandomBits(pixels[i].x, entropy_reduction);
RandomBits(pixels[i].y, entropy_reduction);
RandomBits(pixels[i].z, entropy_reduction);
RandomBits(pixels[i].w, entropy_reduction);
}
}
//---------------------------------------------------------------------
// Histogram verification
//---------------------------------------------------------------------
// Decode float4 pixel into bins
template <int NUM_BINS, int ACTIVE_CHANNELS>
void DecodePixelGold(float4 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
{
float* samples = reinterpret_cast<float*>(&pixel);
for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
bins[CHANNEL] = (unsigned int) (samples[CHANNEL] * float(NUM_BINS));
}
// Decode uchar4 pixel into bins
template <int NUM_BINS, int ACTIVE_CHANNELS>
void DecodePixelGold(uchar4 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
{
unsigned char* samples = reinterpret_cast<unsigned char*>(&pixel);
for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
bins[CHANNEL] = (unsigned int) (samples[CHANNEL]);
}
// Decode uchar1 pixel into bins
template <int NUM_BINS, int ACTIVE_CHANNELS>
void DecodePixelGold(uchar1 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
{
bins[0] = (unsigned int) pixel.x;
}
// Compute reference histogram. Specialized for uchar4
template <
int ACTIVE_CHANNELS,
int NUM_BINS,
typename PixelType>
void HistogramGold(PixelType *image, int width, int height, unsigned int* hist)
{
memset(hist, 0, ACTIVE_CHANNELS * NUM_BINS * sizeof(unsigned int));
for (int i = 0; i < width; i++)
{
for (int j = 0; j < height; j++)
{
PixelType pixel = image[i + j * width];
unsigned int bins[ACTIVE_CHANNELS];
DecodePixelGold<NUM_BINS>(pixel, bins);
for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
{
hist[(NUM_BINS * CHANNEL) + bins[CHANNEL]]++;
}
}
}
}
//---------------------------------------------------------------------
// Test execution
//---------------------------------------------------------------------
/**
* Run a specific histogram implementation
*/
template <
int ACTIVE_CHANNELS,
int NUM_BINS,
typename PixelType>
void RunTest(
std::vector<std::pair<std::string, double> >& timings,
PixelType* d_pixels,
const int width,
const int height,
unsigned int * d_hist,
unsigned int * h_hist,
int timing_iterations,
const char * long_name,
const char * short_name,
double (*f)(PixelType*, int, int, unsigned int*, bool))
{
if (!g_report) printf("%s ", long_name); fflush(stdout);
// Run single test to verify (and code cache)
(*f)(d_pixels, width, height, d_hist, !g_report);
int compare = CompareDeviceResults(h_hist, d_hist, ACTIVE_CHANNELS * NUM_BINS, true, g_verbose);
if (!g_report) printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
double elapsed_ms = 0;
for (int i = 0; i < timing_iterations; i++)
{
elapsed_ms += (*f)(d_pixels, width, height, d_hist, false);
}
double avg_us = (elapsed_ms / timing_iterations) * 1000; // average in us
timings.push_back(std::pair<std::string, double>(short_name, avg_us));
if (!g_report)
{
printf("Avg time %.3f us (%d iterations)\n", avg_us, timing_iterations); fflush(stdout);
}
else
{
printf("%.3f, ", avg_us); fflush(stdout);
}
AssertEquals(0, compare);
}
/**
* Evaluate corpus of histogram implementations
*/
template <
int NUM_CHANNELS,
int ACTIVE_CHANNELS,
int NUM_BINS,
typename PixelType>
void TestMethods(
PixelType* h_pixels,
int height,
int width,
int timing_iterations,
double bandwidth_GBs)
{
// Copy data to gpu
PixelType* d_pixels;
size_t pixel_bytes = width * height * sizeof(PixelType);
CubDebugExit(g_allocator.DeviceAllocate((void**) &d_pixels, pixel_bytes));
CubDebugExit(cudaMemcpy(d_pixels, h_pixels, pixel_bytes, cudaMemcpyHostToDevice));
if (g_report) printf("%.3f, ", double(pixel_bytes) / bandwidth_GBs / 1000);
// Allocate results arrays on cpu/gpu
unsigned int *h_hist;
unsigned int *d_hist;
size_t histogram_bytes = NUM_BINS * ACTIVE_CHANNELS * sizeof(unsigned int);
h_hist = (unsigned int *) malloc(histogram_bytes);
g_allocator.DeviceAllocate((void **) &d_hist, histogram_bytes);
// Compute reference cpu histogram
HistogramGold<ACTIVE_CHANNELS, NUM_BINS>(h_pixels, width, height, h_hist);
// Store timings
std::vector<std::pair<std::string, double> > timings;
// Run experiments
RunTest<ACTIVE_CHANNELS, NUM_BINS>(timings, d_pixels, width, height, d_hist, h_hist, timing_iterations,
"CUB", "CUB", run_cub_histogram<NUM_CHANNELS, ACTIVE_CHANNELS, NUM_BINS, PixelType>);
RunTest<ACTIVE_CHANNELS, NUM_BINS>(timings, d_pixels, width, height, d_hist, h_hist, timing_iterations,
"Shared memory atomics", "smem atomics", run_smem_atomics<ACTIVE_CHANNELS, NUM_BINS, PixelType>);
RunTest<ACTIVE_CHANNELS, NUM_BINS>(timings, d_pixels, width, height, d_hist, h_hist, timing_iterations,
"Global memory atomics", "gmem atomics", run_gmem_atomics<ACTIVE_CHANNELS, NUM_BINS, PixelType>);
// Report timings
if (!g_report)
{
std::sort(timings.begin(), timings.end(), less_than_value());
printf("Timings (us):\n");
for (int i = 0; i < timings.size(); i++)
{
double bandwidth = height * width * sizeof(PixelType) / timings[i].second / 1000;
printf("\t %.3f %s (%.3f GB/s, %.3f%% peak)\n", timings[i].second, timings[i].first.c_str(), bandwidth, bandwidth / bandwidth_GBs * 100);
}
printf("\n");
}
// Free data
CubDebugExit(g_allocator.DeviceFree(d_pixels));
CubDebugExit(g_allocator.DeviceFree(d_hist));
free(h_hist);
}
/**
* Test different problem genres
*/
void TestGenres(
uchar4* uchar4_pixels,
int height,
int width,
int timing_iterations,
double bandwidth_GBs)
{
int num_pixels = width * height;
{
if (!g_report) printf("1 channel uchar1 tests (256-bin):\n\n"); fflush(stdout);
size_t image_bytes = num_pixels * sizeof(uchar1);
uchar1* uchar1_pixels = (uchar1*) malloc(image_bytes);
// Convert to 1-channel (averaging first 3 channels)
for (int i = 0; i < num_pixels; ++i)
{
uchar1_pixels[i].x = (unsigned char)
(((unsigned int) uchar4_pixels[i].x +
(unsigned int) uchar4_pixels[i].y +
(unsigned int) uchar4_pixels[i].z) / 3);
}
TestMethods<1, 1, 256>(uchar1_pixels, width, height, timing_iterations, bandwidth_GBs);
free(uchar1_pixels);
if (g_report) printf(", ");
}
{
if (!g_report) printf("3/4 channel uchar4 tests (256-bin):\n\n"); fflush(stdout);
TestMethods<4, 3, 256>(uchar4_pixels, width, height, timing_iterations, bandwidth_GBs);
if (g_report) printf(", ");
}
{
if (!g_report) printf("3/4 channel float4 tests (256-bin):\n\n"); fflush(stdout);
size_t image_bytes = num_pixels * sizeof(float4);
float4* float4_pixels = (float4*) malloc(image_bytes);
// Convert to float4 with range [0.0, 1.0)
for (int i = 0; i < num_pixels; ++i)
{
float4_pixels[i].x = float(uchar4_pixels[i].x) / 256;
float4_pixels[i].y = float(uchar4_pixels[i].y) / 256;
float4_pixels[i].z = float(uchar4_pixels[i].z) / 256;
float4_pixels[i].w = float(uchar4_pixels[i].w) / 256;
}
TestMethods<4, 3, 256>(float4_pixels, width, height, timing_iterations, bandwidth_GBs);
free(float4_pixels);
if (g_report) printf("\n");
}
}
/**
* Main
*/
int main(int argc, char **argv)
{
// Initialize command line
CommandLineArgs args(argc, argv);
if (args.CheckCmdLineFlag("help"))
{
printf(
"%s "
"[--device=<device-id>] "
"[--v] "
"[--i=<timing iterations>] "
"\n\t"
"--file=<.tga filename> "
"\n\t"
"--entropy=<-1 (0%), 0 (100%), 1 (81%), 2 (54%), 3 (34%), 4 (20%), ..."
"[--height=<default: 1080>] "
"[--width=<default: 1920>] "
"\n", argv[0]);
exit(0);
}
std::string filename;
int timing_iterations = 100;
int entropy_reduction = 0;
int height = 1080;
int width = 1920;
g_verbose = args.CheckCmdLineFlag("v");
g_report = args.CheckCmdLineFlag("report");
args.GetCmdLineArgument("i", timing_iterations);
args.GetCmdLineArgument("file", filename);
args.GetCmdLineArgument("height", height);
args.GetCmdLineArgument("width", width);
args.GetCmdLineArgument("entropy", entropy_reduction);
// Initialize device
CubDebugExit(args.DeviceInit());
// Get GPU device bandwidth (GB/s)
int device_ordinal, bus_width, mem_clock_khz;
CubDebugExit(cudaGetDevice(&device_ordinal));
CubDebugExit(cudaDeviceGetAttribute(&bus_width, cudaDevAttrGlobalMemoryBusWidth, device_ordinal));
CubDebugExit(cudaDeviceGetAttribute(&mem_clock_khz, cudaDevAttrMemoryClockRate, device_ordinal));
double bandwidth_GBs = double(bus_width) * mem_clock_khz * 2 / 8 / 1000 / 1000;
// Run test(s)
uchar4* uchar4_pixels = NULL;
if (!g_report)
{
if (!filename.empty())
{
// Parse targa file
ReadTga(uchar4_pixels, width, height, filename.c_str());
printf("File %s: width(%d) height(%d)\n\n", filename.c_str(), width, height); fflush(stdout);
}
else
{
// Generate image
GenerateRandomImage(uchar4_pixels, width, height, entropy_reduction);
printf("Random image: entropy-reduction(%d) width(%d) height(%d)\n\n", entropy_reduction, width, height); fflush(stdout);
}
TestGenres(uchar4_pixels, height, width, timing_iterations, bandwidth_GBs);
}
else
{
// Run test suite
printf("Test, MIN, RLE CUB, SMEM, GMEM, , MIN, RLE_CUB, SMEM, GMEM, , MIN, RLE_CUB, SMEM, GMEM\n");
// Entropy reduction tests
for (entropy_reduction = 0; entropy_reduction < 5; ++entropy_reduction)
{
printf("entropy reduction %d, ", entropy_reduction);
GenerateRandomImage(uchar4_pixels, width, height, entropy_reduction);
TestGenres(uchar4_pixels, height, width, timing_iterations, bandwidth_GBs);
}
printf("entropy reduction -1, ");
GenerateRandomImage(uchar4_pixels, width, height, -1);
TestGenres(uchar4_pixels, height, width, timing_iterations, bandwidth_GBs);
printf("\n");
// File image tests
std::vector<std::string> file_tests;
file_tests.push_back("animals");
file_tests.push_back("apples");
file_tests.push_back("sunset");
file_tests.push_back("cheetah");
file_tests.push_back("nature");
file_tests.push_back("operahouse");
file_tests.push_back("austin");
file_tests.push_back("cityscape");
for (int i = 0; i < file_tests.size(); ++i)
{
printf("%s, ", file_tests[i].c_str());
std::string filename = std::string("histogram/benchmark/") + file_tests[i] + ".tga";
ReadTga(uchar4_pixels, width, height, filename.c_str());
TestGenres(uchar4_pixels, height, width, timing_iterations, bandwidth_GBs);
}
}
free(uchar4_pixels);
CubDebugExit(cudaDeviceSynchronize());
printf("\n\n");
return 0;
}