Spaces:
Runtime error
Runtime error
/****************************************************************************** | |
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. | |
* | |
* Redistribution and use in source and binary forms, with or without | |
* modification, are permitted provided that the following conditions are met: | |
* * Redistributions of source code must retain the above copyright | |
* notice, this list of conditions and the following disclaimer. | |
* * Redistributions in binary form must reproduce the above copyright | |
* notice, this list of conditions and the following disclaimer in the | |
* documentation and/or other materials provided with the distribution. | |
* * Neither the name of the NVIDIA CORPORATION nor the | |
* names of its contributors may be used to endorse or promote products | |
* derived from this software without specific prior written permission. | |
* | |
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | |
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY | |
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES | |
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND | |
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
* | |
******************************************************************************/ | |
#include <stdio.h> | |
#include <map> | |
#include <vector> | |
#include <algorithm> | |
#include <cstdio> | |
#include <fstream> | |
#include "histogram/histogram_gmem_atomics.h" | |
#include "histogram/histogram_smem_atomics.h" | |
#include "histogram/histogram_cub.h" | |
#include <cub/util_allocator.cuh> | |
#include <test/test_util.h> | |
using namespace cub; | |
//--------------------------------------------------------------------- | |
// Globals, constants, and type declarations | |
//--------------------------------------------------------------------- | |
// Ensure printing of CUDA runtime errors to console | |
#define CUB_STDERR | |
bool g_verbose = false; // Whether to display input/output to console | |
bool g_report = false; // Whether to display a full report in CSV format | |
CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory | |
struct less_than_value | |
{ | |
inline bool operator()( | |
const std::pair<std::string, double> &a, | |
const std::pair<std::string, double> &b) | |
{ | |
return a.second < b.second; | |
} | |
}; | |
//--------------------------------------------------------------------- | |
// Targa (.tga) image file parsing | |
//--------------------------------------------------------------------- | |
/** | |
* TGA image header info | |
*/ | |
struct TgaHeader | |
{ | |
char idlength; | |
char colormaptype; | |
char datatypecode; | |
short colormaporigin; | |
short colormaplength; | |
char colormapdepth; | |
short x_origin; | |
short y_origin; | |
short width; | |
short height; | |
char bitsperpixel; | |
char imagedescriptor; | |
void Parse (FILE *fptr) | |
{ | |
idlength = fgetc(fptr); | |
colormaptype = fgetc(fptr); | |
datatypecode = fgetc(fptr); | |
fread(&colormaporigin, 2, 1, fptr); | |
fread(&colormaplength, 2, 1, fptr); | |
colormapdepth = fgetc(fptr); | |
fread(&x_origin, 2, 1, fptr); | |
fread(&y_origin, 2, 1, fptr); | |
fread(&width, 2, 1, fptr); | |
fread(&height, 2, 1, fptr); | |
bitsperpixel = fgetc(fptr); | |
imagedescriptor = fgetc(fptr); | |
} | |
void Display (FILE *fptr) | |
{ | |
fprintf(fptr, "ID length: %d\n", idlength); | |
fprintf(fptr, "Color map type: %d\n", colormaptype); | |
fprintf(fptr, "Image type: %d\n", datatypecode); | |
fprintf(fptr, "Color map offset: %d\n", colormaporigin); | |
fprintf(fptr, "Color map length: %d\n", colormaplength); | |
fprintf(fptr, "Color map depth: %d\n", colormapdepth); | |
fprintf(fptr, "X origin: %d\n", x_origin); | |
fprintf(fptr, "Y origin: %d\n", y_origin); | |
fprintf(fptr, "Width: %d\n", width); | |
fprintf(fptr, "Height: %d\n", height); | |
fprintf(fptr, "Bits per pixel: %d\n", bitsperpixel); | |
fprintf(fptr, "Descriptor: %d\n", imagedescriptor); | |
} | |
}; | |
/** | |
* Decode image byte data into pixel | |
*/ | |
void ParseTgaPixel(uchar4 &pixel, unsigned char *tga_pixel, int bytes) | |
{ | |
if (bytes == 4) | |
{ | |
pixel.x = tga_pixel[2]; | |
pixel.y = tga_pixel[1]; | |
pixel.z = tga_pixel[0]; | |
pixel.w = tga_pixel[3]; | |
} | |
else if (bytes == 3) | |
{ | |
pixel.x = tga_pixel[2]; | |
pixel.y = tga_pixel[1]; | |
pixel.z = tga_pixel[0]; | |
pixel.w = 0; | |
} | |
else if (bytes == 2) | |
{ | |
pixel.x = (tga_pixel[1] & 0x7c) << 1; | |
pixel.y = ((tga_pixel[1] & 0x03) << 6) | ((tga_pixel[0] & 0xe0) >> 2); | |
pixel.z = (tga_pixel[0] & 0x1f) << 3; | |
pixel.w = (tga_pixel[1] & 0x80); | |
} | |
} | |
/** | |
* Reads a .tga image file | |
*/ | |
void ReadTga(uchar4* &pixels, int &width, int &height, const char *filename) | |
{ | |
// Open the file | |
FILE *fptr; | |
if ((fptr = fopen(filename, "rb")) == NULL) | |
{ | |
fprintf(stderr, "File open failed\n"); | |
exit(-1); | |
} | |
// Parse header | |
TgaHeader header; | |
header.Parse(fptr); | |
// header.Display(stdout); | |
width = header.width; | |
height = header.height; | |
// Verify compatibility | |
if (header.datatypecode != 2 && header.datatypecode != 10) | |
{ | |
fprintf(stderr, "Can only handle image type 2 and 10\n"); | |
exit(-1); | |
} | |
if (header.bitsperpixel != 16 && header.bitsperpixel != 24 && header.bitsperpixel != 32) | |
{ | |
fprintf(stderr, "Can only handle pixel depths of 16, 24, and 32\n"); | |
exit(-1); | |
} | |
if (header.colormaptype != 0 && header.colormaptype != 1) | |
{ | |
fprintf(stderr, "Can only handle color map types of 0 and 1\n"); | |
exit(-1); | |
} | |
// Skip unnecessary header info | |
int skip_bytes = header.idlength + (header.colormaptype * header.colormaplength); | |
fseek(fptr, skip_bytes, SEEK_CUR); | |
// Read the image | |
int pixel_bytes = header.bitsperpixel / 8; | |
// Allocate and initialize pixel data | |
size_t image_bytes = width * height * sizeof(uchar4); | |
if ((pixels == NULL) && ((pixels = (uchar4*) malloc(image_bytes)) == NULL)) | |
{ | |
fprintf(stderr, "malloc of image failed\n"); | |
exit(-1); | |
} | |
memset(pixels, 0, image_bytes); | |
// Parse pixels | |
unsigned char tga_pixel[5]; | |
int current_pixel = 0; | |
while (current_pixel < header.width * header.height) | |
{ | |
if (header.datatypecode == 2) | |
{ | |
// Uncompressed | |
if (fread(tga_pixel, 1, pixel_bytes, fptr) != pixel_bytes) | |
{ | |
fprintf(stderr, "Unexpected end of file at pixel %d (uncompressed)\n", current_pixel); | |
exit(-1); | |
} | |
ParseTgaPixel(pixels[current_pixel], tga_pixel, pixel_bytes); | |
current_pixel++; | |
} | |
else if (header.datatypecode == 10) | |
{ | |
// Compressed | |
if (fread(tga_pixel, 1, pixel_bytes + 1, fptr) != pixel_bytes + 1) | |
{ | |
fprintf(stderr, "Unexpected end of file at pixel %d (compressed)\n", current_pixel); | |
exit(-1); | |
} | |
int run_length = tga_pixel[0] & 0x7f; | |
ParseTgaPixel(pixels[current_pixel], &(tga_pixel[1]), pixel_bytes); | |
current_pixel++; | |
if (tga_pixel[0] & 0x80) | |
{ | |
// RLE chunk | |
for (int i = 0; i < run_length; i++) | |
{ | |
ParseTgaPixel(pixels[current_pixel], &(tga_pixel[1]), pixel_bytes); | |
current_pixel++; | |
} | |
} | |
else | |
{ | |
// Normal chunk | |
for (int i = 0; i < run_length; i++) | |
{ | |
if (fread(tga_pixel, 1, pixel_bytes, fptr) != pixel_bytes) | |
{ | |
fprintf(stderr, "Unexpected end of file at pixel %d (normal)\n", current_pixel); | |
exit(-1); | |
} | |
ParseTgaPixel(pixels[current_pixel], tga_pixel, pixel_bytes); | |
current_pixel++; | |
} | |
} | |
} | |
} | |
// Close file | |
fclose(fptr); | |
} | |
//--------------------------------------------------------------------- | |
// Random image generation | |
//--------------------------------------------------------------------- | |
/** | |
* Generate a random image with specified entropy | |
*/ | |
void GenerateRandomImage(uchar4* &pixels, int width, int height, int entropy_reduction) | |
{ | |
int num_pixels = width * height; | |
size_t image_bytes = num_pixels * sizeof(uchar4); | |
if ((pixels == NULL) && ((pixels = (uchar4*) malloc(image_bytes)) == NULL)) | |
{ | |
fprintf(stderr, "malloc of image failed\n"); | |
exit(-1); | |
} | |
for (int i = 0; i < num_pixels; ++i) | |
{ | |
RandomBits(pixels[i].x, entropy_reduction); | |
RandomBits(pixels[i].y, entropy_reduction); | |
RandomBits(pixels[i].z, entropy_reduction); | |
RandomBits(pixels[i].w, entropy_reduction); | |
} | |
} | |
//--------------------------------------------------------------------- | |
// Histogram verification | |
//--------------------------------------------------------------------- | |
// Decode float4 pixel into bins | |
template <int NUM_BINS, int ACTIVE_CHANNELS> | |
void DecodePixelGold(float4 pixel, unsigned int (&bins)[ACTIVE_CHANNELS]) | |
{ | |
float* samples = reinterpret_cast<float*>(&pixel); | |
for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL) | |
bins[CHANNEL] = (unsigned int) (samples[CHANNEL] * float(NUM_BINS)); | |
} | |
// Decode uchar4 pixel into bins | |
template <int NUM_BINS, int ACTIVE_CHANNELS> | |
void DecodePixelGold(uchar4 pixel, unsigned int (&bins)[ACTIVE_CHANNELS]) | |
{ | |
unsigned char* samples = reinterpret_cast<unsigned char*>(&pixel); | |
for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL) | |
bins[CHANNEL] = (unsigned int) (samples[CHANNEL]); | |
} | |
// Decode uchar1 pixel into bins | |
template <int NUM_BINS, int ACTIVE_CHANNELS> | |
void DecodePixelGold(uchar1 pixel, unsigned int (&bins)[ACTIVE_CHANNELS]) | |
{ | |
bins[0] = (unsigned int) pixel.x; | |
} | |
// Compute reference histogram. Specialized for uchar4 | |
template < | |
int ACTIVE_CHANNELS, | |
int NUM_BINS, | |
typename PixelType> | |
void HistogramGold(PixelType *image, int width, int height, unsigned int* hist) | |
{ | |
memset(hist, 0, ACTIVE_CHANNELS * NUM_BINS * sizeof(unsigned int)); | |
for (int i = 0; i < width; i++) | |
{ | |
for (int j = 0; j < height; j++) | |
{ | |
PixelType pixel = image[i + j * width]; | |
unsigned int bins[ACTIVE_CHANNELS]; | |
DecodePixelGold<NUM_BINS>(pixel, bins); | |
for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL) | |
{ | |
hist[(NUM_BINS * CHANNEL) + bins[CHANNEL]]++; | |
} | |
} | |
} | |
} | |
//--------------------------------------------------------------------- | |
// Test execution | |
//--------------------------------------------------------------------- | |
/** | |
* Run a specific histogram implementation | |
*/ | |
template < | |
int ACTIVE_CHANNELS, | |
int NUM_BINS, | |
typename PixelType> | |
void RunTest( | |
std::vector<std::pair<std::string, double> >& timings, | |
PixelType* d_pixels, | |
const int width, | |
const int height, | |
unsigned int * d_hist, | |
unsigned int * h_hist, | |
int timing_iterations, | |
const char * long_name, | |
const char * short_name, | |
double (*f)(PixelType*, int, int, unsigned int*, bool)) | |
{ | |
if (!g_report) printf("%s ", long_name); fflush(stdout); | |
// Run single test to verify (and code cache) | |
(*f)(d_pixels, width, height, d_hist, !g_report); | |
int compare = CompareDeviceResults(h_hist, d_hist, ACTIVE_CHANNELS * NUM_BINS, true, g_verbose); | |
if (!g_report) printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout); | |
double elapsed_ms = 0; | |
for (int i = 0; i < timing_iterations; i++) | |
{ | |
elapsed_ms += (*f)(d_pixels, width, height, d_hist, false); | |
} | |
double avg_us = (elapsed_ms / timing_iterations) * 1000; // average in us | |
timings.push_back(std::pair<std::string, double>(short_name, avg_us)); | |
if (!g_report) | |
{ | |
printf("Avg time %.3f us (%d iterations)\n", avg_us, timing_iterations); fflush(stdout); | |
} | |
else | |
{ | |
printf("%.3f, ", avg_us); fflush(stdout); | |
} | |
AssertEquals(0, compare); | |
} | |
/** | |
* Evaluate corpus of histogram implementations | |
*/ | |
template < | |
int NUM_CHANNELS, | |
int ACTIVE_CHANNELS, | |
int NUM_BINS, | |
typename PixelType> | |
void TestMethods( | |
PixelType* h_pixels, | |
int height, | |
int width, | |
int timing_iterations, | |
double bandwidth_GBs) | |
{ | |
// Copy data to gpu | |
PixelType* d_pixels; | |
size_t pixel_bytes = width * height * sizeof(PixelType); | |
CubDebugExit(g_allocator.DeviceAllocate((void**) &d_pixels, pixel_bytes)); | |
CubDebugExit(cudaMemcpy(d_pixels, h_pixels, pixel_bytes, cudaMemcpyHostToDevice)); | |
if (g_report) printf("%.3f, ", double(pixel_bytes) / bandwidth_GBs / 1000); | |
// Allocate results arrays on cpu/gpu | |
unsigned int *h_hist; | |
unsigned int *d_hist; | |
size_t histogram_bytes = NUM_BINS * ACTIVE_CHANNELS * sizeof(unsigned int); | |
h_hist = (unsigned int *) malloc(histogram_bytes); | |
g_allocator.DeviceAllocate((void **) &d_hist, histogram_bytes); | |
// Compute reference cpu histogram | |
HistogramGold<ACTIVE_CHANNELS, NUM_BINS>(h_pixels, width, height, h_hist); | |
// Store timings | |
std::vector<std::pair<std::string, double> > timings; | |
// Run experiments | |
RunTest<ACTIVE_CHANNELS, NUM_BINS>(timings, d_pixels, width, height, d_hist, h_hist, timing_iterations, | |
"CUB", "CUB", run_cub_histogram<NUM_CHANNELS, ACTIVE_CHANNELS, NUM_BINS, PixelType>); | |
RunTest<ACTIVE_CHANNELS, NUM_BINS>(timings, d_pixels, width, height, d_hist, h_hist, timing_iterations, | |
"Shared memory atomics", "smem atomics", run_smem_atomics<ACTIVE_CHANNELS, NUM_BINS, PixelType>); | |
RunTest<ACTIVE_CHANNELS, NUM_BINS>(timings, d_pixels, width, height, d_hist, h_hist, timing_iterations, | |
"Global memory atomics", "gmem atomics", run_gmem_atomics<ACTIVE_CHANNELS, NUM_BINS, PixelType>); | |
// Report timings | |
if (!g_report) | |
{ | |
std::sort(timings.begin(), timings.end(), less_than_value()); | |
printf("Timings (us):\n"); | |
for (int i = 0; i < timings.size(); i++) | |
{ | |
double bandwidth = height * width * sizeof(PixelType) / timings[i].second / 1000; | |
printf("\t %.3f %s (%.3f GB/s, %.3f%% peak)\n", timings[i].second, timings[i].first.c_str(), bandwidth, bandwidth / bandwidth_GBs * 100); | |
} | |
printf("\n"); | |
} | |
// Free data | |
CubDebugExit(g_allocator.DeviceFree(d_pixels)); | |
CubDebugExit(g_allocator.DeviceFree(d_hist)); | |
free(h_hist); | |
} | |
/** | |
* Test different problem genres | |
*/ | |
void TestGenres( | |
uchar4* uchar4_pixels, | |
int height, | |
int width, | |
int timing_iterations, | |
double bandwidth_GBs) | |
{ | |
int num_pixels = width * height; | |
{ | |
if (!g_report) printf("1 channel uchar1 tests (256-bin):\n\n"); fflush(stdout); | |
size_t image_bytes = num_pixels * sizeof(uchar1); | |
uchar1* uchar1_pixels = (uchar1*) malloc(image_bytes); | |
// Convert to 1-channel (averaging first 3 channels) | |
for (int i = 0; i < num_pixels; ++i) | |
{ | |
uchar1_pixels[i].x = (unsigned char) | |
(((unsigned int) uchar4_pixels[i].x + | |
(unsigned int) uchar4_pixels[i].y + | |
(unsigned int) uchar4_pixels[i].z) / 3); | |
} | |
TestMethods<1, 1, 256>(uchar1_pixels, width, height, timing_iterations, bandwidth_GBs); | |
free(uchar1_pixels); | |
if (g_report) printf(", "); | |
} | |
{ | |
if (!g_report) printf("3/4 channel uchar4 tests (256-bin):\n\n"); fflush(stdout); | |
TestMethods<4, 3, 256>(uchar4_pixels, width, height, timing_iterations, bandwidth_GBs); | |
if (g_report) printf(", "); | |
} | |
{ | |
if (!g_report) printf("3/4 channel float4 tests (256-bin):\n\n"); fflush(stdout); | |
size_t image_bytes = num_pixels * sizeof(float4); | |
float4* float4_pixels = (float4*) malloc(image_bytes); | |
// Convert to float4 with range [0.0, 1.0) | |
for (int i = 0; i < num_pixels; ++i) | |
{ | |
float4_pixels[i].x = float(uchar4_pixels[i].x) / 256; | |
float4_pixels[i].y = float(uchar4_pixels[i].y) / 256; | |
float4_pixels[i].z = float(uchar4_pixels[i].z) / 256; | |
float4_pixels[i].w = float(uchar4_pixels[i].w) / 256; | |
} | |
TestMethods<4, 3, 256>(float4_pixels, width, height, timing_iterations, bandwidth_GBs); | |
free(float4_pixels); | |
if (g_report) printf("\n"); | |
} | |
} | |
/** | |
* Main | |
*/ | |
int main(int argc, char **argv) | |
{ | |
// Initialize command line | |
CommandLineArgs args(argc, argv); | |
if (args.CheckCmdLineFlag("help")) | |
{ | |
printf( | |
"%s " | |
"[--device=<device-id>] " | |
"[--v] " | |
"[--i=<timing iterations>] " | |
"\n\t" | |
"--file=<.tga filename> " | |
"\n\t" | |
"--entropy=<-1 (0%), 0 (100%), 1 (81%), 2 (54%), 3 (34%), 4 (20%), ..." | |
"[--height=<default: 1080>] " | |
"[--width=<default: 1920>] " | |
"\n", argv[0]); | |
exit(0); | |
} | |
std::string filename; | |
int timing_iterations = 100; | |
int entropy_reduction = 0; | |
int height = 1080; | |
int width = 1920; | |
g_verbose = args.CheckCmdLineFlag("v"); | |
g_report = args.CheckCmdLineFlag("report"); | |
args.GetCmdLineArgument("i", timing_iterations); | |
args.GetCmdLineArgument("file", filename); | |
args.GetCmdLineArgument("height", height); | |
args.GetCmdLineArgument("width", width); | |
args.GetCmdLineArgument("entropy", entropy_reduction); | |
// Initialize device | |
CubDebugExit(args.DeviceInit()); | |
// Get GPU device bandwidth (GB/s) | |
int device_ordinal, bus_width, mem_clock_khz; | |
CubDebugExit(cudaGetDevice(&device_ordinal)); | |
CubDebugExit(cudaDeviceGetAttribute(&bus_width, cudaDevAttrGlobalMemoryBusWidth, device_ordinal)); | |
CubDebugExit(cudaDeviceGetAttribute(&mem_clock_khz, cudaDevAttrMemoryClockRate, device_ordinal)); | |
double bandwidth_GBs = double(bus_width) * mem_clock_khz * 2 / 8 / 1000 / 1000; | |
// Run test(s) | |
uchar4* uchar4_pixels = NULL; | |
if (!g_report) | |
{ | |
if (!filename.empty()) | |
{ | |
// Parse targa file | |
ReadTga(uchar4_pixels, width, height, filename.c_str()); | |
printf("File %s: width(%d) height(%d)\n\n", filename.c_str(), width, height); fflush(stdout); | |
} | |
else | |
{ | |
// Generate image | |
GenerateRandomImage(uchar4_pixels, width, height, entropy_reduction); | |
printf("Random image: entropy-reduction(%d) width(%d) height(%d)\n\n", entropy_reduction, width, height); fflush(stdout); | |
} | |
TestGenres(uchar4_pixels, height, width, timing_iterations, bandwidth_GBs); | |
} | |
else | |
{ | |
// Run test suite | |
printf("Test, MIN, RLE CUB, SMEM, GMEM, , MIN, RLE_CUB, SMEM, GMEM, , MIN, RLE_CUB, SMEM, GMEM\n"); | |
// Entropy reduction tests | |
for (entropy_reduction = 0; entropy_reduction < 5; ++entropy_reduction) | |
{ | |
printf("entropy reduction %d, ", entropy_reduction); | |
GenerateRandomImage(uchar4_pixels, width, height, entropy_reduction); | |
TestGenres(uchar4_pixels, height, width, timing_iterations, bandwidth_GBs); | |
} | |
printf("entropy reduction -1, "); | |
GenerateRandomImage(uchar4_pixels, width, height, -1); | |
TestGenres(uchar4_pixels, height, width, timing_iterations, bandwidth_GBs); | |
printf("\n"); | |
// File image tests | |
std::vector<std::string> file_tests; | |
file_tests.push_back("animals"); | |
file_tests.push_back("apples"); | |
file_tests.push_back("sunset"); | |
file_tests.push_back("cheetah"); | |
file_tests.push_back("nature"); | |
file_tests.push_back("operahouse"); | |
file_tests.push_back("austin"); | |
file_tests.push_back("cityscape"); | |
for (int i = 0; i < file_tests.size(); ++i) | |
{ | |
printf("%s, ", file_tests[i].c_str()); | |
std::string filename = std::string("histogram/benchmark/") + file_tests[i] + ".tga"; | |
ReadTga(uchar4_pixels, width, height, filename.c_str()); | |
TestGenres(uchar4_pixels, height, width, timing_iterations, bandwidth_GBs); | |
} | |
} | |
free(uchar4_pixels); | |
CubDebugExit(cudaDeviceSynchronize()); | |
printf("\n\n"); | |
return 0; | |
} | |