Spaces:
Runtime error
Runtime error
/****************************************************************************** | |
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. | |
* | |
* Redistribution and use in source and binary forms, with or without | |
* modification, are permitted provided that the following conditions are met: | |
* * Redistributions of source code must retain the above copyright | |
* notice, this list of conditions and the following disclaimer. | |
* * Redistributions in binary form must reproduce the above copyright | |
* notice, this list of conditions and the following disclaimer in the | |
* documentation and/or other materials provided with the distribution. | |
* * Neither the name of the NVIDIA CORPORATION nor the | |
* names of its contributors may be used to endorse or promote products | |
* derived from this software without specific prior written permission. | |
* | |
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | |
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY | |
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES | |
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND | |
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIAeBILITY, OR TORT | |
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
* | |
******************************************************************************/ | |
//--------------------------------------------------------------------- | |
// SpMV comparison tool | |
//--------------------------------------------------------------------- | |
#include <stdio.h> | |
#include <map> | |
#include <vector> | |
#include <algorithm> | |
#include <cstdio> | |
#include <fstream> | |
#include <cusparse.h> | |
#include "sparse_matrix.h" | |
// Ensure printing of CUDA runtime errors to console | |
#define CUB_STDERR | |
#include <cub/device/device_spmv.cuh> | |
#include <cub/util_allocator.cuh> | |
#include <cub/iterator/tex_ref_input_iterator.cuh> | |
#include <test/test_util.h> | |
using namespace cub; | |
//--------------------------------------------------------------------- | |
// Globals, constants, and type declarations | |
//--------------------------------------------------------------------- | |
bool g_quiet = false; // Whether to display stats in CSV format | |
bool g_verbose = false; // Whether to display output to console | |
bool g_verbose2 = false; // Whether to display input to console | |
CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory | |
//--------------------------------------------------------------------- | |
// SpMV verification | |
//--------------------------------------------------------------------- | |
// Compute reference SpMV y = Ax | |
template < | |
typename ValueT, | |
typename OffsetT> | |
void SpmvGold( | |
CsrMatrix<ValueT, OffsetT>& a, | |
ValueT* vector_x, | |
ValueT* vector_y_in, | |
ValueT* vector_y_out, | |
ValueT alpha, | |
ValueT beta) | |
{ | |
for (OffsetT row = 0; row < a.num_rows; ++row) | |
{ | |
ValueT partial = beta * vector_y_in[row]; | |
for ( | |
OffsetT offset = a.row_offsets[row]; | |
offset < a.row_offsets[row + 1]; | |
++offset) | |
{ | |
partial += alpha * a.values[offset] * vector_x[a.column_indices[offset]]; | |
} | |
vector_y_out[row] = partial; | |
} | |
} | |
//--------------------------------------------------------------------- | |
// GPU I/O proxy | |
//--------------------------------------------------------------------- | |
/** | |
* Read every matrix nonzero value, read every corresponding vector value | |
*/ | |
template < | |
int BLOCK_THREADS, | |
int ITEMS_PER_THREAD, | |
typename ValueT, | |
typename OffsetT, | |
typename VectorItr> | |
__launch_bounds__ (int(BLOCK_THREADS)) | |
__global__ void NonZeroIoKernel( | |
SpmvParams<ValueT, OffsetT> params, | |
VectorItr d_vector_x) | |
{ | |
enum | |
{ | |
TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, | |
}; | |
ValueT nonzero = 0.0; | |
int tile_idx = blockIdx.x; | |
OffsetT block_offset = tile_idx * TILE_ITEMS; | |
OffsetT column_indices[ITEMS_PER_THREAD]; | |
ValueT values[ITEMS_PER_THREAD]; | |
#pragma unroll | |
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) | |
{ | |
OffsetT nonzero_idx = block_offset + (ITEM * BLOCK_THREADS) + threadIdx.x; | |
OffsetT* ci = params.d_column_indices + nonzero_idx; | |
ValueT*a = params.d_values + nonzero_idx; | |
column_indices[ITEM] = (nonzero_idx < params.num_nonzeros) ? *ci : 0; | |
values[ITEM] = (nonzero_idx < params.num_nonzeros) ? *a : 0.0; | |
} | |
__syncthreads(); | |
// Read vector | |
#pragma unroll | |
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) | |
{ | |
ValueT vector_value = ThreadLoad<LOAD_LDG>(params.d_vector_x + column_indices[ITEM]); | |
nonzero += vector_value * values[ITEM]; | |
} | |
__syncthreads(); | |
if (block_offset < params.num_rows) | |
{ | |
#pragma unroll | |
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) | |
{ | |
OffsetT row_idx = block_offset + (ITEM * BLOCK_THREADS) + threadIdx.x; | |
if (row_idx < params.num_rows) | |
{ | |
OffsetT row_end_offset = ThreadLoad<LOAD_DEFAULT>(params.d_row_end_offsets + row_idx); | |
if ((row_end_offset >= 0) && (nonzero == nonzero)) | |
params.d_vector_y[row_idx] = nonzero; | |
} | |
} | |
} | |
} | |
/** | |
* Run GPU I/O proxy | |
*/ | |
template < | |
typename ValueT, | |
typename OffsetT> | |
float TestGpuCsrIoProxy( | |
SpmvParams<ValueT, OffsetT>& params, | |
int timing_iterations) | |
{ | |
enum { | |
BLOCK_THREADS = 128, | |
ITEMS_PER_THREAD = 7, | |
TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD, | |
}; | |
// size_t smem = 1024 * 16; | |
size_t smem = 1024 * 0; | |
unsigned int nonzero_blocks = (params.num_nonzeros + TILE_SIZE - 1) / TILE_SIZE; | |
unsigned int row_blocks = (params.num_rows + TILE_SIZE - 1) / TILE_SIZE; | |
unsigned int blocks = std::max(nonzero_blocks, row_blocks); | |
typedef TexRefInputIterator<ValueT, 1234, int> TexItr; | |
TexItr x_itr; | |
CubDebugExit(x_itr.BindTexture(params.d_vector_x)); | |
// Get device ordinal | |
int device_ordinal; | |
CubDebugExit(cudaGetDevice(&device_ordinal)); | |
// Get device SM version | |
int sm_version; | |
CubDebugExit(SmVersion(sm_version, device_ordinal)); | |
void (*kernel)(SpmvParams<ValueT, OffsetT>, TexItr) = NonZeroIoKernel<BLOCK_THREADS, ITEMS_PER_THREAD>; | |
int spmv_sm_occupancy; | |
CubDebugExit(MaxSmOccupancy(spmv_sm_occupancy, kernel, BLOCK_THREADS, smem)); | |
if (!g_quiet) | |
printf("NonZeroIoKernel<%d,%d><<<%d, %d>>>, sm occupancy %d\n", BLOCK_THREADS, ITEMS_PER_THREAD, blocks, BLOCK_THREADS, spmv_sm_occupancy); | |
// Warmup | |
NonZeroIoKernel<BLOCK_THREADS, ITEMS_PER_THREAD><<<blocks, BLOCK_THREADS, smem>>>(params, x_itr); | |
// Check for failures | |
CubDebugExit(cudaPeekAtLastError()); | |
CubDebugExit(SyncStream(0)); | |
// Timing | |
GpuTimer timer; | |
float elapsed_millis = 0.0; | |
timer.Start(); | |
for (int it = 0; it < timing_iterations; ++it) | |
{ | |
NonZeroIoKernel<BLOCK_THREADS, ITEMS_PER_THREAD><<<blocks, BLOCK_THREADS, smem>>>(params, x_itr); | |
} | |
timer.Stop(); | |
elapsed_millis += timer.ElapsedMillis(); | |
CubDebugExit(x_itr.UnbindTexture()); | |
return elapsed_millis / timing_iterations; | |
} | |
//--------------------------------------------------------------------- | |
// cuSparse HybMV | |
//--------------------------------------------------------------------- | |
/** | |
* Run cuSparse HYB SpMV (specialized for fp32) | |
*/ | |
template < | |
typename OffsetT> | |
float TestCusparseHybmv( | |
float* vector_y_in, | |
float* reference_vector_y_out, | |
SpmvParams<float, OffsetT>& params, | |
int timing_iterations, | |
cusparseHandle_t cusparse) | |
{ | |
CpuTimer cpu_timer; | |
cpu_timer.Start(); | |
// Construct Hyb matrix | |
cusparseMatDescr_t mat_desc; | |
cusparseHybMat_t hyb_desc; | |
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateMatDescr(&mat_desc)); | |
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateHybMat(&hyb_desc)); | |
cusparseStatus_t status = cusparseScsr2hyb( | |
cusparse, | |
params.num_rows, params.num_cols, | |
mat_desc, | |
params.d_values, params.d_row_end_offsets, params.d_column_indices, | |
hyb_desc, | |
0, | |
CUSPARSE_HYB_PARTITION_AUTO); | |
AssertEquals(CUSPARSE_STATUS_SUCCESS, status); | |
cudaDeviceSynchronize(); | |
cpu_timer.Stop(); | |
float elapsed_millis = cpu_timer.ElapsedMillis(); | |
printf("HYB setup ms, %.5f, ", elapsed_millis); | |
// Reset input/output vector y | |
CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(float) * params.num_rows, cudaMemcpyHostToDevice)); | |
// Warmup | |
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseShybmv( | |
cusparse, | |
CUSPARSE_OPERATION_NON_TRANSPOSE, | |
¶ms.alpha, mat_desc, | |
hyb_desc, | |
params.d_vector_x, ¶ms.beta, params.d_vector_y)); | |
if (!g_quiet) | |
{ | |
int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose); | |
printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout); | |
} | |
// Timing | |
elapsed_millis = 0.0; | |
GpuTimer timer; | |
timer.Start(); | |
for(int it = 0; it < timing_iterations; ++it) | |
{ | |
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseShybmv( | |
cusparse, | |
CUSPARSE_OPERATION_NON_TRANSPOSE, | |
¶ms.alpha, mat_desc, | |
hyb_desc, | |
params.d_vector_x, ¶ms.beta, params.d_vector_y)); | |
} | |
timer.Stop(); | |
elapsed_millis += timer.ElapsedMillis(); | |
// Cleanup | |
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyHybMat(hyb_desc)); | |
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyMatDescr(mat_desc)); | |
return elapsed_millis / timing_iterations; | |
} | |
/** | |
* Run cuSparse HYB SpMV (specialized for fp64) | |
*/ | |
template < | |
typename OffsetT> | |
float TestCusparseHybmv( | |
double* vector_y_in, | |
double* reference_vector_y_out, | |
SpmvParams<double, OffsetT>& params, | |
int timing_iterations, | |
cusparseHandle_t cusparse) | |
{ | |
CpuTimer cpu_timer; | |
cpu_timer.Start(); | |
// Construct Hyb matrix | |
cusparseMatDescr_t mat_desc; | |
cusparseHybMat_t hyb_desc; | |
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateMatDescr(&mat_desc)); | |
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateHybMat(&hyb_desc)); | |
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDcsr2hyb( | |
cusparse, | |
params.num_rows, params.num_cols, | |
mat_desc, | |
params.d_values, params.d_row_end_offsets, params.d_column_indices, | |
hyb_desc, | |
0, | |
CUSPARSE_HYB_PARTITION_AUTO)); | |
cudaDeviceSynchronize(); | |
cpu_timer.Stop(); | |
float elapsed_millis = cpu_timer.ElapsedMillis(); | |
printf("HYB setup ms, %.5f, ", elapsed_millis); | |
// Reset input/output vector y | |
CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(float) * params.num_rows, cudaMemcpyHostToDevice)); | |
// Warmup | |
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDhybmv( | |
cusparse, | |
CUSPARSE_OPERATION_NON_TRANSPOSE, | |
¶ms.alpha, mat_desc, | |
hyb_desc, | |
params.d_vector_x, ¶ms.beta, params.d_vector_y)); | |
if (!g_quiet) | |
{ | |
int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose); | |
printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout); | |
} | |
// Timing | |
elapsed_millis = 0.0; | |
GpuTimer timer; | |
timer.Start(); | |
for(int it = 0; it < timing_iterations; ++it) | |
{ | |
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDhybmv( | |
cusparse, | |
CUSPARSE_OPERATION_NON_TRANSPOSE, | |
¶ms.alpha, mat_desc, | |
hyb_desc, | |
params.d_vector_x, ¶ms.beta, params.d_vector_y)); | |
} | |
timer.Stop(); | |
elapsed_millis += timer.ElapsedMillis(); | |
// Cleanup | |
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyHybMat(hyb_desc)); | |
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyMatDescr(mat_desc)); | |
return elapsed_millis / timing_iterations; | |
} | |
//--------------------------------------------------------------------- | |
// cuSparse CsrMV | |
//--------------------------------------------------------------------- | |
/** | |
* Run cuSparse SpMV (specialized for fp32) | |
*/ | |
template < | |
typename OffsetT> | |
float TestCusparseCsrmv( | |
float* vector_y_in, | |
float* reference_vector_y_out, | |
SpmvParams<float, OffsetT>& params, | |
int timing_iterations, | |
cusparseHandle_t cusparse) | |
{ | |
cusparseMatDescr_t desc; | |
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateMatDescr(&desc)); | |
// Reset input/output vector y | |
CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(float) * params.num_rows, cudaMemcpyHostToDevice)); | |
// Warmup | |
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseScsrmv( | |
cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE, | |
params.num_rows, params.num_cols, params.num_nonzeros, ¶ms.alpha, desc, | |
params.d_values, params.d_row_end_offsets, params.d_column_indices, | |
params.d_vector_x, ¶ms.beta, params.d_vector_y)); | |
if (!g_quiet) | |
{ | |
int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose); | |
printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout); | |
} | |
// Timing | |
float elapsed_millis = 0.0; | |
GpuTimer timer; | |
timer.Start(); | |
for(int it = 0; it < timing_iterations; ++it) | |
{ | |
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseScsrmv( | |
cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE, | |
params.num_rows, params.num_cols, params.num_nonzeros, ¶ms.alpha, desc, | |
params.d_values, params.d_row_end_offsets, params.d_column_indices, | |
params.d_vector_x, ¶ms.beta, params.d_vector_y)); | |
} | |
timer.Stop(); | |
elapsed_millis += timer.ElapsedMillis(); | |
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyMatDescr(desc)); | |
return elapsed_millis / timing_iterations; | |
} | |
/** | |
* Run cuSparse SpMV (specialized for fp64) | |
*/ | |
template < | |
typename OffsetT> | |
float TestCusparseCsrmv( | |
double* vector_y_in, | |
double* reference_vector_y_out, | |
SpmvParams<double, OffsetT>& params, | |
int timing_iterations, | |
cusparseHandle_t cusparse) | |
{ | |
cusparseMatDescr_t desc; | |
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateMatDescr(&desc)); | |
// Reset input/output vector y | |
CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(float) * params.num_rows, cudaMemcpyHostToDevice)); | |
// Warmup | |
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDcsrmv( | |
cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE, | |
params.num_rows, params.num_cols, params.num_nonzeros, ¶ms.alpha, desc, | |
params.d_values, params.d_row_end_offsets, params.d_column_indices, | |
params.d_vector_x, ¶ms.beta, params.d_vector_y)); | |
if (!g_quiet) | |
{ | |
int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose); | |
printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout); | |
} | |
// Timing | |
float elapsed_millis = 0.0; | |
GpuTimer timer; | |
timer.Start(); | |
for(int it = 0; it < timing_iterations; ++it) | |
{ | |
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDcsrmv( | |
cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE, | |
params.num_rows, params.num_cols, params.num_nonzeros, ¶ms.alpha, desc, | |
params.d_values, params.d_row_end_offsets, params.d_column_indices, | |
params.d_vector_x, ¶ms.beta, params.d_vector_y)); | |
} | |
timer.Stop(); | |
elapsed_millis += timer.ElapsedMillis(); | |
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyMatDescr(desc)); | |
return elapsed_millis / timing_iterations; | |
} | |
//--------------------------------------------------------------------- | |
// GPU Merge-based SpMV | |
//--------------------------------------------------------------------- | |
/** | |
* Run CUB SpMV | |
*/ | |
template < | |
typename ValueT, | |
typename OffsetT> | |
float TestGpuMergeCsrmv( | |
ValueT* vector_y_in, | |
ValueT* reference_vector_y_out, | |
SpmvParams<ValueT, OffsetT>& params, | |
int timing_iterations) | |
{ | |
// Allocate temporary storage | |
size_t temp_storage_bytes = 0; | |
void *d_temp_storage = NULL; | |
// Get amount of temporary storage needed | |
CubDebugExit(DeviceSpmv::CsrMV( | |
d_temp_storage, temp_storage_bytes, | |
params.d_values, params.d_row_end_offsets, params.d_column_indices, | |
params.d_vector_x, params.d_vector_y, | |
params.num_rows, params.num_cols, params.num_nonzeros, | |
// params.alpha, params.beta, | |
(cudaStream_t) 0, false)); | |
// Allocate | |
CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); | |
// Reset input/output vector y | |
CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(ValueT) * params.num_rows, cudaMemcpyHostToDevice)); | |
// Warmup | |
CubDebugExit(DeviceSpmv::CsrMV( | |
d_temp_storage, temp_storage_bytes, | |
params.d_values, params.d_row_end_offsets, params.d_column_indices, | |
params.d_vector_x, params.d_vector_y, | |
params.num_rows, params.num_cols, params.num_nonzeros, | |
// params.alpha, params.beta, | |
(cudaStream_t) 0, !g_quiet)); | |
if (!g_quiet) | |
{ | |
int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose); | |
printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout); | |
} | |
// Timing | |
GpuTimer timer; | |
float elapsed_millis = 0.0; | |
timer.Start(); | |
for(int it = 0; it < timing_iterations; ++it) | |
{ | |
CubDebugExit(DeviceSpmv::CsrMV( | |
d_temp_storage, temp_storage_bytes, | |
params.d_values, params.d_row_end_offsets, params.d_column_indices, | |
params.d_vector_x, params.d_vector_y, | |
params.num_rows, params.num_cols, params.num_nonzeros, | |
// params.alpha, params.beta, | |
(cudaStream_t) 0, false)); | |
} | |
timer.Stop(); | |
elapsed_millis += timer.ElapsedMillis(); | |
return elapsed_millis / timing_iterations; | |
} | |
//--------------------------------------------------------------------- | |
// Test generation | |
//--------------------------------------------------------------------- | |
/** | |
* Display perf | |
*/ | |
template <typename ValueT, typename OffsetT> | |
void DisplayPerf( | |
float device_giga_bandwidth, | |
double avg_millis, | |
CsrMatrix<ValueT, OffsetT>& csr_matrix) | |
{ | |
double nz_throughput, effective_bandwidth; | |
size_t total_bytes = (csr_matrix.num_nonzeros * (sizeof(ValueT) * 2 + sizeof(OffsetT))) + | |
(csr_matrix.num_rows) * (sizeof(OffsetT) + sizeof(ValueT)); | |
nz_throughput = double(csr_matrix.num_nonzeros) / avg_millis / 1.0e6; | |
effective_bandwidth = double(total_bytes) / avg_millis / 1.0e6; | |
if (!g_quiet) | |
printf("fp%d: %.4f avg ms, %.5f gflops, %.3lf effective GB/s (%.2f%% peak)\n", | |
sizeof(ValueT) * 8, | |
avg_millis, | |
2 * nz_throughput, | |
effective_bandwidth, | |
effective_bandwidth / device_giga_bandwidth * 100); | |
else | |
printf("%.5f, %.6f, %.3lf, %.2f%%, ", | |
avg_millis, | |
2 * nz_throughput, | |
effective_bandwidth, | |
effective_bandwidth / device_giga_bandwidth * 100); | |
fflush(stdout); | |
} | |
/** | |
* Run tests | |
*/ | |
template < | |
typename ValueT, | |
typename OffsetT> | |
void RunTest( | |
bool rcm_relabel, | |
ValueT alpha, | |
ValueT beta, | |
CooMatrix<ValueT, OffsetT>& coo_matrix, | |
int timing_iterations, | |
CommandLineArgs& args) | |
{ | |
// Adaptive timing iterations: run 16 billion nonzeros through | |
if (timing_iterations == -1) | |
timing_iterations = std::min(50000ull, std::max(100ull, ((16ull << 30) / coo_matrix.num_nonzeros))); | |
if (!g_quiet) | |
printf("\t%d timing iterations\n", timing_iterations); | |
// Convert to CSR | |
CsrMatrix<ValueT, OffsetT> csr_matrix; | |
csr_matrix.FromCoo(coo_matrix); | |
if (!args.CheckCmdLineFlag("csrmv")) | |
coo_matrix.Clear(); | |
// Relabel | |
if (rcm_relabel) | |
{ | |
if (!g_quiet) | |
{ | |
csr_matrix.Stats().Display(); | |
printf("\n"); | |
csr_matrix.DisplayHistogram(); | |
printf("\n"); | |
if (g_verbose2) | |
csr_matrix.Display(); | |
printf("\n"); | |
} | |
RcmRelabel(csr_matrix, !g_quiet); | |
if (!g_quiet) printf("\n"); | |
} | |
// Display matrix info | |
csr_matrix.Stats().Display(!g_quiet); | |
if (!g_quiet) | |
{ | |
printf("\n"); | |
csr_matrix.DisplayHistogram(); | |
printf("\n"); | |
if (g_verbose2) | |
csr_matrix.Display(); | |
printf("\n"); | |
} | |
fflush(stdout); | |
// Allocate input and output vectors | |
ValueT* vector_x = new ValueT[csr_matrix.num_cols]; | |
ValueT* vector_y_in = new ValueT[csr_matrix.num_rows]; | |
ValueT* vector_y_out = new ValueT[csr_matrix.num_rows]; | |
for (int col = 0; col < csr_matrix.num_cols; ++col) | |
vector_x[col] = 1.0; | |
for (int row = 0; row < csr_matrix.num_rows; ++row) | |
vector_y_in[row] = 1.0; | |
// Compute reference answer | |
SpmvGold(csr_matrix, vector_x, vector_y_in, vector_y_out, alpha, beta); | |
float avg_millis; | |
if (g_quiet) { | |
printf("%s, %s, ", args.deviceProp.name, (sizeof(ValueT) > 4) ? "fp64" : "fp32"); fflush(stdout); | |
} | |
// Get GPU device bandwidth (GB/s) | |
float device_giga_bandwidth = args.device_giga_bandwidth; | |
// Allocate and initialize GPU problem | |
SpmvParams<ValueT, OffsetT> params; | |
CubDebugExit(g_allocator.DeviceAllocate((void **) ¶ms.d_values, sizeof(ValueT) * csr_matrix.num_nonzeros)); | |
CubDebugExit(g_allocator.DeviceAllocate((void **) ¶ms.d_row_end_offsets, sizeof(OffsetT) * (csr_matrix.num_rows + 1))); | |
CubDebugExit(g_allocator.DeviceAllocate((void **) ¶ms.d_column_indices, sizeof(OffsetT) * csr_matrix.num_nonzeros)); | |
CubDebugExit(g_allocator.DeviceAllocate((void **) ¶ms.d_vector_x, sizeof(ValueT) * csr_matrix.num_cols)); | |
CubDebugExit(g_allocator.DeviceAllocate((void **) ¶ms.d_vector_y, sizeof(ValueT) * csr_matrix.num_rows)); | |
params.num_rows = csr_matrix.num_rows; | |
params.num_cols = csr_matrix.num_cols; | |
params.num_nonzeros = csr_matrix.num_nonzeros; | |
params.alpha = alpha; | |
params.beta = beta; | |
CubDebugExit(cudaMemcpy(params.d_values, csr_matrix.values, sizeof(ValueT) * csr_matrix.num_nonzeros, cudaMemcpyHostToDevice)); | |
CubDebugExit(cudaMemcpy(params.d_row_end_offsets, csr_matrix.row_offsets, sizeof(OffsetT) * (csr_matrix.num_rows + 1), cudaMemcpyHostToDevice)); | |
CubDebugExit(cudaMemcpy(params.d_column_indices, csr_matrix.column_indices, sizeof(OffsetT) * csr_matrix.num_nonzeros, cudaMemcpyHostToDevice)); | |
CubDebugExit(cudaMemcpy(params.d_vector_x, vector_x, sizeof(ValueT) * csr_matrix.num_cols, cudaMemcpyHostToDevice)); | |
if (!g_quiet) printf("\n\n"); | |
printf("GPU CSR I/O Prox, "); fflush(stdout); | |
avg_millis = TestGpuCsrIoProxy(params, timing_iterations); | |
DisplayPerf(device_giga_bandwidth, avg_millis, csr_matrix); | |
if (args.CheckCmdLineFlag("csrmv")) | |
{ | |
if (!g_quiet) printf("\n\n"); | |
printf("CUB, "); fflush(stdout); | |
avg_millis = TestGpuMergeCsrmv(vector_y_in, vector_y_out, params, timing_iterations); | |
DisplayPerf(device_giga_bandwidth, avg_millis, csr_matrix); | |
} | |
// Initialize cuSparse | |
cusparseHandle_t cusparse; | |
AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreate(&cusparse)); | |
if (args.CheckCmdLineFlag("csrmv")) | |
{ | |
if (!g_quiet) printf("\n\n"); | |
printf("Cusparse CsrMV, "); fflush(stdout); | |
avg_millis = TestCusparseCsrmv(vector_y_in, vector_y_out, params, timing_iterations, cusparse); | |
DisplayPerf(device_giga_bandwidth, avg_millis, csr_matrix); | |
} | |
if (args.CheckCmdLineFlag("hybmv")) | |
{ | |
if (!g_quiet) printf("\n\n"); | |
printf("Cusparse HybMV, "); fflush(stdout); | |
avg_millis = TestCusparseHybmv(vector_y_in, vector_y_out, params, timing_iterations, cusparse); | |
DisplayPerf(device_giga_bandwidth, avg_millis, csr_matrix); | |
} | |
// Cleanup | |
if (params.d_values) CubDebugExit(g_allocator.DeviceFree(params.d_values)); | |
if (params.d_row_end_offsets) CubDebugExit(g_allocator.DeviceFree(params.d_row_end_offsets)); | |
if (params.d_column_indices) CubDebugExit(g_allocator.DeviceFree(params.d_column_indices)); | |
if (params.d_vector_x) CubDebugExit(g_allocator.DeviceFree(params.d_vector_x)); | |
if (params.d_vector_y) CubDebugExit(g_allocator.DeviceFree(params.d_vector_y)); | |
if (vector_x) delete[] vector_x; | |
if (vector_y_in) delete[] vector_y_in; | |
if (vector_y_out) delete[] vector_y_out; | |
} | |
/** | |
* Run tests | |
*/ | |
template < | |
typename ValueT, | |
typename OffsetT> | |
void RunTests( | |
bool rcm_relabel, | |
ValueT alpha, | |
ValueT beta, | |
const std::string& mtx_filename, | |
int grid2d, | |
int grid3d, | |
int wheel, | |
int dense, | |
int timing_iterations, | |
CommandLineArgs& args) | |
{ | |
// Initialize matrix in COO form | |
CooMatrix<ValueT, OffsetT> coo_matrix; | |
if (!mtx_filename.empty()) | |
{ | |
// Parse matrix market file | |
printf("%s, ", mtx_filename.c_str()); fflush(stdout); | |
coo_matrix.InitMarket(mtx_filename, 1.0, !g_quiet); | |
if ((coo_matrix.num_rows == 1) || (coo_matrix.num_cols == 1) || (coo_matrix.num_nonzeros == 1)) | |
{ | |
if (!g_quiet) printf("Trivial dataset\n"); | |
exit(0); | |
} | |
} | |
else if (grid2d > 0) | |
{ | |
// Generate 2D lattice | |
printf("grid2d_%d, ", grid2d); fflush(stdout); | |
coo_matrix.InitGrid2d(grid2d, false); | |
} | |
else if (grid3d > 0) | |
{ | |
// Generate 3D lattice | |
printf("grid3d_%d, ", grid3d); fflush(stdout); | |
coo_matrix.InitGrid3d(grid3d, false); | |
} | |
else if (wheel > 0) | |
{ | |
// Generate wheel graph | |
printf("wheel_%d, ", grid2d); fflush(stdout); | |
coo_matrix.InitWheel(wheel); | |
} | |
else if (dense > 0) | |
{ | |
// Generate dense graph | |
OffsetT size = 1 << 24; // 16M nnz | |
args.GetCmdLineArgument("size", size); | |
OffsetT rows = size / dense; | |
printf("dense_%d_x_%d, ", rows, dense); fflush(stdout); | |
coo_matrix.InitDense(rows, dense); | |
} | |
else | |
{ | |
fprintf(stderr, "No graph type specified.\n"); | |
exit(1); | |
} | |
RunTest( | |
rcm_relabel, | |
alpha, | |
beta, | |
coo_matrix, | |
timing_iterations, | |
args); | |
} | |
/** | |
* Main | |
*/ | |
int main(int argc, char **argv) | |
{ | |
// Initialize command line | |
CommandLineArgs args(argc, argv); | |
if (args.CheckCmdLineFlag("help")) | |
{ | |
printf( | |
"%s " | |
"[--csrmv | --hybmv | --bsrmv ] " | |
"[--device=<device-id>] " | |
"[--quiet] " | |
"[--v] " | |
"[--i=<timing iterations>] " | |
"[--fp64] " | |
"[--rcm] " | |
"[--alpha=<alpha scalar (default: 1.0)>] " | |
"[--beta=<beta scalar (default: 0.0)>] " | |
"\n\t" | |
"--mtx=<matrix market file> " | |
"\n\t" | |
"--dense=<cols>" | |
"\n\t" | |
"--grid2d=<width>" | |
"\n\t" | |
"--grid3d=<width>" | |
"\n\t" | |
"--wheel=<spokes>" | |
"\n", argv[0]); | |
exit(0); | |
} | |
bool fp64; | |
bool rcm_relabel; | |
std::string mtx_filename; | |
int grid2d = -1; | |
int grid3d = -1; | |
int wheel = -1; | |
int dense = -1; | |
int timing_iterations = -1; | |
float alpha = 1.0; | |
float beta = 0.0; | |
g_verbose = args.CheckCmdLineFlag("v"); | |
g_verbose2 = args.CheckCmdLineFlag("v2"); | |
g_quiet = args.CheckCmdLineFlag("quiet"); | |
fp64 = args.CheckCmdLineFlag("fp64"); | |
rcm_relabel = args.CheckCmdLineFlag("rcm"); | |
args.GetCmdLineArgument("i", timing_iterations); | |
args.GetCmdLineArgument("mtx", mtx_filename); | |
args.GetCmdLineArgument("grid2d", grid2d); | |
args.GetCmdLineArgument("grid3d", grid3d); | |
args.GetCmdLineArgument("wheel", wheel); | |
args.GetCmdLineArgument("dense", dense); | |
args.GetCmdLineArgument("alpha", alpha); | |
args.GetCmdLineArgument("beta", beta); | |
// Initialize device | |
CubDebugExit(args.DeviceInit()); | |
// Run test(s) | |
if (fp64) | |
{ | |
RunTests<double, int>(rcm_relabel, alpha, beta, mtx_filename, grid2d, grid3d, wheel, dense, timing_iterations, args); | |
} | |
else | |
{ | |
RunTests<float, int>(rcm_relabel, alpha, beta, mtx_filename, grid2d, grid3d, wheel, dense, timing_iterations, args); | |
} | |
CubDebugExit(cudaDeviceSynchronize()); | |
printf("\n"); | |
return 0; | |
} | |