baseline_null / pc_util /src /cluster_gpu.cu
WatsonTang98's picture
Upload 21 files
e8ffc70 verified
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include "cluster_gpu.h"
#include "cuda_utils.h"
__device__ float get_dis(float x1, float y1, float z1, float x2, float y2, float z2) {
float dis = (x1 - x2) * (x1 - x2) + (y1 - y2) * (y1 - y2) + (z1 - z2) * (z1 - z2);
return sqrt(dis);
}
/*
__device__ void dfs (int i, int c, int n, int min_pts, const int* pts_cnt, const int* pts_adj, int* idx, int label) {
idx[i] = c;
if(pts_cnt[i] < min_pts) return;
for(int j=0;j<n;j++) {
int adj = pts_adj[i * n + j];
printf("%d %d %d\n", i * n, i * n + j, adj);
if (adj == -1) break;
if (idx[adj] == -1)
dfs(adj, c, n, min_pts, pts_cnt, pts_adj, idx, label);
}
}
*/
__global__ void dbscan_kernel_fast(int b, int n, float eps, int min_pts, const float *__restrict__ xyz, int *__restrict__ idx,
int *__restrict__ pts_cnt, int *__restrict__ pts_adj, int *__restrict__ pts_stack) {
// xyz: (B, N, 3)
// output:
// idx: (B, N)
int bs_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (bs_idx >= b) return;
xyz += bs_idx * n * 3;
idx += bs_idx * n;
pts_cnt += bs_idx * n;
pts_stack += bs_idx * n;
pts_adj += bs_idx * n * n;
for(int i=0;i<n;i++) {
pts_cnt[i] = 0;
for(int j=0;j<n;j++) {
pts_adj[i * n + j] = -1;
if(i==j) continue;
float x1 = xyz[i * 3 + 0];
float y1 = xyz[i * 3 + 1];
float z1 = xyz[i * 3 + 2];
float x2 = xyz[j * 3 + 0];
float y2 = xyz[j * 3 + 1];
float z2 = xyz[j * 3 + 2];
if(get_dis(x2, y2, z2, -10.0, -10.0, -10.0) < 1e-3) continue;
if(get_dis(x1, y1, z1, x2, y2, z2) <= eps) {
pts_adj[i * n + pts_cnt[i]] = j;
pts_cnt[i] += 1;
}
}
}
int cluster_idx = 0;
for(int i=0;i<n;i++) {
if(idx[i] != -1) continue;
if(pts_cnt[i] >= min_pts) {
for(int j=0;j<n;j++)
pts_stack[j] = -1;
pts_stack[0] = i;
int stack_idx = 0;
int stack_len = 1;
while (stack_idx < n && pts_stack[stack_idx] != -1)
{
int pts_idx = pts_stack[stack_idx];
idx[pts_idx] = cluster_idx;
if(pts_cnt[pts_idx] < min_pts){
stack_idx += 1;
continue;
}
for(int j=0;j<n;j++) {
int adj = pts_adj[pts_idx * n + j];
if (adj == -1) break;
if (idx[adj] == -1)
{
idx[adj] = -2;
pts_stack[stack_len++] = adj;
}
}
stack_idx += 1;
}
cluster_idx += 1;
}
}
}
void dbscan_kernel_launcher_fast(int b, int n, float eps, int min_pts, const float *xyz, int *idx) {
// xyz: (B, N, 3)
// output:
// idx: (B, N)
cudaError_t err;
dim3 blocks(DIVUP(b, THREADS_PER_BLOCK)); // blockIdx.x(col), blockIdx.y(row)
dim3 threads(THREADS_PER_BLOCK);
int* pts_cnt;
int* pts_stack;
int* pts_adj;
err = cudaMalloc((void**)&pts_cnt, b * n * sizeof(int));
if (cudaSuccess != err) {
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
exit(-1);
}
err = cudaMalloc((void**)&pts_stack, b * n * sizeof(int));
if (cudaSuccess != err) {
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
exit(-1);
}
err = cudaMalloc((void**)&pts_adj, b * n * n * sizeof(int));
if (cudaSuccess != err) {
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
exit(-1);
}
dbscan_kernel_fast<<<blocks, threads>>>(b, n, eps, min_pts, xyz, idx, pts_cnt, pts_adj, pts_stack);
// cudaDeviceSynchronize(); // for using printf in kernel function
cudaFree(pts_cnt);
cudaFree(pts_stack);
cudaFree(pts_adj);
err = cudaGetLastError();
if (cudaSuccess != err) {
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
exit(-1);
}
}
__global__ void cluster_pts_kernel_fast(int b, int n, int m, const float *__restrict__ xyz, const int *__restrict__ idx,
float *__restrict__ new_xyz, int *__restrict__ num) {
int bs_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (bs_idx >= b ) return;
xyz += bs_idx * n * 3;
idx += bs_idx * n;
new_xyz += bs_idx * m * 3;
num += bs_idx * m;
for(int i=0;i<n;i++) {
if (idx[i] == -1) continue;
int c_idx = idx[i];
new_xyz[c_idx * 3 + 0] += xyz[i * 3 + 0];
new_xyz[c_idx * 3 + 1] += xyz[i * 3 + 1];
new_xyz[c_idx * 3 + 2] += xyz[i * 3 + 2];
num[c_idx] += 1;
}
for(int i=0;i<m;i++) {
if (num[i] == 0) break;
new_xyz[i * 3 + 0] /= num[i];
new_xyz[i * 3 + 1] /= num[i];
new_xyz[i * 3 + 2] /= num[i];
}
}
void cluster_pts_kernel_launcher_fast(int b, int n, int m, const float *xyz, const int *idx, float *new_xyz, int *num) {
cudaError_t err;
dim3 blocks(DIVUP(b, THREADS_PER_BLOCK)); // blockIdx.x(col), blockIdx.y(row)
dim3 threads(THREADS_PER_BLOCK);
cluster_pts_kernel_fast<<<blocks, threads>>>(b, n, m, xyz, idx, new_xyz, num);
// cudaDeviceSynchronize(); // for using printf in kernel function
err = cudaGetLastError();
if (cudaSuccess != err) {
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
exit(-1);
}
}