|
#include <math.h> |
|
#include <stdio.h> |
|
#include <stdlib.h> |
|
|
|
|
|
#include "cluster_gpu.h" |
|
#include "cuda_utils.h" |
|
|
|
|
|
__device__ float get_dis(float x1, float y1, float z1, float x2, float y2, float z2) { |
|
float dis = (x1 - x2) * (x1 - x2) + (y1 - y2) * (y1 - y2) + (z1 - z2) * (z1 - z2); |
|
return sqrt(dis); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__global__ void dbscan_kernel_fast(int b, int n, float eps, int min_pts, const float *__restrict__ xyz, int *__restrict__ idx, |
|
int *__restrict__ pts_cnt, int *__restrict__ pts_adj, int *__restrict__ pts_stack) { |
|
|
|
|
|
|
|
int bs_idx = blockIdx.x * blockDim.x + threadIdx.x; |
|
if (bs_idx >= b) return; |
|
|
|
xyz += bs_idx * n * 3; |
|
idx += bs_idx * n; |
|
pts_cnt += bs_idx * n; |
|
pts_stack += bs_idx * n; |
|
pts_adj += bs_idx * n * n; |
|
|
|
for(int i=0;i<n;i++) { |
|
pts_cnt[i] = 0; |
|
for(int j=0;j<n;j++) { |
|
pts_adj[i * n + j] = -1; |
|
if(i==j) continue; |
|
float x1 = xyz[i * 3 + 0]; |
|
float y1 = xyz[i * 3 + 1]; |
|
float z1 = xyz[i * 3 + 2]; |
|
float x2 = xyz[j * 3 + 0]; |
|
float y2 = xyz[j * 3 + 1]; |
|
float z2 = xyz[j * 3 + 2]; |
|
|
|
if(get_dis(x2, y2, z2, -10.0, -10.0, -10.0) < 1e-3) continue; |
|
if(get_dis(x1, y1, z1, x2, y2, z2) <= eps) { |
|
pts_adj[i * n + pts_cnt[i]] = j; |
|
pts_cnt[i] += 1; |
|
} |
|
|
|
} |
|
} |
|
|
|
int cluster_idx = 0; |
|
|
|
for(int i=0;i<n;i++) { |
|
if(idx[i] != -1) continue; |
|
|
|
if(pts_cnt[i] >= min_pts) { |
|
for(int j=0;j<n;j++) |
|
pts_stack[j] = -1; |
|
pts_stack[0] = i; |
|
int stack_idx = 0; |
|
int stack_len = 1; |
|
while (stack_idx < n && pts_stack[stack_idx] != -1) |
|
{ |
|
int pts_idx = pts_stack[stack_idx]; |
|
idx[pts_idx] = cluster_idx; |
|
if(pts_cnt[pts_idx] < min_pts){ |
|
stack_idx += 1; |
|
continue; |
|
} |
|
for(int j=0;j<n;j++) { |
|
int adj = pts_adj[pts_idx * n + j]; |
|
if (adj == -1) break; |
|
if (idx[adj] == -1) |
|
{ |
|
idx[adj] = -2; |
|
pts_stack[stack_len++] = adj; |
|
} |
|
} |
|
stack_idx += 1; |
|
} |
|
cluster_idx += 1; |
|
} |
|
} |
|
} |
|
|
|
|
|
void dbscan_kernel_launcher_fast(int b, int n, float eps, int min_pts, const float *xyz, int *idx) { |
|
|
|
|
|
|
|
|
|
cudaError_t err; |
|
|
|
dim3 blocks(DIVUP(b, THREADS_PER_BLOCK)); |
|
dim3 threads(THREADS_PER_BLOCK); |
|
|
|
int* pts_cnt; |
|
int* pts_stack; |
|
int* pts_adj; |
|
|
|
err = cudaMalloc((void**)&pts_cnt, b * n * sizeof(int)); |
|
if (cudaSuccess != err) { |
|
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); |
|
exit(-1); |
|
} |
|
|
|
err = cudaMalloc((void**)&pts_stack, b * n * sizeof(int)); |
|
if (cudaSuccess != err) { |
|
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); |
|
exit(-1); |
|
} |
|
|
|
err = cudaMalloc((void**)&pts_adj, b * n * n * sizeof(int)); |
|
if (cudaSuccess != err) { |
|
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); |
|
exit(-1); |
|
} |
|
|
|
dbscan_kernel_fast<<<blocks, threads>>>(b, n, eps, min_pts, xyz, idx, pts_cnt, pts_adj, pts_stack); |
|
|
|
cudaFree(pts_cnt); |
|
cudaFree(pts_stack); |
|
cudaFree(pts_adj); |
|
err = cudaGetLastError(); |
|
if (cudaSuccess != err) { |
|
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); |
|
exit(-1); |
|
} |
|
} |
|
|
|
|
|
|
|
__global__ void cluster_pts_kernel_fast(int b, int n, int m, const float *__restrict__ xyz, const int *__restrict__ idx, |
|
float *__restrict__ new_xyz, int *__restrict__ num) { |
|
int bs_idx = blockIdx.x * blockDim.x + threadIdx.x; |
|
if (bs_idx >= b ) return; |
|
|
|
xyz += bs_idx * n * 3; |
|
idx += bs_idx * n; |
|
new_xyz += bs_idx * m * 3; |
|
num += bs_idx * m; |
|
|
|
for(int i=0;i<n;i++) { |
|
if (idx[i] == -1) continue; |
|
int c_idx = idx[i]; |
|
new_xyz[c_idx * 3 + 0] += xyz[i * 3 + 0]; |
|
new_xyz[c_idx * 3 + 1] += xyz[i * 3 + 1]; |
|
new_xyz[c_idx * 3 + 2] += xyz[i * 3 + 2]; |
|
num[c_idx] += 1; |
|
} |
|
for(int i=0;i<m;i++) { |
|
if (num[i] == 0) break; |
|
new_xyz[i * 3 + 0] /= num[i]; |
|
new_xyz[i * 3 + 1] /= num[i]; |
|
new_xyz[i * 3 + 2] /= num[i]; |
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
void cluster_pts_kernel_launcher_fast(int b, int n, int m, const float *xyz, const int *idx, float *new_xyz, int *num) { |
|
cudaError_t err; |
|
|
|
dim3 blocks(DIVUP(b, THREADS_PER_BLOCK)); |
|
dim3 threads(THREADS_PER_BLOCK); |
|
|
|
cluster_pts_kernel_fast<<<blocks, threads>>>(b, n, m, xyz, idx, new_xyz, num); |
|
|
|
err = cudaGetLastError(); |
|
if (cudaSuccess != err) { |
|
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); |
|
exit(-1); |
|
} |
|
} |
|
|
|
|
|
|