|
#include <math.h> |
|
#include <stdio.h> |
|
#include <stdlib.h> |
|
|
|
|
|
#include "ball_query_gpu.h" |
|
#include "cuda_utils.h" |
|
|
|
|
|
__global__ void ball_query_kernel_fast(int b, int n, int m, float radius, int nsample, |
|
const float *__restrict__ new_xyz, const float *__restrict__ xyz, int *__restrict__ idx) { |
|
|
|
|
|
|
|
|
|
int bs_idx = blockIdx.y; |
|
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; |
|
if (bs_idx >= b || pt_idx >= m) return; |
|
|
|
new_xyz += bs_idx * m * 3 + pt_idx * 3; |
|
xyz += bs_idx * n * 3; |
|
idx += bs_idx * m * nsample + pt_idx * nsample; |
|
|
|
float radius2 = radius * radius; |
|
float new_x = new_xyz[0]; |
|
float new_y = new_xyz[1]; |
|
float new_z = new_xyz[2]; |
|
|
|
int cnt = 0; |
|
for (int k = 0; k < n; ++k) { |
|
float x = xyz[k * 3 + 0]; |
|
float y = xyz[k * 3 + 1]; |
|
float z = xyz[k * 3 + 2]; |
|
float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z); |
|
if (d2 < radius2){ |
|
if (cnt == 0){ |
|
for (int l = 0; l < nsample; ++l) { |
|
idx[l] = k; |
|
} |
|
} |
|
idx[cnt] = k; |
|
++cnt; |
|
if (cnt >= nsample) break; |
|
} |
|
} |
|
} |
|
|
|
|
|
void ball_query_kernel_launcher_fast(int b, int n, int m, float radius, int nsample, \ |
|
const float *new_xyz, const float *xyz, int *idx) { |
|
|
|
|
|
|
|
|
|
|
|
cudaError_t err; |
|
|
|
dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b); |
|
dim3 threads(THREADS_PER_BLOCK); |
|
|
|
ball_query_kernel_fast<<<blocks, threads>>>(b, n, m, radius, nsample, new_xyz, xyz, idx); |
|
|
|
err = cudaGetLastError(); |
|
if (cudaSuccess != err) { |
|
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); |
|
exit(-1); |
|
} |
|
} |
|
|
|
|
|
__global__ void ball_center_query_kernel_fast(int b, int n, int m, float radius, \ |
|
const float *__restrict__ point, const float *__restrict__ key_point, int *__restrict__ idx) { |
|
|
|
|
|
|
|
|
|
int bs_idx = blockIdx.y; |
|
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; |
|
if (bs_idx >= b || pt_idx >= n) return; |
|
|
|
point += bs_idx * n * 3 + pt_idx * 3; |
|
key_point += bs_idx * m * 3; |
|
idx += bs_idx * n + pt_idx; |
|
|
|
float radius2 = radius * radius; |
|
float point_x = point[0]; |
|
float point_y = point[1]; |
|
float point_z = point[2]; |
|
|
|
float bestd = 1e8; |
|
for (int k = 0; k < m; ++k) { |
|
float x = key_point[k * 3 + 0]; |
|
float y = key_point[k * 3 + 1]; |
|
float z = key_point[k * 3 + 2]; |
|
if (((x + 1) * (x + 1) + (y + 1) * (y + 1) + (z + 1) * (z + 1)) < 1e-4) break; |
|
float d2 = (point_x - x) * (point_x - x) + (point_y - y) * (point_y - y) + (point_z - z) * (point_z - z); |
|
if (d2 < radius2 && d2 < bestd){ |
|
idx[0] = k; |
|
bestd = d2; |
|
} |
|
} |
|
} |
|
|
|
|
|
void ball_center_query_kernel_launcher_fast(int b, int n, int m, float radius, \ |
|
const float *point, const float *key_point, int *idx) { |
|
|
|
|
|
|
|
|
|
|
|
cudaError_t err; |
|
|
|
dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), b); |
|
dim3 threads(THREADS_PER_BLOCK); |
|
|
|
ball_center_query_kernel_fast<<<blocks, threads>>>(b, n, m, radius, point, key_point, idx); |
|
|
|
err = cudaGetLastError(); |
|
if (cudaSuccess != err) { |
|
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); |
|
exit(-1); |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
__global__ void knn_query_kernel_fast(int b, int n, int m, int nsample, const float *__restrict__ new_xyz, |
|
const float *__restrict__ xyz, float *__restrict__ dist2, int *__restrict__ idx) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int bs_idx = blockIdx.y; |
|
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; |
|
if (bs_idx >= b || pt_idx >= m) return; |
|
|
|
new_xyz += bs_idx * m * 3 + pt_idx * 3; |
|
xyz += bs_idx * n * 3; |
|
dist2 += bs_idx * m * nsample + pt_idx * nsample; |
|
idx += bs_idx * m * nsample + pt_idx * nsample; |
|
|
|
float nx = new_xyz[0]; |
|
float ny = new_xyz[1]; |
|
float nz = new_xyz[2]; |
|
|
|
for (int i = 0; i < n; ++i) { |
|
float x = xyz[i * 3 + 0]; |
|
float y = xyz[i * 3 + 1]; |
|
float z = xyz[i * 3 + 2]; |
|
float d2 = (nx - x) * (nx - x) + (ny - y) * (ny - y) + (nz - z) * (nz - z); |
|
if (d2 < dist2[nsample - 1]) { |
|
dist2[nsample - 1] = d2; |
|
idx[nsample - 1] = i; |
|
for (int j = nsample - 2; j >= 0; j--) { |
|
if (d2 < dist2[j]){ |
|
dist2[j + 1] = dist2[j]; |
|
dist2[j] = d2; |
|
idx[j + 1] = idx[j]; |
|
idx[j] = i; |
|
} |
|
} |
|
} |
|
} |
|
} |
|
|
|
|
|
void knn_query_kernel_launcher_fast(int b, int n, int m, int nsample, \ |
|
const float *new_xyz, const float *xyz, float *dist2, int *idx) { |
|
cudaError_t err; |
|
|
|
dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b); |
|
dim3 threads(THREADS_PER_BLOCK); |
|
|
|
knn_query_kernel_fast<<<blocks, threads>>>(b, n, m, nsample, new_xyz, xyz, dist2, idx); |
|
|
|
err = cudaGetLastError(); |
|
if (cudaSuccess != err) { |
|
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); |
|
exit(-1); |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__global__ void ball_query_kernel_stack(int B, int M, float radius, int nsample, \ |
|
const float *new_xyz, const int *new_xyz_batch_cnt, const float *xyz, const int *xyz_batch_cnt, int *idx) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; |
|
if (pt_idx >= M) return; |
|
|
|
int bs_idx = 0, pt_cnt = new_xyz_batch_cnt[0]; |
|
for (int k = 1; k < B; k++){ |
|
if (pt_idx < pt_cnt) break; |
|
pt_cnt += new_xyz_batch_cnt[k]; |
|
bs_idx = k; |
|
} |
|
|
|
int xyz_batch_start_idx = 0; |
|
for (int k = 0; k < bs_idx; k++) xyz_batch_start_idx += xyz_batch_cnt[k]; |
|
|
|
|
|
new_xyz += pt_idx * 3; |
|
xyz += xyz_batch_start_idx * 3; |
|
idx += pt_idx * nsample; |
|
|
|
float radius2 = radius * radius; |
|
float new_x = new_xyz[0]; |
|
float new_y = new_xyz[1]; |
|
float new_z = new_xyz[2]; |
|
int n = xyz_batch_cnt[bs_idx]; |
|
|
|
int cnt = 0; |
|
for (int k = 0; k < n; ++k) { |
|
float x = xyz[k * 3 + 0]; |
|
float y = xyz[k * 3 + 1]; |
|
float z = xyz[k * 3 + 2]; |
|
float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z); |
|
if (d2 < radius2){ |
|
if (cnt == 0){ |
|
for (int l = 0; l < nsample; ++l) { |
|
idx[l] = k; |
|
} |
|
} |
|
idx[cnt] = k; |
|
++cnt; |
|
if (cnt >= nsample) break; |
|
} |
|
} |
|
if (cnt == 0) idx[0] = -1; |
|
} |
|
|
|
|
|
void ball_query_kernel_launcher_stack(int B, int M, float radius, int nsample, |
|
const float *new_xyz, const int *new_xyz_batch_cnt, const float *xyz, const int *xyz_batch_cnt, int *idx){ |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cudaError_t err; |
|
|
|
dim3 blocks(DIVUP(M, THREADS_PER_BLOCK)); |
|
dim3 threads(THREADS_PER_BLOCK); |
|
|
|
ball_query_kernel_stack<<<blocks, threads>>>(B, M, radius, nsample, new_xyz, new_xyz_batch_cnt, xyz, xyz_batch_cnt, idx); |
|
|
|
err = cudaGetLastError(); |
|
if (cudaSuccess != err) { |
|
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); |
|
exit(-1); |
|
} |
|
} |
|
|