Skip to content

Commit 21a36ec

Browse files
committed
CUDA jacobian working
1 parent cb5b5f5 commit 21a36ec

File tree

4 files changed

+175
-149
lines changed

4 files changed

+175
-149
lines changed

make.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
1+
# nvcc -Wno-deprecated-gpu-targets -Xcompiler -fPIC -shared -o roboticstoolbox/cuda/fknm.so roboticstoolbox/cuda/fknm.cu
12
nvcc -Wno-deprecated-gpu-targets -Xcompiler -fPIC -shared -o roboticstoolbox/cuda/fknm.so roboticstoolbox/cuda/fknm.cu

roboticstoolbox/cuda/fknm.cu

Lines changed: 105 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -17,45 +17,57 @@ __device__ void _eye(double *data);
1717
* Params
1818
* T: (N, 4, 4) the final transform matrix of all points (shared)
1919
* tool: (N, 4, 4) the tool transform matrix of all points (shared)
20-
* link_A: (cdim, 4, 4) the transformation matrix of all joints
21-
* link_axes: (cdim, ): axes of all links
22-
* link_isjoint: (cdim, ): 1/0 whether links are joints
20+
* link_A: (nlinks, 4, 4) the transformation matrix of all joints
21+
* link_axes: (nlinks, ): axes of all links
22+
* link_isjoint: (nlinks, ): 1/0 whether links are joints
2323
* N: (int) number of points
24-
* cdim: (int) number of joints
25-
* out: (N, 6, cdim)
24+
* nlinks: (int) number of links on the path
25+
* njoints: (int) number of joints
26+
* out: (N, 6, njoints)
2627
*/
2728
__global__ void _jacob0(double *T,
2829
double *tool,
29-
double *e_tool,
30+
double *etool,
3031
double *link_A,
31-
int *link_axes,
32-
int *link_isjoint,
32+
long *link_axes,
33+
long *link_isjoint,
3334
int N,
34-
int cdim,
35+
int nlinks,
36+
int njoints,
3537
double *out)
3638
{
3739
int tid = blockIdx.x * blockDim.x + threadIdx.x;
38-
double *T_i, *tool_i;
39-
double *U, *temp, *etool_i;
40+
double *T_i; // T
41+
double *tool_i;
42+
double *U;
43+
double *temp;
44+
double *etool_i;
4045
double *invU;
41-
double *link_iA;
46+
double *link_iA; // TODO: =ret ?
4247

43-
cudaMalloc((void**)&U, sizeof(double) * 16);
44-
cudaMalloc((void**)&invU, sizeof(double) * 16);
45-
cudaMalloc((void**)&temp, sizeof(double) * 16);
48+
U = (double*) malloc(sizeof(double) * 16);
49+
invU = (double*) malloc(sizeof(double) * 16);
50+
temp = (double*) malloc(sizeof(double) * 16);
4651
int j = 0;
4752

4853
T_i = &T[tid * 16];
4954
tool_i = &tool[tid * 16];
55+
etool_i = &etool[tid * 16];
5056
_eye(U);
51-
for (int i = 0; i < cdim; i++) {
5257

58+
if (tid >= N) {
59+
return;
60+
}
61+
62+
for (int i = 0; i < nlinks; i++) {
63+
64+
// printf("Hello from tid %d link_i %d link_axis %ld isjoint %ld \n", tid, i, link_axes[i], link_isjoint[i]);
5365
if (link_isjoint[i] == 1) {
5466
link_iA = &link_A[i * 16];
5567
mult(U, link_iA, temp);
5668
copy(temp, U);
5769

58-
if (i == cdim - 1) {
70+
if (i == nlinks - 1) {
5971
mult(U, etool_i, temp);
6072
copy(temp, U);
6173
mult(U, tool_i, temp);
@@ -65,72 +77,74 @@ __global__ void _jacob0(double *T,
6577
_inv(U, invU);
6678
mult(invU, T_i, temp);
6779

68-
double *out_tid = &out[tid + 16];
80+
double *out_tid = &out[tid * 6 * njoints];
6981

7082
if (link_axes[i] == 0) {
71-
out_tid[0 * tid + j] = U[0 * 4 + 2] * temp[1 * 4 + 3] - U[0 * 4 + 1] * temp[2 * 4 + 3];
72-
out_tid[1 * tid + j] = U[1 * 4 + 2] * temp[1 * 4 + 3] - U[1 * 4 + 1] * temp[2 * 4 + 3];
73-
out_tid[2 * tid + j] = U[2 * 4 + 2] * temp[1 * 4 + 3] - U[2 * 4 + 1] * temp[2 * 4 + 3];
74-
out_tid[3 * tid + j] = U[0 * 4 + 2];
75-
out_tid[4 * tid + j] = U[1 * 4 + 2];
76-
out_tid[5 * tid + j] = U[2 * 4 + 2];
83+
out_tid[0 * njoints + j] = U[0 * 4 + 2] * temp[1 * 4 + 3] - U[0 * 4 + 1] * temp[2 * 4 + 3];
84+
out_tid[1 * njoints + j] = U[1 * 4 + 2] * temp[1 * 4 + 3] - U[1 * 4 + 1] * temp[2 * 4 + 3];
85+
out_tid[2 * njoints + j] = U[2 * 4 + 2] * temp[1 * 4 + 3] - U[2 * 4 + 1] * temp[2 * 4 + 3];
86+
out_tid[3 * njoints + j] = U[0 * 4 + 2];
87+
out_tid[4 * njoints + j] = U[1 * 4 + 2];
88+
out_tid[5 * njoints + j] = U[2 * 4 + 2];
7789
}
7890
else if (link_axes[i] == 1)
7991
{
80-
out_tid[0 * tid + j] = U[0 * 4 + 0] * temp[2 * 4 + 3] - U[0 * 4 + 2] * temp[0 * 4 + 3];
81-
out_tid[1 * tid + j] = U[1 * 4 + 0] * temp[2 * 4 + 3] - U[1 * 4 + 2] * temp[0 * 4 + 3];
82-
out_tid[2 * tid + j] = U[2 * 4 + 0] * temp[2 * 4 + 3] - U[2 * 4 + 2] * temp[0 * 4 + 3];
83-
out_tid[3 * tid + j] = U[0 * 4 + 1];
84-
out_tid[4 * tid + j] = U[1 * 4 + 1];
85-
out_tid[5 * tid + j] = U[2 * 4 + 1];
92+
out_tid[0 * njoints + j] = U[0 * 4 + 0] * temp[2 * 4 + 3] - U[0 * 4 + 2] * temp[0 * 4 + 3];
93+
out_tid[1 * njoints + j] = U[1 * 4 + 0] * temp[2 * 4 + 3] - U[1 * 4 + 2] * temp[0 * 4 + 3];
94+
out_tid[2 * njoints + j] = U[2 * 4 + 0] * temp[2 * 4 + 3] - U[2 * 4 + 2] * temp[0 * 4 + 3];
95+
out_tid[3 * njoints + j] = U[0 * 4 + 1];
96+
out_tid[4 * njoints + j] = U[1 * 4 + 1];
97+
out_tid[5 * njoints + j] = U[2 * 4 + 1];
8698
}
8799
else if (link_axes[i] == 2)
88100
{
89-
out_tid[0 * tid + j] = U[0 * 4 + 1] * temp[0 * 4 + 3] - U[0 * 4 + 0] * temp[1 * 4 + 3];
90-
out_tid[1 * tid + j] = U[1 * 4 + 1] * temp[0 * 4 + 3] - U[1 * 4 + 0] * temp[1 * 4 + 3];
91-
out_tid[2 * tid + j] = U[2 * 4 + 1] * temp[0 * 4 + 3] - U[2 * 4 + 0] * temp[1 * 4 + 3];
92-
out_tid[3 * tid + j] = U[0 * 4 + 2];
93-
out_tid[4 * tid + j] = U[1 * 4 + 2];
94-
out_tid[5 * tid + j] = U[2 * 4 + 2];
101+
out_tid[0 * njoints + j] = U[0 * 4 + 1] * temp[0 * 4 + 3] - U[0 * 4 + 0] * temp[1 * 4 + 3];
102+
out_tid[1 * njoints + j] = U[1 * 4 + 1] * temp[0 * 4 + 3] - U[1 * 4 + 0] * temp[1 * 4 + 3];
103+
out_tid[2 * njoints + j] = U[2 * 4 + 1] * temp[0 * 4 + 3] - U[2 * 4 + 0] * temp[1 * 4 + 3];
104+
out_tid[3 * njoints + j] = U[0 * 4 + 2];
105+
out_tid[4 * njoints + j] = U[1 * 4 + 2];
106+
out_tid[5 * njoints + j] = U[2 * 4 + 2];
95107
}
96108
else if (link_axes[i] == 3)
97109
{
98-
out_tid[0 * tid + j] = U[0 * 4 + 0];
99-
out_tid[1 * tid + j] = U[1 * 4 + 0];
100-
out_tid[2 * tid + j] = U[2 * 4 + 0];
101-
out_tid[3 * tid + j] = 0.0;
102-
out_tid[4 * tid + j] = 0.0;
103-
out_tid[5 * tid + j] = 0.0;
110+
out_tid[0 * njoints + j] = U[0 * 4 + 0];
111+
out_tid[1 * njoints + j] = U[1 * 4 + 0];
112+
out_tid[2 * njoints + j] = U[2 * 4 + 0];
113+
out_tid[3 * njoints + j] = 0.0;
114+
out_tid[4 * njoints + j] = 0.0;
115+
out_tid[5 * njoints + j] = 0.0;
104116
}
105117
else if (link_axes[i] == 4)
106118
{
107-
out_tid[0 * tid + j] = U[0 * 4 + 1];
108-
out_tid[1 * tid + j] = U[1 * 4 + 1];
109-
out_tid[2 * tid + j] = U[2 * 4 + 1];
110-
out_tid[3 * tid + j] = 0.0;
111-
out_tid[4 * tid + j] = 0.0;
112-
out_tid[5 * tid + j] = 0.0;
119+
out_tid[0 * njoints + j] = U[0 * 4 + 1];
120+
out_tid[1 * njoints + j] = U[1 * 4 + 1];
121+
out_tid[2 * njoints + j] = U[2 * 4 + 1];
122+
out_tid[3 * njoints + j] = 0.0;
123+
out_tid[4 * njoints + j] = 0.0;
124+
out_tid[5 * njoints + j] = 0.0;
113125
}
114126
else if (link_axes[i] == 5)
115127
{
116-
out_tid[0 * tid + j] = U[0 * 4 + 2];
117-
out_tid[1 * tid + j] = U[1 * 4 + 2];
118-
out_tid[2 * tid + j] = U[2 * 4 + 2];
119-
out_tid[3 * tid + j] = 0.0;
120-
out_tid[4 * tid + j] = 0.0;
121-
out_tid[5 * tid + j] = 0.0;
128+
out_tid[0 * njoints + j] = U[0 * 4 + 2];
129+
out_tid[1 * njoints + j] = U[1 * 4 + 2];
130+
out_tid[2 * njoints + j] = U[2 * 4 + 2];
131+
out_tid[3 * njoints + j] = 0.0;
132+
out_tid[4 * njoints + j] = 0.0;
133+
out_tid[5 * njoints + j] = 0.0;
122134
}
123135
j++;
124-
} else {
136+
}
137+
else
138+
{
125139
link_iA = &link_A[i * 16];
126140
mult(U, link_iA, temp);
127141
copy(temp, U);
128142
}
129143
}
130144

131-
cudaFree(U);
132-
cudaFree(invU);
133-
cudaFree(temp);
145+
free(U);
146+
free(invU);
147+
free(temp);
134148
}
135149

136150

@@ -197,8 +211,7 @@ __device__ void mult(double *A, double *B, double *C)
197211

198212
__device__ int _inv(double *m, double *invOut)
199213
{
200-
double *inv;
201-
cudaMalloc((void**)&inv, sizeof(double) * 16);
214+
double *inv = (double*) malloc(sizeof(double) * 16);
202215
double det;
203216
int i;
204217

@@ -324,7 +337,7 @@ __device__ int _inv(double *m, double *invOut)
324337
for (i = 0; i < 16; i++)
325338
invOut[i] = inv[i] * det;
326339

327-
cudaFree(inv);
340+
free(inv);
328341
return 1;
329342
}
330343

@@ -336,67 +349,72 @@ extern "C"{
336349
* Params
337350
* T: (N, 4, 4) the final transform matrix of all points (shared)
338351
* tool: (N, 4, 4) the end transform matrix of all points (shared)
339-
* link_A: (cdim, 4, 4) the transformation matrix of all joints
340-
* link_axes: (cdim, ): axes of all links
341-
* link_isjoint: (cdim, ): 1/0 whether links are joints
352+
* link_A: (nlinks, 4, 4) the transformation matrix of all joints
353+
* link_axes: (nlinks, ): axes of all links
354+
* link_isjoint: (nlinks, ): 1/0 whether links are joints
342355
* N: (int) number of points
343-
* cdim: (int) number of joints
344-
* out: (N, 6, cdim)
356+
* nlinks: (int) number of links
357+
* njoints: (int) number of joints
358+
* out: (N, 6, njoints)
345359
*/
346360
void jacob0(double *T,
347361
double *tool,
348362
double *etool,
349363
double *link_A,
350-
int *link_axes,
351-
int *link_isjoint,
364+
long *link_axes,
365+
long *link_isjoint,
352366
int N,
353-
int cdim,
367+
int nlinks,
368+
int njoints,
354369
double *out)
355-
// affine_T[N]
356-
// link_axes[cdim]
357-
// link_A[cdim]
358-
// link_isjoint[cdim]
359-
// out
360370
{
361371
double *d_T, *d_tool, *d_etool, *d_link_A;
362-
int *d_link_axes, *d_link_isjoint;
372+
long *d_link_axes, *d_link_isjoint;
363373
double *d_out;
364374

365375
cudaMalloc((void**)&d_T, sizeof(double) * N * 16);
366376
cudaMalloc((void**)&d_tool, sizeof(double) * N * 16);
367377
cudaMalloc((void**)&d_etool, sizeof(double) * N * 16);
368-
cudaMalloc((void**)&d_link_A, sizeof(double) * cdim * 16);
369-
cudaMalloc((void**)&d_link_axes, sizeof(int) * cdim);
370-
cudaMalloc((void**)&d_link_isjoint, sizeof(int) * cdim);
371-
cudaMalloc((void**)&d_out, sizeof(double) * 6 * cdim);
378+
cudaMalloc((void**)&d_link_A, sizeof(double) * nlinks * 16);
379+
cudaMalloc((void**)&d_link_axes, sizeof(long) * nlinks);
380+
cudaMalloc((void**)&d_link_isjoint, sizeof(long) * nlinks);
381+
cudaMalloc((void**)&d_out, sizeof(double) * N * 6 * njoints);
372382

373383

374384
// Transfer data from host to device memory
375385
cudaMemcpy(d_T, T, sizeof(double) * N * 16, cudaMemcpyHostToDevice);
376386
cudaMemcpy(d_tool, tool, sizeof(double) * N * 16, cudaMemcpyHostToDevice);
377387
cudaMemcpy(d_etool, etool, sizeof(double) * N * 16, cudaMemcpyHostToDevice);
378-
cudaMemcpy(d_link_A, link_A, sizeof(double) * cdim * 16, cudaMemcpyHostToDevice);
379-
cudaMemcpy(d_link_axes, link_axes, sizeof(int) * cdim, cudaMemcpyHostToDevice);
380-
cudaMemcpy(d_link_isjoint, link_isjoint, sizeof(int) * cdim, cudaMemcpyHostToDevice);
381-
cudaMemcpy(d_out, out, sizeof(double) * 6 * cdim, cudaMemcpyHostToDevice);
388+
cudaMemcpy(d_link_A, link_A, sizeof(double) * nlinks * 16, cudaMemcpyHostToDevice);
389+
cudaMemcpy(d_link_axes, link_axes, sizeof(long) * nlinks, cudaMemcpyHostToDevice);
390+
cudaMemcpy(d_link_isjoint, link_isjoint, sizeof(long) * nlinks, cudaMemcpyHostToDevice);
391+
cudaMemcpy(d_out, out, sizeof(double) * N * 6 * njoints, cudaMemcpyHostToDevice);
382392

383393

384394
int block_size = 256;
385395
int grid_size = ((N + block_size) / block_size);
396+
// printf("Block size %d gid size %d\n", block_size, grid_size);
386397
_jacob0<<<grid_size,block_size>>>(d_T,
387398
d_tool,
388399
d_etool,
389400
d_link_A,
390401
d_link_axes,
391402
d_link_isjoint,
392403
N,
393-
cdim,
404+
nlinks,
405+
njoints,
394406
d_out);
395407

396-
// memset(out, 1, N * 6 * cdim);
408+
// cudaDeviceSynchronize();
409+
// cudaError_t cudaerr = cudaDeviceSynchronize();
410+
// if (cudaerr != cudaSuccess)
411+
// printf("kernel launch failed with error \"%s\".\n",
412+
// cudaGetErrorString(cudaerr));
413+
414+
// memset(out, 1, N * 6 * njoints);
397415
// out[0] = 1;
398-
cudaMemcpy(out, d_out, sizeof(double) * 6 * cdim, cudaMemcpyDeviceToHost);
399-
printf("Out size %d %d %f %f %f %f %f", N, cdim, out[0], out[1], out[2], out[3], out[4]);
416+
cudaMemcpy(out, d_out, sizeof(double) * N * 6 * njoints, cudaMemcpyDeviceToHost);
417+
// printf("Out size %d %d %f %f %f %f %f", N, njoints, d_out[0], d_out[1], d_out[2], d_out[3], d_out[4]);
400418

401419
// Deallocate device memory
402420
cudaFree(d_T);

0 commit comments

Comments
 (0)