@@ -17,45 +17,57 @@ __device__ void _eye(double *data);
17
17
* Params
18
18
* T: (N, 4, 4) the final transform matrix of all points (shared)
19
19
* tool: (N, 4, 4) the tool transform matrix of all points (shared)
20
- * link_A: (cdim , 4, 4) the transformation matrix of all joints
21
- * link_axes: (cdim , ): axes of all links
22
- * link_isjoint: (cdim , ): 1/0 whether links are joints
20
+ * link_A: (nlinks , 4, 4) the transformation matrix of all joints
21
+ * link_axes: (nlinks , ): axes of all links
22
+ * link_isjoint: (nlinks , ): 1/0 whether links are joints
23
23
* N: (int) number of points
24
- * cdim: (int) number of joints
25
- * out: (N, 6, cdim)
24
+ * nlinks: (int) number of links on the path
25
+ * njoints: (int) number of joints
26
+ * out: (N, 6, njoints)
26
27
*/
27
28
__global__ void _jacob0 (double *T,
28
29
double *tool,
29
- double *e_tool ,
30
+ double *etool ,
30
31
double *link_A,
31
- int *link_axes,
32
- int *link_isjoint,
32
+ long *link_axes,
33
+ long *link_isjoint,
33
34
int N,
34
- int cdim,
35
+ int nlinks,
36
+ int njoints,
35
37
double *out)
36
38
{
37
39
int tid = blockIdx .x * blockDim .x + threadIdx .x ;
38
- double *T_i, *tool_i;
39
- double *U, *temp, *etool_i;
40
+ double *T_i; // T
41
+ double *tool_i;
42
+ double *U;
43
+ double *temp;
44
+ double *etool_i;
40
45
double *invU;
41
- double *link_iA;
46
+ double *link_iA; // TODO: =ret ?
42
47
43
- cudaMalloc (( void **)&U, sizeof (double ) * 16 );
44
- cudaMalloc (( void **)&invU, sizeof (double ) * 16 );
45
- cudaMalloc (( void **)&temp, sizeof (double ) * 16 );
48
+ U = ( double *) malloc ( sizeof (double ) * 16 );
49
+ invU = ( double *) malloc ( sizeof (double ) * 16 );
50
+ temp = ( double *) malloc ( sizeof (double ) * 16 );
46
51
int j = 0 ;
47
52
48
53
T_i = &T[tid * 16 ];
49
54
tool_i = &tool[tid * 16 ];
55
+ etool_i = &etool[tid * 16 ];
50
56
_eye (U);
51
- for (int i = 0 ; i < cdim; i++) {
52
57
58
+ if (tid >= N) {
59
+ return ;
60
+ }
61
+
62
+ for (int i = 0 ; i < nlinks; i++) {
63
+
64
+ // printf("Hello from tid %d link_i %d link_axis %ld isjoint %ld \n", tid, i, link_axes[i], link_isjoint[i]);
53
65
if (link_isjoint[i] == 1 ) {
54
66
link_iA = &link_A[i * 16 ];
55
67
mult (U, link_iA, temp);
56
68
copy (temp, U);
57
69
58
- if (i == cdim - 1 ) {
70
+ if (i == nlinks - 1 ) {
59
71
mult (U, etool_i, temp);
60
72
copy (temp, U);
61
73
mult (U, tool_i, temp);
@@ -65,72 +77,74 @@ __global__ void _jacob0(double *T,
65
77
_inv (U, invU);
66
78
mult (invU, T_i, temp);
67
79
68
- double *out_tid = &out[tid + 16 ];
80
+ double *out_tid = &out[tid * 6 * njoints ];
69
81
70
82
if (link_axes[i] == 0 ) {
71
- out_tid[0 * tid + j] = U[0 * 4 + 2 ] * temp[1 * 4 + 3 ] - U[0 * 4 + 1 ] * temp[2 * 4 + 3 ];
72
- out_tid[1 * tid + j] = U[1 * 4 + 2 ] * temp[1 * 4 + 3 ] - U[1 * 4 + 1 ] * temp[2 * 4 + 3 ];
73
- out_tid[2 * tid + j] = U[2 * 4 + 2 ] * temp[1 * 4 + 3 ] - U[2 * 4 + 1 ] * temp[2 * 4 + 3 ];
74
- out_tid[3 * tid + j] = U[0 * 4 + 2 ];
75
- out_tid[4 * tid + j] = U[1 * 4 + 2 ];
76
- out_tid[5 * tid + j] = U[2 * 4 + 2 ];
83
+ out_tid[0 * njoints + j] = U[0 * 4 + 2 ] * temp[1 * 4 + 3 ] - U[0 * 4 + 1 ] * temp[2 * 4 + 3 ];
84
+ out_tid[1 * njoints + j] = U[1 * 4 + 2 ] * temp[1 * 4 + 3 ] - U[1 * 4 + 1 ] * temp[2 * 4 + 3 ];
85
+ out_tid[2 * njoints + j] = U[2 * 4 + 2 ] * temp[1 * 4 + 3 ] - U[2 * 4 + 1 ] * temp[2 * 4 + 3 ];
86
+ out_tid[3 * njoints + j] = U[0 * 4 + 2 ];
87
+ out_tid[4 * njoints + j] = U[1 * 4 + 2 ];
88
+ out_tid[5 * njoints + j] = U[2 * 4 + 2 ];
77
89
}
78
90
else if (link_axes[i] == 1 )
79
91
{
80
- out_tid[0 * tid + j] = U[0 * 4 + 0 ] * temp[2 * 4 + 3 ] - U[0 * 4 + 2 ] * temp[0 * 4 + 3 ];
81
- out_tid[1 * tid + j] = U[1 * 4 + 0 ] * temp[2 * 4 + 3 ] - U[1 * 4 + 2 ] * temp[0 * 4 + 3 ];
82
- out_tid[2 * tid + j] = U[2 * 4 + 0 ] * temp[2 * 4 + 3 ] - U[2 * 4 + 2 ] * temp[0 * 4 + 3 ];
83
- out_tid[3 * tid + j] = U[0 * 4 + 1 ];
84
- out_tid[4 * tid + j] = U[1 * 4 + 1 ];
85
- out_tid[5 * tid + j] = U[2 * 4 + 1 ];
92
+ out_tid[0 * njoints + j] = U[0 * 4 + 0 ] * temp[2 * 4 + 3 ] - U[0 * 4 + 2 ] * temp[0 * 4 + 3 ];
93
+ out_tid[1 * njoints + j] = U[1 * 4 + 0 ] * temp[2 * 4 + 3 ] - U[1 * 4 + 2 ] * temp[0 * 4 + 3 ];
94
+ out_tid[2 * njoints + j] = U[2 * 4 + 0 ] * temp[2 * 4 + 3 ] - U[2 * 4 + 2 ] * temp[0 * 4 + 3 ];
95
+ out_tid[3 * njoints + j] = U[0 * 4 + 1 ];
96
+ out_tid[4 * njoints + j] = U[1 * 4 + 1 ];
97
+ out_tid[5 * njoints + j] = U[2 * 4 + 1 ];
86
98
}
87
99
else if (link_axes[i] == 2 )
88
100
{
89
- out_tid[0 * tid + j] = U[0 * 4 + 1 ] * temp[0 * 4 + 3 ] - U[0 * 4 + 0 ] * temp[1 * 4 + 3 ];
90
- out_tid[1 * tid + j] = U[1 * 4 + 1 ] * temp[0 * 4 + 3 ] - U[1 * 4 + 0 ] * temp[1 * 4 + 3 ];
91
- out_tid[2 * tid + j] = U[2 * 4 + 1 ] * temp[0 * 4 + 3 ] - U[2 * 4 + 0 ] * temp[1 * 4 + 3 ];
92
- out_tid[3 * tid + j] = U[0 * 4 + 2 ];
93
- out_tid[4 * tid + j] = U[1 * 4 + 2 ];
94
- out_tid[5 * tid + j] = U[2 * 4 + 2 ];
101
+ out_tid[0 * njoints + j] = U[0 * 4 + 1 ] * temp[0 * 4 + 3 ] - U[0 * 4 + 0 ] * temp[1 * 4 + 3 ];
102
+ out_tid[1 * njoints + j] = U[1 * 4 + 1 ] * temp[0 * 4 + 3 ] - U[1 * 4 + 0 ] * temp[1 * 4 + 3 ];
103
+ out_tid[2 * njoints + j] = U[2 * 4 + 1 ] * temp[0 * 4 + 3 ] - U[2 * 4 + 0 ] * temp[1 * 4 + 3 ];
104
+ out_tid[3 * njoints + j] = U[0 * 4 + 2 ];
105
+ out_tid[4 * njoints + j] = U[1 * 4 + 2 ];
106
+ out_tid[5 * njoints + j] = U[2 * 4 + 2 ];
95
107
}
96
108
else if (link_axes[i] == 3 )
97
109
{
98
- out_tid[0 * tid + j] = U[0 * 4 + 0 ];
99
- out_tid[1 * tid + j] = U[1 * 4 + 0 ];
100
- out_tid[2 * tid + j] = U[2 * 4 + 0 ];
101
- out_tid[3 * tid + j] = 0.0 ;
102
- out_tid[4 * tid + j] = 0.0 ;
103
- out_tid[5 * tid + j] = 0.0 ;
110
+ out_tid[0 * njoints + j] = U[0 * 4 + 0 ];
111
+ out_tid[1 * njoints + j] = U[1 * 4 + 0 ];
112
+ out_tid[2 * njoints + j] = U[2 * 4 + 0 ];
113
+ out_tid[3 * njoints + j] = 0.0 ;
114
+ out_tid[4 * njoints + j] = 0.0 ;
115
+ out_tid[5 * njoints + j] = 0.0 ;
104
116
}
105
117
else if (link_axes[i] == 4 )
106
118
{
107
- out_tid[0 * tid + j] = U[0 * 4 + 1 ];
108
- out_tid[1 * tid + j] = U[1 * 4 + 1 ];
109
- out_tid[2 * tid + j] = U[2 * 4 + 1 ];
110
- out_tid[3 * tid + j] = 0.0 ;
111
- out_tid[4 * tid + j] = 0.0 ;
112
- out_tid[5 * tid + j] = 0.0 ;
119
+ out_tid[0 * njoints + j] = U[0 * 4 + 1 ];
120
+ out_tid[1 * njoints + j] = U[1 * 4 + 1 ];
121
+ out_tid[2 * njoints + j] = U[2 * 4 + 1 ];
122
+ out_tid[3 * njoints + j] = 0.0 ;
123
+ out_tid[4 * njoints + j] = 0.0 ;
124
+ out_tid[5 * njoints + j] = 0.0 ;
113
125
}
114
126
else if (link_axes[i] == 5 )
115
127
{
116
- out_tid[0 * tid + j] = U[0 * 4 + 2 ];
117
- out_tid[1 * tid + j] = U[1 * 4 + 2 ];
118
- out_tid[2 * tid + j] = U[2 * 4 + 2 ];
119
- out_tid[3 * tid + j] = 0.0 ;
120
- out_tid[4 * tid + j] = 0.0 ;
121
- out_tid[5 * tid + j] = 0.0 ;
128
+ out_tid[0 * njoints + j] = U[0 * 4 + 2 ];
129
+ out_tid[1 * njoints + j] = U[1 * 4 + 2 ];
130
+ out_tid[2 * njoints + j] = U[2 * 4 + 2 ];
131
+ out_tid[3 * njoints + j] = 0.0 ;
132
+ out_tid[4 * njoints + j] = 0.0 ;
133
+ out_tid[5 * njoints + j] = 0.0 ;
122
134
}
123
135
j++;
124
- } else {
136
+ }
137
+ else
138
+ {
125
139
link_iA = &link_A[i * 16 ];
126
140
mult (U, link_iA, temp);
127
141
copy (temp, U);
128
142
}
129
143
}
130
144
131
- cudaFree (U);
132
- cudaFree (invU);
133
- cudaFree (temp);
145
+ free (U);
146
+ free (invU);
147
+ free (temp);
134
148
}
135
149
136
150
@@ -197,8 +211,7 @@ __device__ void mult(double *A, double *B, double *C)
197
211
198
212
__device__ int _inv (double *m, double *invOut)
199
213
{
200
- double *inv;
201
- cudaMalloc ((void **)&inv, sizeof (double ) * 16 );
214
+ double *inv = (double *) malloc (sizeof (double ) * 16 );
202
215
double det;
203
216
int i;
204
217
@@ -324,7 +337,7 @@ __device__ int _inv(double *m, double *invOut)
324
337
for (i = 0 ; i < 16 ; i++)
325
338
invOut[i] = inv[i] * det;
326
339
327
- cudaFree (inv);
340
+ free (inv);
328
341
return 1 ;
329
342
}
330
343
@@ -336,67 +349,72 @@ extern "C"{
336
349
* Params
337
350
* T: (N, 4, 4) the final transform matrix of all points (shared)
338
351
* tool: (N, 4, 4) the end transform matrix of all points (shared)
339
- * link_A: (cdim , 4, 4) the transformation matrix of all joints
340
- * link_axes: (cdim , ): axes of all links
341
- * link_isjoint: (cdim , ): 1/0 whether links are joints
352
+ * link_A: (nlinks , 4, 4) the transformation matrix of all joints
353
+ * link_axes: (nlinks , ): axes of all links
354
+ * link_isjoint: (nlinks , ): 1/0 whether links are joints
342
355
* N: (int) number of points
343
- * cdim: (int) number of joints
344
- * out: (N, 6, cdim)
356
+ * nlinks: (int) number of links
357
+ * njoints: (int) number of joints
358
+ * out: (N, 6, njoints)
345
359
*/
346
360
void jacob0 (double *T,
347
361
double *tool,
348
362
double *etool,
349
363
double *link_A,
350
- int *link_axes,
351
- int *link_isjoint,
364
+ long *link_axes,
365
+ long *link_isjoint,
352
366
int N,
353
- int cdim,
367
+ int nlinks,
368
+ int njoints,
354
369
double *out)
355
- // affine_T[N]
356
- // link_axes[cdim]
357
- // link_A[cdim]
358
- // link_isjoint[cdim]
359
- // out
360
370
{
361
371
double *d_T, *d_tool, *d_etool, *d_link_A;
362
- int *d_link_axes, *d_link_isjoint;
372
+ long *d_link_axes, *d_link_isjoint;
363
373
double *d_out;
364
374
365
375
cudaMalloc ((void **)&d_T, sizeof (double ) * N * 16 );
366
376
cudaMalloc ((void **)&d_tool, sizeof (double ) * N * 16 );
367
377
cudaMalloc ((void **)&d_etool, sizeof (double ) * N * 16 );
368
- cudaMalloc ((void **)&d_link_A, sizeof (double ) * cdim * 16 );
369
- cudaMalloc ((void **)&d_link_axes, sizeof (int ) * cdim );
370
- cudaMalloc ((void **)&d_link_isjoint, sizeof (int ) * cdim );
371
- cudaMalloc ((void **)&d_out, sizeof (double ) * 6 * cdim );
378
+ cudaMalloc ((void **)&d_link_A, sizeof (double ) * nlinks * 16 );
379
+ cudaMalloc ((void **)&d_link_axes, sizeof (long ) * nlinks );
380
+ cudaMalloc ((void **)&d_link_isjoint, sizeof (long ) * nlinks );
381
+ cudaMalloc ((void **)&d_out, sizeof (double ) * N * 6 * njoints );
372
382
373
383
374
384
// Transfer data from host to device memory
375
385
cudaMemcpy (d_T, T, sizeof (double ) * N * 16 , cudaMemcpyHostToDevice);
376
386
cudaMemcpy (d_tool, tool, sizeof (double ) * N * 16 , cudaMemcpyHostToDevice);
377
387
cudaMemcpy (d_etool, etool, sizeof (double ) * N * 16 , cudaMemcpyHostToDevice);
378
- cudaMemcpy (d_link_A, link_A, sizeof (double ) * cdim * 16 , cudaMemcpyHostToDevice);
379
- cudaMemcpy (d_link_axes, link_axes, sizeof (int ) * cdim , cudaMemcpyHostToDevice);
380
- cudaMemcpy (d_link_isjoint, link_isjoint, sizeof (int ) * cdim , cudaMemcpyHostToDevice);
381
- cudaMemcpy (d_out, out, sizeof (double ) * 6 * cdim , cudaMemcpyHostToDevice);
388
+ cudaMemcpy (d_link_A, link_A, sizeof (double ) * nlinks * 16 , cudaMemcpyHostToDevice);
389
+ cudaMemcpy (d_link_axes, link_axes, sizeof (long ) * nlinks , cudaMemcpyHostToDevice);
390
+ cudaMemcpy (d_link_isjoint, link_isjoint, sizeof (long ) * nlinks , cudaMemcpyHostToDevice);
391
+ cudaMemcpy (d_out, out, sizeof (double ) * N * 6 * njoints , cudaMemcpyHostToDevice);
382
392
383
393
384
394
int block_size = 256 ;
385
395
int grid_size = ((N + block_size) / block_size);
396
+ // printf("Block size %d gid size %d\n", block_size, grid_size);
386
397
_jacob0<<<grid_size,block_size>>> (d_T,
387
398
d_tool,
388
399
d_etool,
389
400
d_link_A,
390
401
d_link_axes,
391
402
d_link_isjoint,
392
403
N,
393
- cdim,
404
+ nlinks,
405
+ njoints,
394
406
d_out);
395
407
396
- // memset(out, 1, N * 6 * cdim);
408
+ // cudaDeviceSynchronize();
409
+ // cudaError_t cudaerr = cudaDeviceSynchronize();
410
+ // if (cudaerr != cudaSuccess)
411
+ // printf("kernel launch failed with error \"%s\".\n",
412
+ // cudaGetErrorString(cudaerr));
413
+
414
+ // memset(out, 1, N * 6 * njoints);
397
415
// out[0] = 1;
398
- cudaMemcpy (out, d_out, sizeof (double ) * 6 * cdim , cudaMemcpyDeviceToHost);
399
- printf (" Out size %d %d %f %f %f %f %f" , N, cdim, out [0 ], out [1 ], out [2 ], out [3 ], out [4 ]);
416
+ cudaMemcpy (out, d_out, sizeof (double ) * N * 6 * njoints , cudaMemcpyDeviceToHost);
417
+ // printf("Out size %d %d %f %f %f %f %f", N, njoints, d_out [0], d_out [1], d_out [2], d_out [3], d_out [4]);
400
418
401
419
// Deallocate device memory
402
420
cudaFree (d_T);
0 commit comments