@@ -189,10 +189,8 @@ __kernel void ConvolveBasic(
189
189
// NDRange: (output_width+pad)/ OUT_BLOCK_WIDTH, (output_height+pad)/OUT_BLOCK_HEIGHT, NUM_FILTERS/OUT_BLOCK_DEPTH
190
190
191
191
// NOTE: for beignet this reqd_work_group_size does not guarantee that SIMD16 mode will be used, the compiler could choose to use two SIMD8 threads, and if that happens the code will break.
192
- #ifndef __BEIGNET__
193
192
__attribute__((reqd_work_group_size (1 , 1 , SIMD_SIZE )))
194
193
__attribute__((intel_reqd_sub_group_size (SIMD_SIZE )))
195
- #endif
196
194
__kernel void
197
195
convolve_simd (
198
196
ELTWISE_DATA_ARG
@@ -232,12 +230,12 @@ convolve_simd(
232
230
233
231
int curr_local_y = ( lid / ( TILE_X / 4 ) );
234
232
int curr_local_x = ( lid % ( TILE_X / 4 ) ) * 4 ;
235
- int curr_y = or * STRIDE_Y + INPUT_START_Y + curr_local_y ;
236
- int curr_x = oc * STRIDE_X + INPUT_START_X + curr_local_x ;
233
+ int curr_y = or * STRIDE_Y + curr_local_y ;
234
+ int curr_x = oc * STRIDE_X + curr_local_x ;
237
235
#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0
238
236
int saved_y = curr_y ;
239
237
#endif
240
- in_addr = input_batch_offset + INPUT_START_Z * input_height * input_width
238
+ in_addr = input_batch_offset
241
239
+ (curr_y - INPUT_PAD_H ) * input_width // y tile offset
242
240
+ curr_x - INPUT_PAD_W ; // x tile offset
243
241
union {
@@ -363,7 +361,7 @@ convolve_simd(
363
361
fm = fm % ALIGNED_NUM_FILTERS ;
364
362
365
363
if ((ALIGNED_NUM_FILTERS == NUM_FILTERS || fm < NUM_FILTERS )) {
366
- unsigned int out_addr = OUT_BUFF_OFFSET + ( num_in_batch * TOTAL_OUTPUT_DEPTH + fm ) * output_width * output_height ;
364
+ unsigned int out_addr = ( num_in_batch * TOTAL_OUTPUT_DEPTH + fm ) * output_width * output_height ;
367
365
out_addr += or * output_width + oc ;
368
366
// we need this address calculation for biases because we support views and batching
369
367
#if APPLY_BIAS
0 commit comments