@@ -155,9 +155,13 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
155
155
int n1 = y > 0 ? -(int )img1.step : 0 , s1 = y < img1.rows -1 ? (int )img1.step : 0 ;
156
156
int n2 = y > 0 ? -(int )img2.step : 0 , s2 = y < img2.rows -1 ? (int )img2.step : 0 ;
157
157
158
+ int minX_cmn = std::min (minX1,minX2)-1 ;
159
+ int maxX_cmn = std::max (maxX1,maxX2)+1 ;
160
+ minX_cmn = std::max (minX_cmn, 1 );
161
+ maxX_cmn = std::min (maxX_cmn, width - 1 );
158
162
if ( cn == 1 )
159
163
{
160
- for ( x = 1 ; x < width- 1 ; x++ )
164
+ for ( x = minX_cmn ; x < maxX_cmn ; x++ )
161
165
{
162
166
prow1[x] = tab[(row1[x+1 ] - row1[x-1 ])*2 + row1[x+n1+1 ] - row1[x+n1-1 ] + row1[x+s1+1 ] - row1[x+s1-1 ]];
163
167
prow2[width-1 -x] = tab[(row2[x+1 ] - row2[x-1 ])*2 + row2[x+n2+1 ] - row2[x+n2-1 ] + row2[x+s2+1 ] - row2[x+s2-1 ]];
@@ -168,7 +172,7 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
168
172
}
169
173
else
170
174
{
171
- for ( x = 1 ; x < width- 1 ; x++ )
175
+ for ( x = minX_cmn ; x < maxX_cmn ; x++ )
172
176
{
173
177
prow1[x] = tab[(row1[x*3 +3 ] - row1[x*3 -3 ])*2 + row1[x*3 +n1+3 ] - row1[x*3 +n1-3 ] + row1[x*3 +s1+3 ] - row1[x*3 +s1-3 ]];
174
178
prow1[x+width] = tab[(row1[x*3 +4 ] - row1[x*3 -2 ])*2 + row1[x*3 +n1+4 ] - row1[x*3 +n1-2 ] + row1[x*3 +s1+4 ] - row1[x*3 +s1-2 ]];
@@ -864,6 +868,7 @@ struct CalcVerticalSums: public ParallelLoopBody
864
868
Cbuf = alignedBuf;
865
869
Sbuf = Cbuf + CSBufSize;
866
870
hsumBuf = Sbuf + CSBufSize;
871
+ useSIMD = hasSIMD128 ();
867
872
}
868
873
869
874
void operator ()( const Range& range ) const
@@ -951,6 +956,24 @@ struct CalcVerticalSums: public ParallelLoopBody
951
956
const CostType* pixAdd = pixDiff + std::min (x + SW2*D, (width1-1 )*D);
952
957
const CostType* pixSub = pixDiff + std::max (x - (SW2+1 )*D, 0 );
953
958
959
+ #if CV_SIMD128
960
+ if ( useSIMD )
961
+ {
962
+ for ( d = 0 ; d < D; d += 8 )
963
+ {
964
+ v_int16x8 hv = v_load (hsumAdd + x - D + d);
965
+ v_int16x8 Cx = v_load (Cprev + x + d);
966
+ v_int16x8 psub = v_load (pixSub + d);
967
+ v_int16x8 padd = v_load (pixAdd + d);
968
+ hv = (hv - psub + padd);
969
+ psub = v_load (hsumSub + x + d);
970
+ Cx = Cx - psub + hv;
971
+ v_store (hsumAdd + x + d, hv);
972
+ v_store (C + x + d, Cx);
973
+ }
974
+ }
975
+ else
976
+ #endif
954
977
{
955
978
for ( d = 0 ; d < D; d++ )
956
979
{
@@ -1010,6 +1033,46 @@ struct CalcVerticalSums: public ParallelLoopBody
1010
1033
const CostType* Cp = C + x*D;
1011
1034
CostType* Sp = S + x*D;
1012
1035
1036
+ #if CV_SIMD128
1037
+ if ( useSIMD )
1038
+ {
1039
+ v_int16x8 _P1 = v_setall_s16 ((short )P1);
1040
+
1041
+ v_int16x8 _delta = v_setall_s16 ((short )delta);
1042
+ v_int16x8 _minL = v_setall_s16 ((short )MAX_COST);
1043
+
1044
+ for ( d = 0 ; d < D; d += 8 )
1045
+ {
1046
+ v_int16x8 Cpd = v_load (Cp + d);
1047
+ v_int16x8 L;
1048
+
1049
+ L = v_load (Lr_ppr + d);
1050
+
1051
+ L = v_min (L, (v_load (Lr_ppr + d - 1 ) + _P1));
1052
+ L = v_min (L, (v_load (Lr_ppr + d + 1 ) + _P1));
1053
+
1054
+ L = v_min (L, _delta);
1055
+ L = ((L - _delta) + Cpd);
1056
+
1057
+ v_store (Lr_p + d, L);
1058
+
1059
+ // Get minimum from in L-L3
1060
+ _minL = v_min (_minL, L);
1061
+
1062
+ v_int16x8 Sval = v_load (Sp + d);
1063
+
1064
+ Sval = Sval + L;
1065
+
1066
+ v_store (Sp + d, Sval);
1067
+ }
1068
+
1069
+ v_int32x4 min1, min2, min12;
1070
+ v_expand (_minL, min1, min2);
1071
+ min12 = v_min (min1,min2);
1072
+ minLr[0 ][x] = (CostType)v_reduce_min (min12);
1073
+ }
1074
+ else
1075
+ #endif
1013
1076
{
1014
1077
int minL = MAX_COST;
1015
1078
@@ -1058,6 +1121,7 @@ struct CalcVerticalSums: public ParallelLoopBody
1058
1121
size_t LrSize;
1059
1122
size_t hsumBufNRows;
1060
1123
int ftzero;
1124
+ bool useSIMD;
1061
1125
};
1062
1126
1063
1127
struct CalcHorizontalSums : public ParallelLoopBody
@@ -1085,6 +1149,7 @@ struct CalcHorizontalSums: public ParallelLoopBody
1085
1149
LrSize = 2 * D2;
1086
1150
Cbuf = alignedBuf;
1087
1151
Sbuf = Cbuf + CSBufSize;
1152
+ useSIMD = hasSIMD128 ();
1088
1153
}
1089
1154
1090
1155
void operator ()( const Range& range ) const
@@ -1138,20 +1203,60 @@ struct CalcHorizontalSums: public ParallelLoopBody
1138
1203
const CostType* Cp = C + x*D;
1139
1204
CostType* Sp = S + x*D;
1140
1205
1141
- int minL = MAX_COST;
1206
+ #if CV_SIMD128
1207
+ if ( useSIMD )
1208
+ {
1209
+ v_int16x8 _P1 = v_setall_s16 ((short )P1);
1210
+
1211
+ v_int16x8 _delta = v_setall_s16 ((short )delta);
1212
+ v_int16x8 _minL = v_setall_s16 ((short )MAX_COST);
1142
1213
1143
- for ( d = 0 ; d < D; d++ )
1214
+ for ( d = 0 ; d < D; d += 8 )
1215
+ {
1216
+ v_int16x8 Cpd = v_load (Cp + d);
1217
+ v_int16x8 L;
1218
+
1219
+ L = v_load (Lr_ppr + d);
1220
+
1221
+ L = v_min (L, (v_load (Lr_ppr + d - 1 ) + _P1));
1222
+ L = v_min (L, (v_load (Lr_ppr + d + 1 ) + _P1));
1223
+
1224
+ L = v_min (L, _delta);
1225
+ L = ((L - _delta) + Cpd);
1226
+
1227
+ v_store (Lr_p + d, L);
1228
+
1229
+ // Get minimum from in L-L3
1230
+ _minL = v_min (_minL, L);
1231
+
1232
+ v_int16x8 Sval = v_load (Sp + d);
1233
+
1234
+ Sval = Sval + L;
1235
+
1236
+ v_store (Sp + d, Sval);
1237
+ }
1238
+
1239
+ v_int32x4 min1, min2, min12;
1240
+ v_expand (_minL, min1, min2);
1241
+ min12 = v_min (min1,min2);
1242
+ minLr = (CostType)v_reduce_min (min12);
1243
+ }
1244
+ else
1245
+ #endif
1144
1246
{
1145
- int Cpd = Cp[d], L;
1247
+ minLr = MAX_COST;
1248
+ for ( d = 0 ; d < D; d++ )
1249
+ {
1250
+ int Cpd = Cp[d], L;
1146
1251
1147
- L = Cpd + std::min ((int )Lr_ppr[d], std::min (Lr_ppr[d-1 ] + P1, std::min (Lr_ppr[d+1 ] + P1, delta))) - delta;
1252
+ L = Cpd + std::min ((int )Lr_ppr[d], std::min (Lr_ppr[d-1 ] + P1, std::min (Lr_ppr[d+1 ] + P1, delta))) - delta;
1148
1253
1149
- Lr_p[d] = (CostType)L;
1150
- minL = std::min (minL , L);
1254
+ Lr_p[d] = (CostType)L;
1255
+ minLr = (CostType) std::min (( int )minLr , L);
1151
1256
1152
- Sp[d] = saturate_cast<CostType>(Sp[d] + L);
1257
+ Sp[d] = saturate_cast<CostType>(Sp[d] + L);
1258
+ }
1153
1259
}
1154
- minLr = (CostType)minL;
1155
1260
}
1156
1261
1157
1262
memset ( Lr - 8 , 0 , LrSize*sizeof (CostType) );
@@ -1169,26 +1274,82 @@ struct CalcHorizontalSums: public ParallelLoopBody
1169
1274
const CostType* Cp = C + x*D;
1170
1275
CostType* Sp = S + x*D;
1171
1276
int minS = MAX_COST, bestDisp = -1 ;
1277
+ minLr = MAX_COST;
1172
1278
1173
- int minL = MAX_COST;
1174
-
1175
- for ( d = 0 ; d < D; d++ )
1279
+ #if CV_SIMD128
1280
+ if ( useSIMD )
1176
1281
{
1177
- int Cpd = Cp[d], L ;
1282
+ v_int16x8 _P1 = v_setall_s16 (( short )P1) ;
1178
1283
1179
- L = Cpd + std::min ((int )Lr_ppr[d], std::min (Lr_ppr[d-1 ] + P1, std::min (Lr_ppr[d+1 ] + P1, delta))) - delta;
1284
+ v_int16x8 _delta = v_setall_s16 ((short )delta);
1285
+ v_int16x8 _minL = v_setall_s16 ((short )MAX_COST);
1180
1286
1181
- Lr_p[d] = (CostType)L ;
1182
- minL = std::min (minL, L );
1287
+ v_int16x8 _minS = v_setall_s16 (MAX_COST), _bestDisp = v_setall_s16 (- 1 ) ;
1288
+ v_int16x8 _d8 = v_int16x8 ( 0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 ), _8 = v_setall_s16 ( 8 );
1183
1289
1184
- Sp[d] = saturate_cast<CostType>(Sp[d] + L);
1185
- if ( Sp[d] < minS )
1290
+ for ( d = 0 ; d < D; d+= 8 )
1186
1291
{
1187
- minS = Sp[d];
1188
- bestDisp = d;
1292
+ v_int16x8 Cpd = v_load (Cp + d);
1293
+ v_int16x8 L;
1294
+
1295
+ L = v_load (Lr_ppr + d);
1296
+
1297
+ L = v_min (L, (v_load (Lr_ppr + d - 1 ) + _P1));
1298
+ L = v_min (L, (v_load (Lr_ppr + d + 1 ) + _P1));
1299
+
1300
+ L = v_min (L, _delta);
1301
+ L = ((L - _delta) + Cpd);
1302
+
1303
+ v_store (Lr_p + d, L);
1304
+
1305
+ // Get minimum from in L-L3
1306
+ _minL = v_min (_minL, L);
1307
+
1308
+ v_int16x8 Sval = v_load (Sp + d);
1309
+
1310
+ Sval = Sval + L;
1311
+
1312
+ v_int16x8 mask = Sval < _minS;
1313
+ _minS = v_min ( Sval, _minS );
1314
+ _bestDisp = _bestDisp ^ ((_bestDisp ^ _d8) & mask);
1315
+ _d8 = _d8 + _8;
1316
+
1317
+ v_store (Sp + d, Sval);
1318
+ }
1319
+ v_int32x4 min1, min2, min12;
1320
+ v_expand (_minL, min1, min2);
1321
+ min12 = v_min (min1,min2);
1322
+ minLr = (CostType)v_reduce_min (min12);
1323
+
1324
+ v_int32x4 _d0, _d1;
1325
+ v_expand (_minS, _d0, _d1);
1326
+ minS = (int )std::min (v_reduce_min (_d0), v_reduce_min (_d1));
1327
+ v_int16x8 v_mask = v_setall_s16 ((short )minS) == _minS;
1328
+
1329
+ _bestDisp = (_bestDisp & v_mask) | (v_setall_s16 (SHRT_MAX) & ~v_mask);
1330
+ v_expand (_bestDisp, _d0, _d1);
1331
+ bestDisp = (int )std::min (v_reduce_min (_d0), v_reduce_min (_d1));
1332
+ }
1333
+ else
1334
+ #endif
1335
+ {
1336
+ for ( d = 0 ; d < D; d++ )
1337
+ {
1338
+ int Cpd = Cp[d], L;
1339
+
1340
+ L = Cpd + std::min ((int )Lr_ppr[d], std::min (Lr_ppr[d-1 ] + P1, std::min (Lr_ppr[d+1 ] + P1, delta))) - delta;
1341
+
1342
+ Lr_p[d] = (CostType)L;
1343
+ minLr = (CostType)std::min ((int )minLr, L);
1344
+
1345
+ Sp[d] = saturate_cast<CostType>(Sp[d] + L);
1346
+ if ( Sp[d] < minS )
1347
+ {
1348
+ minS = Sp[d];
1349
+ bestDisp = d;
1350
+ }
1189
1351
}
1190
1352
}
1191
- minLr = (CostType)minL;
1192
1353
// Some postprocessing procedures and saving
1193
1354
for ( d = 0 ; d < D; d++ )
1194
1355
{
@@ -1263,6 +1424,7 @@ struct CalcHorizontalSums: public ParallelLoopBody
1263
1424
int INVALID_DISP_SCALED;
1264
1425
int uniquenessRatio;
1265
1426
int disp12MaxDiff;
1427
+ bool useSIMD;
1266
1428
};
1267
1429
/*
1268
1430
computes disparity for "roi" in img1 w.r.t. img2 and write it to disp1buf.
0 commit comments