Skip to content

Commit 31c7966

Browse files
committed
Merge pull request opencv#8803 from 4ekmah:sgbm_modehh4_SIMD
2 parents 7b8d107 + a113e8f commit 31c7966

File tree

2 files changed

+185
-23
lines changed

2 files changed

+185
-23
lines changed

modules/calib3d/perf/perf_stereosgbm.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ using namespace cv;
4747

4848
void MakeArtificialExample(RNG rng, Mat& dst_left_view, Mat& dst_view);
4949

50-
CV_ENUM(SGBMModes, StereoSGBM::MODE_SGBM, StereoSGBM::MODE_SGBM_3WAY);
50+
CV_ENUM(SGBMModes, StereoSGBM::MODE_SGBM, StereoSGBM::MODE_SGBM_3WAY, StereoSGBM::MODE_HH4);
5151
typedef tuple<Size, int, SGBMModes> SGBMParams;
5252
typedef TestBaseWithParam<SGBMParams> TestStereoCorresp;
5353

modules/calib3d/src/stereosgbm.cpp

Lines changed: 184 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -155,9 +155,13 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
155155
int n1 = y > 0 ? -(int)img1.step : 0, s1 = y < img1.rows-1 ? (int)img1.step : 0;
156156
int n2 = y > 0 ? -(int)img2.step : 0, s2 = y < img2.rows-1 ? (int)img2.step : 0;
157157

158+
int minX_cmn = std::min(minX1,minX2)-1;
159+
int maxX_cmn = std::max(maxX1,maxX2)+1;
160+
minX_cmn = std::max(minX_cmn, 1);
161+
maxX_cmn = std::min(maxX_cmn, width - 1);
158162
if( cn == 1 )
159163
{
160-
for( x = 1; x < width-1; x++ )
164+
for( x = minX_cmn; x < maxX_cmn; x++ )
161165
{
162166
prow1[x] = tab[(row1[x+1] - row1[x-1])*2 + row1[x+n1+1] - row1[x+n1-1] + row1[x+s1+1] - row1[x+s1-1]];
163167
prow2[width-1-x] = tab[(row2[x+1] - row2[x-1])*2 + row2[x+n2+1] - row2[x+n2-1] + row2[x+s2+1] - row2[x+s2-1]];
@@ -168,7 +172,7 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
168172
}
169173
else
170174
{
171-
for( x = 1; x < width-1; x++ )
175+
for( x = minX_cmn; x < maxX_cmn; x++ )
172176
{
173177
prow1[x] = tab[(row1[x*3+3] - row1[x*3-3])*2 + row1[x*3+n1+3] - row1[x*3+n1-3] + row1[x*3+s1+3] - row1[x*3+s1-3]];
174178
prow1[x+width] = tab[(row1[x*3+4] - row1[x*3-2])*2 + row1[x*3+n1+4] - row1[x*3+n1-2] + row1[x*3+s1+4] - row1[x*3+s1-2]];
@@ -864,6 +868,7 @@ struct CalcVerticalSums: public ParallelLoopBody
864868
Cbuf = alignedBuf;
865869
Sbuf = Cbuf + CSBufSize;
866870
hsumBuf = Sbuf + CSBufSize;
871+
useSIMD = hasSIMD128();
867872
}
868873

869874
void operator()( const Range& range ) const
@@ -951,6 +956,24 @@ struct CalcVerticalSums: public ParallelLoopBody
951956
const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1-1)*D);
952957
const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*D, 0);
953958

959+
#if CV_SIMD128
960+
if( useSIMD )
961+
{
962+
for( d = 0; d < D; d += 8 )
963+
{
964+
v_int16x8 hv = v_load(hsumAdd + x - D + d);
965+
v_int16x8 Cx = v_load(Cprev + x + d);
966+
v_int16x8 psub = v_load(pixSub + d);
967+
v_int16x8 padd = v_load(pixAdd + d);
968+
hv = (hv - psub + padd);
969+
psub = v_load(hsumSub + x + d);
970+
Cx = Cx - psub + hv;
971+
v_store(hsumAdd + x + d, hv);
972+
v_store(C + x + d, Cx);
973+
}
974+
}
975+
else
976+
#endif
954977
{
955978
for( d = 0; d < D; d++ )
956979
{
@@ -1010,6 +1033,46 @@ struct CalcVerticalSums: public ParallelLoopBody
10101033
const CostType* Cp = C + x*D;
10111034
CostType* Sp = S + x*D;
10121035

1036+
#if CV_SIMD128
1037+
if( useSIMD )
1038+
{
1039+
v_int16x8 _P1 = v_setall_s16((short)P1);
1040+
1041+
v_int16x8 _delta = v_setall_s16((short)delta);
1042+
v_int16x8 _minL = v_setall_s16((short)MAX_COST);
1043+
1044+
for( d = 0; d < D; d += 8 )
1045+
{
1046+
v_int16x8 Cpd = v_load(Cp + d);
1047+
v_int16x8 L;
1048+
1049+
L = v_load(Lr_ppr + d);
1050+
1051+
L = v_min(L, (v_load(Lr_ppr + d - 1) + _P1));
1052+
L = v_min(L, (v_load(Lr_ppr + d + 1) + _P1));
1053+
1054+
L = v_min(L, _delta);
1055+
L = ((L - _delta) + Cpd);
1056+
1057+
v_store(Lr_p + d, L);
1058+
1059+
// Get minimum from in L-L3
1060+
_minL = v_min(_minL, L);
1061+
1062+
v_int16x8 Sval = v_load(Sp + d);
1063+
1064+
Sval = Sval + L;
1065+
1066+
v_store(Sp + d, Sval);
1067+
}
1068+
1069+
v_int32x4 min1, min2, min12;
1070+
v_expand(_minL, min1, min2);
1071+
min12 = v_min(min1,min2);
1072+
minLr[0][x] = (CostType)v_reduce_min(min12);
1073+
}
1074+
else
1075+
#endif
10131076
{
10141077
int minL = MAX_COST;
10151078

@@ -1058,6 +1121,7 @@ struct CalcVerticalSums: public ParallelLoopBody
10581121
size_t LrSize;
10591122
size_t hsumBufNRows;
10601123
int ftzero;
1124+
bool useSIMD;
10611125
};
10621126

10631127
struct CalcHorizontalSums: public ParallelLoopBody
@@ -1085,6 +1149,7 @@ struct CalcHorizontalSums: public ParallelLoopBody
10851149
LrSize = 2 * D2;
10861150
Cbuf = alignedBuf;
10871151
Sbuf = Cbuf + CSBufSize;
1152+
useSIMD = hasSIMD128();
10881153
}
10891154

10901155
void operator()( const Range& range ) const
@@ -1138,20 +1203,60 @@ struct CalcHorizontalSums: public ParallelLoopBody
11381203
const CostType* Cp = C + x*D;
11391204
CostType* Sp = S + x*D;
11401205

1141-
int minL = MAX_COST;
1206+
#if CV_SIMD128
1207+
if( useSIMD )
1208+
{
1209+
v_int16x8 _P1 = v_setall_s16((short)P1);
1210+
1211+
v_int16x8 _delta = v_setall_s16((short)delta);
1212+
v_int16x8 _minL = v_setall_s16((short)MAX_COST);
11421213

1143-
for( d = 0; d < D; d++ )
1214+
for( d = 0; d < D; d += 8 )
1215+
{
1216+
v_int16x8 Cpd = v_load(Cp + d);
1217+
v_int16x8 L;
1218+
1219+
L = v_load(Lr_ppr + d);
1220+
1221+
L = v_min(L, (v_load(Lr_ppr + d - 1) + _P1));
1222+
L = v_min(L, (v_load(Lr_ppr + d + 1) + _P1));
1223+
1224+
L = v_min(L, _delta);
1225+
L = ((L - _delta) + Cpd);
1226+
1227+
v_store(Lr_p + d, L);
1228+
1229+
// Get minimum from in L-L3
1230+
_minL = v_min(_minL, L);
1231+
1232+
v_int16x8 Sval = v_load(Sp + d);
1233+
1234+
Sval = Sval + L;
1235+
1236+
v_store(Sp + d, Sval);
1237+
}
1238+
1239+
v_int32x4 min1, min2, min12;
1240+
v_expand(_minL, min1, min2);
1241+
min12 = v_min(min1,min2);
1242+
minLr = (CostType)v_reduce_min(min12);
1243+
}
1244+
else
1245+
#endif
11441246
{
1145-
int Cpd = Cp[d], L;
1247+
minLr = MAX_COST;
1248+
for( d = 0; d < D; d++ )
1249+
{
1250+
int Cpd = Cp[d], L;
11461251

1147-
L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta;
1252+
L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta;
11481253

1149-
Lr_p[d] = (CostType)L;
1150-
minL = std::min(minL, L);
1254+
Lr_p[d] = (CostType)L;
1255+
minLr = (CostType)std::min((int)minLr, L);
11511256

1152-
Sp[d] = saturate_cast<CostType>(Sp[d] + L);
1257+
Sp[d] = saturate_cast<CostType>(Sp[d] + L);
1258+
}
11531259
}
1154-
minLr = (CostType)minL;
11551260
}
11561261

11571262
memset( Lr - 8, 0, LrSize*sizeof(CostType) );
@@ -1169,26 +1274,82 @@ struct CalcHorizontalSums: public ParallelLoopBody
11691274
const CostType* Cp = C + x*D;
11701275
CostType* Sp = S + x*D;
11711276
int minS = MAX_COST, bestDisp = -1;
1277+
minLr = MAX_COST;
11721278

1173-
int minL = MAX_COST;
1174-
1175-
for( d = 0; d < D; d++ )
1279+
#if CV_SIMD128
1280+
if( useSIMD )
11761281
{
1177-
int Cpd = Cp[d], L;
1282+
v_int16x8 _P1 = v_setall_s16((short)P1);
11781283

1179-
L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta;
1284+
v_int16x8 _delta = v_setall_s16((short)delta);
1285+
v_int16x8 _minL = v_setall_s16((short)MAX_COST);
11801286

1181-
Lr_p[d] = (CostType)L;
1182-
minL = std::min(minL, L);
1287+
v_int16x8 _minS = v_setall_s16(MAX_COST), _bestDisp = v_setall_s16(-1);
1288+
v_int16x8 _d8 = v_int16x8(0, 1, 2, 3, 4, 5, 6, 7), _8 = v_setall_s16(8);
11831289

1184-
Sp[d] = saturate_cast<CostType>(Sp[d] + L);
1185-
if( Sp[d] < minS )
1290+
for( d = 0; d < D; d+= 8 )
11861291
{
1187-
minS = Sp[d];
1188-
bestDisp = d;
1292+
v_int16x8 Cpd = v_load(Cp + d);
1293+
v_int16x8 L;
1294+
1295+
L = v_load(Lr_ppr + d);
1296+
1297+
L = v_min(L, (v_load(Lr_ppr + d - 1) + _P1));
1298+
L = v_min(L, (v_load(Lr_ppr + d + 1) + _P1));
1299+
1300+
L = v_min(L, _delta);
1301+
L = ((L - _delta) + Cpd);
1302+
1303+
v_store(Lr_p + d, L);
1304+
1305+
// Get minimum from in L-L3
1306+
_minL = v_min(_minL, L);
1307+
1308+
v_int16x8 Sval = v_load(Sp + d);
1309+
1310+
Sval = Sval + L;
1311+
1312+
v_int16x8 mask = Sval < _minS;
1313+
_minS = v_min( Sval, _minS );
1314+
_bestDisp = _bestDisp ^ ((_bestDisp ^ _d8) & mask);
1315+
_d8 = _d8 + _8;
1316+
1317+
v_store(Sp + d, Sval);
1318+
}
1319+
v_int32x4 min1, min2, min12;
1320+
v_expand(_minL, min1, min2);
1321+
min12 = v_min(min1,min2);
1322+
minLr = (CostType)v_reduce_min(min12);
1323+
1324+
v_int32x4 _d0, _d1;
1325+
v_expand(_minS, _d0, _d1);
1326+
minS = (int)std::min(v_reduce_min(_d0), v_reduce_min(_d1));
1327+
v_int16x8 v_mask = v_setall_s16((short)minS) == _minS;
1328+
1329+
_bestDisp = (_bestDisp & v_mask) | (v_setall_s16(SHRT_MAX) & ~v_mask);
1330+
v_expand(_bestDisp, _d0, _d1);
1331+
bestDisp = (int)std::min(v_reduce_min(_d0), v_reduce_min(_d1));
1332+
}
1333+
else
1334+
#endif
1335+
{
1336+
for( d = 0; d < D; d++ )
1337+
{
1338+
int Cpd = Cp[d], L;
1339+
1340+
L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta;
1341+
1342+
Lr_p[d] = (CostType)L;
1343+
minLr = (CostType)std::min((int)minLr, L);
1344+
1345+
Sp[d] = saturate_cast<CostType>(Sp[d] + L);
1346+
if( Sp[d] < minS )
1347+
{
1348+
minS = Sp[d];
1349+
bestDisp = d;
1350+
}
11891351
}
11901352
}
1191-
minLr = (CostType)minL;
11921353
//Some postprocessing procedures and saving
11931354
for( d = 0; d < D; d++ )
11941355
{
@@ -1263,6 +1424,7 @@ struct CalcHorizontalSums: public ParallelLoopBody
12631424
int INVALID_DISP_SCALED;
12641425
int uniquenessRatio;
12651426
int disp12MaxDiff;
1427+
bool useSIMD;
12661428
};
12671429
/*
12681430
computes disparity for "roi" in img1 w.r.t. img2 and write it to disp1buf.

0 commit comments

Comments
 (0)