Skip to content

Commit 77264dc

Browse files
committed
AVX optimized implementation of haar migrated to separate file
1 parent 20f603a commit 77264dc

File tree

3 files changed

+492
-372
lines changed

3 files changed

+492
-372
lines changed

modules/objdetect/src/haar.avx.cpp

Lines changed: 369 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,369 @@
1+
/*M///////////////////////////////////////////////////////////////////////////////////////
2+
//
3+
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4+
//
5+
// By downloading, copying, installing or using the software you agree to this license.
6+
// If you do not agree to this license, do not download, install,
7+
// copy or use the software.
8+
//
9+
//
10+
// Intel License Agreement
11+
// For Open Source Computer Vision Library
12+
//
13+
// Copyright (C) 2000, Intel Corporation, all rights reserved.
14+
// Third party copyrights are property of their respective owners.
15+
//
16+
// Redistribution and use in source and binary forms, with or without modification,
17+
// are permitted provided that the following conditions are met:
18+
//
19+
// * Redistribution's of source code must retain the above copyright notice,
20+
// this list of conditions and the following disclaimer.
21+
//
22+
// * Redistribution's in binary form must reproduce the above copyright notice,
23+
// this list of conditions and the following disclaimer in the documentation
24+
// and/or other materials provided with the distribution.
25+
//
26+
// * The name of Intel Corporation may not be used to endorse or promote products
27+
// derived from this software without specific prior written permission.
28+
//
29+
// This software is provided by the copyright holders and contributors "as is" and
30+
// any express or implied warranties, including, but not limited to, the implied
31+
// warranties of merchantability and fitness for a particular purpose are disclaimed.
32+
// In no event shall the Intel Corporation or contributors be liable for any direct,
33+
// indirect, incidental, special, exemplary, or consequential damages
34+
// (including, but not limited to, procurement of substitute goods or services;
35+
// loss of use, data, or profits; or business interruption) however caused
36+
// and on any theory of liability, whether in contract, strict liability,
37+
// or tort (including negligence or otherwise) arising in any way out of
38+
// the use of this software, even if advised of the possibility of such damage.
39+
//
40+
//M*/
41+
42+
/* Haar features calculation */
43+
44+
#include "precomp.hpp"
45+
#include "haar.hpp"
46+
47+
namespace cv_haar_avx
48+
{
49+
50+
// AVX version icvEvalHidHaarClassifier. Process 8 CvHidHaarClassifiers per call. Check AVX support before invocation!!
51+
#if CV_HAAR_USE_AVX
52+
double icvEvalHidHaarClassifierAVX(CvHidHaarClassifier* classifier,
53+
double variance_norm_factor, size_t p_offset)
54+
{
55+
int CV_DECL_ALIGNED(32) idxV[8] = { 0,0,0,0,0,0,0,0 };
56+
uchar flags[8] = { 0,0,0,0,0,0,0,0 };
57+
CvHidHaarTreeNode* nodes[8];
58+
double res = 0;
59+
uchar exitConditionFlag = 0;
60+
for (;;)
61+
{
62+
float CV_DECL_ALIGNED(32) tmp[8] = { 0,0,0,0,0,0,0,0 };
63+
nodes[0] = (classifier + 0)->node + idxV[0];
64+
nodes[1] = (classifier + 1)->node + idxV[1];
65+
nodes[2] = (classifier + 2)->node + idxV[2];
66+
nodes[3] = (classifier + 3)->node + idxV[3];
67+
nodes[4] = (classifier + 4)->node + idxV[4];
68+
nodes[5] = (classifier + 5)->node + idxV[5];
69+
nodes[6] = (classifier + 6)->node + idxV[6];
70+
nodes[7] = (classifier + 7)->node + idxV[7];
71+
72+
__m256 t = _mm256_set1_ps(static_cast<float>(variance_norm_factor));
73+
74+
t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,
75+
nodes[6]->threshold,
76+
nodes[5]->threshold,
77+
nodes[4]->threshold,
78+
nodes[3]->threshold,
79+
nodes[2]->threshold,
80+
nodes[1]->threshold,
81+
nodes[0]->threshold));
82+
83+
__m256 offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[0], p_offset),
84+
calc_sumf(nodes[6]->feature.rect[0], p_offset),
85+
calc_sumf(nodes[5]->feature.rect[0], p_offset),
86+
calc_sumf(nodes[4]->feature.rect[0], p_offset),
87+
calc_sumf(nodes[3]->feature.rect[0], p_offset),
88+
calc_sumf(nodes[2]->feature.rect[0], p_offset),
89+
calc_sumf(nodes[1]->feature.rect[0], p_offset),
90+
calc_sumf(nodes[0]->feature.rect[0], p_offset));
91+
92+
__m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight,
93+
nodes[6]->feature.rect[0].weight,
94+
nodes[5]->feature.rect[0].weight,
95+
nodes[4]->feature.rect[0].weight,
96+
nodes[3]->feature.rect[0].weight,
97+
nodes[2]->feature.rect[0].weight,
98+
nodes[1]->feature.rect[0].weight,
99+
nodes[0]->feature.rect[0].weight);
100+
101+
__m256 sum = _mm256_mul_ps(offset, weight);
102+
103+
offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[1], p_offset),
104+
calc_sumf(nodes[6]->feature.rect[1], p_offset),
105+
calc_sumf(nodes[5]->feature.rect[1], p_offset),
106+
calc_sumf(nodes[4]->feature.rect[1], p_offset),
107+
calc_sumf(nodes[3]->feature.rect[1], p_offset),
108+
calc_sumf(nodes[2]->feature.rect[1], p_offset),
109+
calc_sumf(nodes[1]->feature.rect[1], p_offset),
110+
calc_sumf(nodes[0]->feature.rect[1], p_offset));
111+
112+
weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight,
113+
nodes[6]->feature.rect[1].weight,
114+
nodes[5]->feature.rect[1].weight,
115+
nodes[4]->feature.rect[1].weight,
116+
nodes[3]->feature.rect[1].weight,
117+
nodes[2]->feature.rect[1].weight,
118+
nodes[1]->feature.rect[1].weight,
119+
nodes[0]->feature.rect[1].weight);
120+
121+
sum = _mm256_add_ps(sum, _mm256_mul_ps(offset, weight));
122+
123+
if (nodes[0]->feature.rect[2].p0)
124+
tmp[0] = calc_sumf(nodes[0]->feature.rect[2], p_offset) * nodes[0]->feature.rect[2].weight;
125+
if (nodes[1]->feature.rect[2].p0)
126+
tmp[1] = calc_sumf(nodes[1]->feature.rect[2], p_offset) * nodes[1]->feature.rect[2].weight;
127+
if (nodes[2]->feature.rect[2].p0)
128+
tmp[2] = calc_sumf(nodes[2]->feature.rect[2], p_offset) * nodes[2]->feature.rect[2].weight;
129+
if (nodes[3]->feature.rect[2].p0)
130+
tmp[3] = calc_sumf(nodes[3]->feature.rect[2], p_offset) * nodes[3]->feature.rect[2].weight;
131+
if (nodes[4]->feature.rect[2].p0)
132+
tmp[4] = calc_sumf(nodes[4]->feature.rect[2], p_offset) * nodes[4]->feature.rect[2].weight;
133+
if (nodes[5]->feature.rect[2].p0)
134+
tmp[5] = calc_sumf(nodes[5]->feature.rect[2], p_offset) * nodes[5]->feature.rect[2].weight;
135+
if (nodes[6]->feature.rect[2].p0)
136+
tmp[6] = calc_sumf(nodes[6]->feature.rect[2], p_offset) * nodes[6]->feature.rect[2].weight;
137+
if (nodes[7]->feature.rect[2].p0)
138+
tmp[7] = calc_sumf(nodes[7]->feature.rect[2], p_offset) * nodes[7]->feature.rect[2].weight;
139+
140+
sum = _mm256_add_ps(sum, _mm256_load_ps(tmp));
141+
142+
__m256 left = _mm256_set_ps(static_cast<float>(nodes[7]->left), static_cast<float>(nodes[6]->left),
143+
static_cast<float>(nodes[5]->left), static_cast<float>(nodes[4]->left),
144+
static_cast<float>(nodes[3]->left), static_cast<float>(nodes[2]->left),
145+
static_cast<float>(nodes[1]->left), static_cast<float>(nodes[0]->left));
146+
__m256 right = _mm256_set_ps(static_cast<float>(nodes[7]->right), static_cast<float>(nodes[6]->right),
147+
static_cast<float>(nodes[5]->right), static_cast<float>(nodes[4]->right),
148+
static_cast<float>(nodes[3]->right), static_cast<float>(nodes[2]->right),
149+
static_cast<float>(nodes[1]->right), static_cast<float>(nodes[0]->right));
150+
151+
_mm256_store_si256((__m256i*)idxV, _mm256_cvttps_epi32(_mm256_blendv_ps(right, left, _mm256_cmp_ps(sum, t, _CMP_LT_OQ))));
152+
153+
for (int i = 0; i < 8; i++)
154+
{
155+
if (idxV[i] <= 0)
156+
{
157+
if (!flags[i])
158+
{
159+
exitConditionFlag++;
160+
flags[i] = 1;
161+
res += (classifier + i)->alpha[-idxV[i]];
162+
}
163+
idxV[i] = 0;
164+
}
165+
}
166+
if (exitConditionFlag == 8)
167+
return res;
168+
}
169+
}
170+
171+
double icvEvalHidHaarStumpClassifierAVX(CvHidHaarClassifier* classifier,
172+
double variance_norm_factor, size_t p_offset)
173+
{
174+
float CV_DECL_ALIGNED(32) tmp[8] = { 0,0,0,0,0,0,0,0 };
175+
CvHidHaarTreeNode* nodes[8];
176+
177+
nodes[0] = classifier[0].node;
178+
nodes[1] = classifier[1].node;
179+
nodes[2] = classifier[2].node;
180+
nodes[3] = classifier[3].node;
181+
nodes[4] = classifier[4].node;
182+
nodes[5] = classifier[5].node;
183+
nodes[6] = classifier[6].node;
184+
nodes[7] = classifier[7].node;
185+
186+
__m256 t = _mm256_set1_ps(static_cast<float>(variance_norm_factor));
187+
188+
t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,
189+
nodes[6]->threshold,
190+
nodes[5]->threshold,
191+
nodes[4]->threshold,
192+
nodes[3]->threshold,
193+
nodes[2]->threshold,
194+
nodes[1]->threshold,
195+
nodes[0]->threshold));
196+
197+
__m256 offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[0], p_offset),
198+
calc_sumf(nodes[6]->feature.rect[0], p_offset),
199+
calc_sumf(nodes[5]->feature.rect[0], p_offset),
200+
calc_sumf(nodes[4]->feature.rect[0], p_offset),
201+
calc_sumf(nodes[3]->feature.rect[0], p_offset),
202+
calc_sumf(nodes[2]->feature.rect[0], p_offset),
203+
calc_sumf(nodes[1]->feature.rect[0], p_offset),
204+
calc_sumf(nodes[0]->feature.rect[0], p_offset));
205+
206+
__m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight,
207+
nodes[6]->feature.rect[0].weight,
208+
nodes[5]->feature.rect[0].weight,
209+
nodes[4]->feature.rect[0].weight,
210+
nodes[3]->feature.rect[0].weight,
211+
nodes[2]->feature.rect[0].weight,
212+
nodes[1]->feature.rect[0].weight,
213+
nodes[0]->feature.rect[0].weight);
214+
215+
__m256 sum = _mm256_mul_ps(offset, weight);
216+
217+
offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[1], p_offset),
218+
calc_sumf(nodes[6]->feature.rect[1], p_offset),
219+
calc_sumf(nodes[5]->feature.rect[1], p_offset),
220+
calc_sumf(nodes[4]->feature.rect[1], p_offset),
221+
calc_sumf(nodes[3]->feature.rect[1], p_offset),
222+
calc_sumf(nodes[2]->feature.rect[1], p_offset),
223+
calc_sumf(nodes[1]->feature.rect[1], p_offset),
224+
calc_sumf(nodes[0]->feature.rect[1], p_offset));
225+
226+
weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight,
227+
nodes[6]->feature.rect[1].weight,
228+
nodes[5]->feature.rect[1].weight,
229+
nodes[4]->feature.rect[1].weight,
230+
nodes[3]->feature.rect[1].weight,
231+
nodes[2]->feature.rect[1].weight,
232+
nodes[1]->feature.rect[1].weight,
233+
nodes[0]->feature.rect[1].weight);
234+
235+
sum = _mm256_add_ps(sum, _mm256_mul_ps(offset, weight));
236+
237+
if (nodes[0]->feature.rect[2].p0)
238+
tmp[0] = calc_sumf(nodes[0]->feature.rect[2], p_offset) * nodes[0]->feature.rect[2].weight;
239+
if (nodes[1]->feature.rect[2].p0)
240+
tmp[1] = calc_sumf(nodes[1]->feature.rect[2], p_offset) * nodes[1]->feature.rect[2].weight;
241+
if (nodes[2]->feature.rect[2].p0)
242+
tmp[2] = calc_sumf(nodes[2]->feature.rect[2], p_offset) * nodes[2]->feature.rect[2].weight;
243+
if (nodes[3]->feature.rect[2].p0)
244+
tmp[3] = calc_sumf(nodes[3]->feature.rect[2], p_offset) * nodes[3]->feature.rect[2].weight;
245+
if (nodes[4]->feature.rect[2].p0)
246+
tmp[4] = calc_sumf(nodes[4]->feature.rect[2], p_offset) * nodes[4]->feature.rect[2].weight;
247+
if (nodes[5]->feature.rect[2].p0)
248+
tmp[5] = calc_sumf(nodes[5]->feature.rect[2], p_offset) * nodes[5]->feature.rect[2].weight;
249+
if (nodes[6]->feature.rect[2].p0)
250+
tmp[6] = calc_sumf(nodes[6]->feature.rect[2], p_offset) * nodes[6]->feature.rect[2].weight;
251+
if (nodes[7]->feature.rect[2].p0)
252+
tmp[7] = calc_sumf(nodes[7]->feature.rect[2], p_offset) * nodes[7]->feature.rect[2].weight;
253+
254+
sum = _mm256_add_ps(sum, _mm256_load_ps(tmp));
255+
256+
__m256 alpha0 = _mm256_set_ps(classifier[7].alpha[0],
257+
classifier[6].alpha[0],
258+
classifier[5].alpha[0],
259+
classifier[4].alpha[0],
260+
classifier[3].alpha[0],
261+
classifier[2].alpha[0],
262+
classifier[1].alpha[0],
263+
classifier[0].alpha[0]);
264+
__m256 alpha1 = _mm256_set_ps(classifier[7].alpha[1],
265+
classifier[6].alpha[1],
266+
classifier[5].alpha[1],
267+
classifier[4].alpha[1],
268+
classifier[3].alpha[1],
269+
classifier[2].alpha[1],
270+
classifier[1].alpha[1],
271+
classifier[0].alpha[1]);
272+
273+
__m256 outBuf = _mm256_blendv_ps(alpha0, alpha1, _mm256_cmp_ps(t, sum, _CMP_LE_OQ));
274+
outBuf = _mm256_hadd_ps(outBuf, outBuf);
275+
outBuf = _mm256_hadd_ps(outBuf, outBuf);
276+
_mm256_store_ps(tmp, outBuf);
277+
return (tmp[0] + tmp[4]);
278+
}
279+
280+
double icvEvalHidHaarStumpClassifierTwoRectAVX(CvHidHaarClassifier* classifier,
281+
double variance_norm_factor, size_t p_offset)
282+
{
283+
float CV_DECL_ALIGNED(32) buf[8];
284+
CvHidHaarTreeNode* nodes[8];
285+
nodes[0] = classifier[0].node;
286+
nodes[1] = classifier[1].node;
287+
nodes[2] = classifier[2].node;
288+
nodes[3] = classifier[3].node;
289+
nodes[4] = classifier[4].node;
290+
nodes[5] = classifier[5].node;
291+
nodes[6] = classifier[6].node;
292+
nodes[7] = classifier[7].node;
293+
294+
__m256 t = _mm256_set1_ps(static_cast<float>(variance_norm_factor));
295+
t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,
296+
nodes[6]->threshold,
297+
nodes[5]->threshold,
298+
nodes[4]->threshold,
299+
nodes[3]->threshold,
300+
nodes[2]->threshold,
301+
nodes[1]->threshold,
302+
nodes[0]->threshold));
303+
304+
__m256 offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[0], p_offset),
305+
calc_sumf(nodes[6]->feature.rect[0], p_offset),
306+
calc_sumf(nodes[5]->feature.rect[0], p_offset),
307+
calc_sumf(nodes[4]->feature.rect[0], p_offset),
308+
calc_sumf(nodes[3]->feature.rect[0], p_offset),
309+
calc_sumf(nodes[2]->feature.rect[0], p_offset),
310+
calc_sumf(nodes[1]->feature.rect[0], p_offset),
311+
calc_sumf(nodes[0]->feature.rect[0], p_offset));
312+
313+
__m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight,
314+
nodes[6]->feature.rect[0].weight,
315+
nodes[5]->feature.rect[0].weight,
316+
nodes[4]->feature.rect[0].weight,
317+
nodes[3]->feature.rect[0].weight,
318+
nodes[2]->feature.rect[0].weight,
319+
nodes[1]->feature.rect[0].weight,
320+
nodes[0]->feature.rect[0].weight);
321+
322+
__m256 sum = _mm256_mul_ps(offset, weight);
323+
324+
offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[1], p_offset),
325+
calc_sumf(nodes[6]->feature.rect[1], p_offset),
326+
calc_sumf(nodes[5]->feature.rect[1], p_offset),
327+
calc_sumf(nodes[4]->feature.rect[1], p_offset),
328+
calc_sumf(nodes[3]->feature.rect[1], p_offset),
329+
calc_sumf(nodes[2]->feature.rect[1], p_offset),
330+
calc_sumf(nodes[1]->feature.rect[1], p_offset),
331+
calc_sumf(nodes[0]->feature.rect[1], p_offset));
332+
333+
weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight,
334+
nodes[6]->feature.rect[1].weight,
335+
nodes[5]->feature.rect[1].weight,
336+
nodes[4]->feature.rect[1].weight,
337+
nodes[3]->feature.rect[1].weight,
338+
nodes[2]->feature.rect[1].weight,
339+
nodes[1]->feature.rect[1].weight,
340+
nodes[0]->feature.rect[1].weight);
341+
342+
sum = _mm256_add_ps(sum, _mm256_mul_ps(offset, weight));
343+
344+
__m256 alpha0 = _mm256_set_ps(classifier[7].alpha[0],
345+
classifier[6].alpha[0],
346+
classifier[5].alpha[0],
347+
classifier[4].alpha[0],
348+
classifier[3].alpha[0],
349+
classifier[2].alpha[0],
350+
classifier[1].alpha[0],
351+
classifier[0].alpha[0]);
352+
__m256 alpha1 = _mm256_set_ps(classifier[7].alpha[1],
353+
classifier[6].alpha[1],
354+
classifier[5].alpha[1],
355+
classifier[4].alpha[1],
356+
classifier[3].alpha[1],
357+
classifier[2].alpha[1],
358+
classifier[1].alpha[1],
359+
classifier[0].alpha[1]);
360+
361+
_mm256_store_ps(buf, _mm256_blendv_ps(alpha0, alpha1, _mm256_cmp_ps(t, sum, _CMP_LE_OQ)));
362+
return (buf[0] + buf[1] + buf[2] + buf[3] + buf[4] + buf[5] + buf[6] + buf[7]);
363+
}
364+
365+
#endif //CV_HAAR_USE_AVX
366+
367+
}
368+
369+
/* End of file. */

0 commit comments

Comments
 (0)