Skip to content

Commit 0f86453

Browse files
authored
Merge pull request #2144 from kleisauke/ensure-composite-alignment
Ensure max_band vector is aligned on a 16-byte boundary
2 parents 3dad989 + 305714c commit 0f86453

File tree

2 files changed

+84
-28
lines changed

2 files changed

+84
-28
lines changed

configure.ac

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -272,7 +272,7 @@ AM_GLIB_GNU_GETTEXT
272272
# [ax_gcc_version_option=yes],
273273
# [ax_gcc_version_option=no]
274274
# )
275-
AC_MSG_CHECKING([for gcc version])
275+
AC_MSG_CHECKING([for $CC version])
276276
GCC_VERSION=""
277277
version=$($CC -dumpversion)
278278
if test $? = 0; then
@@ -326,7 +326,7 @@ AC_TYPE_SIZE_T
326326

327327
# g++/gcc 4.x and 5.x have rather broken vector support ... 5.4.1 seems to
328328
# work, but 5.4.0 fails to even compile
329-
AC_MSG_CHECKING([for gcc with working vector support])
329+
AC_MSG_CHECKING([for $CC with working vector support])
330330
if test x"$GCC_VERSION_MAJOR" != x"4" -a x"$GCC_VERSION_MAJOR" != x"5"; then
331331
AC_MSG_RESULT([yes])
332332
else
@@ -339,7 +339,7 @@ if test x"$ax_cv_have_var_attribute_vector_size" = x"yes"; then
339339
AC_MSG_CHECKING([for C++ vector shuffle])
340340
AC_LANG_PUSH([C++])
341341
AC_TRY_COMPILE([
342-
typedef float v4f __attribute__((vector_size(4 * sizeof(float))));
342+
typedef float v4f __attribute__((vector_size(4 * sizeof(float)),aligned(16)));
343343
],[
344344
v4f f; f[3] = 99;
345345
],[
@@ -362,7 +362,7 @@ if test x"$have_vector_shuffle" = x"yes"; then
362362
AC_MSG_CHECKING([for C++ vector arithmetic])
363363
AC_LANG_PUSH([C++])
364364
AC_TRY_COMPILE([
365-
typedef float v4f __attribute__((vector_size(4 * sizeof(float))));
365+
typedef float v4f __attribute__((vector_size(4 * sizeof(float)),aligned(16)));
366366
],[
367367
v4f f = {1, 2, 3, 4}; f *= 12.0;
368368
v4f g = {5, 6, 7, 8}; f = g > 0 ? g : -1 * g;
@@ -382,7 +382,7 @@ if test x"$have_vector_arith" = x"yes"; then
382382
AC_MSG_CHECKING([for C++ signed constants in vector templates])
383383
AC_LANG_PUSH([C++])
384384
AC_TRY_COMPILE([
385-
typedef float v4f __attribute__((vector_size(4 * sizeof(float))));
385+
typedef float v4f __attribute__((vector_size(4 * sizeof(float)),aligned(16)));
386386
template <typename T>
387387
static void
388388
h( v4f B )
@@ -409,7 +409,7 @@ fi
409409
AC_FUNC_MEMCMP
410410
AC_FUNC_MMAP
411411
AC_FUNC_VPRINTF
412-
AC_CHECK_FUNCS([getcwd gettimeofday getwd memset munmap putenv realpath strcasecmp strchr strcspn strdup strerror strrchr strspn vsnprintf realpath mkstemp mktemp random rand sysconf atexit])
412+
AC_CHECK_FUNCS([getcwd gettimeofday getwd memset munmap putenv realpath strcasecmp strchr strcspn strdup strerror strrchr strspn vsnprintf realpath mkstemp mktemp random rand sysconf atexit _aligned_malloc posix_memalign memalign])
413413
AC_CHECK_LIB(m,cbrt,[AC_DEFINE(HAVE_CBRT,1,[have cbrt() in libm.])])
414414
AC_CHECK_LIB(m,hypot,[AC_DEFINE(HAVE_HYPOT,1,[have hypot() in libm.])])
415415
AC_CHECK_LIB(m,atan2,[AC_DEFINE(HAVE_ATAN2,1,[have atan2() in libm.])])

libvips/conversion/composite.cpp

Lines changed: 78 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -55,13 +55,17 @@
5555

5656
#include <stdio.h>
5757
#include <string.h>
58-
#if _MSC_VER
58+
#ifdef _MSC_VER
5959
#include <cstdlib>
6060
#else
6161
#include <stdlib.h>
6262
#endif
6363
#include <math.h>
6464

65+
#if defined(HAVE__ALIGNED_MALLOC) || defined(HAVE_MEMALIGN)
66+
#include <malloc.h>
67+
#endif
68+
6569
#include <vips/vips.h>
6670
#include <vips/internal.h>
6771
#include <vips/debug.h>
@@ -81,7 +85,7 @@
8185
#ifdef HAVE_VECTOR_ARITH
8286
/* A vector of four floats.
8387
*/
84-
typedef float v4f __attribute__((vector_size(4 * sizeof(float))));
88+
typedef float v4f __attribute__((vector_size(4 * sizeof(float)),aligned(16)));
8589
#endif /*HAVE_VECTOR_ARITH*/
8690

8791
typedef struct _VipsCompositeBase {
@@ -130,12 +134,6 @@ typedef struct _VipsCompositeBase {
130134
*/
131135
gboolean skippable;
132136

133-
#ifdef HAVE_VECTOR_ARITH
134-
/* max_band as a vector, for the RGBA case.
135-
*/
136-
v4f max_band_vec;
137-
#endif /*HAVE_VECTOR_ARITH*/
138-
139137
} VipsCompositeBase;
140138

141139
typedef VipsConversionClass VipsCompositeBaseClass;
@@ -168,6 +166,14 @@ vips_composite_base_dispose( GObject *gobject )
168166
/* Our sequence value.
169167
*/
170168
typedef struct {
169+
#ifdef HAVE_VECTOR_ARITH
170+
/* max_band as a vector, for the RGBA case. This must be
171+
* defined first to ensure that the member is aligned
172+
* on a 16-byte boundary.
173+
*/
174+
v4f max_band_vec;
175+
#endif /*HAVE_VECTOR_ARITH*/
176+
171177
VipsCompositeBase *composite;
172178

173179
/* Full set of input regions, each made on the corresponding input
@@ -196,6 +202,39 @@ typedef struct {
196202

197203
} VipsCompositeSequence;
198204

205+
#ifdef HAVE_VECTOR_ARITH
206+
/* Allocate aligned memory. The return value can be released
207+
* by calling the vips_free_aligned() function, for example:
208+
* VIPS_FREEF( vips_free_aligned, ptr );
209+
*/
210+
static inline void *
211+
vips_alloc_aligned( size_t sz, size_t align )
212+
{
213+
g_assert( !(align & (align - 1)) );
214+
#ifdef HAVE__ALIGNED_MALLOC
215+
return _aligned_malloc( sz, align );
216+
#elif defined(HAVE_POSIX_MEMALIGN)
217+
void *ptr;
218+
if( posix_memalign( &ptr, align, sz ) ) return NULL;
219+
return ptr;
220+
#elif defined(HAVE_MEMALIGN)
221+
return memalign( align, sz );
222+
#else
223+
#error Missing aligned alloc implementation
224+
#endif
225+
}
226+
227+
static inline void
228+
vips_free_aligned( void* ptr )
229+
{
230+
#ifdef HAVE__ALIGNED_MALLOC
231+
_aligned_free( ptr );
232+
#else /*defined(HAVE_POSIX_MEMALIGN) || defined(HAVE_MEMALIGN)*/
233+
free( ptr );
234+
#endif
235+
}
236+
#endif /*HAVE_VECTOR_ARITH*/
237+
199238
static int
200239
vips_composite_stop( void *vseq, void *a, void *b )
201240
{
@@ -216,7 +255,11 @@ vips_composite_stop( void *vseq, void *a, void *b )
216255
VIPS_FREE( seq->enabled );
217256
VIPS_FREE( seq->p );
218257

258+
#ifdef HAVE_VECTOR_ARITH
259+
VIPS_FREEF( vips_free_aligned, seq );
260+
#else /*!defined(HAVE_VECTOR_ARITH)*/
219261
VIPS_FREE( seq );
262+
#endif /*HAVE_VECTOR_ARITH*/
220263

221264
return( 0 );
222265
}
@@ -230,7 +273,14 @@ vips_composite_start( VipsImage *out, void *a, void *b )
230273
VipsCompositeSequence *seq;
231274
int i, n;
232275

276+
#ifdef HAVE_VECTOR_ARITH
277+
/* Ensure that the memory is aligned on a 16-byte boundary.
278+
*/
279+
if( !(seq = ((VipsCompositeSequence *) vips_alloc_aligned(
280+
sizeof( VipsCompositeSequence ), 16 ))) )
281+
#else /*!defined(HAVE_VECTOR_ARITH)*/
233282
if( !(seq = VIPS_NEW( NULL, VipsCompositeSequence )) )
283+
#endif /*HAVE_VECTOR_ARITH*/
234284
return( NULL );
235285

236286
seq->composite = composite;
@@ -280,7 +330,19 @@ vips_composite_start( VipsImage *out, void *a, void *b )
280330
return( NULL );
281331
}
282332
}
283-
333+
334+
#ifdef HAVE_VECTOR_ARITH
335+
/* We need a float version for the vector path.
336+
*/
337+
if( composite->bands == 3 )
338+
seq->max_band_vec = (v4f){
339+
(float) composite->max_band[0],
340+
(float) composite->max_band[1],
341+
(float) composite->max_band[2],
342+
(float) composite->max_band[3]
343+
};
344+
#endif
345+
284346
return( seq );
285347
}
286348

@@ -664,9 +726,11 @@ vips_composite_base_blend( VipsCompositeBase *composite,
664726
*/
665727
template <typename T>
666728
static void
667-
vips_composite_base_blend3( VipsCompositeBase *composite,
729+
vips_composite_base_blend3( VipsCompositeSequence *seq,
668730
VipsBlendMode mode, v4f &B, T * restrict p )
669731
{
732+
VipsCompositeBase *composite = seq->composite;
733+
670734
v4f A;
671735
float aA;
672736
float aB;
@@ -684,7 +748,7 @@ vips_composite_base_blend3( VipsCompositeBase *composite,
684748
A[2] = p[2];
685749
A[3] = p[3];
686750

687-
A /= composite->max_band_vec;
751+
A /= seq->max_band_vec;
688752

689753
aA = A[3];
690754
aB = B[3];
@@ -975,7 +1039,7 @@ vips_combine_pixels3( VipsCompositeSequence *seq, VipsPel *q )
9751039

9761040
/* Scale the base pixel to 0 - 1.
9771041
*/
978-
B /= composite->max_band_vec;
1042+
B /= seq->max_band_vec;
9791043
aB = B[3];
9801044

9811045
if( !composite->premultiplied ) {
@@ -987,7 +1051,7 @@ vips_combine_pixels3( VipsCompositeSequence *seq, VipsPel *q )
9871051
int j = seq->enabled[i];
9881052
VipsBlendMode m = n_mode == 1 ? mode[0] : mode[j - 1];
9891053

990-
vips_composite_base_blend3<T>( composite, m, B, tp[i] );
1054+
vips_composite_base_blend3<T>( seq, m, B, tp[i] );
9911055
}
9921056

9931057
/* Unpremultiply, if necessary.
@@ -1006,7 +1070,7 @@ vips_combine_pixels3( VipsCompositeSequence *seq, VipsPel *q )
10061070

10071071
/* Write back as a full range pixel, clipping to range.
10081072
*/
1009-
B *= composite->max_band_vec;
1073+
B *= seq->max_band_vec;
10101074
if( min_T != 0 ||
10111075
max_T != 0 ) {
10121076
float low = min_T;
@@ -1386,14 +1450,6 @@ vips_composite_base_build( VipsObject *object )
13861450
return( -1 );
13871451
}
13881452

1389-
#ifdef HAVE_VECTOR_ARITH
1390-
/* We need a float version for the vector path.
1391-
*/
1392-
if( composite->bands == 3 )
1393-
for( int b = 0; b <= 3; b++ )
1394-
composite->max_band_vec[b] = composite->max_band[b];
1395-
#endif /*HAVE_VECTOR_ARITH*/
1396-
13971453
/* Transform the input images to match in format. We may have
13981454
* mixed float and double, for example.
13991455
*/

0 commit comments

Comments
 (0)