55
55
56
56
#include < stdio.h>
57
57
#include < string.h>
58
- #if _MSC_VER
58
+ #ifdef _MSC_VER
59
59
#include < cstdlib>
60
60
#else
61
61
#include < stdlib.h>
62
62
#endif
63
63
#include < math.h>
64
64
65
+ #if defined(HAVE__ALIGNED_MALLOC) || defined(HAVE_MEMALIGN)
66
+ #include < malloc.h>
67
+ #endif
68
+
65
69
#include < vips/vips.h>
66
70
#include < vips/internal.h>
67
71
#include < vips/debug.h>
81
85
#ifdef HAVE_VECTOR_ARITH
82
86
/* A vector of four floats.
83
87
*/
84
- typedef float v4f __attribute__ ((vector_size(4 * sizeof (float ))));
88
+ typedef float v4f __attribute__ ((vector_size(4 * sizeof (float )),aligned( 16 ) ));
85
89
#endif /* HAVE_VECTOR_ARITH*/
86
90
87
91
typedef struct _VipsCompositeBase {
@@ -130,12 +134,6 @@ typedef struct _VipsCompositeBase {
130
134
*/
131
135
gboolean skippable;
132
136
133
- #ifdef HAVE_VECTOR_ARITH
134
- /* max_band as a vector, for the RGBA case.
135
- */
136
- v4f max_band_vec;
137
- #endif /* HAVE_VECTOR_ARITH*/
138
-
139
137
} VipsCompositeBase;
140
138
141
139
typedef VipsConversionClass VipsCompositeBaseClass;
@@ -168,6 +166,14 @@ vips_composite_base_dispose( GObject *gobject )
168
166
/* Our sequence value.
169
167
*/
170
168
typedef struct {
169
+ #ifdef HAVE_VECTOR_ARITH
170
+ /* max_band as a vector, for the RGBA case. This must be
171
+ * defined first to ensure that the member is aligned
172
+ * on a 16-byte boundary.
173
+ */
174
+ v4f max_band_vec;
175
+ #endif /* HAVE_VECTOR_ARITH*/
176
+
171
177
VipsCompositeBase *composite;
172
178
173
179
/* Full set of input regions, each made on the corresponding input
@@ -196,6 +202,39 @@ typedef struct {
196
202
197
203
} VipsCompositeSequence;
198
204
205
+ #ifdef HAVE_VECTOR_ARITH
206
+ /* Allocate aligned memory. The return value can be released
207
+ * by calling the vips_free_aligned() function, for example:
208
+ * VIPS_FREEF( vips_free_aligned, ptr );
209
+ */
210
+ static inline void *
211
+ vips_alloc_aligned ( size_t sz, size_t align )
212
+ {
213
+ g_assert ( !(align & (align - 1 )) );
214
+ #ifdef HAVE__ALIGNED_MALLOC
215
+ return _aligned_malloc ( sz, align );
216
+ #elif defined(HAVE_POSIX_MEMALIGN)
217
+ void *ptr;
218
+ if ( posix_memalign ( &ptr, align, sz ) ) return NULL ;
219
+ return ptr;
220
+ #elif defined(HAVE_MEMALIGN)
221
+ return memalign ( align, sz );
222
+ #else
223
+ #error Missing aligned alloc implementation
224
+ #endif
225
+ }
226
+
227
+ static inline void
228
+ vips_free_aligned ( void * ptr )
229
+ {
230
+ #ifdef HAVE__ALIGNED_MALLOC
231
+ _aligned_free ( ptr );
232
+ #else /* defined(HAVE_POSIX_MEMALIGN) || defined(HAVE_MEMALIGN)*/
233
+ free ( ptr );
234
+ #endif
235
+ }
236
+ #endif /* HAVE_VECTOR_ARITH*/
237
+
199
238
static int
200
239
vips_composite_stop ( void *vseq, void *a, void *b )
201
240
{
@@ -216,7 +255,11 @@ vips_composite_stop( void *vseq, void *a, void *b )
216
255
VIPS_FREE ( seq->enabled );
217
256
VIPS_FREE ( seq->p );
218
257
258
+ #ifdef HAVE_VECTOR_ARITH
259
+ VIPS_FREEF ( vips_free_aligned, seq );
260
+ #else /* !defined(HAVE_VECTOR_ARITH)*/
219
261
VIPS_FREE ( seq );
262
+ #endif /* HAVE_VECTOR_ARITH*/
220
263
221
264
return ( 0 );
222
265
}
@@ -230,7 +273,14 @@ vips_composite_start( VipsImage *out, void *a, void *b )
230
273
VipsCompositeSequence *seq;
231
274
int i, n;
232
275
276
+ #ifdef HAVE_VECTOR_ARITH
277
+ /* Ensure that the memory is aligned on a 16-byte boundary.
278
+ */
279
+ if ( !(seq = ((VipsCompositeSequence *) vips_alloc_aligned (
280
+ sizeof ( VipsCompositeSequence ), 16 ))) )
281
+ #else /* !defined(HAVE_VECTOR_ARITH)*/
233
282
if ( !(seq = VIPS_NEW ( NULL , VipsCompositeSequence )) )
283
+ #endif /* HAVE_VECTOR_ARITH*/
234
284
return ( NULL );
235
285
236
286
seq->composite = composite;
@@ -280,7 +330,19 @@ vips_composite_start( VipsImage *out, void *a, void *b )
280
330
return ( NULL );
281
331
}
282
332
}
283
-
333
+
334
+ #ifdef HAVE_VECTOR_ARITH
335
+ /* We need a float version for the vector path.
336
+ */
337
+ if ( composite->bands == 3 )
338
+ seq->max_band_vec = (v4f){
339
+ (float ) composite->max_band [0 ],
340
+ (float ) composite->max_band [1 ],
341
+ (float ) composite->max_band [2 ],
342
+ (float ) composite->max_band [3 ]
343
+ };
344
+ #endif
345
+
284
346
return ( seq );
285
347
}
286
348
@@ -664,9 +726,11 @@ vips_composite_base_blend( VipsCompositeBase *composite,
664
726
*/
665
727
template <typename T>
666
728
static void
667
- vips_composite_base_blend3 ( VipsCompositeBase *composite,
729
+ vips_composite_base_blend3 ( VipsCompositeSequence *seq,
668
730
VipsBlendMode mode, v4f &B, T * restrict p )
669
731
{
732
+ VipsCompositeBase *composite = seq->composite ;
733
+
670
734
v4f A;
671
735
float aA;
672
736
float aB;
@@ -684,7 +748,7 @@ vips_composite_base_blend3( VipsCompositeBase *composite,
684
748
A[2 ] = p[2 ];
685
749
A[3 ] = p[3 ];
686
750
687
- A /= composite ->max_band_vec ;
751
+ A /= seq ->max_band_vec ;
688
752
689
753
aA = A[3 ];
690
754
aB = B[3 ];
@@ -975,7 +1039,7 @@ vips_combine_pixels3( VipsCompositeSequence *seq, VipsPel *q )
975
1039
976
1040
/* Scale the base pixel to 0 - 1.
977
1041
*/
978
- B /= composite ->max_band_vec ;
1042
+ B /= seq ->max_band_vec ;
979
1043
aB = B[3 ];
980
1044
981
1045
if ( !composite->premultiplied ) {
@@ -987,7 +1051,7 @@ vips_combine_pixels3( VipsCompositeSequence *seq, VipsPel *q )
987
1051
int j = seq->enabled [i];
988
1052
VipsBlendMode m = n_mode == 1 ? mode[0 ] : mode[j - 1 ];
989
1053
990
- vips_composite_base_blend3<T>( composite , m, B, tp[i] );
1054
+ vips_composite_base_blend3<T>( seq , m, B, tp[i] );
991
1055
}
992
1056
993
1057
/* Unpremultiply, if necessary.
@@ -1006,7 +1070,7 @@ vips_combine_pixels3( VipsCompositeSequence *seq, VipsPel *q )
1006
1070
1007
1071
/* Write back as a full range pixel, clipping to range.
1008
1072
*/
1009
- B *= composite ->max_band_vec ;
1073
+ B *= seq ->max_band_vec ;
1010
1074
if ( min_T != 0 ||
1011
1075
max_T != 0 ) {
1012
1076
float low = min_T;
@@ -1386,14 +1450,6 @@ vips_composite_base_build( VipsObject *object )
1386
1450
return ( -1 );
1387
1451
}
1388
1452
1389
- #ifdef HAVE_VECTOR_ARITH
1390
- /* We need a float version for the vector path.
1391
- */
1392
- if ( composite->bands == 3 )
1393
- for ( int b = 0 ; b <= 3 ; b++ )
1394
- composite->max_band_vec [b] = composite->max_band [b];
1395
- #endif /* HAVE_VECTOR_ARITH*/
1396
-
1397
1453
/* Transform the input images to match in format. We may have
1398
1454
* mixed float and double, for example.
1399
1455
*/
0 commit comments