76
76
* - remove pts for a small speedup
77
77
* 12/10/17
78
78
* - fix leak of vectors, thanks MHeimbuc
79
+ * 14/10/17
80
+ * - switch to half-float for vector path
79
81
*/
80
82
81
83
/*
108
110
/*
109
111
#define DEBUG
110
112
#define DEBUG_PIXELS
111
- */
112
113
#define DEBUG_COMPILE
114
+ */
113
115
114
116
#ifdef HAVE_CONFIG_H
115
117
#include <config.h>
124
126
125
127
#include "pconvolution.h"
126
128
127
- /* We do the 8-bit vector path with fixed-point arithmetic. We use 3.5 bits
128
- * for the mask coefficients, so our range is -4 to +3.99, after using scale
129
- * on the mask.
130
- */
131
- #define FIXED_BITS (5)
132
- #define FIXED_SCALE (1 << FIXED_BITS)
133
-
134
129
/* Larger than this and we fall back to C.
135
130
*/
136
131
#define MAX_PASS (20)
@@ -154,22 +149,22 @@ typedef struct {
154
149
/* An int version of M.
155
150
*/
156
151
VipsImage * iM ;
152
+ int n_point ; /* w * h for our matrix */
157
153
158
154
/* We make a smaller version of the mask with the zeros squeezed out.
159
155
*/
160
156
int nnz ; /* Number of non-zero mask elements */
161
157
int * coeff ; /* Array of non-zero mask coefficients */
162
158
int * coeff_pos ; /* Index of each nnz element in mask->coeff */
163
159
164
- /* And a half float version for a vector path. mant has the signed
160
+ /* And a half float version for the vector path. mant has the signed
165
161
* 8-bit mantissas in [-1, +1), sexp has the exponent shift after the
166
162
* mul and before the add, and exp has the final exponent shift before
167
163
* write-back.
168
164
*/
169
165
int * mant ;
170
166
int sexp ;
171
167
int exp ;
172
- int n_point ; /* Number of points in fixed-point array */
173
168
174
169
/* The set of passes we need for this mask.
175
170
*/
@@ -180,10 +175,6 @@ typedef struct {
180
175
*/
181
176
int r ;
182
177
VipsVector * vector ;
183
-
184
- /* Remove later.
185
- */
186
- int * fixed ;
187
178
} VipsConvi ;
188
179
189
180
typedef VipsConvolutionClass VipsConviClass ;
@@ -353,11 +344,13 @@ vips_convi_compile_section( VipsConvi *convi, VipsImage *in, Pass *pass )
353
344
354
345
char source [256 ];
355
346
char off [256 ];
347
+ char rnd [256 ];
348
+ char sexp [256 ];
356
349
char coeff [256 ];
357
350
358
351
/* Exclude zero elements.
359
352
*/
360
- if ( !convi -> fixed [i ] )
353
+ if ( !convi -> mant [i ] )
361
354
continue ;
362
355
363
356
/* The source. sl0 is the first scanline in the mask.
@@ -379,9 +372,16 @@ vips_convi_compile_section( VipsConvi *convi, VipsImage *in, Pass *pass )
379
372
* of the image and coefficient are interesting, so we can take
380
373
* the bottom half of a 16x16->32 multiply.
381
374
*/
382
- CONST ( coeff , convi -> fixed [i ], 2 );
375
+ CONST ( coeff , convi -> mant [i ], 2 );
383
376
ASM3 ( "mullw" , "value" , "value" , coeff );
384
377
378
+ /* Shift right before add to prevent overflow on large masks.
379
+ */
380
+ CONST ( sexp , convi -> sexp , 2 );
381
+ CONST ( rnd , 1 << (convi -> sexp - 1 ), 2 );
382
+ ASM3 ( "addw" , "value" , "value" , rnd );
383
+ ASM3 ( "shrsw" , "value" , "value" , sexp );
384
+
385
385
/* We accumulate the signed 16-bit result in sum. Saturated
386
386
* add.
387
387
*/
@@ -420,8 +420,8 @@ vips_convi_compile_clip( VipsConvi *convi )
420
420
int offset = VIPS_RINT ( vips_image_get_offset ( M ) );
421
421
422
422
VipsVector * v ;
423
- char c16 [256 ];
424
- char c5 [256 ];
423
+ char rnd [256 ];
424
+ char exp [256 ];
425
425
char c0 [256 ];
426
426
char c255 [256 ];
427
427
char off [256 ];
@@ -436,10 +436,10 @@ vips_convi_compile_clip( VipsConvi *convi )
436
436
*/
437
437
TEMP ( "value" , 2 );
438
438
439
- CONST ( c16 , 16 , 2 );
440
- ASM3 ( "addw" , "value" , "r" , c16 );
441
- CONST ( c5 , 5 , 2 );
442
- ASM3 ( "shrsw" , "value" , "value" , c5 );
439
+ CONST ( rnd , 1 << ( convi -> exp - 1 ) , 2 );
440
+ ASM3 ( "addw" , "value" , "r" , rnd );
441
+ CONST ( exp , convi -> exp , 2 );
442
+ ASM3 ( "shrsw" , "value" , "value" , exp );
443
443
444
444
CONST ( off , offset , 2 );
445
445
ASM3 ( "addw" , "value" , "value" , off );
@@ -852,8 +852,7 @@ vips__image_intize( VipsImage *in, VipsImage **out )
852
852
static int
853
853
vips_convi_intize ( VipsConvi * convi , VipsImage * M )
854
854
{
855
- int n_point = M -> Xsize * M -> Ysize ;
856
-
855
+ int n_point ;
857
856
VipsImage * t ;
858
857
double scale ;
859
858
double * scaled ;
@@ -862,6 +861,10 @@ vips_convi_intize( VipsConvi *convi, VipsImage *M )
862
861
int shift ;
863
862
int i ;
864
863
864
+ n_point = M -> Xsize * M -> Ysize ;
865
+
866
+ g_assert ( convi -> n_point == n_point );
867
+
865
868
if ( vips_check_matrix ( "vips2imask" , M , & t ) )
866
869
return ( -1 );
867
870
@@ -1003,7 +1006,7 @@ vips_convi_build( VipsObject *object )
1003
1006
1004
1007
in = convolution -> in ;
1005
1008
M = convolution -> M ;
1006
- convi -> n_point = n_point = M -> Xsize * M -> Ysize ;
1009
+ convi -> n_point = M -> Xsize * M -> Ysize ;
1007
1010
1008
1011
if ( vips_embed ( in , & t [0 ],
1009
1012
M -> Xsize / 2 , M -> Ysize / 2 ,
@@ -1042,12 +1045,15 @@ vips_convi_build( VipsObject *object )
1042
1045
convi -> iM = M = t [1 ];
1043
1046
1044
1047
coeff = VIPS_MATRIX ( M , 0 , 0 );
1048
+ n_point = M -> Xsize * M -> Ysize ;
1045
1049
if ( !(convi -> coeff = VIPS_ARRAY ( object , n_point , int )) ||
1046
- !(convi -> coeff_pos = VIPS_ARRAY ( object , n_point , int )) )
1050
+ !(convi -> coeff_pos =
1051
+ VIPS_ARRAY ( object , n_point , int )) )
1047
1052
return ( -1 );
1048
1053
1049
1054
/* Squeeze out zero mask elements.
1050
1055
*/
1056
+ convi -> nnz = 0 ;
1051
1057
for ( i = 0 ; i < n_point ; i ++ )
1052
1058
if ( coeff [i ] ) {
1053
1059
convi -> coeff [convi -> nnz ] = coeff [i ];
@@ -1127,7 +1133,7 @@ vips_convi_init( VipsConvi *convi )
1127
1133
* The output image always has the same #VipsBandFormat as the input image.
1128
1134
*
1129
1135
* For #VIPS_FORMAT_UCHAR images, vips_convi() uses a fast vector path based on
1130
- * fixed-point arithmetic. This can produce slightly different results.
1136
+ * half-float arithmetic. This can produce slightly different results.
1131
1137
* Disable the vector path with `--vips-novector` or `VIPS_NOVECTOR` or
1132
1138
* vips_vector_set_enabled().
1133
1139
*
0 commit comments