Skip to content

Commit b2527da

Browse files
authored
allow utf-8 header for svg detection (libvips#2481)
* allow utf-8 header for svg detection We were checking that the first 24 chars of an SVG were plain ASCII, but that's not always the case, for example: <svg id="レイヤー_1のコピー" data-name="レイヤー 1のコピー" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100"> </svg> We now test for the string "<svg" being in the first 1000 bytes, and everything up to that being valid utf-8. See libvips#2438 * raise priority of webpload it was very low priority before, for some reason
1 parent e1a7063 commit b2527da

File tree

2 files changed

+89
-16
lines changed

2 files changed

+89
-16
lines changed

libvips/foreign/svgload.c

Lines changed: 87 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
* 11/6/21
2323
* - switch to rsvg_handle_render_document()
2424
* - librsvg can no longer render very large images :(
25+
* 14/10/21
26+
* - allow utf-8 headers for svg detection
2527
*/
2628

2729
/*
@@ -131,6 +133,85 @@ vips_foreign_load_svg_zfree( void *opaque, void *ptr )
131133
}
132134
#endif /*HANDLE_SVGZ*/
133135

136+
/* Find a utf-8 substring within the first len_bytes (not characters).
137+
*
138+
* - case-insensitive
139+
* - needle must be zero-terminated, but hackstack need not be
140+
* - haystack can be null-terminated
141+
* - if haystack is shorter than len bytes, that'll end the search
142+
* - if we hit invalid utf-8, we return NULL
143+
*/
144+
static const char *
145+
vips_utf8_strcasestr( const char *haystack_start, const char *needle_start,
146+
int len_bytes )
147+
{
148+
int needle_len = g_utf8_strlen( needle_start, -1 );
149+
int needle_len_bytes = strlen( needle_start );
150+
151+
const char *haystack;
152+
153+
for( haystack = haystack_start;
154+
haystack - haystack_start <= len_bytes - needle_len_bytes;
155+
haystack = g_utf8_find_next_char( haystack, NULL ) ) {
156+
const char *needle_char;
157+
const char *haystack_char;
158+
int i;
159+
160+
haystack_char = haystack;
161+
needle_char = needle_start;
162+
for( i = 0; i < needle_len; i++ ) {
163+
/* Haystack isn't necessarily null-terminated and
164+
* might end half-way through a utf-8 character, so we
165+
* need to be careful not to run off the end.
166+
*/
167+
gunichar a =
168+
g_utf8_get_char_validated( haystack_char,
169+
haystack_start + len_bytes - haystack );
170+
gunichar b =
171+
g_utf8_get_char_validated( needle_char, -1 );
172+
173+
/* Invalid utf8?
174+
*
175+
* gunichar is a uint32, so we can't compare < 0, we
176+
* have to look for -1 and -2 (the two possible error
177+
* values).
178+
*/
179+
if( a == (gunichar) -1 ||
180+
a == (gunichar) -2 ||
181+
b == (gunichar) -1 ||
182+
b == (gunichar) -2 )
183+
return( NULL );
184+
185+
/* End of haystack. There can't be a complete needle
186+
* anywhere.
187+
*/
188+
if( a == (gunichar) 0 )
189+
return( NULL );
190+
191+
/* Mismatch.
192+
*/
193+
if( g_unichar_tolower( a ) != g_unichar_tolower( b ) )
194+
break;
195+
196+
haystack_char =
197+
g_utf8_find_next_char( haystack_char,
198+
haystack_start + len_bytes );
199+
needle_char =
200+
g_utf8_find_next_char( needle_char, NULL );
201+
}
202+
203+
if( i == needle_len )
204+
/* Walked the whole of needle, so we must have found a
205+
* complete match.
206+
*/
207+
return( haystack );
208+
}
209+
210+
/* Walked the whole of haystack without finding a match.
211+
*/
212+
return( NULL );
213+
}
214+
134215
/* This is used by both the file and buffer subclasses.
135216
*/
136217
static gboolean
@@ -145,8 +226,6 @@ vips_foreign_load_svg_is_a( const void *buf, size_t len )
145226
char obuf[SVG_HEADER_SIZE];
146227
#endif /*HANDLE_SVGZ*/
147228

148-
int i;
149-
150229
/* Start with str pointing at the argument buffer, swap to it pointing
151230
* into obuf if we see zip data.
152231
*/
@@ -200,23 +279,17 @@ vips_foreign_load_svg_is_a( const void *buf, size_t len )
200279
*
201280
* But there can be a doctype in there too. And case and whitespace can
202281
* vary a lot. And the <?xml can be missing. And you can have a comment
203-
* before the <svg line.
282+
* before the <svg line. And it can be utf-8, so non ASCII characters.
204283
*
205-
* Simple rules:
206-
* - first 24 chars are plain ascii (x09-x7F)
207-
* - first SVG_HEADER_SIZE chars contain "<svg", upper or lower case.
284+
* All we do is look for "<svg", any case, within the first
285+
* SVG_HEADER_SIZE bytes, where the bytestream up to the "<svg" is
286+
* valid utf-8.
208287
*
209288
* We could rsvg_handle_new_from_data() on the buffer, but that can be
210289
* horribly slow for large documents.
211290
*/
212-
if( len < 24 )
213-
return( 0 );
214-
for( i = 0; i < 24; i++ )
215-
if( !isascii( str[i] ) || str[i] < 9 )
216-
return( FALSE );
217-
for( i = 0; i < SVG_HEADER_SIZE && i < len - 5; i++ )
218-
if( g_ascii_strncasecmp( str + i, "<svg", 4 ) == 0 )
219-
return( TRUE );
291+
if( vips_utf8_strcasestr( str, "<svg", len ) )
292+
return( TRUE );
220293

221294
return( FALSE );
222295
}

libvips/foreign/webpload.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -166,9 +166,9 @@ vips_foreign_load_webp_class_init( VipsForeignLoadWebpClass *class )
166166
object_class->description = _( "load webp" );
167167
object_class->build = vips_foreign_load_webp_build;
168168

169-
/* is_a() is not that quick ... lower the priority.
169+
/* We are fast at is_a(), so high priority.
170170
*/
171-
foreign_class->priority = -50;
171+
foreign_class->priority = 200;
172172

173173
load_class->get_flags_filename =
174174
vips_foreign_load_webp_get_flags_filename;

0 commit comments

Comments
 (0)