Skip to content

Commit 7bb282b

Browse files
committed
Fix conversion of EUC-TW text (and add test suite)
- Treat text which ends abruptly in the middle of a multi-byte character as erroneous. - Don't allow ASCII control characters to appear in the middle of a multi-byte character. - If an illegal byte appears in the middle of a multi-byte character, go back to the initial state rather than trying to finish the multi-byte character. - There was a bug in the file with the conversion tables, which set the 'maximum codepoint which can be converted using table A2' using the size of table A1, not table A2. This meant that several hundred Unicode codepoints which should have been able to be converted to EUC-TW were flagged as erroneous instead. - When a sequence which cannot possibly be a prefix of a valid multi-byte character is found, immediately flag it as an error, rather than waiting to read more bytes first. - Allow characters in CNS-11643 plane 1 to be encoded as 4-byte sequences (although they can also be encoded as 2-byte sequences). This is allowed by the standard for EUC-TW text.
1 parent b0eeb58 commit 7bb282b

File tree

4 files changed

+23806
-81
lines changed

4 files changed

+23806
-81
lines changed

ext/mbstring/libmbfl/filters/mbfilter_euc_tw.c

Lines changed: 69 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@
3232

3333
#include "unicode_table_cns11643.h"
3434

35+
static int mbfl_filt_conv_euctw_wchar_flush(mbfl_convert_filter *filter);
36+
3537
static const unsigned char mblen_table_euctw[] = { /* 0xA1-0xFE */
3638
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3739
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -71,7 +73,7 @@ const struct mbfl_convert_vtbl vtbl_euctw_wchar = {
7173
mbfl_filt_conv_common_ctor,
7274
NULL,
7375
mbfl_filt_conv_euctw_wchar,
74-
mbfl_filt_conv_common_flush,
76+
mbfl_filt_conv_euctw_wchar_flush,
7577
NULL,
7678
};
7779

@@ -87,117 +89,97 @@ const struct mbfl_convert_vtbl vtbl_wchar_euctw = {
8789

8890
#define CK(statement) do { if ((statement) < 0) return (-1); } while (0)
8991

90-
/*
91-
* EUC-TW => wchar
92-
*/
93-
int
94-
mbfl_filt_conv_euctw_wchar(int c, mbfl_convert_filter *filter)
92+
int mbfl_filt_conv_euctw_wchar(int c, mbfl_convert_filter *filter)
9593
{
96-
int c1, s, w, plane;
94+
int c1, s, w;
9795

9896
switch (filter->status) {
9997
case 0:
100-
if (c >= 0 && c < 0x80) { /* latin */
98+
if (c >= 0 && c < 0x80) { /* latin */
10199
CK((*filter->output_function)(c, filter->data));
102-
} else if (c > 0xa0 && c < 0xff) { /* dbcs first byte */
100+
} else if (((c >= 0xA1 && c <= 0xA6) || (c >= 0xC2 && c <= 0xFD)) && c != 0xC3) { /* 2-byte character, first byte */
103101
filter->status = 1;
104102
filter->cache = c;
105-
} else if (c == 0x8e) { /* mbcs first byte */
103+
} else if (c == 0x8E) { /* 4-byte character, first byte */
106104
filter->status = 2;
107-
filter->cache = c;
108105
} else {
109-
w = c & MBFL_WCSGROUP_MASK;
110-
w |= MBFL_WCSGROUP_THROUGH;
111-
CK((*filter->output_function)(w, filter->data));
106+
CK((*filter->output_function)(c | MBFL_WCSGROUP_THROUGH, filter->data));
112107
}
113108
break;
114109

115-
case 1: /* mbcs second byte */
110+
case 1: /* 2-byte character, second byte */
116111
filter->status = 0;
117112
c1 = filter->cache;
118-
if (c > 0xa0 && c < 0xff) {
119-
w = (c1 - 0xa1)*94 + (c - 0xa1);
113+
if (c > 0xA0 && c < 0xFF) {
114+
w = (c1 - 0xA1)*94 + (c - 0xA1);
120115
if (w >= 0 && w < cns11643_1_ucs_table_size) {
121116
w = cns11643_1_ucs_table[w];
122117
} else {
123118
w = 0;
124119
}
125120
if (w <= 0) {
126-
w = (c1 << 8) | c;
127-
w &= MBFL_WCSPLANE_MASK;
128-
w |= MBFL_WCSPLANE_CNS11643;
121+
w = (c1 << 8) | c | MBFL_WCSPLANE_CNS11643;
129122
}
130123
CK((*filter->output_function)(w, filter->data));
131-
} else if ((c >= 0 && c < 0x21) || c == 0x7f) { /* CTLs */
132-
CK((*filter->output_function)(c, filter->data));
133124
} else {
134-
w = (c1 << 8) | c;
135-
w &= MBFL_WCSGROUP_MASK;
136-
w |= MBFL_WCSGROUP_THROUGH;
125+
filter->status = filter->cache = 0;
126+
w = (c1 << 8) | c | MBFL_WCSGROUP_THROUGH;
137127
CK((*filter->output_function)(w, filter->data));
138128
}
139129
break;
140130

141-
case 2: /* got 0x8e, first char */
142-
c1 = filter->cache;
143-
if ((c >= 0 && c < 0x21) || c == 0x7f) { /* CTLs */
144-
CK((*filter->output_function)(c, filter->data));
145-
filter->status = 0;
146-
} else if (c > 0xa0 && c < 0xaf) {
131+
case 2: /* got 0x8e, second byte */
132+
if (c == 0xA1 || c == 0xA2 || c == 0xAE) {
147133
filter->status = 3;
148-
filter->cache = c - 0xa1;
134+
filter->cache = c - 0xA1;
149135
} else {
150-
w = (c1 << 8) | c;
151-
w &= MBFL_WCSGROUP_MASK;
152-
w |= MBFL_WCSGROUP_THROUGH;
136+
filter->status = filter->cache = 0;
137+
w = 0x8E00 | c | MBFL_WCSGROUP_THROUGH;
153138
CK((*filter->output_function)(w, filter->data));
154139
}
155140
break;
156141

157-
case 3: /* got 0x8e, third char */
142+
case 3: /* got 0x8e, third byte */
158143
filter->status = 0;
159144
c1 = filter->cache;
160-
if ((c >= 0 && c < 0x21) || c == 0x7f) { /* CTLs */
161-
CK((*filter->output_function)(c, filter->data));
162-
filter->status = 0;
163-
} else if (c > 0xa0 && c < 0xff) {
145+
if (c >= 0xA1 && ((c1 == 0 && ((c >= 0xA1 && c <= 0xA6) || (c >= 0xC2 && c <= 0xFD)) && c != 0xC3) ||
146+
(c1 == 1 && c <= 0xF2) || (c1 == 13 && c <= 0xE7))) {
164147
filter->status = 4;
165-
filter->cache = (c1 << 8) + c - 0xa1;
148+
filter->cache = (c1 << 8) + c - 0xA1;
166149
} else {
167-
w = (c1 << 8) | c;
168-
w &= MBFL_WCSGROUP_MASK;
169-
w |= MBFL_WCSGROUP_THROUGH;
150+
filter->status = filter->cache = 0;
151+
w = (c1 << 8) | c | MBFL_WCSGROUP_THROUGH;
170152
CK((*filter->output_function)(w, filter->data));
171153
}
172154
break;
173155

174-
case 4: /* mbcs fourth char */
156+
case 4: /* multi-byte character, fourth byte */
175157
filter->status = 0;
176158
c1 = filter->cache;
177-
if (c1 >= 0x100 && c1 <= 0xdff && c > 0xa0 && c < 0xff) {
178-
plane = (c1 & 0xf00) >> 8;
179-
s = (c1 & 0xff)*94 + c - 0xa1;
159+
if (c1 <= 0xDFF && c > 0xA0 && c < 0xFF) {
160+
int plane = (c1 & 0xF00) >> 8; /* This is actually the CNS-11643 plane minus one */
161+
s = (c1 & 0xFF)*94 + c - 0xA1;
180162
w = 0;
181163
if (s >= 0) {
182-
if (plane == 1 && s < cns11643_2_ucs_table_size) {
164+
/* A later version of CNS-11643 moved all the characters in "plane 14" to "plane 3",
165+
* and added tens of thousands more characters in planes 4, 5, 6, and 7
166+
* We only support the older version of CNS-11643
167+
* This is the same as iconv from glibc 2.2 */
168+
if (plane == 0 && s < cns11643_1_ucs_table_size) {
169+
w = cns11643_1_ucs_table[s];
170+
} else if (plane == 1 && s < cns11643_2_ucs_table_size) {
183171
w = cns11643_2_ucs_table[s];
184-
}
185-
if (plane == 13 && s < cns11643_14_ucs_table_size) {
172+
} else if (plane == 13 && s < cns11643_14_ucs_table_size) {
186173
w = cns11643_14_ucs_table[s];
187174
}
188175
}
189176
if (w <= 0) {
190-
w = ((c1 & 0x7f) << 8) | (c & 0x7f);
191-
w &= MBFL_WCSPLANE_MASK;
192-
w |= MBFL_WCSPLANE_CNS11643;
177+
w = ((c1 & 0x7F) << 8) | (c & 0x7F) | MBFL_WCSPLANE_CNS11643;
193178
}
194179
CK((*filter->output_function)(w, filter->data));
195-
} else if ((c >= 0 && c < 0x21) || c == 0x7f) { /* CTLs */
196-
CK((*filter->output_function)(c, filter->data));
197180
} else {
198-
w = (c1 << 8) | c | 0x8e0000;
199-
w &= MBFL_WCSGROUP_MASK;
200-
w |= MBFL_WCSGROUP_THROUGH;
181+
filter->status = filter->cache = 0;
182+
w = (c1 << 8) | c | 0x8e0000 | MBFL_WCSGROUP_THROUGH;
201183
CK((*filter->output_function)(w, filter->data));
202184
}
203185
break;
@@ -210,15 +192,10 @@ mbfl_filt_conv_euctw_wchar(int c, mbfl_convert_filter *filter)
210192
return c;
211193
}
212194

213-
/*
214-
* wchar => EUC-TW
215-
*/
216-
int
217-
mbfl_filt_conv_wchar_euctw(int c, mbfl_convert_filter *filter)
195+
int mbfl_filt_conv_wchar_euctw(int c, mbfl_convert_filter *filter)
218196
{
219-
int c1, s, plane;
197+
int s = 0;
220198

221-
s = 0;
222199
if (c >= ucs_a1_cns11643_table_min && c < ucs_a1_cns11643_table_max) {
223200
s = ucs_a1_cns11643_table[c - ucs_a1_cns11643_table_min];
224201
} else if (c >= ucs_a2_cns11643_table_min && c < ucs_a2_cns11643_table_max) {
@@ -230,36 +207,48 @@ mbfl_filt_conv_wchar_euctw(int c, mbfl_convert_filter *filter)
230207
} else if (c >= ucs_r_cns11643_table_min && c < ucs_r_cns11643_table_max) {
231208
s = ucs_r_cns11643_table[c - ucs_r_cns11643_table_min];
232209
}
210+
233211
if (s <= 0) {
234-
c1 = c & ~MBFL_WCSPLANE_MASK;
235-
if (c1 == MBFL_WCSPLANE_CNS11643) {
236-
s = c & MBFL_WCSPLANE_MASK;
237-
}
238212
if (c == 0) {
239213
s = 0;
240214
} else if (s <= 0) {
241215
s = -1;
242216
}
243217
}
218+
244219
if (s >= 0) {
245-
plane = (s & 0x1f0000) >> 16;
246-
if (plane <= 1){
247-
if (s < 0x80) { /* latin */
220+
int plane = (s & 0x1F0000) >> 16;
221+
if (plane <= 1) {
222+
if (s < 0x80) { /* latin */
248223
CK((*filter->output_function)(s, filter->data));
249224
} else {
250-
s = (s & 0xffff) | 0x8080;
251-
CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
252-
CK((*filter->output_function)(s & 0xff, filter->data));
225+
s = (s & 0xFFFF) | 0x8080;
226+
CK((*filter->output_function)((s >> 8) & 0xFF, filter->data));
227+
CK((*filter->output_function)(s & 0xFF, filter->data));
253228
}
254229
} else {
255-
s = (0x8ea00000 + (plane << 16)) | ((s & 0xffff) | 0x8080);
230+
s = (0x8EA00000 + (plane << 16)) | ((s & 0xFFFF) | 0x8080);
256231
CK((*filter->output_function)(0x8e , filter->data));
257-
CK((*filter->output_function)((s >> 16) & 0xff, filter->data));
258-
CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
259-
CK((*filter->output_function)(s & 0xff, filter->data));
232+
CK((*filter->output_function)((s >> 16) & 0xFF, filter->data));
233+
CK((*filter->output_function)((s >> 8) & 0xFF, filter->data));
234+
CK((*filter->output_function)(s & 0xFF, filter->data));
260235
}
261236
} else {
262237
CK(mbfl_filt_conv_illegal_output(c, filter));
263238
}
264239
return c;
265240
}
241+
242+
static int mbfl_filt_conv_euctw_wchar_flush(mbfl_convert_filter *filter)
243+
{
244+
if (filter->status) {
245+
/* 2-byte or 4-byte character was truncated */
246+
CK((*filter->output_function)(filter->cache | MBFL_WCSGROUP_THROUGH, filter->data));
247+
}
248+
249+
if (filter->flush_function) {
250+
(*filter->flush_function)(filter->data);
251+
}
252+
253+
return 0;
254+
}

ext/mbstring/libmbfl/filters/unicode_table_cns11643.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3175,6 +3175,7 @@ static const unsigned int ucs_a2_cns11643_table[] = {
31753175
0x00000,0x00000,0x00000,0x00000,0x00000,0x00000,0x00000,0x00000,
31763176
0x00000,0x00000,0x00000,0x00000,0x00000,0x00000,0x00000,0x00000,
31773177
0x00000,0x00000,0x00000,0x00000,0x00000,0x00000,0x00000,0x00000,
3178+
/* 0x2400 */
31783179
0x14221,0x14222,0x14223,0x14224,0x14225,0x14226,0x14227,0x14228,
31793180
0x14229,0x1422a,0x1422b,0x1422c,0x1422d,0x1422e,0x1422f,0x14230,
31803181
0x14231,0x14232,0x14233,0x14234,0x14235,0x14236,0x14237,0x14238,
@@ -3207,6 +3208,7 @@ static const unsigned int ucs_a2_cns11643_table[] = {
32073208
0x00000,0x00000,0x00000,0x00000,0x00000,0x00000,0x00000,0x00000,
32083209
0x00000,0x00000,0x00000,0x00000,0x00000,0x00000,0x00000,0x00000,
32093210
0x00000,0x00000,0x00000,0x00000,0x00000,0x00000,0x00000,0x00000,
3211+
/* 0x2500 */
32103212
0x12339,0x00000,0x1233a,0x00000,0x00000,0x00000,0x00000,0x00000,
32113213
0x00000,0x00000,0x00000,0x00000,0x1233c,0x00000,0x00000,0x00000,
32123214
0x1233d,0x00000,0x00000,0x00000,0x1233e,0x00000,0x00000,0x00000,
@@ -3251,7 +3253,7 @@ static const unsigned int ucs_a2_cns11643_table[] = {
32513253
0x12251,0x12253,0x12252};
32523254

32533255
static const int ucs_a2_cns11643_table_min = 0x2000;
3254-
static const int ucs_a2_cns11643_table_max = 0x2000 + (sizeof (ucs_a1_cns11643_table) / sizeof (unsigned int));
3256+
static const int ucs_a2_cns11643_table_max = 0x2000 + (sizeof (ucs_a2_cns11643_table) / sizeof (unsigned int));
32553257

32563258
static const unsigned int ucs_a3_cns11643_table[] = {
32573259
/* 0x2f00 */

0 commit comments

Comments
 (0)