Skip to content

Commit e731daf

Browse files
committed
Merge pull request locutusjs#71 from kirilloid/master
utf-16 surrogate pairs
2 parents 86156bb + 1f86e5f commit e731daf

File tree

2 files changed

+36
-8
lines changed

2 files changed

+36
-8
lines changed

functions/xml/utf8_decode.js

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,17 @@ function utf8_decode (str_data) {
88
// + bugfixed by: Onno Marsman
99
// + input by: Brett Zamir (http://brett-zamir.me)
1010
// + bugfixed by: Kevin van Zonneveld (http://kevin.vanzonneveld.net)
11+
// + bugfixed by: kirilloid
1112
// * example 1: utf8_decode('Kevin van Zonneveld');
1213
// * returns 1: 'Kevin van Zonneveld'
14+
1315
var tmp_arr = [],
1416
i = 0,
1517
ac = 0,
1618
c1 = 0,
1719
c2 = 0,
18-
c3 = 0;
20+
c3 = 0,
21+
c4 = 0;
1922

2023
str_data += '';
2124

@@ -24,19 +27,25 @@ function utf8_decode (str_data) {
2427
if (c1 <= 191) {
2528
tmp_arr[ac++] = String.fromCharCode(c1);
2629
i++;
27-
} else if (c1 >= 192 && c1 <= 223) {
30+
} else if (c1 <= 223) {
2831
c2 = str_data.charCodeAt(i + 1);
2932
tmp_arr[ac++] = String.fromCharCode(((c1 & 31) << 6) | (c2 & 63));
3033
i += 2;
31-
} else if (c1 >= 224 && c1 <= 239) {
34+
} else if (c1 <= 239) {
3235
// http://en.wikipedia.org/wiki/UTF-8#Codepage_layout
3336
c2 = str_data.charCodeAt(i + 1);
3437
c3 = str_data.charCodeAt(i + 2);
3538
tmp_arr[ac++] = String.fromCharCode(((c1 & 15) << 12) | ((c2 & 63) << 6) | (c3 & 63));
3639
i += 3;
3740
} else {
38-
tmp_arr[ac++] = String.fromCharCode(c1);
39-
i++;
41+
c2 = str_data.charCodeAt(i + 1);
42+
c3 = str_data.charCodeAt(i + 2);
43+
c4 = str_data.charCodeAt(i + 3);
44+
c1 = ((c1 & 7) << 18) | ((c2 & 63) << 12) | ((c3 & 63) << 6) | (c4 & 63);
45+
c1 -= 0x10000;
46+
tmp_arr[ac++] = String.fromCharCode(0xD800 | ((c1>>10) & 0x3FF));
47+
tmp_arr[ac++] = String.fromCharCode(0xDC00 | (c1 & 0x3FF));
48+
i += 4;
4049
}
4150
}
4251

functions/xml/utf8_encode.js

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ function utf8_encode (argString) {
1010
// + bugfixed by: Ulrich
1111
// + bugfixed by: Rafal Kukawski
1212
// + improved by: kirilloid
13+
// + bugfixed by: kirilloid
1314
// * example 1: utf8_encode('Kevin van Zonneveld');
1415
// * returns 1: 'Kevin van Zonneveld'
1516

@@ -30,9 +31,27 @@ function utf8_encode (argString) {
3031
if (c1 < 128) {
3132
end++;
3233
} else if (c1 > 127 && c1 < 2048) {
33-
enc = String.fromCharCode((c1 >> 6) | 192, (c1 & 63) | 128);
34-
} else {
35-
enc = String.fromCharCode((c1 >> 12) | 224, ((c1 >> 6) & 63) | 128, (c1 & 63) | 128);
34+
enc = String.fromCharCode(
35+
(c1 >> 6) | 192,
36+
( c1 & 63) | 128
37+
);
38+
} else if (c1 & 0xF800 != 0xD800) {
39+
enc = String.fromCharCode(
40+
(c1 >> 12) | 224,
41+
((c1 >> 6) & 63) | 128,
42+
( c1 & 63) | 128
43+
);
44+
} else { // surrogate pairs
45+
if (c1 & 0xFC00 != 0xD800) { throw new RangeError("Unmatched trail surrogate at " + n); }
46+
var c2 = string.charCodeAt(++n);
47+
if (c2 & 0xFC00 != 0xDC00) { throw new RangeError("Unmatched lead surrogate at " + (n-1)); }
48+
c1 = ((c1 & 0x3FF) << 10) + (c2 & 0x3FF) + 0x10000;
49+
enc = String.fromCharCode(
50+
(c1 >> 18) | 240,
51+
((c1 >> 12) & 63) | 128,
52+
((c1 >> 6) & 63) | 128,
53+
( c1 & 63) | 128
54+
);
3655
}
3756
if (enc !== null) {
3857
if (end > start) {

0 commit comments

Comments
 (0)