Merge pull request locutusjs#71 from kirilloid/master

kvz · kvz · commit e731daf2cfad · 2013-02-18T05:42:32.000-08:00
utf-16 surrogate pairs
diff --git a/functions/xml/utf8_decode.js b/functions/xml/utf8_decode.js
@@ -8,14 +8,17 @@ function utf8_decode (str_data) {
   // +   bugfixed by: Onno Marsman
   // +      input by: Brett Zamir (http://brett-zamir.me)
   // +   bugfixed by: Kevin van Zonneveld (http://kevin.vanzonneveld.net)
+  // +   bugfixed by: kirilloid
   // *     example 1: utf8_decode('Kevin van Zonneveld');
   // *     returns 1: 'Kevin van Zonneveld'
+
   var tmp_arr = [],
     i = 0,
     ac = 0,
     c1 = 0,
     c2 = 0,
-    c3 = 0;
+    c3 = 0,
+    c4 = 0;
 
   str_data += '';
 
@@ -24,19 +27,25 @@ function utf8_decode (str_data) {
     if (c1 <= 191) {
       tmp_arr[ac++] = String.fromCharCode(c1);
       i++;
-    } else if (c1 >= 192 && c1 <= 223) {
+    } else if (c1 <= 223) {
       c2 = str_data.charCodeAt(i + 1);
       tmp_arr[ac++] = String.fromCharCode(((c1 & 31) << 6) | (c2 & 63));
       i += 2;
-    } else if (c1 >= 224 && c1 <= 239) {
+    } else if (c1 <= 239) {
       // http://en.wikipedia.org/wiki/UTF-8#Codepage_layout
       c2 = str_data.charCodeAt(i + 1);
       c3 = str_data.charCodeAt(i + 2);
       tmp_arr[ac++] = String.fromCharCode(((c1 & 15) << 12) | ((c2 & 63) << 6) | (c3 & 63));
       i += 3;
     } else {
-      tmp_arr[ac++] = String.fromCharCode(c1);
-      i++;
+      c2 = str_data.charCodeAt(i + 1);
+      c3 = str_data.charCodeAt(i + 2);
+      c4 = str_data.charCodeAt(i + 3);
+      c1 = ((c1 & 7) << 18) | ((c2 & 63) << 12) | ((c3 & 63) << 6) | (c4 & 63);
+      c1 -= 0x10000;
+      tmp_arr[ac++] = String.fromCharCode(0xD800 | ((c1>>10) & 0x3FF));
+      tmp_arr[ac++] = String.fromCharCode(0xDC00 | (c1 & 0x3FF));
+      i += 4;
     }
   }
 
diff --git a/functions/xml/utf8_encode.js b/functions/xml/utf8_encode.js
@@ -10,6 +10,7 @@ function utf8_encode (argString) {
   // +   bugfixed by: Ulrich
   // +   bugfixed by: Rafal Kukawski
   // +   improved by: kirilloid
+  // +   bugfixed by: kirilloid
   // *     example 1: utf8_encode('Kevin van Zonneveld');
   // *     returns 1: 'Kevin van Zonneveld'
 
@@ -30,9 +31,27 @@ function utf8_encode (argString) {
     if (c1 < 128) {
       end++;
     } else if (c1 > 127 && c1 < 2048) {
-      enc = String.fromCharCode((c1 >> 6) | 192, (c1 & 63) | 128);
-    } else {
-      enc = String.fromCharCode((c1 >> 12) | 224, ((c1 >> 6) & 63) | 128, (c1 & 63) | 128);
+      enc = String.fromCharCode(
+         (c1 >> 6)        | 192,
+        ( c1        & 63) | 128
+      );
+    } else if (c1 & 0xF800 != 0xD800) {
+      enc = String.fromCharCode(
+         (c1 >> 12)       | 224,
+        ((c1 >> 6)  & 63) | 128,
+        ( c1        & 63) | 128
+      );
+    } else { // surrogate pairs
+      if (c1 & 0xFC00 != 0xD800) { throw new RangeError("Unmatched trail surrogate at " + n); }
+      var c2 = string.charCodeAt(++n);
+      if (c2 & 0xFC00 != 0xDC00) { throw new RangeError("Unmatched lead surrogate at " + (n-1)); }
+      c1 = ((c1 & 0x3FF) << 10) + (c2 & 0x3FF) + 0x10000;
+      enc = String.fromCharCode(
+         (c1 >> 18)       | 240,
+        ((c1 >> 12) & 63) | 128,
+        ((c1 >> 6)  & 63) | 128,
+        ( c1        & 63) | 128
+      );
     }
     if (enc !== null) {
       if (end > start) {