|
| 1 | +;;;; Modified version of trivial-utf-8 by Marijn Haverbeke. See the |
| 2 | +;;;; license information at the end of this file. |
| 3 | + |
| 4 | +(in-package #:ql-utf-8) |
| 5 | + |
| 6 | +;;; Minimal utf-8 decoding and encoding library. |
| 7 | +;;; |
| 8 | +;;; See http://common-lisp.net/project/trivial-utf-8/ |
| 9 | + |
| 10 | + |
| 11 | +(eval-when (:compile-toplevel :load-toplevel :execute) |
| 12 | + (defparameter *optimize* nil)) |
| 13 | + |
| 14 | +(deftype vector-index (&optional (size array-dimension-limit)) |
| 15 | + `(integer 0 (,size))) |
| 16 | + |
| 17 | +(deftype octet () |
| 18 | + `(unsigned-byte 8)) |
| 19 | + |
| 20 | +(define-condition utf-8-decoding-error (simple-error) |
| 21 | + ((message :initarg :message) |
| 22 | + (byte :initarg :byte :initform nil)) |
| 23 | + (:report (lambda (err stream) |
| 24 | + (format stream (slot-value err 'message) |
| 25 | + (slot-value err 'byte))))) |
| 26 | + |
| 27 | +(declaim (inline utf-8-group-size)) |
| 28 | +(defun utf-8-group-size (byte) |
| 29 | + "Determine the amount of bytes that are part of the character |
| 30 | +starting with a given byte." |
| 31 | + (check-type byte octet) |
| 32 | + (cond ((zerop (logand byte #b10000000)) 1) |
| 33 | + ((= (logand byte #b11100000) #b11000000) 2) |
| 34 | + ((= (logand byte #b11110000) #b11100000) 3) |
| 35 | + ((= (logand byte #b11111000) #b11110000) 4) |
| 36 | + (t (error 'utf-8-decoding-error :byte byte |
| 37 | + :message "Invalid byte at start of character: 0x~X")))) |
| 38 | + |
| 39 | +(defun utf-8-string-length (bytes &key (start 0) (end (length bytes))) |
| 40 | + "Calculate the length of the string encoded by the given bytes." |
| 41 | + (check-type bytes (simple-array octet (*))) |
| 42 | + (check-type start vector-index) |
| 43 | + (check-type end vector-index) |
| 44 | + (loop :with i :of-type fixnum = start |
| 45 | + :with string-length = 0 |
| 46 | + :while (< i end) |
| 47 | + :do (progn |
| 48 | + (incf (the fixnum string-length) 1) |
| 49 | + (incf i (utf-8-group-size (elt bytes i)))) |
| 50 | + :finally (return string-length))) |
| 51 | + |
| 52 | +(defun get-utf-8-character (bytes group-size &optional (start 0)) |
| 53 | + "Given an array of bytes and the amount of bytes to use, |
| 54 | +extract the character starting at the given start position." |
| 55 | + (check-type bytes (simple-array (unsigned-byte 8) (*))) |
| 56 | + (check-type start vector-index) |
| 57 | + (check-type group-size fixnum) |
| 58 | + (macrolet ((next-byte () |
| 59 | + '(prog1 (elt bytes start) |
| 60 | + (incf start))) |
| 61 | + (six-bits (byte) |
| 62 | + (let ((b (gensym))) |
| 63 | + `(let ((,b ,byte)) |
| 64 | + (unless (= (logand ,b #b11000000) #b10000000) |
| 65 | + (error 'utf-8-decoding-error :byte ,b |
| 66 | + :message "Invalid byte 0x~X inside a character.")) |
| 67 | + (ldb (byte 6 0) ,b)))) |
| 68 | + (test-overlong (byte min-size) |
| 69 | + (let ((b (gensym))) |
| 70 | + `(let ((,b ,byte)) |
| 71 | + (unless (>= ,b ,min-size) |
| 72 | + (error 'utf-8-decoding-error :byte ,b |
| 73 | + :message "Overlong byte sequence found.")) |
| 74 | + ,b)))) |
| 75 | + (case group-size |
| 76 | + (1 (next-byte)) |
| 77 | + (2 (test-overlong (logior (ash (ldb (byte 5 0) (next-byte)) 6) |
| 78 | + (six-bits (next-byte))) 128)) |
| 79 | + (3 (test-overlong (logior (ash (ldb (byte 4 0) (next-byte)) 12) |
| 80 | + (ash (six-bits (next-byte)) 6) |
| 81 | + (six-bits (next-byte))) 2048)) |
| 82 | + (4 (test-overlong (logior (ash (ldb (byte 3 0) (next-byte)) 18) |
| 83 | + (ash (six-bits (next-byte)) 12) |
| 84 | + (ash (six-bits (next-byte)) 6) |
| 85 | + (six-bits (next-byte))) 65536))))) |
| 86 | + |
| 87 | +(defun decode-utf-8 (bytes-in &key (start 0) (end (length bytes-in))) |
| 88 | + "Convert a byte array containing utf-8 encoded characters into |
| 89 | +the string it encodes." |
| 90 | + (check-type bytes-in vector) |
| 91 | + (check-type start vector-index) |
| 92 | + (check-type end vector-index) |
| 93 | + (loop :with bytes = (coerce bytes-in '(simple-array (unsigned-byte 8) (*))) |
| 94 | + :with buffer = (make-string (utf-8-string-length bytes :start start :end end) :element-type 'character) |
| 95 | + :with array-position :of-type fixnum = start |
| 96 | + :with string-position :of-type fixnum = 0 |
| 97 | + :while (< array-position end) |
| 98 | + :do (let* ((char (elt bytes array-position)) |
| 99 | + (current-group (utf-8-group-size char))) |
| 100 | + (when (> (+ current-group array-position) end) |
| 101 | + (error 'utf-8-decoding-error |
| 102 | + :message "Unfinished character at end of byte array.")) |
| 103 | + (setf (char buffer string-position) |
| 104 | + (code-char (get-utf-8-character bytes current-group |
| 105 | + array-position))) |
| 106 | + (incf string-position 1) |
| 107 | + (incf array-position current-group)) |
| 108 | + :finally (return buffer))) |
| 109 | + |
| 110 | +;;; Copyright (c) Marijn Haverbeke |
| 111 | +;;; |
| 112 | +;;; This software is provided 'as-is', without any express or implied |
| 113 | +;;; warranty. In no event will the authors be held liable for any |
| 114 | +;;; damages arising from the use of this software. |
| 115 | +;;; |
| 116 | +;;; Permission is granted to anyone to use this software for any |
| 117 | +;;; purpose, including commercial applications, and to alter it and |
| 118 | +;;; redistribute it freely, subject to the following restrictions: |
| 119 | +;;; |
| 120 | +;;; 1. The origin of this software must not be misrepresented; you must |
| 121 | +;;; not claim that you wrote the original software. If you use this |
| 122 | +;;; software in a product, an acknowledgment in the product |
| 123 | +;;; documentation would be appreciated but is not required. |
| 124 | +;;; |
| 125 | +;;; 2. Altered source versions must be plainly marked as such, and must |
| 126 | +;;; not be misrepresented as being the original software. |
| 127 | +;;; |
| 128 | +;;; 3. This notice may not be removed or altered from any source |
| 129 | +;;; distribution. |
0 commit comments