Skip to content

Commit ef0417d

Browse files
committed
First cut at UTF-8 functions for provenance table. Taken from TRIVIAL-UTF-8.
1 parent 8ce78a6 commit ef0417d

File tree

2 files changed

+135
-0
lines changed

2 files changed

+135
-0
lines changed

quicklisp/package.lisp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,12 @@
1818
#:safely-read-file
1919
#:make-versions-url))
2020

21+
(defpackage #:ql-utf-8
22+
(:documentation
23+
"Functions to convert octet vectors encoded as UTF-8 to strings.")
24+
(:use #:cl)
25+
(:export #:decode-utf-8))
26+
2127
(defpackage #:ql-setup
2228
(:documentation
2329
"Functions and variables initialized early in the Quicklisp client

quicklisp/utf-8.lisp

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
;;;; Modified version of trivial-utf-8 by Marijn Haverbeke. See the
2+
;;;; license information at the end of this file.
3+
4+
(in-package #:ql-utf-8)
5+
6+
;;; Minimal utf-8 decoding and encoding library.
7+
;;;
8+
;;; See http://common-lisp.net/project/trivial-utf-8/
9+
10+
11+
(eval-when (:compile-toplevel :load-toplevel :execute)
12+
(defparameter *optimize* nil))
13+
14+
(deftype vector-index (&optional (size array-dimension-limit))
15+
`(integer 0 (,size)))
16+
17+
(deftype octet ()
18+
`(unsigned-byte 8))
19+
20+
(define-condition utf-8-decoding-error (simple-error)
21+
((message :initarg :message)
22+
(byte :initarg :byte :initform nil))
23+
(:report (lambda (err stream)
24+
(format stream (slot-value err 'message)
25+
(slot-value err 'byte)))))
26+
27+
(declaim (inline utf-8-group-size))
28+
(defun utf-8-group-size (byte)
29+
"Determine the amount of bytes that are part of the character
30+
starting with a given byte."
31+
(check-type byte octet)
32+
(cond ((zerop (logand byte #b10000000)) 1)
33+
((= (logand byte #b11100000) #b11000000) 2)
34+
((= (logand byte #b11110000) #b11100000) 3)
35+
((= (logand byte #b11111000) #b11110000) 4)
36+
(t (error 'utf-8-decoding-error :byte byte
37+
:message "Invalid byte at start of character: 0x~X"))))
38+
39+
(defun utf-8-string-length (bytes &key (start 0) (end (length bytes)))
40+
"Calculate the length of the string encoded by the given bytes."
41+
(check-type bytes (simple-array octet (*)))
42+
(check-type start vector-index)
43+
(check-type end vector-index)
44+
(loop :with i :of-type fixnum = start
45+
:with string-length = 0
46+
:while (< i end)
47+
:do (progn
48+
(incf (the fixnum string-length) 1)
49+
(incf i (utf-8-group-size (elt bytes i))))
50+
:finally (return string-length)))
51+
52+
(defun get-utf-8-character (bytes group-size &optional (start 0))
53+
"Given an array of bytes and the amount of bytes to use,
54+
extract the character starting at the given start position."
55+
(check-type bytes (simple-array (unsigned-byte 8) (*)))
56+
(check-type start vector-index)
57+
(check-type group-size fixnum)
58+
(macrolet ((next-byte ()
59+
'(prog1 (elt bytes start)
60+
(incf start)))
61+
(six-bits (byte)
62+
(let ((b (gensym)))
63+
`(let ((,b ,byte))
64+
(unless (= (logand ,b #b11000000) #b10000000)
65+
(error 'utf-8-decoding-error :byte ,b
66+
:message "Invalid byte 0x~X inside a character."))
67+
(ldb (byte 6 0) ,b))))
68+
(test-overlong (byte min-size)
69+
(let ((b (gensym)))
70+
`(let ((,b ,byte))
71+
(unless (>= ,b ,min-size)
72+
(error 'utf-8-decoding-error :byte ,b
73+
:message "Overlong byte sequence found."))
74+
,b))))
75+
(case group-size
76+
(1 (next-byte))
77+
(2 (test-overlong (logior (ash (ldb (byte 5 0) (next-byte)) 6)
78+
(six-bits (next-byte))) 128))
79+
(3 (test-overlong (logior (ash (ldb (byte 4 0) (next-byte)) 12)
80+
(ash (six-bits (next-byte)) 6)
81+
(six-bits (next-byte))) 2048))
82+
(4 (test-overlong (logior (ash (ldb (byte 3 0) (next-byte)) 18)
83+
(ash (six-bits (next-byte)) 12)
84+
(ash (six-bits (next-byte)) 6)
85+
(six-bits (next-byte))) 65536)))))
86+
87+
(defun decode-utf-8 (bytes-in &key (start 0) (end (length bytes-in)))
88+
"Convert a byte array containing utf-8 encoded characters into
89+
the string it encodes."
90+
(check-type bytes-in vector)
91+
(check-type start vector-index)
92+
(check-type end vector-index)
93+
(loop :with bytes = (coerce bytes-in '(simple-array (unsigned-byte 8) (*)))
94+
:with buffer = (make-string (utf-8-string-length bytes :start start :end end) :element-type 'character)
95+
:with array-position :of-type fixnum = start
96+
:with string-position :of-type fixnum = 0
97+
:while (< array-position end)
98+
:do (let* ((char (elt bytes array-position))
99+
(current-group (utf-8-group-size char)))
100+
(when (> (+ current-group array-position) end)
101+
(error 'utf-8-decoding-error
102+
:message "Unfinished character at end of byte array."))
103+
(setf (char buffer string-position)
104+
(code-char (get-utf-8-character bytes current-group
105+
array-position)))
106+
(incf string-position 1)
107+
(incf array-position current-group))
108+
:finally (return buffer)))
109+
110+
;;; Copyright (c) Marijn Haverbeke
111+
;;;
112+
;;; This software is provided 'as-is', without any express or implied
113+
;;; warranty. In no event will the authors be held liable for any
114+
;;; damages arising from the use of this software.
115+
;;;
116+
;;; Permission is granted to anyone to use this software for any
117+
;;; purpose, including commercial applications, and to alter it and
118+
;;; redistribute it freely, subject to the following restrictions:
119+
;;;
120+
;;; 1. The origin of this software must not be misrepresented; you must
121+
;;; not claim that you wrote the original software. If you use this
122+
;;; software in a product, an acknowledgment in the product
123+
;;; documentation would be appreciated but is not required.
124+
;;;
125+
;;; 2. Altered source versions must be plainly marked as such, and must
126+
;;; not be misrepresented as being the original software.
127+
;;;
128+
;;; 3. This notice may not be removed or altered from any source
129+
;;; distribution.

0 commit comments

Comments
 (0)