PostgreSQL Source Code git master
wchar.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * wchar.c
4 * Functions for working with multibyte characters in various encodings.
5 *
6 * Portions Copyright (c) 1998-2025, PostgreSQL Global Development Group
7 *
8 * IDENTIFICATION
9 * src/common/wchar.c
10 *
11 *-------------------------------------------------------------------------
12 */
13#include "c.h"
14
15#include <limits.h>
16
17#include "mb/pg_wchar.h"
18#include "utils/ascii.h"
19
20
21/*
22 * In today's multibyte encodings other than UTF8, this two-byte sequence
23 * ensures pg_encoding_mblen() == 2 && pg_encoding_verifymbstr() == 0.
24 *
25 * For historical reasons, several verifychar implementations opt to reject
26 * this pair specifically. Byte pair range constraints, in encoding
27 * originator documentation, always excluded this pair. No core conversion
28 * could translate it. However, longstanding verifychar implementations
29 * accepted any non-NUL byte. big5_to_euc_tw and big5_to_mic even translate
30 * pairs not valid per encoding originator documentation. To avoid tightening
31 * core or non-core conversions in a security patch, we sought this one pair.
32 *
33 * PQescapeString() historically used spaces for BYTE1; many other values
34 * could suffice for BYTE1.
35 */
36#define NONUTF8_INVALID_BYTE0 (0x8d)
37#define NONUTF8_INVALID_BYTE1 (' ')
38
39
40/*
41 * Operations on multi-byte encodings are driven by a table of helper
42 * functions.
43 *
44 * To add an encoding support, define mblen(), dsplen(), verifychar() and
45 * verifystr() for the encoding. For server-encodings, also define mb2wchar()
46 * and wchar2mb() conversion functions.
47 *
48 * These functions generally assume that their input is validly formed.
49 * The "verifier" functions, further down in the file, have to be more
50 * paranoid.
51 *
52 * We expect that mblen() does not need to examine more than the first byte
53 * of the character to discover the correct length. GB18030 is an exception
54 * to that rule, though, as it also looks at second byte. But even that
55 * behaves in a predictable way, if you only pass the first byte: it will
56 * treat 4-byte encoded characters as two 2-byte encoded characters, which is
57 * good enough for all current uses.
58 *
59 * Note: for the display output of psql to work properly, the return values
60 * of the dsplen functions must conform to the Unicode standard. In particular
61 * the NUL character is zero width and control characters are generally
62 * width -1. It is recommended that non-ASCII encodings refer their ASCII
63 * subset to the ASCII routines to ensure consistency.
64 */
65
66/*
67 * SQL/ASCII
68 */
69static int
70pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
71{
72 int cnt = 0;
73
74 while (len > 0 && *from)
75 {
76 *to++ = *from++;
77 len--;
78 cnt++;
79 }
80 *to = 0;
81 return cnt;
82}
83
84static int
85pg_ascii_mblen(const unsigned char *s)
86{
87 return 1;
88}
89
90static int
91pg_ascii_dsplen(const unsigned char *s)
92{
93 if (*s == '\0')
94 return 0;
95 if (*s < 0x20 || *s == 0x7f)
96 return -1;
97
98 return 1;
99}
100
101/*
102 * EUC
103 */
104static int
105pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
106{
107 int cnt = 0;
108
109 while (len > 0 && *from)
110 {
111 if (*from == SS2 && len >= 2) /* JIS X 0201 (so called "1 byte
112 * KANA") */
113 {
114 from++;
115 *to = (SS2 << 8) | *from++;
116 len -= 2;
117 }
118 else if (*from == SS3 && len >= 3) /* JIS X 0212 KANJI */
119 {
120 from++;
121 *to = (SS3 << 16) | (*from++ << 8);
122 *to |= *from++;
123 len -= 3;
124 }
125 else if (IS_HIGHBIT_SET(*from) && len >= 2) /* JIS X 0208 KANJI */
126 {
127 *to = *from++ << 8;
128 *to |= *from++;
129 len -= 2;
130 }
131 else /* must be ASCII */
132 {
133 *to = *from++;
134 len--;
135 }
136 to++;
137 cnt++;
138 }
139 *to = 0;
140 return cnt;
141}
142
143static inline int
144pg_euc_mblen(const unsigned char *s)
145{
146 int len;
147
148 if (*s == SS2)
149 len = 2;
150 else if (*s == SS3)
151 len = 3;
152 else if (IS_HIGHBIT_SET(*s))
153 len = 2;
154 else
155 len = 1;
156 return len;
157}
158
159static inline int
160pg_euc_dsplen(const unsigned char *s)
161{
162 int len;
163
164 if (*s == SS2)
165 len = 2;
166 else if (*s == SS3)
167 len = 2;
168 else if (IS_HIGHBIT_SET(*s))
169 len = 2;
170 else
171 len = pg_ascii_dsplen(s);
172 return len;
173}
174
175/*
176 * EUC_JP
177 */
178static int
179pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
180{
181 return pg_euc2wchar_with_len(from, to, len);
182}
183
184static int
185pg_eucjp_mblen(const unsigned char *s)
186{
187 return pg_euc_mblen(s);
188}
189
190static int
191pg_eucjp_dsplen(const unsigned char *s)
192{
193 int len;
194
195 if (*s == SS2)
196 len = 1;
197 else if (*s == SS3)
198 len = 2;
199 else if (IS_HIGHBIT_SET(*s))
200 len = 2;
201 else
202 len = pg_ascii_dsplen(s);
203 return len;
204}
205
206/*
207 * EUC_KR
208 */
209static int
210pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
211{
212 return pg_euc2wchar_with_len(from, to, len);
213}
214
215static int
216pg_euckr_mblen(const unsigned char *s)
217{
218 return pg_euc_mblen(s);
219}
220
221static int
222pg_euckr_dsplen(const unsigned char *s)
223{
224 return pg_euc_dsplen(s);
225}
226
227/*
228 * EUC_CN
229 *
230 */
231static int
232pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
233{
234 int cnt = 0;
235
236 while (len > 0 && *from)
237 {
238 if (*from == SS2 && len >= 3) /* code set 2 (unused?) */
239 {
240 from++;
241 *to = (SS2 << 16) | (*from++ << 8);
242 *to |= *from++;
243 len -= 3;
244 }
245 else if (*from == SS3 && len >= 3) /* code set 3 (unused ?) */
246 {
247 from++;
248 *to = (SS3 << 16) | (*from++ << 8);
249 *to |= *from++;
250 len -= 3;
251 }
252 else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 1 */
253 {
254 *to = *from++ << 8;
255 *to |= *from++;
256 len -= 2;
257 }
258 else
259 {
260 *to = *from++;
261 len--;
262 }
263 to++;
264 cnt++;
265 }
266 *to = 0;
267 return cnt;
268}
269
270static int
271pg_euccn_mblen(const unsigned char *s)
272{
273 int len;
274
275 if (IS_HIGHBIT_SET(*s))
276 len = 2;
277 else
278 len = 1;
279 return len;
280}
281
282static int
283pg_euccn_dsplen(const unsigned char *s)
284{
285 int len;
286
287 if (IS_HIGHBIT_SET(*s))
288 len = 2;
289 else
290 len = pg_ascii_dsplen(s);
291 return len;
292}
293
294/*
295 * EUC_TW
296 *
297 */
298static int
299pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
300{
301 int cnt = 0;
302
303 while (len > 0 && *from)
304 {
305 if (*from == SS2 && len >= 4) /* code set 2 */
306 {
307 from++;
308 *to = (((uint32) SS2) << 24) | (*from++ << 16);
309 *to |= *from++ << 8;
310 *to |= *from++;
311 len -= 4;
312 }
313 else if (*from == SS3 && len >= 3) /* code set 3 (unused?) */
314 {
315 from++;
316 *to = (SS3 << 16) | (*from++ << 8);
317 *to |= *from++;
318 len -= 3;
319 }
320 else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 2 */
321 {
322 *to = *from++ << 8;
323 *to |= *from++;
324 len -= 2;
325 }
326 else
327 {
328 *to = *from++;
329 len--;
330 }
331 to++;
332 cnt++;
333 }
334 *to = 0;
335 return cnt;
336}
337
338static int
339pg_euctw_mblen(const unsigned char *s)
340{
341 int len;
342
343 if (*s == SS2)
344 len = 4;
345 else if (*s == SS3)
346 len = 3;
347 else if (IS_HIGHBIT_SET(*s))
348 len = 2;
349 else
350 len = 1;
351 return len;
352}
353
354static int
355pg_euctw_dsplen(const unsigned char *s)
356{
357 int len;
358
359 if (*s == SS2)
360 len = 2;
361 else if (*s == SS3)
362 len = 2;
363 else if (IS_HIGHBIT_SET(*s))
364 len = 2;
365 else
366 len = pg_ascii_dsplen(s);
367 return len;
368}
369
370/*
371 * Convert pg_wchar to EUC_* encoding.
372 * caller must allocate enough space for "to", including a trailing zero!
373 * len: length of from.
374 * "from" not necessarily null terminated.
375 */
376static int
377pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
378{
379 int cnt = 0;
380
381 while (len > 0 && *from)
382 {
383 unsigned char c;
384
385 if ((c = (*from >> 24)))
386 {
387 *to++ = c;
388 *to++ = (*from >> 16) & 0xff;
389 *to++ = (*from >> 8) & 0xff;
390 *to++ = *from & 0xff;
391 cnt += 4;
392 }
393 else if ((c = (*from >> 16)))
394 {
395 *to++ = c;
396 *to++ = (*from >> 8) & 0xff;
397 *to++ = *from & 0xff;
398 cnt += 3;
399 }
400 else if ((c = (*from >> 8)))
401 {
402 *to++ = c;
403 *to++ = *from & 0xff;
404 cnt += 2;
405 }
406 else
407 {
408 *to++ = *from;
409 cnt++;
410 }
411 from++;
412 len--;
413 }
414 *to = 0;
415 return cnt;
416}
417
418
419/*
420 * JOHAB
421 */
422static int
423pg_johab_mblen(const unsigned char *s)
424{
425 return pg_euc_mblen(s);
426}
427
428static int
429pg_johab_dsplen(const unsigned char *s)
430{
431 return pg_euc_dsplen(s);
432}
433
434/*
435 * convert UTF8 string to pg_wchar (UCS-4)
436 * caller must allocate enough space for "to", including a trailing zero!
437 * len: length of from.
438 * "from" not necessarily null terminated.
439 */
440static int
441pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
442{
443 int cnt = 0;
444 uint32 c1,
445 c2,
446 c3,
447 c4;
448
449 while (len > 0 && *from)
450 {
451 if ((*from & 0x80) == 0)
452 {
453 *to = *from++;
454 len--;
455 }
456 else if ((*from & 0xe0) == 0xc0)
457 {
458 if (len < 2)
459 break; /* drop trailing incomplete char */
460 c1 = *from++ & 0x1f;
461 c2 = *from++ & 0x3f;
462 *to = (c1 << 6) | c2;
463 len -= 2;
464 }
465 else if ((*from & 0xf0) == 0xe0)
466 {
467 if (len < 3)
468 break; /* drop trailing incomplete char */
469 c1 = *from++ & 0x0f;
470 c2 = *from++ & 0x3f;
471 c3 = *from++ & 0x3f;
472 *to = (c1 << 12) | (c2 << 6) | c3;
473 len -= 3;
474 }
475 else if ((*from & 0xf8) == 0xf0)
476 {
477 if (len < 4)
478 break; /* drop trailing incomplete char */
479 c1 = *from++ & 0x07;
480 c2 = *from++ & 0x3f;
481 c3 = *from++ & 0x3f;
482 c4 = *from++ & 0x3f;
483 *to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
484 len -= 4;
485 }
486 else
487 {
488 /* treat a bogus char as length 1; not ours to raise error */
489 *to = *from++;
490 len--;
491 }
492 to++;
493 cnt++;
494 }
495 *to = 0;
496 return cnt;
497}
498
499
500/*
501 * Trivial conversion from pg_wchar to UTF-8.
502 * caller should allocate enough space for "to"
503 * len: length of from.
504 * "from" not necessarily null terminated.
505 */
506static int
507pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
508{
509 int cnt = 0;
510
511 while (len > 0 && *from)
512 {
513 int char_len;
514
515 unicode_to_utf8(*from, to);
516 char_len = pg_utf_mblen(to);
517 cnt += char_len;
518 to += char_len;
519 from++;
520 len--;
521 }
522 *to = 0;
523 return cnt;
524}
525
526/*
527 * Return the byte length of a UTF8 character pointed to by s
528 *
529 * Note: in the current implementation we do not support UTF8 sequences
530 * of more than 4 bytes; hence do NOT return a value larger than 4.
531 * We return "1" for any leading byte that is either flat-out illegal or
532 * indicates a length larger than we support.
533 *
534 * pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps
535 * other places would need to be fixed to change this.
536 */
537int
538pg_utf_mblen(const unsigned char *s)
539{
540 int len;
541
542 if ((*s & 0x80) == 0)
543 len = 1;
544 else if ((*s & 0xe0) == 0xc0)
545 len = 2;
546 else if ((*s & 0xf0) == 0xe0)
547 len = 3;
548 else if ((*s & 0xf8) == 0xf0)
549 len = 4;
550#ifdef NOT_USED
551 else if ((*s & 0xfc) == 0xf8)
552 len = 5;
553 else if ((*s & 0xfe) == 0xfc)
554 len = 6;
555#endif
556 else
557 len = 1;
558 return len;
559}
560
561/*
562 * This is an implementation of wcwidth() and wcswidth() as defined in
563 * "The Single UNIX Specification, Version 2, The Open Group, 1997"
564 * <http://www.unix.org/online.html>
565 *
566 * Markus Kuhn -- 2001-09-08 -- public domain
567 *
568 * customised for PostgreSQL
569 *
570 * original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
571 */
572
574{
575 unsigned int first;
576 unsigned int last;
577};
578
579/* auxiliary function for binary search in interval table */
580static int
581mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
582{
583 int min = 0;
584 int mid;
585
586 if (ucs < table[0].first || ucs > table[max].last)
587 return 0;
588 while (max >= min)
589 {
590 mid = (min + max) / 2;
591 if (ucs > table[mid].last)
592 min = mid + 1;
593 else if (ucs < table[mid].first)
594 max = mid - 1;
595 else
596 return 1;
597 }
598
599 return 0;
600}
601
602
603/* The following functions define the column width of an ISO 10646
604 * character as follows:
605 *
606 * - The null character (U+0000) has a column width of 0.
607 *
608 * - Other C0/C1 control characters and DEL will lead to a return
609 * value of -1.
610 *
611 * - Non-spacing and enclosing combining characters (general
612 * category code Mn, Me or Cf in the Unicode database) have a
613 * column width of 0.
614 *
615 * - Spacing characters in the East Asian Wide (W) or East Asian
616 * FullWidth (F) category as defined in Unicode Technical
617 * Report #11 have a column width of 2.
618 *
619 * - All remaining characters (including all printable
620 * ISO 8859-1 and WGL4 characters, Unicode control characters,
621 * etc.) have a column width of 1.
622 *
623 * This implementation assumes that wchar_t characters are encoded
624 * in ISO 10646.
625 */
626
627static int
629{
632
633 /* test for 8-bit control characters */
634 if (ucs == 0)
635 return 0;
636
637 if (ucs < 0x20 || (ucs >= 0x7f && ucs < 0xa0) || ucs > 0x0010ffff)
638 return -1;
639
640 /*
641 * binary search in table of non-spacing characters
642 *
643 * XXX: In the official Unicode sources, it is possible for a character to
644 * be described as both non-spacing and wide at the same time. As of
645 * Unicode 13.0, treating the non-spacing property as the determining
646 * factor for display width leads to the correct behavior, so do that
647 * search first.
648 */
649 if (mbbisearch(ucs, nonspacing,
650 sizeof(nonspacing) / sizeof(struct mbinterval) - 1))
651 return 0;
652
653 /* binary search in table of wide characters */
654 if (mbbisearch(ucs, east_asian_fw,
655 sizeof(east_asian_fw) / sizeof(struct mbinterval) - 1))
656 return 2;
657
658 return 1;
659}
660
661static int
662pg_utf_dsplen(const unsigned char *s)
663{
664 return ucs_wcwidth(utf8_to_unicode(s));
665}
666
667/*
668 * convert mule internal code to pg_wchar
669 * caller should allocate enough space for "to"
670 * len: length of from.
671 * "from" not necessarily null terminated.
672 */
673static int
674pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
675{
676 int cnt = 0;
677
678 while (len > 0 && *from)
679 {
680 if (IS_LC1(*from) && len >= 2)
681 {
682 *to = *from++ << 16;
683 *to |= *from++;
684 len -= 2;
685 }
686 else if (IS_LCPRV1(*from) && len >= 3)
687 {
688 from++;
689 *to = *from++ << 16;
690 *to |= *from++;
691 len -= 3;
692 }
693 else if (IS_LC2(*from) && len >= 3)
694 {
695 *to = *from++ << 16;
696 *to |= *from++ << 8;
697 *to |= *from++;
698 len -= 3;
699 }
700 else if (IS_LCPRV2(*from) && len >= 4)
701 {
702 from++;
703 *to = *from++ << 16;
704 *to |= *from++ << 8;
705 *to |= *from++;
706 len -= 4;
707 }
708 else
709 { /* assume ASCII */
710 *to = (unsigned char) *from++;
711 len--;
712 }
713 to++;
714 cnt++;
715 }
716 *to = 0;
717 return cnt;
718}
719
720/*
721 * convert pg_wchar to mule internal code
722 * caller should allocate enough space for "to"
723 * len: length of from.
724 * "from" not necessarily null terminated.
725 */
726static int
727pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
728{
729 int cnt = 0;
730
731 while (len > 0 && *from)
732 {
733 unsigned char lb;
734
735 lb = (*from >> 16) & 0xff;
736 if (IS_LC1(lb))
737 {
738 *to++ = lb;
739 *to++ = *from & 0xff;
740 cnt += 2;
741 }
742 else if (IS_LC2(lb))
743 {
744 *to++ = lb;
745 *to++ = (*from >> 8) & 0xff;
746 *to++ = *from & 0xff;
747 cnt += 3;
748 }
749 else if (IS_LCPRV1_A_RANGE(lb))
750 {
751 *to++ = LCPRV1_A;
752 *to++ = lb;
753 *to++ = *from & 0xff;
754 cnt += 3;
755 }
756 else if (IS_LCPRV1_B_RANGE(lb))
757 {
758 *to++ = LCPRV1_B;
759 *to++ = lb;
760 *to++ = *from & 0xff;
761 cnt += 3;
762 }
763 else if (IS_LCPRV2_A_RANGE(lb))
764 {
765 *to++ = LCPRV2_A;
766 *to++ = lb;
767 *to++ = (*from >> 8) & 0xff;
768 *to++ = *from & 0xff;
769 cnt += 4;
770 }
771 else if (IS_LCPRV2_B_RANGE(lb))
772 {
773 *to++ = LCPRV2_B;
774 *to++ = lb;
775 *to++ = (*from >> 8) & 0xff;
776 *to++ = *from & 0xff;
777 cnt += 4;
778 }
779 else
780 {
781 *to++ = *from & 0xff;
782 cnt += 1;
783 }
784 from++;
785 len--;
786 }
787 *to = 0;
788 return cnt;
789}
790
791/* exported for direct use by conv.c */
792int
793pg_mule_mblen(const unsigned char *s)
794{
795 int len;
796
797 if (IS_LC1(*s))
798 len = 2;
799 else if (IS_LCPRV1(*s))
800 len = 3;
801 else if (IS_LC2(*s))
802 len = 3;
803 else if (IS_LCPRV2(*s))
804 len = 4;
805 else
806 len = 1; /* assume ASCII */
807 return len;
808}
809
810static int
811pg_mule_dsplen(const unsigned char *s)
812{
813 int len;
814
815 /*
816 * Note: it's not really appropriate to assume that all multibyte charsets
817 * are double-wide on screen. But this seems an okay approximation for
818 * the MULE charsets we currently support.
819 */
820
821 if (IS_LC1(*s))
822 len = 1;
823 else if (IS_LCPRV1(*s))
824 len = 1;
825 else if (IS_LC2(*s))
826 len = 2;
827 else if (IS_LCPRV2(*s))
828 len = 2;
829 else
830 len = 1; /* assume ASCII */
831
832 return len;
833}
834
835/*
836 * ISO8859-1
837 */
838static int
839pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
840{
841 int cnt = 0;
842
843 while (len > 0 && *from)
844 {
845 *to++ = *from++;
846 len--;
847 cnt++;
848 }
849 *to = 0;
850 return cnt;
851}
852
853/*
854 * Trivial conversion from pg_wchar to single byte encoding. Just ignores
855 * high bits.
856 * caller should allocate enough space for "to"
857 * len: length of from.
858 * "from" not necessarily null terminated.
859 */
860static int
861pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
862{
863 int cnt = 0;
864
865 while (len > 0 && *from)
866 {
867 *to++ = *from++;
868 len--;
869 cnt++;
870 }
871 *to = 0;
872 return cnt;
873}
874
875static int
876pg_latin1_mblen(const unsigned char *s)
877{
878 return 1;
879}
880
881static int
882pg_latin1_dsplen(const unsigned char *s)
883{
884 return pg_ascii_dsplen(s);
885}
886
887/*
888 * SJIS
889 */
890static int
891pg_sjis_mblen(const unsigned char *s)
892{
893 int len;
894
895 if (*s >= 0xa1 && *s <= 0xdf)
896 len = 1; /* 1 byte kana? */
897 else if (IS_HIGHBIT_SET(*s))
898 len = 2; /* kanji? */
899 else
900 len = 1; /* should be ASCII */
901 return len;
902}
903
904static int
905pg_sjis_dsplen(const unsigned char *s)
906{
907 int len;
908
909 if (*s >= 0xa1 && *s <= 0xdf)
910 len = 1; /* 1 byte kana? */
911 else if (IS_HIGHBIT_SET(*s))
912 len = 2; /* kanji? */
913 else
914 len = pg_ascii_dsplen(s); /* should be ASCII */
915 return len;
916}
917
918/*
919 * Big5
920 */
921static int
922pg_big5_mblen(const unsigned char *s)
923{
924 int len;
925
926 if (IS_HIGHBIT_SET(*s))
927 len = 2; /* kanji? */
928 else
929 len = 1; /* should be ASCII */
930 return len;
931}
932
933static int
934pg_big5_dsplen(const unsigned char *s)
935{
936 int len;
937
938 if (IS_HIGHBIT_SET(*s))
939 len = 2; /* kanji? */
940 else
941 len = pg_ascii_dsplen(s); /* should be ASCII */
942 return len;
943}
944
945/*
946 * GBK
947 */
948static int
949pg_gbk_mblen(const unsigned char *s)
950{
951 int len;
952
953 if (IS_HIGHBIT_SET(*s))
954 len = 2; /* kanji? */
955 else
956 len = 1; /* should be ASCII */
957 return len;
958}
959
960static int
961pg_gbk_dsplen(const unsigned char *s)
962{
963 int len;
964
965 if (IS_HIGHBIT_SET(*s))
966 len = 2; /* kanji? */
967 else
968 len = pg_ascii_dsplen(s); /* should be ASCII */
969 return len;
970}
971
972/*
973 * UHC
974 */
975static int
976pg_uhc_mblen(const unsigned char *s)
977{
978 int len;
979
980 if (IS_HIGHBIT_SET(*s))
981 len = 2; /* 2byte? */
982 else
983 len = 1; /* should be ASCII */
984 return len;
985}
986
987static int
988pg_uhc_dsplen(const unsigned char *s)
989{
990 int len;
991
992 if (IS_HIGHBIT_SET(*s))
993 len = 2; /* 2byte? */
994 else
995 len = pg_ascii_dsplen(s); /* should be ASCII */
996 return len;
997}
998
999/*
1000 * GB18030
1001 * Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
1002 */
1003
1004/*
1005 * Unlike all other mblen() functions, this also looks at the second byte of
1006 * the input. However, if you only pass the first byte of a multi-byte
1007 * string, and \0 as the second byte, this still works in a predictable way:
1008 * a 4-byte character will be reported as two 2-byte characters. That's
1009 * enough for all current uses, as a client-only encoding. It works that
1010 * way, because in any valid 4-byte GB18030-encoded character, the third and
1011 * fourth byte look like a 2-byte encoded character, when looked at
1012 * separately.
1013 */
1014static int
1015pg_gb18030_mblen(const unsigned char *s)
1016{
1017 int len;
1018
1019 if (!IS_HIGHBIT_SET(*s))
1020 len = 1; /* ASCII */
1021 else if (*(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1022 len = 4;
1023 else
1024 len = 2;
1025 return len;
1026}
1027
1028static int
1029pg_gb18030_dsplen(const unsigned char *s)
1030{
1031 int len;
1032
1033 if (IS_HIGHBIT_SET(*s))
1034 len = 2;
1035 else
1036 len = pg_ascii_dsplen(s); /* ASCII */
1037 return len;
1038}
1039
1040/*
1041 *-------------------------------------------------------------------
1042 * multibyte sequence validators
1043 *
1044 * The verifychar functions accept "s", a pointer to the first byte of a
1045 * string, and "len", the remaining length of the string. If there is a
1046 * validly encoded character beginning at *s, return its length in bytes;
1047 * else return -1.
1048 *
1049 * The verifystr functions also accept "s", a pointer to a string and "len",
1050 * the length of the string. They verify the whole string, and return the
1051 * number of input bytes (<= len) that are valid. In other words, if the
1052 * whole string is valid, verifystr returns "len", otherwise it returns the
1053 * byte offset of the first invalid character. The verifystr functions must
1054 * test for and reject zeroes in the input.
1055 *
1056 * The verifychar functions can assume that len > 0 and that *s != '\0', but
1057 * they must test for and reject zeroes in any additional bytes of a
1058 * multibyte character. Note that this definition allows the function for a
1059 * single-byte encoding to be just "return 1".
1060 *-------------------------------------------------------------------
1061 */
1062static int
1063pg_ascii_verifychar(const unsigned char *s, int len)
1064{
1065 return 1;
1066}
1067
1068static int
1069pg_ascii_verifystr(const unsigned char *s, int len)
1070{
1071 const unsigned char *nullpos = memchr(s, 0, len);
1072
1073 if (nullpos == NULL)
1074 return len;
1075 else
1076 return nullpos - s;
1077}
1078
1079#define IS_EUC_RANGE_VALID(c) ((c) >= 0xa1 && (c) <= 0xfe)
1080
1081static int
1082pg_eucjp_verifychar(const unsigned char *s, int len)
1083{
1084 int l;
1085 unsigned char c1,
1086 c2;
1087
1088 c1 = *s++;
1089
1090 switch (c1)
1091 {
1092 case SS2: /* JIS X 0201 */
1093 l = 2;
1094 if (l > len)
1095 return -1;
1096 c2 = *s++;
1097 if (c2 < 0xa1 || c2 > 0xdf)
1098 return -1;
1099 break;
1100
1101 case SS3: /* JIS X 0212 */
1102 l = 3;
1103 if (l > len)
1104 return -1;
1105 c2 = *s++;
1106 if (!IS_EUC_RANGE_VALID(c2))
1107 return -1;
1108 c2 = *s++;
1109 if (!IS_EUC_RANGE_VALID(c2))
1110 return -1;
1111 break;
1112
1113 default:
1114 if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
1115 {
1116 l = 2;
1117 if (l > len)
1118 return -1;
1119 if (!IS_EUC_RANGE_VALID(c1))
1120 return -1;
1121 c2 = *s++;
1122 if (!IS_EUC_RANGE_VALID(c2))
1123 return -1;
1124 }
1125 else
1126 /* must be ASCII */
1127 {
1128 l = 1;
1129 }
1130 break;
1131 }
1132
1133 return l;
1134}
1135
1136static int
1137pg_eucjp_verifystr(const unsigned char *s, int len)
1138{
1139 const unsigned char *start = s;
1140
1141 while (len > 0)
1142 {
1143 int l;
1144
1145 /* fast path for ASCII-subset characters */
1146 if (!IS_HIGHBIT_SET(*s))
1147 {
1148 if (*s == '\0')
1149 break;
1150 l = 1;
1151 }
1152 else
1153 {
1154 l = pg_eucjp_verifychar(s, len);
1155 if (l == -1)
1156 break;
1157 }
1158 s += l;
1159 len -= l;
1160 }
1161
1162 return s - start;
1163}
1164
1165static int
1166pg_euckr_verifychar(const unsigned char *s, int len)
1167{
1168 int l;
1169 unsigned char c1,
1170 c2;
1171
1172 c1 = *s++;
1173
1174 if (IS_HIGHBIT_SET(c1))
1175 {
1176 l = 2;
1177 if (l > len)
1178 return -1;
1179 if (!IS_EUC_RANGE_VALID(c1))
1180 return -1;
1181 c2 = *s++;
1182 if (!IS_EUC_RANGE_VALID(c2))
1183 return -1;
1184 }
1185 else
1186 /* must be ASCII */
1187 {
1188 l = 1;
1189 }
1190
1191 return l;
1192}
1193
1194static int
1195pg_euckr_verifystr(const unsigned char *s, int len)
1196{
1197 const unsigned char *start = s;
1198
1199 while (len > 0)
1200 {
1201 int l;
1202
1203 /* fast path for ASCII-subset characters */
1204 if (!IS_HIGHBIT_SET(*s))
1205 {
1206 if (*s == '\0')
1207 break;
1208 l = 1;
1209 }
1210 else
1211 {
1212 l = pg_euckr_verifychar(s, len);
1213 if (l == -1)
1214 break;
1215 }
1216 s += l;
1217 len -= l;
1218 }
1219
1220 return s - start;
1221}
1222
1223/* EUC-CN byte sequences are exactly same as EUC-KR */
1224#define pg_euccn_verifychar pg_euckr_verifychar
1225#define pg_euccn_verifystr pg_euckr_verifystr
1226
1227static int
1228pg_euctw_verifychar(const unsigned char *s, int len)
1229{
1230 int l;
1231 unsigned char c1,
1232 c2;
1233
1234 c1 = *s++;
1235
1236 switch (c1)
1237 {
1238 case SS2: /* CNS 11643 Plane 1-7 */
1239 l = 4;
1240 if (l > len)
1241 return -1;
1242 c2 = *s++;
1243 if (c2 < 0xa1 || c2 > 0xa7)
1244 return -1;
1245 c2 = *s++;
1246 if (!IS_EUC_RANGE_VALID(c2))
1247 return -1;
1248 c2 = *s++;
1249 if (!IS_EUC_RANGE_VALID(c2))
1250 return -1;
1251 break;
1252
1253 case SS3: /* unused */
1254 return -1;
1255
1256 default:
1257 if (IS_HIGHBIT_SET(c1)) /* CNS 11643 Plane 1 */
1258 {
1259 l = 2;
1260 if (l > len)
1261 return -1;
1262 /* no further range check on c1? */
1263 c2 = *s++;
1264 if (!IS_EUC_RANGE_VALID(c2))
1265 return -1;
1266 }
1267 else
1268 /* must be ASCII */
1269 {
1270 l = 1;
1271 }
1272 break;
1273 }
1274 return l;
1275}
1276
1277static int
1278pg_euctw_verifystr(const unsigned char *s, int len)
1279{
1280 const unsigned char *start = s;
1281
1282 while (len > 0)
1283 {
1284 int l;
1285
1286 /* fast path for ASCII-subset characters */
1287 if (!IS_HIGHBIT_SET(*s))
1288 {
1289 if (*s == '\0')
1290 break;
1291 l = 1;
1292 }
1293 else
1294 {
1295 l = pg_euctw_verifychar(s, len);
1296 if (l == -1)
1297 break;
1298 }
1299 s += l;
1300 len -= l;
1301 }
1302
1303 return s - start;
1304}
1305
1306static int
1307pg_johab_verifychar(const unsigned char *s, int len)
1308{
1309 int l,
1310 mbl;
1311 unsigned char c;
1312
1313 l = mbl = pg_johab_mblen(s);
1314
1315 if (len < l)
1316 return -1;
1317
1318 if (!IS_HIGHBIT_SET(*s))
1319 return mbl;
1320
1321 while (--l > 0)
1322 {
1323 c = *++s;
1324 if (!IS_EUC_RANGE_VALID(c))
1325 return -1;
1326 }
1327 return mbl;
1328}
1329
1330static int
1331pg_johab_verifystr(const unsigned char *s, int len)
1332{
1333 const unsigned char *start = s;
1334
1335 while (len > 0)
1336 {
1337 int l;
1338
1339 /* fast path for ASCII-subset characters */
1340 if (!IS_HIGHBIT_SET(*s))
1341 {
1342 if (*s == '\0')
1343 break;
1344 l = 1;
1345 }
1346 else
1347 {
1348 l = pg_johab_verifychar(s, len);
1349 if (l == -1)
1350 break;
1351 }
1352 s += l;
1353 len -= l;
1354 }
1355
1356 return s - start;
1357}
1358
1359static int
1360pg_mule_verifychar(const unsigned char *s, int len)
1361{
1362 int l,
1363 mbl;
1364 unsigned char c;
1365
1366 l = mbl = pg_mule_mblen(s);
1367
1368 if (len < l)
1369 return -1;
1370
1371 while (--l > 0)
1372 {
1373 c = *++s;
1374 if (!IS_HIGHBIT_SET(c))
1375 return -1;
1376 }
1377 return mbl;
1378}
1379
1380static int
1381pg_mule_verifystr(const unsigned char *s, int len)
1382{
1383 const unsigned char *start = s;
1384
1385 while (len > 0)
1386 {
1387 int l;
1388
1389 /* fast path for ASCII-subset characters */
1390 if (!IS_HIGHBIT_SET(*s))
1391 {
1392 if (*s == '\0')
1393 break;
1394 l = 1;
1395 }
1396 else
1397 {
1398 l = pg_mule_verifychar(s, len);
1399 if (l == -1)
1400 break;
1401 }
1402 s += l;
1403 len -= l;
1404 }
1405
1406 return s - start;
1407}
1408
1409static int
1410pg_latin1_verifychar(const unsigned char *s, int len)
1411{
1412 return 1;
1413}
1414
1415static int
1416pg_latin1_verifystr(const unsigned char *s, int len)
1417{
1418 const unsigned char *nullpos = memchr(s, 0, len);
1419
1420 if (nullpos == NULL)
1421 return len;
1422 else
1423 return nullpos - s;
1424}
1425
1426static int
1427pg_sjis_verifychar(const unsigned char *s, int len)
1428{
1429 int l,
1430 mbl;
1431 unsigned char c1,
1432 c2;
1433
1434 l = mbl = pg_sjis_mblen(s);
1435
1436 if (len < l)
1437 return -1;
1438
1439 if (l == 1) /* pg_sjis_mblen already verified it */
1440 return mbl;
1441
1442 c1 = *s++;
1443 c2 = *s;
1444 if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
1445 return -1;
1446 return mbl;
1447}
1448
1449static int
1450pg_sjis_verifystr(const unsigned char *s, int len)
1451{
1452 const unsigned char *start = s;
1453
1454 while (len > 0)
1455 {
1456 int l;
1457
1458 /* fast path for ASCII-subset characters */
1459 if (!IS_HIGHBIT_SET(*s))
1460 {
1461 if (*s == '\0')
1462 break;
1463 l = 1;
1464 }
1465 else
1466 {
1467 l = pg_sjis_verifychar(s, len);
1468 if (l == -1)
1469 break;
1470 }
1471 s += l;
1472 len -= l;
1473 }
1474
1475 return s - start;
1476}
1477
1478static int
1479pg_big5_verifychar(const unsigned char *s, int len)
1480{
1481 int l,
1482 mbl;
1483
1484 l = mbl = pg_big5_mblen(s);
1485
1486 if (len < l)
1487 return -1;
1488
1489 if (l == 2 &&
1490 s[0] == NONUTF8_INVALID_BYTE0 &&
1491 s[1] == NONUTF8_INVALID_BYTE1)
1492 return -1;
1493
1494 while (--l > 0)
1495 {
1496 if (*++s == '\0')
1497 return -1;
1498 }
1499
1500 return mbl;
1501}
1502
1503static int
1504pg_big5_verifystr(const unsigned char *s, int len)
1505{
1506 const unsigned char *start = s;
1507
1508 while (len > 0)
1509 {
1510 int l;
1511
1512 /* fast path for ASCII-subset characters */
1513 if (!IS_HIGHBIT_SET(*s))
1514 {
1515 if (*s == '\0')
1516 break;
1517 l = 1;
1518 }
1519 else
1520 {
1521 l = pg_big5_verifychar(s, len);
1522 if (l == -1)
1523 break;
1524 }
1525 s += l;
1526 len -= l;
1527 }
1528
1529 return s - start;
1530}
1531
1532static int
1533pg_gbk_verifychar(const unsigned char *s, int len)
1534{
1535 int l,
1536 mbl;
1537
1538 l = mbl = pg_gbk_mblen(s);
1539
1540 if (len < l)
1541 return -1;
1542
1543 if (l == 2 &&
1544 s[0] == NONUTF8_INVALID_BYTE0 &&
1545 s[1] == NONUTF8_INVALID_BYTE1)
1546 return -1;
1547
1548 while (--l > 0)
1549 {
1550 if (*++s == '\0')
1551 return -1;
1552 }
1553
1554 return mbl;
1555}
1556
1557static int
1558pg_gbk_verifystr(const unsigned char *s, int len)
1559{
1560 const unsigned char *start = s;
1561
1562 while (len > 0)
1563 {
1564 int l;
1565
1566 /* fast path for ASCII-subset characters */
1567 if (!IS_HIGHBIT_SET(*s))
1568 {
1569 if (*s == '\0')
1570 break;
1571 l = 1;
1572 }
1573 else
1574 {
1575 l = pg_gbk_verifychar(s, len);
1576 if (l == -1)
1577 break;
1578 }
1579 s += l;
1580 len -= l;
1581 }
1582
1583 return s - start;
1584}
1585
1586static int
1587pg_uhc_verifychar(const unsigned char *s, int len)
1588{
1589 int l,
1590 mbl;
1591
1592 l = mbl = pg_uhc_mblen(s);
1593
1594 if (len < l)
1595 return -1;
1596
1597 if (l == 2 &&
1598 s[0] == NONUTF8_INVALID_BYTE0 &&
1599 s[1] == NONUTF8_INVALID_BYTE1)
1600 return -1;
1601
1602 while (--l > 0)
1603 {
1604 if (*++s == '\0')
1605 return -1;
1606 }
1607
1608 return mbl;
1609}
1610
1611static int
1612pg_uhc_verifystr(const unsigned char *s, int len)
1613{
1614 const unsigned char *start = s;
1615
1616 while (len > 0)
1617 {
1618 int l;
1619
1620 /* fast path for ASCII-subset characters */
1621 if (!IS_HIGHBIT_SET(*s))
1622 {
1623 if (*s == '\0')
1624 break;
1625 l = 1;
1626 }
1627 else
1628 {
1629 l = pg_uhc_verifychar(s, len);
1630 if (l == -1)
1631 break;
1632 }
1633 s += l;
1634 len -= l;
1635 }
1636
1637 return s - start;
1638}
1639
1640static int
1641pg_gb18030_verifychar(const unsigned char *s, int len)
1642{
1643 int l;
1644
1645 if (!IS_HIGHBIT_SET(*s))
1646 l = 1; /* ASCII */
1647 else if (len >= 4 && *(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1648 {
1649 /* Should be 4-byte, validate remaining bytes */
1650 if (*s >= 0x81 && *s <= 0xfe &&
1651 *(s + 2) >= 0x81 && *(s + 2) <= 0xfe &&
1652 *(s + 3) >= 0x30 && *(s + 3) <= 0x39)
1653 l = 4;
1654 else
1655 l = -1;
1656 }
1657 else if (len >= 2 && *s >= 0x81 && *s <= 0xfe)
1658 {
1659 /* Should be 2-byte, validate */
1660 if ((*(s + 1) >= 0x40 && *(s + 1) <= 0x7e) ||
1661 (*(s + 1) >= 0x80 && *(s + 1) <= 0xfe))
1662 l = 2;
1663 else
1664 l = -1;
1665 }
1666 else
1667 l = -1;
1668 return l;
1669}
1670
1671static int
1672pg_gb18030_verifystr(const unsigned char *s, int len)
1673{
1674 const unsigned char *start = s;
1675
1676 while (len > 0)
1677 {
1678 int l;
1679
1680 /* fast path for ASCII-subset characters */
1681 if (!IS_HIGHBIT_SET(*s))
1682 {
1683 if (*s == '\0')
1684 break;
1685 l = 1;
1686 }
1687 else
1688 {
1689 l = pg_gb18030_verifychar(s, len);
1690 if (l == -1)
1691 break;
1692 }
1693 s += l;
1694 len -= l;
1695 }
1696
1697 return s - start;
1698}
1699
1700static int
1701pg_utf8_verifychar(const unsigned char *s, int len)
1702{
1703 int l;
1704
1705 if ((*s & 0x80) == 0)
1706 {
1707 if (*s == '\0')
1708 return -1;
1709 return 1;
1710 }
1711 else if ((*s & 0xe0) == 0xc0)
1712 l = 2;
1713 else if ((*s & 0xf0) == 0xe0)
1714 l = 3;
1715 else if ((*s & 0xf8) == 0xf0)
1716 l = 4;
1717 else
1718 l = 1;
1719
1720 if (l > len)
1721 return -1;
1722
1723 if (!pg_utf8_islegal(s, l))
1724 return -1;
1725
1726 return l;
1727}
1728
1729/*
1730 * The fast path of the UTF-8 verifier uses a deterministic finite automaton
1731 * (DFA) for multibyte characters. In a traditional table-driven DFA, the
1732 * input byte and current state are used to compute an index into an array of
1733 * state transitions. Since the address of the next transition is dependent
1734 * on this computation, there is latency in executing the load instruction,
1735 * and the CPU is not kept busy.
1736 *
1737 * Instead, we use a "shift-based" DFA as described by Per Vognsen:
1738 *
1739 * https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725
1740 *
1741 * In a shift-based DFA, the input byte is an index into array of integers
1742 * whose bit pattern encodes the state transitions. To compute the next
1743 * state, we simply right-shift the integer by the current state and apply a
1744 * mask. In this scheme, the address of the transition only depends on the
1745 * input byte, so there is better pipelining.
1746 *
1747 * The naming convention for states and transitions was adopted from a UTF-8
1748 * to UTF-16/32 transcoder, whose table is reproduced below:
1749 *
1750 * https://github.com/BobSteagall/utf_utils/blob/6b7a465265de2f5fa6133d653df0c9bdd73bbcf8/src/utf_utils.cpp
1751 *
1752 * ILL ASC CR1 CR2 CR3 L2A L3A L3B L3C L4A L4B L4C CLASS / STATE
1753 * ==========================================================================
1754 * err, END, err, err, err, CS1, P3A, CS2, P3B, P4A, CS3, P4B, | BGN/END
1755 * err, err, err, err, err, err, err, err, err, err, err, err, | ERR
1756 * |
1757 * err, err, END, END, END, err, err, err, err, err, err, err, | CS1
1758 * err, err, CS1, CS1, CS1, err, err, err, err, err, err, err, | CS2
1759 * err, err, CS2, CS2, CS2, err, err, err, err, err, err, err, | CS3
1760 * |
1761 * err, err, err, err, CS1, err, err, err, err, err, err, err, | P3A
1762 * err, err, CS1, CS1, err, err, err, err, err, err, err, err, | P3B
1763 * |
1764 * err, err, err, CS2, CS2, err, err, err, err, err, err, err, | P4A
1765 * err, err, CS2, err, err, err, err, err, err, err, err, err, | P4B
1766 *
1767 * In the most straightforward implementation, a shift-based DFA for UTF-8
1768 * requires 64-bit integers to encode the transitions, but with an SMT solver
1769 * it's possible to find state numbers such that the transitions fit within
1770 * 32-bit integers, as Dougall Johnson demonstrated:
1771 *
1772 * https://gist.github.com/dougallj/166e326de6ad4cf2c94be97a204c025f
1773 *
1774 * This packed representation is the reason for the seemingly odd choice of
1775 * state values below.
1776 */
1777
1778/* Error */
1779#define ERR 0
1780/* Begin */
1781#define BGN 11
1782/* Continuation states, expect 1/2/3 continuation bytes */
1783#define CS1 16
1784#define CS2 1
1785#define CS3 5
1786/* Partial states, where the first continuation byte has a restricted range */
1787#define P3A 6 /* Lead was E0, check for 3-byte overlong */
1788#define P3B 20 /* Lead was ED, check for surrogate */
1789#define P4A 25 /* Lead was F0, check for 4-byte overlong */
1790#define P4B 30 /* Lead was F4, check for too-large */
1791/* Begin and End are the same state */
1792#define END BGN
1793
1794/* the encoded state transitions for the lookup table */
1795
1796/* ASCII */
1797#define ASC (END << BGN)
1798/* 2-byte lead */
1799#define L2A (CS1 << BGN)
1800/* 3-byte lead */
1801#define L3A (P3A << BGN)
1802#define L3B (CS2 << BGN)
1803#define L3C (P3B << BGN)
1804/* 4-byte lead */
1805#define L4A (P4A << BGN)
1806#define L4B (CS3 << BGN)
1807#define L4C (P4B << BGN)
1808/* continuation byte */
1809#define CR1 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4B)
1810#define CR2 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4A)
1811#define CR3 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3A) | (CS2 << P4A)
1812/* invalid byte */
1813#define ILL ERR
1814
1815static const uint32 Utf8Transition[256] =
1816{
1817 /* ASCII */
1818
1819 ILL, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1820 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1821 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1822 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1823
1824 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1825 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1826 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1827 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1828
1829 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1830 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1831 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1832 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1833
1834 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1835 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1836 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1837 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1838
1839 /* continuation bytes */
1840
1841 /* 80..8F */
1842 CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
1843 CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
1844
1845 /* 90..9F */
1846 CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
1847 CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
1848
1849 /* A0..BF */
1850 CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1851 CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1852 CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1853 CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1854
1855 /* leading bytes */
1856
1857 /* C0..DF */
1858 ILL, ILL, L2A, L2A, L2A, L2A, L2A, L2A,
1859 L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1860 L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1861 L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1862
1863 /* E0..EF */
1864 L3A, L3B, L3B, L3B, L3B, L3B, L3B, L3B,
1865 L3B, L3B, L3B, L3B, L3B, L3C, L3B, L3B,
1866
1867 /* F0..FF */
1868 L4A, L4B, L4B, L4B, L4C, ILL, ILL, ILL,
1869 ILL, ILL, ILL, ILL, ILL, ILL, ILL, ILL
1870};
1871
1872static void
1873utf8_advance(const unsigned char *s, uint32 *state, int len)
1874{
1875 /* Note: We deliberately don't check the state's value here. */
1876 while (len > 0)
1877 {
1878 /*
1879 * It's important that the mask value is 31: In most instruction sets,
1880 * a shift by a 32-bit operand is understood to be a shift by its mod
1881 * 32, so the compiler should elide the mask operation.
1882 */
1883 *state = Utf8Transition[*s++] >> (*state & 31);
1884 len--;
1885 }
1886
1887 *state &= 31;
1888}
1889
1890static int
1891pg_utf8_verifystr(const unsigned char *s, int len)
1892{
1893 const unsigned char *start = s;
1894 const int orig_len = len;
1895 uint32 state = BGN;
1896
1897/*
1898 * With a stride of two vector widths, gcc will unroll the loop. Even if
1899 * the compiler can unroll a longer loop, it's not worth it because we
1900 * must fall back to the byte-wise algorithm if we find any non-ASCII.
1901 */
1902#define STRIDE_LENGTH (2 * sizeof(Vector8))
1903
1904 if (len >= STRIDE_LENGTH)
1905 {
1906 while (len >= STRIDE_LENGTH)
1907 {
1908 /*
1909 * If the chunk is all ASCII, we can skip the full UTF-8 check,
1910 * but we must first check for a non-END state, which means the
1911 * previous chunk ended in the middle of a multibyte sequence.
1912 */
1913 if (state != END || !is_valid_ascii(s, STRIDE_LENGTH))
1915
1916 s += STRIDE_LENGTH;
1917 len -= STRIDE_LENGTH;
1918 }
1919
1920 /* The error state persists, so we only need to check for it here. */
1921 if (state == ERR)
1922 {
1923 /*
1924 * Start over from the beginning with the slow path so we can
1925 * count the valid bytes.
1926 */
1927 len = orig_len;
1928 s = start;
1929 }
1930 else if (state != END)
1931 {
1932 /*
1933 * The fast path exited in the middle of a multibyte sequence.
1934 * Walk backwards to find the leading byte so that the slow path
1935 * can resume checking from there. We must always backtrack at
1936 * least one byte, since the current byte could be e.g. an ASCII
1937 * byte after a 2-byte lead, which is invalid.
1938 */
1939 do
1940 {
1941 Assert(s > start);
1942 s--;
1943 len++;
1945 } while (pg_utf_mblen(s) <= 1);
1946 }
1947 }
1948
1949 /* check remaining bytes */
1950 while (len > 0)
1951 {
1952 int l;
1953
1954 /* fast path for ASCII-subset characters */
1955 if (!IS_HIGHBIT_SET(*s))
1956 {
1957 if (*s == '\0')
1958 break;
1959 l = 1;
1960 }
1961 else
1962 {
1963 l = pg_utf8_verifychar(s, len);
1964 if (l == -1)
1965 break;
1966 }
1967 s += l;
1968 len -= l;
1969 }
1970
1971 return s - start;
1972}
1973
1974/*
1975 * Check for validity of a single UTF-8 encoded character
1976 *
1977 * This directly implements the rules in RFC3629. The bizarre-looking
1978 * restrictions on the second byte are meant to ensure that there isn't
1979 * more than one encoding of a given Unicode character point; that is,
1980 * you may not use a longer-than-necessary byte sequence with high order
1981 * zero bits to represent a character that would fit in fewer bytes.
1982 * To do otherwise is to create security hazards (eg, create an apparent
1983 * non-ASCII character that decodes to plain ASCII).
1984 *
1985 * length is assumed to have been obtained by pg_utf_mblen(), and the
1986 * caller must have checked that that many bytes are present in the buffer.
1987 */
1988bool
1989pg_utf8_islegal(const unsigned char *source, int length)
1990{
1991 unsigned char a;
1992
1993 switch (length)
1994 {
1995 default:
1996 /* reject lengths 5 and 6 for now */
1997 return false;
1998 case 4:
1999 a = source[3];
2000 if (a < 0x80 || a > 0xBF)
2001 return false;
2002 /* FALL THRU */
2003 case 3:
2004 a = source[2];
2005 if (a < 0x80 || a > 0xBF)
2006 return false;
2007 /* FALL THRU */
2008 case 2:
2009 a = source[1];
2010 switch (*source)
2011 {
2012 case 0xE0:
2013 if (a < 0xA0 || a > 0xBF)
2014 return false;
2015 break;
2016 case 0xED:
2017 if (a < 0x80 || a > 0x9F)
2018 return false;
2019 break;
2020 case 0xF0:
2021 if (a < 0x90 || a > 0xBF)
2022 return false;
2023 break;
2024 case 0xF4:
2025 if (a < 0x80 || a > 0x8F)
2026 return false;
2027 break;
2028 default:
2029 if (a < 0x80 || a > 0xBF)
2030 return false;
2031 break;
2032 }
2033 /* FALL THRU */
2034 case 1:
2035 a = *source;
2036 if (a >= 0x80 && a < 0xC2)
2037 return false;
2038 if (a > 0xF4)
2039 return false;
2040 break;
2041 }
2042 return true;
2043}
2044
2045
2046/*
2047 * Fills the provided buffer with two bytes such that:
2048 * pg_encoding_mblen(dst) == 2 && pg_encoding_verifymbstr(dst) == 0
2049 */
2050void
2052{
2054
2055 dst[0] = (encoding == PG_UTF8 ? 0xc0 : NONUTF8_INVALID_BYTE0);
2056 dst[1] = NONUTF8_INVALID_BYTE1;
2057}
2058
2059/*
2060 *-------------------------------------------------------------------
2061 * encoding info table
2062 *-------------------------------------------------------------------
2063 */
2107};
2108
2109/*
2110 * Returns the byte length of a multibyte character.
2111 *
2112 * Choose "mblen" functions based on the input string characteristics.
2113 * pg_encoding_mblen() can be used when ANY of these conditions are met:
2114 *
2115 * - The input string is zero-terminated
2116 *
2117 * - The input string is known to be valid in the encoding (e.g., string
2118 * converted from database encoding)
2119 *
2120 * - The encoding is not GB18030 (e.g., when only database encodings are
2121 * passed to 'encoding' parameter)
2122 *
2123 * encoding==GB18030 requires examining up to two bytes to determine character
2124 * length. Therefore, callers satisfying none of those conditions must use
2125 * pg_encoding_mblen_or_incomplete() instead, as access to mbstr[1] cannot be
2126 * guaranteed to be within allocation bounds.
2127 *
2128 * When dealing with text that is not certainly valid in the specified
2129 * encoding, the result may exceed the actual remaining string length.
2130 * Callers that are not prepared to deal with that should use Min(remaining,
2131 * pg_encoding_mblen_or_incomplete()). For zero-terminated strings, that and
2132 * pg_encoding_mblen_bounded() are interchangeable.
2133 */
2134int
2135pg_encoding_mblen(int encoding, const char *mbstr)
2136{
2137 return (PG_VALID_ENCODING(encoding) ?
2138 pg_wchar_table[encoding].mblen((const unsigned char *) mbstr) :
2139 pg_wchar_table[PG_SQL_ASCII].mblen((const unsigned char *) mbstr));
2140}
2141
2142/*
2143 * Returns the byte length of a multibyte character (possibly not
2144 * zero-terminated), or INT_MAX if too few bytes remain to determine a length.
2145 */
2146int
2148 size_t remaining)
2149{
2150 /*
2151 * Define zero remaining as too few, even for single-byte encodings.
2152 * pg_gb18030_mblen() reads one or two bytes; single-byte encodings read
2153 * zero; others read one.
2154 */
2155 if (remaining < 1 ||
2156 (encoding == PG_GB18030 && IS_HIGHBIT_SET(*mbstr) && remaining < 2))
2157 return INT_MAX;
2158 return pg_encoding_mblen(encoding, mbstr);
2159}
2160
2161/*
2162 * Returns the byte length of a multibyte character; but not more than the
2163 * distance to the terminating zero byte. For input that might lack a
2164 * terminating zero, use Min(remaining, pg_encoding_mblen_or_incomplete()).
2165 */
2166int
2168{
2169 return strnlen(mbstr, pg_encoding_mblen(encoding, mbstr));
2170}
2171
2172/*
2173 * Returns the display length of a multibyte character.
2174 */
2175int
2176pg_encoding_dsplen(int encoding, const char *mbstr)
2177{
2178 return (PG_VALID_ENCODING(encoding) ?
2179 pg_wchar_table[encoding].dsplen((const unsigned char *) mbstr) :
2180 pg_wchar_table[PG_SQL_ASCII].dsplen((const unsigned char *) mbstr));
2181}
2182
2183/*
2184 * Verify the first multibyte character of the given string.
2185 * Return its byte length if good, -1 if bad. (See comments above for
2186 * full details of the mbverifychar API.)
2187 */
2188int
2189pg_encoding_verifymbchar(int encoding, const char *mbstr, int len)
2190{
2191 return (PG_VALID_ENCODING(encoding) ?
2192 pg_wchar_table[encoding].mbverifychar((const unsigned char *) mbstr, len) :
2193 pg_wchar_table[PG_SQL_ASCII].mbverifychar((const unsigned char *) mbstr, len));
2194}
2195
2196/*
2197 * Verify that a string is valid for the given encoding.
2198 * Returns the number of input bytes (<= len) that form a valid string.
2199 * (See comments above for full details of the mbverifystr API.)
2200 */
2201int
2202pg_encoding_verifymbstr(int encoding, const char *mbstr, int len)
2203{
2204 return (PG_VALID_ENCODING(encoding) ?
2205 pg_wchar_table[encoding].mbverifystr((const unsigned char *) mbstr, len) :
2206 pg_wchar_table[PG_SQL_ASCII].mbverifystr((const unsigned char *) mbstr, len));
2207}
2208
2209/*
2210 * fetch maximum length of a given encoding
2211 */
2212int
2214{
2216
2217 /*
2218 * Check for the encoding despite the assert, due to some mingw versions
2219 * otherwise issuing bogus warnings.
2220 */
2221 return PG_VALID_ENCODING(encoding) ?
2224}
static bool is_valid_ascii(const unsigned char *s, int len)
Definition: ascii.h:25
#define IS_HIGHBIT_SET(ch)
Definition: c.h:1126
uint32_t uint32
Definition: c.h:502
Assert(PointerIsAligned(start, uint64))
return str start
int remaining
Definition: informix.c:692
int a
Definition: isn.c:73
static pg_wchar utf8_to_unicode(const unsigned char *c)
Definition: mbprint.c:53
unsigned int pg_wchar
Definition: mbprint.c:31
const void size_t len
int32 encoding
Definition: pg_database.h:41
static const struct lconv_member_info table[]
static rewind_source * source
Definition: pg_rewind.c:89
#define IS_LCPRV2(c)
Definition: pg_wchar.h:164
#define ISSJISTAIL(c)
Definition: pg_wchar.h:45
@ PG_WIN1254
Definition: pg_wchar.h:257
@ PG_LATIN4
Definition: pg_wchar.h:237
@ PG_LATIN9
Definition: pg_wchar.h:242
@ PG_JOHAB
Definition: pg_wchar.h:269
@ PG_GB18030
Definition: pg_wchar.h:268
@ PG_SQL_ASCII
Definition: pg_wchar.h:226
@ PG_KOI8R
Definition: pg_wchar.h:248
@ PG_ISO_8859_6
Definition: pg_wchar.h:252
@ PG_WIN1253
Definition: pg_wchar.h:256
@ PG_KOI8U
Definition: pg_wchar.h:260
@ PG_LATIN6
Definition: pg_wchar.h:239
@ PG_MULE_INTERNAL
Definition: pg_wchar.h:233
@ PG_LATIN5
Definition: pg_wchar.h:238
@ PG_EUC_CN
Definition: pg_wchar.h:228
@ PG_UHC
Definition: pg_wchar.h:267
@ PG_LATIN2
Definition: pg_wchar.h:235
@ PG_ISO_8859_5
Definition: pg_wchar.h:251
@ PG_LATIN10
Definition: pg_wchar.h:243
@ PG_WIN1250
Definition: pg_wchar.h:255
@ PG_ISO_8859_7
Definition: pg_wchar.h:253
@ PG_SJIS
Definition: pg_wchar.h:264
@ PG_LATIN8
Definition: pg_wchar.h:241
@ PG_EUC_JP
Definition: pg_wchar.h:227
@ PG_GBK
Definition: pg_wchar.h:266
@ PG_LATIN3
Definition: pg_wchar.h:236
@ PG_WIN1256
Definition: pg_wchar.h:244
@ PG_LATIN1
Definition: pg_wchar.h:234
@ PG_EUC_TW
Definition: pg_wchar.h:230
@ PG_WIN1258
Definition: pg_wchar.h:245
@ PG_SHIFT_JIS_2004
Definition: pg_wchar.h:270
@ PG_WIN1252
Definition: pg_wchar.h:250
@ PG_LATIN7
Definition: pg_wchar.h:240
@ PG_UTF8
Definition: pg_wchar.h:232
@ PG_WIN1255
Definition: pg_wchar.h:258
@ PG_WIN1257
Definition: pg_wchar.h:259
@ PG_WIN1251
Definition: pg_wchar.h:249
@ PG_EUC_KR
Definition: pg_wchar.h:229
@ PG_WIN866
Definition: pg_wchar.h:246
@ PG_ISO_8859_8
Definition: pg_wchar.h:254
@ PG_WIN874
Definition: pg_wchar.h:247
@ PG_EUC_JIS_2004
Definition: pg_wchar.h:231
@ PG_BIG5
Definition: pg_wchar.h:265
#define LCPRV1_A
Definition: pg_wchar.h:150
#define LCPRV1_B
Definition: pg_wchar.h:151
#define IS_LC2(c)
Definition: pg_wchar.h:144
#define IS_LCPRV1(c)
Definition: pg_wchar.h:152
#define LCPRV2_A
Definition: pg_wchar.h:162
static unsigned char * unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
Definition: pg_wchar.h:575
#define IS_LCPRV2_B_RANGE(c)
Definition: pg_wchar.h:167
#define SS2
Definition: pg_wchar.h:38
#define IS_LCPRV1_A_RANGE(c)
Definition: pg_wchar.h:153
#define PG_VALID_ENCODING(_enc)
Definition: pg_wchar.h:287
#define IS_LCPRV1_B_RANGE(c)
Definition: pg_wchar.h:155
#define ISSJISHEAD(c)
Definition: pg_wchar.h:44
#define IS_LC1(c)
Definition: pg_wchar.h:126
#define IS_LCPRV2_A_RANGE(c)
Definition: pg_wchar.h:165
#define SS3
Definition: pg_wchar.h:39
#define LCPRV2_B
Definition: pg_wchar.h:163
size_t strnlen(const char *str, size_t maxlen)
Definition: strnlen.c:26
char * c
unsigned int first
Definition: wchar.c:575
unsigned int last
Definition: wchar.c:576
int maxmblen
Definition: pg_wchar.h:386
Definition: regguts.h:323
static const struct mbinterval east_asian_fw[]
static const struct mbinterval nonspacing[]
static int pg_uhc_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1612
static int pg_latin1_dsplen(const unsigned char *s)
Definition: wchar.c:882
int pg_encoding_mblen_bounded(int encoding, const char *mbstr)
Definition: wchar.c:2167
static int pg_euctw_mblen(const unsigned char *s)
Definition: wchar.c:339
static int pg_euckr_dsplen(const unsigned char *s)
Definition: wchar.c:222
static const uint32 Utf8Transition[256]
Definition: wchar.c:1815
bool pg_utf8_islegal(const unsigned char *source, int length)
Definition: wchar.c:1989
static int pg_ascii_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1069
static int pg_latin1_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1410
static int pg_sjis_dsplen(const unsigned char *s)
Definition: wchar.c:905
#define CR3
Definition: wchar.c:1811
static int pg_sjis_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1427
static int pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:105
static int pg_eucjp_dsplen(const unsigned char *s)
Definition: wchar.c:191
static int pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:70
#define L3B
Definition: wchar.c:1802
static int pg_uhc_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1587
#define L2A
Definition: wchar.c:1799
static int pg_gbk_dsplen(const unsigned char *s)
Definition: wchar.c:961
static int pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:210
static int pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:299
#define END
Definition: wchar.c:1792
#define pg_euccn_verifychar
Definition: wchar.c:1224
#define L4C
Definition: wchar.c:1807
static int pg_sjis_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1450
static int pg_johab_mblen(const unsigned char *s)
Definition: wchar.c:423
static int pg_johab_dsplen(const unsigned char *s)
Definition: wchar.c:429
static int pg_big5_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1504
#define CR2
Definition: wchar.c:1810
static int pg_mule_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1360
static int pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:179
static int pg_latin1_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1416
static int pg_latin1_mblen(const unsigned char *s)
Definition: wchar.c:876
static int pg_ascii_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1063
static int pg_ascii_mblen(const unsigned char *s)
Definition: wchar.c:85
void pg_encoding_set_invalid(int encoding, char *dst)
Definition: wchar.c:2051
static int mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
Definition: wchar.c:581
static int pg_big5_dsplen(const unsigned char *s)
Definition: wchar.c:934
#define pg_euccn_verifystr
Definition: wchar.c:1225
int pg_utf_mblen(const unsigned char *s)
Definition: wchar.c:538
int pg_encoding_mblen_or_incomplete(int encoding, const char *mbstr, size_t remaining)
Definition: wchar.c:2147
#define NONUTF8_INVALID_BYTE0
Definition: wchar.c:36
static int pg_eucjp_mblen(const unsigned char *s)
Definition: wchar.c:185
static int pg_gbk_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1533
static int pg_big5_mblen(const unsigned char *s)
Definition: wchar.c:922
static int pg_euccn_dsplen(const unsigned char *s)
Definition: wchar.c:283
static int pg_euctw_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1228
static int pg_euckr_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1166
static int pg_euctw_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1278
static int pg_gbk_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1558
static int pg_gb18030_dsplen(const unsigned char *s)
Definition: wchar.c:1029
#define ERR
Definition: wchar.c:1779
static int pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:441
int pg_mule_mblen(const unsigned char *s)
Definition: wchar.c:793
static int pg_euccn_mblen(const unsigned char *s)
Definition: wchar.c:271
#define ASC
Definition: wchar.c:1797
static int pg_gbk_mblen(const unsigned char *s)
Definition: wchar.c:949
static int pg_eucjp_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1137
static int pg_johab_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1331
static int pg_euc_dsplen(const unsigned char *s)
Definition: wchar.c:160
static int pg_gb18030_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1672
static int pg_euckr_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1195
static int pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
Definition: wchar.c:727
static int pg_sjis_mblen(const unsigned char *s)
Definition: wchar.c:891
#define IS_EUC_RANGE_VALID(c)
Definition: wchar.c:1079
static int pg_uhc_dsplen(const unsigned char *s)
Definition: wchar.c:988
static int pg_eucjp_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1082
static int pg_big5_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1479
static int pg_gb18030_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1641
static int pg_mule_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1381
static int pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
Definition: wchar.c:377
#define L3C
Definition: wchar.c:1803
static int pg_utf8_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1701
#define CR1
Definition: wchar.c:1809
static int pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
Definition: wchar.c:861
static int pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
Definition: wchar.c:507
static int pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:232
static int pg_gb18030_mblen(const unsigned char *s)
Definition: wchar.c:1015
int pg_encoding_dsplen(int encoding, const char *mbstr)
Definition: wchar.c:2176
static void utf8_advance(const unsigned char *s, uint32 *state, int len)
Definition: wchar.c:1873
static int pg_euctw_dsplen(const unsigned char *s)
Definition: wchar.c:355
static int pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:839
static int pg_uhc_mblen(const unsigned char *s)
Definition: wchar.c:976
static int pg_euc_mblen(const unsigned char *s)
Definition: wchar.c:144
static int pg_mule_dsplen(const unsigned char *s)
Definition: wchar.c:811
#define L3A
Definition: wchar.c:1801
#define L4B
Definition: wchar.c:1806
int pg_encoding_verifymbstr(int encoding, const char *mbstr, int len)
Definition: wchar.c:2202
#define NONUTF8_INVALID_BYTE1
Definition: wchar.c:37
static int pg_utf8_verifystr(const unsigned char *s, int len)
Definition: wchar.c:1891
static int pg_euckr_mblen(const unsigned char *s)
Definition: wchar.c:216
const pg_wchar_tbl pg_wchar_table[]
Definition: wchar.c:2064
static int pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
Definition: wchar.c:674
#define BGN
Definition: wchar.c:1781
int pg_encoding_max_length(int encoding)
Definition: wchar.c:2213
int pg_encoding_mblen(int encoding, const char *mbstr)
Definition: wchar.c:2135
static int pg_johab_verifychar(const unsigned char *s, int len)
Definition: wchar.c:1307
#define ILL
Definition: wchar.c:1813
#define STRIDE_LENGTH
#define L4A
Definition: wchar.c:1805
static int pg_ascii_dsplen(const unsigned char *s)
Definition: wchar.c:91
int pg_encoding_verifymbchar(int encoding, const char *mbstr, int len)
Definition: wchar.c:2189
static int ucs_wcwidth(pg_wchar ucs)
Definition: wchar.c:628
static int pg_utf_dsplen(const unsigned char *s)
Definition: wchar.c:662