Skip to content

Commit 9acae56

Browse files
committed
Inline basic UTF-8 functions.
Shows a measurable speedup when processing UTF-8 data, such as with the new builtin collation provider. Discussion: https://postgr.es/m/163f4e2190cdf67f67016044e503c5004547e5a9.camel@j-davis.com Reviewed-by: Peter Eisentraut
1 parent 2b52086 commit 9acae56

File tree

2 files changed

+61
-61
lines changed

2 files changed

+61
-61
lines changed

src/common/wchar.c

Lines changed: 0 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -476,39 +476,6 @@ pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
476476
}
477477

478478

479-
/*
480-
* Map a Unicode code point to UTF-8. utf8string must have at least
481-
* unicode_utf8len(c) bytes available.
482-
*/
483-
unsigned char *
484-
unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
485-
{
486-
if (c <= 0x7F)
487-
{
488-
utf8string[0] = c;
489-
}
490-
else if (c <= 0x7FF)
491-
{
492-
utf8string[0] = 0xC0 | ((c >> 6) & 0x1F);
493-
utf8string[1] = 0x80 | (c & 0x3F);
494-
}
495-
else if (c <= 0xFFFF)
496-
{
497-
utf8string[0] = 0xE0 | ((c >> 12) & 0x0F);
498-
utf8string[1] = 0x80 | ((c >> 6) & 0x3F);
499-
utf8string[2] = 0x80 | (c & 0x3F);
500-
}
501-
else
502-
{
503-
utf8string[0] = 0xF0 | ((c >> 18) & 0x07);
504-
utf8string[1] = 0x80 | ((c >> 12) & 0x3F);
505-
utf8string[2] = 0x80 | ((c >> 6) & 0x3F);
506-
utf8string[3] = 0x80 | (c & 0x3F);
507-
}
508-
509-
return utf8string;
510-
}
511-
512479
/*
513480
* Trivial conversion from pg_wchar to UTF-8.
514481
* caller should allocate enough space for "to"
@@ -670,34 +637,6 @@ ucs_wcwidth(pg_wchar ucs)
670637
return 1;
671638
}
672639

673-
/*
674-
* Convert a UTF-8 character to a Unicode code point.
675-
* This is a one-character version of pg_utf2wchar_with_len.
676-
*
677-
* No error checks here, c must point to a long-enough string.
678-
*/
679-
pg_wchar
680-
utf8_to_unicode(const unsigned char *c)
681-
{
682-
if ((*c & 0x80) == 0)
683-
return (pg_wchar) c[0];
684-
else if ((*c & 0xe0) == 0xc0)
685-
return (pg_wchar) (((c[0] & 0x1f) << 6) |
686-
(c[1] & 0x3f));
687-
else if ((*c & 0xf0) == 0xe0)
688-
return (pg_wchar) (((c[0] & 0x0f) << 12) |
689-
((c[1] & 0x3f) << 6) |
690-
(c[2] & 0x3f));
691-
else if ((*c & 0xf8) == 0xf0)
692-
return (pg_wchar) (((c[0] & 0x07) << 18) |
693-
((c[1] & 0x3f) << 12) |
694-
((c[2] & 0x3f) << 6) |
695-
(c[3] & 0x3f));
696-
else
697-
/* that is an invalid code on purpose */
698-
return 0xffffffff;
699-
}
700-
701640
static int
702641
pg_utf_dsplen(const unsigned char *s)
703642
{

src/include/mb/pg_wchar.h

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -555,6 +555,67 @@ surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second)
555555
return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF);
556556
}
557557

558+
/*
559+
* Convert a UTF-8 character to a Unicode code point.
560+
* This is a one-character version of pg_utf2wchar_with_len.
561+
*
562+
* No error checks here, c must point to a long-enough string.
563+
*/
564+
static inline pg_wchar
565+
utf8_to_unicode(const unsigned char *c)
566+
{
567+
if ((*c & 0x80) == 0)
568+
return (pg_wchar) c[0];
569+
else if ((*c & 0xe0) == 0xc0)
570+
return (pg_wchar) (((c[0] & 0x1f) << 6) |
571+
(c[1] & 0x3f));
572+
else if ((*c & 0xf0) == 0xe0)
573+
return (pg_wchar) (((c[0] & 0x0f) << 12) |
574+
((c[1] & 0x3f) << 6) |
575+
(c[2] & 0x3f));
576+
else if ((*c & 0xf8) == 0xf0)
577+
return (pg_wchar) (((c[0] & 0x07) << 18) |
578+
((c[1] & 0x3f) << 12) |
579+
((c[2] & 0x3f) << 6) |
580+
(c[3] & 0x3f));
581+
else
582+
/* that is an invalid code on purpose */
583+
return 0xffffffff;
584+
}
585+
586+
/*
587+
* Map a Unicode code point to UTF-8. utf8string must have at least
588+
* unicode_utf8len(c) bytes available.
589+
*/
590+
static inline unsigned char *
591+
unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
592+
{
593+
if (c <= 0x7F)
594+
{
595+
utf8string[0] = c;
596+
}
597+
else if (c <= 0x7FF)
598+
{
599+
utf8string[0] = 0xC0 | ((c >> 6) & 0x1F);
600+
utf8string[1] = 0x80 | (c & 0x3F);
601+
}
602+
else if (c <= 0xFFFF)
603+
{
604+
utf8string[0] = 0xE0 | ((c >> 12) & 0x0F);
605+
utf8string[1] = 0x80 | ((c >> 6) & 0x3F);
606+
utf8string[2] = 0x80 | (c & 0x3F);
607+
}
608+
else
609+
{
610+
utf8string[0] = 0xF0 | ((c >> 18) & 0x07);
611+
utf8string[1] = 0x80 | ((c >> 12) & 0x3F);
612+
utf8string[2] = 0x80 | ((c >> 6) & 0x3F);
613+
utf8string[3] = 0x80 | (c & 0x3F);
614+
}
615+
616+
return utf8string;
617+
}
618+
558619
/*
559620
* Number of bytes needed to represent the given char in UTF8.
560621
*/

0 commit comments

Comments
 (0)