Skip to content

Commit e09c9c0

Browse files
author
Artur Zakirov
committed
Fix bug: 'Mac OS: invalid byte sequence for encoding UTF8'
1 parent cd8968c commit e09c9c0

File tree

1 file changed

+124
-13
lines changed

1 file changed

+124
-13
lines changed

src/backend/tsearch/spell.c

Lines changed: 124 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -519,12 +519,122 @@ NIAddAffix(IspellDict *Conf, int flag, char flagflags, const char *mask, const c
519519
}
520520

521521
#define PAE_WAIT_MASK 0
522-
#define PAE_INMASK 1
522+
#define PAE_INMASK 1
523523
#define PAE_WAIT_FIND 2
524-
#define PAE_INFIND 3
524+
#define PAE_INFIND 3
525525
#define PAE_WAIT_REPL 4
526-
#define PAE_INREPL 5
526+
#define PAE_INREPL 5
527+
#define PAE_WAIT_TYPE 6
528+
#define PAE_WAIT_FLAG 7
527529

530+
/*
531+
* Used in parse_ooaffentry() to parse an .affix file entry.
532+
*/
533+
static bool
534+
get_nextentry(char **str, char *next)
535+
{
536+
int state = PAE_WAIT_MASK;
537+
char *pnext = next;
538+
539+
*next = '\0';
540+
541+
while (**str)
542+
{
543+
if (state == PAE_WAIT_MASK)
544+
{
545+
if (t_iseq(*str, '#'))
546+
return false;
547+
else if (!t_isspace(*str))
548+
{
549+
COPYCHAR(pnext, *str);
550+
pnext += pg_mblen(*str);
551+
state = PAE_INMASK;
552+
}
553+
}
554+
else if (state == PAE_INMASK)
555+
{
556+
if (t_isspace(*str))
557+
{
558+
*pnext = '\0';
559+
return true;
560+
}
561+
else
562+
{
563+
COPYCHAR(pnext, *str);
564+
pnext += pg_mblen(*str);
565+
}
566+
}
567+
*str += pg_mblen(*str);
568+
}
569+
570+
*pnext ='\0';
571+
572+
return *next;
573+
}
574+
575+
/*
576+
* Parses entry of an .affix file of MySpell or Hunspell format.
577+
*
578+
* An .affix file entry has the following format:
579+
* - header
580+
* <type> <flag> <cross_flag> <flag_count>
581+
* - fields after header:
582+
* <type> <flag> <find> <replace> <mask>
583+
*/
584+
static int
585+
parse_ooaffentry(char *str, char *type, char *flag, char *find,
586+
char *repl, char *mask)
587+
{
588+
int state = PAE_WAIT_TYPE,
589+
next_state = PAE_WAIT_FLAG;
590+
int parse_read = 0;
591+
bool valid = true;
592+
593+
*type = *flag = *find = *repl = *mask = '\0';
594+
595+
while (*str && valid)
596+
{
597+
switch (state)
598+
{
599+
case PAE_WAIT_TYPE:
600+
valid = get_nextentry(&str, type);
601+
break;
602+
case PAE_WAIT_FLAG:
603+
valid = get_nextentry(&str, flag);
604+
next_state = PAE_WAIT_FIND;
605+
break;
606+
case PAE_WAIT_FIND:
607+
valid = get_nextentry(&str, find);
608+
next_state = PAE_WAIT_REPL;
609+
break;
610+
case PAE_WAIT_REPL:
611+
valid = get_nextentry(&str, repl);
612+
next_state = PAE_WAIT_MASK;
613+
break;
614+
case PAE_WAIT_MASK:
615+
get_nextentry(&str, mask);
616+
/* break loop */
617+
valid = false;
618+
break;
619+
default:
620+
elog(ERROR, "unrecognized state in parse_ooaffentry: %d", state);
621+
}
622+
state = next_state;
623+
if (*str)
624+
str += pg_mblen(str);
625+
626+
parse_read++;
627+
}
628+
629+
return parse_read;
630+
}
631+
632+
/*
633+
* Parses entry of an .affix file of Ispell format
634+
*
635+
* An .affix file entry has the following format:
636+
* <mask> > [-<find>,]<replace>
637+
*/
528638
static bool
529639
parse_affentry(char *str, char *mask, char *find, char *repl)
530640
{
@@ -731,8 +841,7 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename)
731841
sflaglen = 0;
732842
char flagflags = 0;
733843
tsearch_readline_state trst;
734-
int scanread = 0;
735-
char scanbuf[BUFSIZ];
844+
int parseread = 0;
736845
char *recoded;
737846

738847
/* read file to find any flag */
@@ -804,8 +913,6 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename)
804913
}
805914
tsearch_readline_end(&trst);
806915

807-
sprintf(scanbuf, "%%6s %%%ds %%%ds %%%ds %%%ds", BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5);
808-
809916
if (!tsearch_readline_begin(&trst, filename))
810917
ereport(ERROR,
811918
(errcode(ERRCODE_CONFIG_FILE_ERROR),
@@ -817,8 +924,7 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename)
817924
if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
818925
goto nextline;
819926

820-
*find = *repl = *mask = '\0';
821-
scanread = sscanf(recoded, scanbuf, type, sflag, find, repl, mask);
927+
parseread = parse_ooaffentry(recoded, type, sflag, find, repl, mask);
822928

823929
if (ptype)
824930
pfree(ptype);
@@ -859,7 +965,7 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename)
859965
goto nextline;
860966
}
861967
/* Else try to parse prefixes and suffixes */
862-
if (scanread < 4 || (STRNCMP(ptype, "sfx") && STRNCMP(ptype, "pfx")))
968+
if (parseread < 4 || (STRNCMP(ptype, "sfx") && STRNCMP(ptype, "pfx")))
863969
goto nextline;
864970

865971
sflaglen = strlen(sflag);
@@ -888,9 +994,13 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename)
888994
if (flag == 0)
889995
goto nextline;
890996
prepl = lowerstr_ctx(Conf, repl);
891-
/* affix flag */
997+
/* Find position of '/' in lowercased string "prepl" */
892998
if ((ptr = strchr(prepl, '/')) != NULL)
893999
{
1000+
/*
1001+
* Here we use non-lowercased string "repl". We need position of
1002+
* '/' in "repl".
1003+
*/
8941004
*ptr = '\0';
8951005
ptr = repl + (ptr - prepl) + 1;
8961006
aflg |= getFlagValues(Conf, getFlags(Conf, ptr));
@@ -964,11 +1074,12 @@ NIImportAffixes(IspellDict *Conf, const char *filename)
9641074

9651075
if (STRNCMP(pstr, "compoundwords") == 0)
9661076
{
1077+
/* Find position in lowercased string "pstr" */
9671078
s = findchar(pstr, 'l');
9681079
if (s)
9691080
{
970-
s = recoded + (s - pstr); /* we need non-lowercased
971-
* string */
1081+
/* Here we use non-lowercased string "recoded" */
1082+
s = recoded + (s - pstr);
9721083
while (*s && !t_isspace(s))
9731084
s += pg_mblen(s);
9741085
while (*s && t_isspace(s))

0 commit comments

Comments
 (0)