Skip to content

Commit 737ae3f

Browse files
committed
Make the locale comparison in pg_upgrade more lenient
If the locale names are not equal, try to canonicalize both of them by passing them to setlocale(). Before, we only canonicalized the old cluster's locale if upgrading from a 8.4-9.2 server, but we also need to canonicalize when upgrading from a pre-8.4 server. That was an oversight in the code. But we should also canonicalize on newer server versions, so that we cope if the canonical form changes from one release to another. I'm about to do just that to fix bug #11431, by mapping a locale name that contains non-ASCII characters to a pure-ASCII alias of the same locale. This is partial backpatch of commit 33755e8 in master. Apply to 9.2, 9.3 and 9.4. The canonicalization code didn't exist before 9.2. In 9.2 and 9.3, this effectively also back-patches the changes from commit 5827472, to be more lax about the spelling of the encoding in the locale names.
1 parent f09369d commit 737ae3f

File tree

1 file changed

+77
-25
lines changed

1 file changed

+77
-25
lines changed

contrib/pg_upgrade/check.c

Lines changed: 77 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,16 @@
99

1010
#include "postgres.h"
1111

12+
#include "mb/pg_wchar.h"
1213
#include "pg_upgrade.h"
1314

1415

1516
static void set_locale_and_encoding(ClusterInfo *cluster);
1617
static void check_new_cluster_is_empty(void);
1718
static void check_locale_and_encoding(ControlData *oldctrl,
1819
ControlData *newctrl);
20+
static bool equivalent_locale(int category, const char *loca, const char *locb);
21+
static bool equivalent_encoding(const char *chara, const char *charb);
1922
static void check_is_super_user(ClusterInfo *cluster);
2023
static void check_for_prepared_transactions(ClusterInfo *cluster);
2124
static void check_for_isn_and_int8_passing_mismatch(ClusterInfo *cluster);
@@ -360,23 +363,8 @@ set_locale_and_encoding(ClusterInfo *cluster)
360363
i_datcollate = PQfnumber(res, "datcollate");
361364
i_datctype = PQfnumber(res, "datctype");
362365

363-
if (GET_MAJOR_VERSION(cluster->major_version) < 902)
364-
{
365-
/*
366-
* Pre-9.2 did not canonicalize the supplied locale names
367-
* to match what the system returns, while 9.2+ does, so
368-
* convert pre-9.2 to match.
369-
*/
370-
ctrl->lc_collate = get_canonical_locale_name(LC_COLLATE,
371-
pg_strdup(PQgetvalue(res, 0, i_datcollate)));
372-
ctrl->lc_ctype = get_canonical_locale_name(LC_CTYPE,
373-
pg_strdup(PQgetvalue(res, 0, i_datctype)));
374-
}
375-
else
376-
{
377-
ctrl->lc_collate = pg_strdup(PQgetvalue(res, 0, i_datcollate));
378-
ctrl->lc_ctype = pg_strdup(PQgetvalue(res, 0, i_datctype));
379-
}
366+
ctrl->lc_collate = pg_strdup(PQgetvalue(res, 0, i_datcollate));
367+
ctrl->lc_ctype = pg_strdup(PQgetvalue(res, 0, i_datctype));
380368

381369
PQclear(res);
382370
}
@@ -406,25 +394,89 @@ static void
406394
check_locale_and_encoding(ControlData *oldctrl,
407395
ControlData *newctrl)
408396
{
409-
/*
410-
* These are often defined with inconsistent case, so use pg_strcasecmp().
411-
* They also often use inconsistent hyphenation, which we cannot fix, e.g.
412-
* UTF-8 vs. UTF8, so at least we display the mismatching values.
413-
*/
414-
if (pg_strcasecmp(oldctrl->lc_collate, newctrl->lc_collate) != 0)
397+
if (!equivalent_locale(LC_COLLATE, oldctrl->lc_collate, newctrl->lc_collate))
415398
pg_log(PG_FATAL,
416399
"lc_collate cluster values do not match: old \"%s\", new \"%s\"\n",
417400
oldctrl->lc_collate, newctrl->lc_collate);
418-
if (pg_strcasecmp(oldctrl->lc_ctype, newctrl->lc_ctype) != 0)
401+
if (!equivalent_locale(LC_CTYPE, oldctrl->lc_ctype, newctrl->lc_ctype))
419402
pg_log(PG_FATAL,
420403
"lc_ctype cluster values do not match: old \"%s\", new \"%s\"\n",
421404
oldctrl->lc_ctype, newctrl->lc_ctype);
422-
if (pg_strcasecmp(oldctrl->encoding, newctrl->encoding) != 0)
405+
if (!equivalent_encoding(oldctrl->encoding, newctrl->encoding))
423406
pg_log(PG_FATAL,
424407
"encoding cluster values do not match: old \"%s\", new \"%s\"\n",
425408
oldctrl->encoding, newctrl->encoding);
426409
}
427410

411+
/*
412+
* equivalent_locale()
413+
*
414+
* Best effort locale-name comparison. Return false if we are not 100% sure
415+
* the locales are equivalent.
416+
*
417+
* Note: The encoding parts of the names are ignored. This function is
418+
* currently used to compare locale names stored in pg_database, and
419+
* pg_database contains a separate encoding field. That's compared directly
420+
* in check_locale_and_encoding().
421+
*/
422+
static bool
423+
equivalent_locale(int category, const char *loca, const char *locb)
424+
{
425+
const char *chara;
426+
const char *charb;
427+
char *canona;
428+
char *canonb;
429+
int lena;
430+
int lenb;
431+
432+
/*
433+
* If the names are equal, the locales are equivalent. Checking this
434+
* first avoids calling setlocale() in the common case that the names
435+
* are equal. That's a good thing, if setlocale() is buggy, for example.
436+
*/
437+
if (pg_strcasecmp(loca, locb) == 0)
438+
return true;
439+
440+
/*
441+
* Not identical. Canonicalize both names, remove the encoding parts,
442+
* and try again.
443+
*/
444+
canona = get_canonical_locale_name(category, loca);
445+
chara = strrchr(canona, '.');
446+
lena = chara ? (chara - canona) : strlen(canona);
447+
448+
canonb = get_canonical_locale_name(category, locb);
449+
charb = strrchr(canonb, '.');
450+
lenb = charb ? (charb - canonb) : strlen(canonb);
451+
452+
if (lena == lenb && pg_strncasecmp(canona, canonb, lena) == 0)
453+
return true;
454+
455+
return false;
456+
}
457+
458+
/*
459+
* equivalent_encoding()
460+
*
461+
* Best effort encoding-name comparison. Return true only if the encodings
462+
* are valid server-side encodings and known equivalent.
463+
*
464+
* Because the lookup in pg_valid_server_encoding() does case folding and
465+
* ignores non-alphanumeric characters, this will recognize many popular
466+
* variant spellings as equivalent, eg "utf8" and "UTF-8" will match.
467+
*/
468+
static bool
469+
equivalent_encoding(const char *chara, const char *charb)
470+
{
471+
int enca = pg_valid_server_encoding(chara);
472+
int encb = pg_valid_server_encoding(charb);
473+
474+
if (enca < 0 || encb < 0)
475+
return false;
476+
477+
return (enca == encb);
478+
}
479+
428480

429481
static void
430482
check_new_cluster_is_empty(void)

0 commit comments

Comments
 (0)