Skip to content

Commit 59f9a0b

Browse files
committed
Implement a solution to the 'Turkish locale downcases I incorrectly'
problem, per previous discussion. Make some additional changes to centralize the knowledge of just how identifier downcasing is done, in hopes of simplifying any future tweaking in this area.
1 parent 1d567ae commit 59f9a0b

File tree

10 files changed

+158
-125
lines changed

10 files changed

+158
-125
lines changed

src/backend/commands/define.c

Lines changed: 7 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
*
1010
*
1111
* IDENTIFICATION
12-
* $PostgreSQL: pgsql/src/backend/commands/define.c,v 1.85 2003/11/29 19:51:47 pgsql Exp $
12+
* $PostgreSQL: pgsql/src/backend/commands/define.c,v 1.86 2004/02/21 00:34:52 tgl Exp $
1313
*
1414
* DESCRIPTION
1515
* The "DefineFoo" routines take the parse tree and pick out the
@@ -38,24 +38,19 @@
3838
#include "catalog/namespace.h"
3939
#include "commands/defrem.h"
4040
#include "parser/parse_type.h"
41+
#include "parser/scansup.h"
4142
#include "utils/int8.h"
4243

4344

4445
/*
45-
* Translate the input language name to lower case.
46+
* Translate the input language name to lower case, and truncate if needed.
4647
*
47-
* Output buffer must be NAMEDATALEN long.
48+
* Returns a palloc'd string
4849
*/
49-
void
50-
case_translate_language_name(const char *input, char *output)
50+
char *
51+
case_translate_language_name(const char *input)
5152
{
52-
int i;
53-
54-
MemSet(output, 0, NAMEDATALEN); /* ensure result Name is
55-
* zero-filled */
56-
57-
for (i = 0; i < NAMEDATALEN - 1 && input[i]; ++i)
58-
output[i] = tolower((unsigned char) input[i]);
53+
return downcase_truncate_identifier(input, strlen(input), false);
5954
}
6055

6156

src/backend/commands/functioncmds.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
*
1111
*
1212
* IDENTIFICATION
13-
* $PostgreSQL: pgsql/src/backend/commands/functioncmds.c,v 1.43 2004/01/06 23:55:18 tgl Exp $
13+
* $PostgreSQL: pgsql/src/backend/commands/functioncmds.c,v 1.44 2004/02/21 00:34:52 tgl Exp $
1414
*
1515
* DESCRIPTION
1616
* These routines take the parse tree and pick out the
@@ -401,7 +401,7 @@ CreateFunction(CreateFunctionStmt *stmt)
401401
Oid prorettype;
402402
bool returnsSet;
403403
char *language;
404-
char languageName[NAMEDATALEN];
404+
char *languageName;
405405
Oid languageOid;
406406
Oid languageValidator;
407407
char *funcname;
@@ -437,7 +437,7 @@ CreateFunction(CreateFunctionStmt *stmt)
437437
&as_clause, &language, &volatility, &isStrict, &security);
438438

439439
/* Convert language name to canonical case */
440-
case_translate_language_name(language, languageName);
440+
languageName = case_translate_language_name(language);
441441

442442
/* Look up the language and validate permissions */
443443
languageTuple = SearchSysCache(LANGNAME,

src/backend/commands/proclang.c

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
* Portions Copyright (c) 1994, Regents of the University of California
88
*
99
* IDENTIFICATION
10-
* $PostgreSQL: pgsql/src/backend/commands/proclang.c,v 1.52 2003/11/29 19:51:47 pgsql Exp $
10+
* $PostgreSQL: pgsql/src/backend/commands/proclang.c,v 1.53 2004/02/21 00:34:52 tgl Exp $
1111
*
1212
*-------------------------------------------------------------------------
1313
*/
@@ -40,11 +40,12 @@
4040
void
4141
CreateProceduralLanguage(CreatePLangStmt *stmt)
4242
{
43-
char languageName[NAMEDATALEN];
43+
char *languageName;
4444
Oid procOid,
4545
valProcOid;
4646
Oid funcrettype;
4747
Oid typev[FUNC_MAX_ARGS];
48+
NameData langname;
4849
char nulls[Natts_pg_language];
4950
Datum values[Natts_pg_language];
5051
Relation rel;
@@ -66,7 +67,7 @@ CreateProceduralLanguage(CreatePLangStmt *stmt)
6667
* Translate the language name and check that this language doesn't
6768
* already exist
6869
*/
69-
case_translate_language_name(stmt->plname, languageName);
70+
languageName = case_translate_language_name(stmt->plname);
7071

7172
if (SearchSysCacheExists(LANGNAME,
7273
PointerGetDatum(languageName),
@@ -124,12 +125,13 @@ CreateProceduralLanguage(CreatePLangStmt *stmt)
124125
}
125126

126127
i = 0;
127-
values[i++] = PointerGetDatum(languageName);
128-
values[i++] = BoolGetDatum(true); /* lanispl */
129-
values[i++] = BoolGetDatum(stmt->pltrusted);
130-
values[i++] = ObjectIdGetDatum(procOid);
131-
values[i++] = ObjectIdGetDatum(valProcOid);
132-
nulls[i] = 'n'; /* lanacl */
128+
namestrcpy(&langname, languageName);
129+
values[i++] = NameGetDatum(&langname); /* lanname */
130+
values[i++] = BoolGetDatum(true); /* lanispl */
131+
values[i++] = BoolGetDatum(stmt->pltrusted); /* lanpltrusted */
132+
values[i++] = ObjectIdGetDatum(procOid); /* lanplcallfoid */
133+
values[i++] = ObjectIdGetDatum(valProcOid); /* lanvalidator */
134+
nulls[i] = 'n'; /* lanacl */
133135

134136
rel = heap_openr(LanguageRelationName, RowExclusiveLock);
135137

@@ -173,7 +175,7 @@ CreateProceduralLanguage(CreatePLangStmt *stmt)
173175
void
174176
DropProceduralLanguage(DropPLangStmt *stmt)
175177
{
176-
char languageName[NAMEDATALEN];
178+
char *languageName;
177179
HeapTuple langTup;
178180
ObjectAddress object;
179181

@@ -189,7 +191,7 @@ DropProceduralLanguage(DropPLangStmt *stmt)
189191
* Translate the language name, check that this language exist and is
190192
* a PL
191193
*/
192-
case_translate_language_name(stmt->plname, languageName);
194+
languageName = case_translate_language_name(stmt->plname);
193195

194196
langTup = SearchSysCache(LANGNAME,
195197
CStringGetDatum(languageName),

src/backend/parser/keywords.c

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $PostgreSQL: pgsql/src/backend/parser/keywords.c,v 1.144 2003/11/29 19:51:51 pgsql Exp $
11+
* $PostgreSQL: pgsql/src/backend/parser/keywords.c,v 1.145 2004/02/21 00:34:52 tgl Exp $
1212
*
1313
*-------------------------------------------------------------------------
1414
*/
@@ -369,17 +369,13 @@ ScanKeywordLookup(const char *text)
369369

370370
/*
371371
* Apply an ASCII-only downcasing. We must not use tolower() since it
372-
* may produce the wrong translation in some locales (eg, Turkish),
373-
* and we don't trust isupper() very much either. In an ASCII-based
374-
* encoding the tests against A and Z are sufficient, but we also
375-
* check isupper() so that we will work correctly under EBCDIC. The
376-
* actual case conversion step should work for either ASCII or EBCDIC.
372+
* may produce the wrong translation in some locales (eg, Turkish).
377373
*/
378374
for (i = 0; i < len; i++)
379375
{
380376
char ch = text[i];
381377

382-
if (ch >= 'A' && ch <= 'Z' && isupper((unsigned char) ch))
378+
if (ch >= 'A' && ch <= 'Z')
383379
ch += 'a' - 'A';
384380
word[i] = ch;
385381
}

src/backend/parser/scan.l

Lines changed: 8 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
* Portions Copyright (c) 1994, Regents of the University of California
1111
*
1212
* IDENTIFICATION
13-
* $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.113 2004/02/19 19:11:30 tgl Exp $
13+
* $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.114 2004/02/21 00:34:52 tgl Exp $
1414
*
1515
*-------------------------------------------------------------------------
1616
*/
@@ -27,6 +27,7 @@
2727
#include "parser/keywords.h"
2828
/* Not needed now that this file is compiled as part of gram.y */
2929
/* #include "parser/parse.h" */
30+
#include "parser/scansup.h"
3031
#include "utils/builtins.h"
3132
#include "mb/pg_wchar.h"
3233

@@ -395,23 +396,15 @@ other .
395396
startlit();
396397
}
397398
<xd>{xdstop} {
399+
char *ident;
400+
398401
BEGIN(INITIAL);
399402
if (literallen == 0)
400403
yyerror("zero-length delimited identifier");
404+
ident = litbufdup();
401405
if (literallen >= NAMEDATALEN)
402-
{
403-
int len;
404-
405-
len = pg_mbcliplen(literalbuf, literallen,
406-
NAMEDATALEN-1);
407-
ereport(NOTICE,
408-
(errcode(ERRCODE_NAME_TOO_LONG),
409-
errmsg("identifier \"%s\" will be truncated to \"%.*s\"",
410-
literalbuf, len, literalbuf)));
411-
literalbuf[len] = '\0';
412-
literallen = len;
413-
}
414-
yylval.str = litbufdup();
406+
truncate_identifier(ident, literallen, true);
407+
yylval.str = ident;
415408
return IDENT;
416409
}
417410
<xd>{xddouble} {
@@ -537,7 +530,6 @@ other .
537530
{identifier} {
538531
const ScanKeyword *keyword;
539532
char *ident;
540-
int i;
541533

542534
/* Is it a keyword? */
543535
keyword = ScanKeywordLookup(yytext);
@@ -550,28 +542,8 @@ other .
550542
/*
551543
* No. Convert the identifier to lower case, and truncate
552544
* if necessary.
553-
*
554-
* Note: here we use a locale-dependent case conversion,
555-
* which seems appropriate under standard SQL rules, whereas
556-
* the keyword comparison was NOT locale-dependent.
557545
*/
558-
ident = pstrdup(yytext);
559-
for (i = 0; ident[i]; i++)
560-
{
561-
if (isupper((unsigned char) ident[i]))
562-
ident[i] = tolower((unsigned char) ident[i]);
563-
}
564-
if (i >= NAMEDATALEN)
565-
{
566-
int len;
567-
568-
len = pg_mbcliplen(ident, i, NAMEDATALEN-1);
569-
ereport(NOTICE,
570-
(errcode(ERRCODE_NAME_TOO_LONG),
571-
errmsg("identifier \"%s\" will be truncated to \"%.*s\"",
572-
ident, len, ident)));
573-
ident[len] = '\0';
574-
}
546+
ident = downcase_truncate_identifier(yytext, yyleng, true);
575547
yylval.str = ident;
576548
return IDENT;
577549
}

src/backend/parser/scansup.c

Lines changed: 76 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
*
1010
*
1111
* IDENTIFICATION
12-
* $PostgreSQL: pgsql/src/backend/parser/scansup.c,v 1.25 2003/11/29 19:51:52 pgsql Exp $
12+
* $PostgreSQL: pgsql/src/backend/parser/scansup.c,v 1.26 2004/02/21 00:34:53 tgl Exp $
1313
*
1414
*-------------------------------------------------------------------------
1515
*/
@@ -19,6 +19,8 @@
1919

2020
#include "miscadmin.h"
2121
#include "parser/scansup.h"
22+
#include "mb/pg_wchar.h"
23+
2224

2325
/* ----------------
2426
* scanstr
@@ -32,7 +34,7 @@
3234
*/
3335

3436
char *
35-
scanstr(char *s)
37+
scanstr(const char *s)
3638
{
3739
char *newStr;
3840
int len,
@@ -109,3 +111,75 @@ scanstr(char *s)
109111
newStr[j] = '\0';
110112
return newStr;
111113
}
114+
115+
116+
/*
117+
* downcase_truncate_identifier() --- do appropriate downcasing and
118+
* truncation of an unquoted identifier. Optionally warn of truncation.
119+
*
120+
* Returns a palloc'd string containing the adjusted identifier.
121+
*
122+
* Note: in some usages the passed string is not null-terminated.
123+
*
124+
* Note: the API of this function is designed to allow for downcasing
125+
* transformations that increase the string length, but we don't yet
126+
* support that. If you want to implement it, you'll need to fix
127+
* SplitIdentifierString() in utils/adt/varlena.c.
128+
*/
129+
char *
130+
downcase_truncate_identifier(const char *ident, int len, bool warn)
131+
{
132+
char *result;
133+
int i;
134+
135+
result = palloc(len + 1);
136+
/*
137+
* SQL99 specifies Unicode-aware case normalization, which we don't yet
138+
* have the infrastructure for. Instead we use tolower() to provide a
139+
* locale-aware translation. However, there are some locales where this
140+
* is not right either (eg, Turkish may do strange things with 'i' and
141+
* 'I'). Our current compromise is to use tolower() for characters with
142+
* the high bit set, and use an ASCII-only downcasing for 7-bit
143+
* characters.
144+
*/
145+
for (i = 0; i < len; i++)
146+
{
147+
unsigned char ch = (unsigned char) ident[i];
148+
149+
if (ch >= 'A' && ch <= 'Z')
150+
ch += 'a' - 'A';
151+
else if (ch >= 0x80 && isupper(ch))
152+
ch = tolower(ch);
153+
result[i] = (char) ch;
154+
}
155+
result[i] = '\0';
156+
157+
if (i >= NAMEDATALEN)
158+
truncate_identifier(result, i, warn);
159+
160+
return result;
161+
}
162+
163+
/*
164+
* truncate_identifier() --- truncate an identifier to NAMEDATALEN-1 bytes.
165+
*
166+
* The given string is modified in-place, if necessary. A warning is
167+
* issued if requested.
168+
*
169+
* We require the caller to pass in the string length since this saves a
170+
* strlen() call in some common usages.
171+
*/
172+
void
173+
truncate_identifier(char *ident, int len, bool warn)
174+
{
175+
if (len >= NAMEDATALEN)
176+
{
177+
len = pg_mbcliplen(ident, len, NAMEDATALEN-1);
178+
if (warn)
179+
ereport(NOTICE,
180+
(errcode(ERRCODE_NAME_TOO_LONG),
181+
errmsg("identifier \"%s\" will be truncated to \"%.*s\"",
182+
ident, len, ident)));
183+
ident[len] = '\0';
184+
}
185+
}

0 commit comments

Comments
 (0)