Skip to content

Commit 3ad8b84

Browse files
committed
Add some tests for encoding conversion in COPY TO/FROM
This adds a couple of tests to trigger encoding conversion when input and server encodings do not match in COPY FROM/TO, or need_transcoding set to true in the COPY state data. These tests rely on UTF8 <-> LATIN1 for the valid cases as LATIN1 accepts any bytes, and UTF8 <-> EUC_JP for some of the invalid cases where a character cannot be understood, causing a conversion failure. Both ENCODING and client_encoding are covered. Test suggested by Andres Freund. Author: Sutou Kouhei Discussion: https://postgr.es/m/20240206222445.hzq22pb2nye7rm67@awork3.anarazel.de
1 parent bf9165b commit 3ad8b84

File tree

4 files changed

+108
-1
lines changed

4 files changed

+108
-1
lines changed
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
--
2+
-- Test cases for encoding with COPY commands
3+
--
4+
-- skip test if not UTF8 server encoding
5+
SELECT getdatabaseencoding() <> 'UTF8'
6+
AS skip_test \gset
7+
\if :skip_test
8+
\quit
9+
\endif
10+
-- directory paths are passed to us in environment variables
11+
\getenv abs_builddir PG_ABS_BUILDDIR
12+
\set utf8_csv :abs_builddir '/results/copyencoding_utf8.csv'
13+
CREATE TABLE copy_encoding_tab (t text);
14+
-- Valid cases
15+
-- Use ENCODING option
16+
-- U+3042 HIRAGANA LETTER A
17+
COPY (SELECT E'\u3042') TO :'utf8_csv' WITH (FORMAT csv, ENCODING 'UTF8');
18+
-- Read UTF8 data as LATIN1: no error
19+
COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv, ENCODING 'LATIN1');
20+
-- Use client_encoding
21+
SET client_encoding TO UTF8;
22+
-- U+3042 HIRAGANA LETTER A
23+
COPY (SELECT E'\u3042') TO :'utf8_csv' WITH (FORMAT csv);
24+
-- Read UTF8 data as LATIN1: no error
25+
SET client_encoding TO LATIN1;
26+
COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv);
27+
RESET client_encoding;
28+
-- Invalid cases
29+
-- Use ENCODING explicitly
30+
-- U+3042 HIRAGANA LETTER A
31+
COPY (SELECT E'\u3042') TO :'utf8_csv' WITH (FORMAT csv, ENCODING 'UTF8');
32+
-- Read UTF8 data as EUC_JP: no error
33+
COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv, ENCODING 'EUC_JP');
34+
ERROR: invalid byte sequence for encoding "EUC_JP": 0xe3 0x81
35+
CONTEXT: COPY copy_encoding_tab, line 1
36+
-- Use client_encoding
37+
SET client_encoding TO UTF8;
38+
-- U+3042 HIRAGANA LETTER A
39+
COPY (SELECT E'\u3042') TO :'utf8_csv' WITH (FORMAT csv);
40+
-- Read UTF8 data as EUC_JP: no error
41+
SET client_encoding TO EUC_JP;
42+
COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv);
43+
ERROR: invalid byte sequence for encoding "EUC_JP": 0xe3 0x81
44+
CONTEXT: COPY copy_encoding_tab, line 1
45+
RESET client_encoding;
46+
DROP TABLE copy_encoding_tab;
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
--
2+
-- Test cases for encoding with COPY commands
3+
--
4+
-- skip test if not UTF8 server encoding
5+
SELECT getdatabaseencoding() <> 'UTF8'
6+
AS skip_test \gset
7+
\if :skip_test
8+
\quit

src/test/regress/parallel_schedule

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ test: geometry horology tstypes regex type_sanity opr_sanity misc_sanity comment
3636
# execute two copy tests in parallel, to check that copy itself
3737
# is concurrent safe.
3838
# ----------
39-
test: copy copyselect copydml insert insert_conflict
39+
test: copy copyselect copydml copyencoding insert insert_conflict
4040

4141
# ----------
4242
# More groups of parallel tests

src/test/regress/sql/copyencoding.sql

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
--
2+
-- Test cases for encoding with COPY commands
3+
--
4+
5+
-- skip test if not UTF8 server encoding
6+
SELECT getdatabaseencoding() <> 'UTF8'
7+
AS skip_test \gset
8+
\if :skip_test
9+
\quit
10+
\endif
11+
12+
-- directory paths are passed to us in environment variables
13+
\getenv abs_builddir PG_ABS_BUILDDIR
14+
15+
\set utf8_csv :abs_builddir '/results/copyencoding_utf8.csv'
16+
17+
CREATE TABLE copy_encoding_tab (t text);
18+
19+
-- Valid cases
20+
21+
-- Use ENCODING option
22+
-- U+3042 HIRAGANA LETTER A
23+
COPY (SELECT E'\u3042') TO :'utf8_csv' WITH (FORMAT csv, ENCODING 'UTF8');
24+
-- Read UTF8 data as LATIN1: no error
25+
COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv, ENCODING 'LATIN1');
26+
27+
-- Use client_encoding
28+
SET client_encoding TO UTF8;
29+
-- U+3042 HIRAGANA LETTER A
30+
COPY (SELECT E'\u3042') TO :'utf8_csv' WITH (FORMAT csv);
31+
-- Read UTF8 data as LATIN1: no error
32+
SET client_encoding TO LATIN1;
33+
COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv);
34+
RESET client_encoding;
35+
36+
-- Invalid cases
37+
38+
-- Use ENCODING explicitly
39+
-- U+3042 HIRAGANA LETTER A
40+
COPY (SELECT E'\u3042') TO :'utf8_csv' WITH (FORMAT csv, ENCODING 'UTF8');
41+
-- Read UTF8 data as EUC_JP: no error
42+
COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv, ENCODING 'EUC_JP');
43+
44+
-- Use client_encoding
45+
SET client_encoding TO UTF8;
46+
-- U+3042 HIRAGANA LETTER A
47+
COPY (SELECT E'\u3042') TO :'utf8_csv' WITH (FORMAT csv);
48+
-- Read UTF8 data as EUC_JP: no error
49+
SET client_encoding TO EUC_JP;
50+
COPY copy_encoding_tab FROM :'utf8_csv' WITH (FORMAT csv);
51+
RESET client_encoding;
52+
53+
DROP TABLE copy_encoding_tab;

0 commit comments

Comments
 (0)