hstore: Tighten key/value parsing check for whitespaces
authorMichael Paquier <[email protected]>
Mon, 12 Jun 2023 00:14:03 +0000 (09:14 +0900)
committerMichael Paquier <[email protected]>
Mon, 12 Jun 2023 00:14:03 +0000 (09:14 +0900)
isspace() can be locale-sensitive depending on the platform, causing
hstore to consider as whitespaces characters it should not see as such.
For example, U+0105, being decoded as 0xC4 0x85 in UTF-8, would be
discarded from the input given.

This problem is similar to 9ae2661, though it was missed that hstore
can also manipulate non-ASCII inputs, so replace the existing isspace()
calls with scanner_isspace().

This problem exists for a long time, so backpatch all the way down.

Author: Evan Jones
Discussion: https://postgr.es/m/CA+HWA9awUW0+RV_gO9r1ABZwGoZxPztcJxPy8vMFSTbTfi4jig@mail.gmail.com
Backpatch-through: 11

contrib/hstore/Makefile
contrib/hstore/expected/hstore_utf8.out [new file with mode: 0644]
contrib/hstore/expected/hstore_utf8_1.out [new file with mode: 0644]
contrib/hstore/hstore_io.c
contrib/hstore/meson.build
contrib/hstore/sql/hstore_utf8.sql [new file with mode: 0644]

index c4e339b57c1c5d793228ff7445d5ded4ad3de420..48ee98f0d5c6f4ddb070586d458f84d4e22f6ae9 100644 (file)
@@ -22,7 +22,7 @@ PGFILEDESC = "hstore - key/value pair data type"
 
 HEADERS = hstore.h
 
-REGRESS = hstore
+REGRESS = hstore hstore_utf8
 
 ifdef USE_PGXS
 PG_CONFIG = pg_config
diff --git a/contrib/hstore/expected/hstore_utf8.out b/contrib/hstore/expected/hstore_utf8.out
new file mode 100644 (file)
index 0000000..4405824
--- /dev/null
@@ -0,0 +1,36 @@
+/*
+ * This test must be run in a database with UTF-8 encoding,
+ * because other encodings don't support all the characters used.
+ */
+SELECT getdatabaseencoding() <> 'UTF8'
+       AS skip_test \gset
+\if :skip_test
+\quit
+\endif
+SET client_encoding = utf8;
+-- UTF-8 locale bug on macOS: isspace(0x85) returns true.  \u0105 encodes
+-- as 0xc4 0x85 in UTF-8; the 0x85 was interpreted here as a whitespace.
+SELECT E'key\u0105=>value\u0105'::hstore;
+      hstore      
+------------------
+ "keyą"=>"valueą"
+(1 row)
+
+SELECT 'keyą=>valueą'::hstore;
+      hstore      
+------------------
+ "keyą"=>"valueą"
+(1 row)
+
+SELECT 'ą=>ą'::hstore;
+  hstore  
+----------
+ "ą"=>"ą"
+(1 row)
+
+SELECT 'keyąfoo=>valueą'::hstore;
+       hstore        
+---------------------
+ "keyąfoo"=>"valueą"
+(1 row)
+
diff --git a/contrib/hstore/expected/hstore_utf8_1.out b/contrib/hstore/expected/hstore_utf8_1.out
new file mode 100644 (file)
index 0000000..37aead8
--- /dev/null
@@ -0,0 +1,8 @@
+/*
+ * This test must be run in a database with UTF-8 encoding,
+ * because other encodings don't support all the characters used.
+ */
+SELECT getdatabaseencoding() <> 'UTF8'
+       AS skip_test \gset
+\if :skip_test
+\quit
index cec7df71a25ac0eef120ddc28c253f86e0223b17..999ddad76d977a0b4ec7d7f46172f56dd6d952b1 100644 (file)
@@ -13,6 +13,7 @@
 #include "lib/stringinfo.h"
 #include "libpq/pqformat.h"
 #include "nodes/miscnodes.h"
+#include "parser/scansup.h"
 #include "utils/builtins.h"
 #include "utils/json.h"
 #include "utils/jsonb.h"
@@ -118,7 +119,7 @@ get_val(HSParser *state, bool ignoreeq, bool *escaped)
            {
                st = GV_WAITESCIN;
            }
-           else if (!isspace((unsigned char) *(state->ptr)))
+           else if (!scanner_isspace((unsigned char) *(state->ptr)))
            {
                *(state->cur) = *(state->ptr);
                state->cur++;
@@ -141,7 +142,7 @@ get_val(HSParser *state, bool ignoreeq, bool *escaped)
                state->ptr--;
                return true;
            }
-           else if (isspace((unsigned char) *(state->ptr)))
+           else if (scanner_isspace((unsigned char) *(state->ptr)))
            {
                return true;
            }
@@ -255,7 +256,7 @@ parse_hstore(HSParser *state)
            {
                PRSEOF;
            }
-           else if (!isspace((unsigned char) *(state->ptr)))
+           else if (!scanner_isspace((unsigned char) *(state->ptr)))
            {
                PRSSYNTAXERROR;
            }
@@ -309,7 +310,7 @@ parse_hstore(HSParser *state)
            {
                return true;
            }
-           else if (!isspace((unsigned char) *(state->ptr)))
+           else if (!scanner_isspace((unsigned char) *(state->ptr)))
            {
                PRSSYNTAXERROR;
            }
index 99c3a3160d7e27e3680d303efc45c2de52f84f3f..20acc45ad88e989e41cb058aa4048b1087c31636 100644 (file)
@@ -50,6 +50,7 @@ tests += {
   'regress': {
     'sql': [
       'hstore',
+      'hstore_utf8',
     ],
   },
 }
diff --git a/contrib/hstore/sql/hstore_utf8.sql b/contrib/hstore/sql/hstore_utf8.sql
new file mode 100644 (file)
index 0000000..face878
--- /dev/null
@@ -0,0 +1,19 @@
+/*
+ * This test must be run in a database with UTF-8 encoding,
+ * because other encodings don't support all the characters used.
+ */
+
+SELECT getdatabaseencoding() <> 'UTF8'
+       AS skip_test \gset
+\if :skip_test
+\quit
+\endif
+
+SET client_encoding = utf8;
+
+-- UTF-8 locale bug on macOS: isspace(0x85) returns true.  \u0105 encodes
+-- as 0xc4 0x85 in UTF-8; the 0x85 was interpreted here as a whitespace.
+SELECT E'key\u0105=>value\u0105'::hstore;
+SELECT 'keyą=>valueą'::hstore;
+SELECT 'ą=>ą'::hstore;
+SELECT 'keyąfoo=>valueą'::hstore;