Fix comments that claimed that mblen() only looks at first byte.

author Heikki Linnakangas <[email protected]>

Fri, 25 Jan 2019 12:54:38 +0000 (14:54 +0200)

committer Heikki Linnakangas <[email protected]>

Fri, 25 Jan 2019 12:54:38 +0000 (14:54 +0200)
author Heikki Linnakangas <[email protected]>
Fri, 25 Jan 2019 12:54:38 +0000 (14:54 +0200)
committer Heikki Linnakangas <[email protected]>
Fri, 25 Jan 2019 12:54:38 +0000 (14:54 +0200)
diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c

index 03745cca753f83656e3d6dfee9e8b15e86311857..1c90934d972cb1dceb06b0e596a51d1702dfa76e 100644 (file)
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -4121,9 +4121,14 @@ not_end_of_copy:
         {
             int         mblen;
  
+           /*
+            * It is enough to look at the first byte in all our encodings, to
+            * get the length.  (GB18030 is a bit special, but still works for
+            * our purposes; see comment in pg_gb18030_mblen())
+            */
             mblen_str[0] = c;
-           /* All our encodings only read the first byte to get the length */
             mblen = pg_encoding_mblen(cstate->file_encoding, mblen_str);
+
             IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(mblen - 1);
             IF_NEED_REFILL_AND_EOF_BREAK(mblen - 1);
             raw_buf_ptr += mblen - 1;
diff --git a/src/backend/utils/mb/wchar.c b/src/backend/utils/mb/wchar.c

index a5fdda456e64ddf598342140a5f8f6b5ee6139e9..8e5116dfc108afebd5d61b651011f3f831fce5c5 100644 (file)
--- a/src/backend/utils/mb/wchar.c
+++ b/src/backend/utils/mb/wchar.c
@@ -15,16 +15,23 @@
  
  
  /*
- * conversion to pg_wchar is done by "table driven."
- * to add an encoding support, define mb2wchar_with_len(), mblen(), dsplen()
- * for the particular encoding. Note that if the encoding is only
- * supported in the client, you don't need to define
- * mb2wchar_with_len() function (SJIS is the case).
+ * Operations on multi-byte encodings are driven by a table of helper
+ * functions.
+ *
+ * To add an encoding support, define mblen(), dsplen() and verifier() for
+ * the encoding.  For server-encodings, also define mb2wchar() and wchar2mb()
+ * conversion functions.
   *
   * These functions generally assume that their input is validly formed.
   * The "verifier" functions, further down in the file, have to be more
- * paranoid.  We expect that mblen() does not need to examine more than
- * the first byte of the character to discover the correct length.
+ * paranoid.
+ *
+ * We expect that mblen() does not need to examine more than the first byte
+ * of the character to discover the correct length.  GB18030 is an exception
+ * to that rule, though, as it also looks at second byte.  But even that
+ * behaves in a predictable way, if you only pass the first byte: it will
+ * treat 4-byte encoded characters as two 2-byte encoded characters, which is
+ * good enough for all current uses.
   *
   * Note: for the display output of psql to work properly, the return values
   * of the dsplen functions must conform to the Unicode standard. In particular
@@ -1073,6 +1080,17 @@ pg_uhc_dsplen(const unsigned char *s)
   * GB18030
   * Added by Bill Huang <[email protected]>,<[email protected]>
   */
+
+/*
+ * Unlike all other mblen() functions, this also looks at the second byte of
+ * the input.  However, if you only pass the first byte of a multi-byte
+ * string, and \0 as the second byte, this still works in a predictable way:
+ * a 4-byte character will be reported as two 2-byte characters.  That's
+ * enough for all current uses, as a client-only encoding.  It works that
+ * way, because in any valid 4-byte GB18030-encoded character, the third and
+ * fourth byte look like a 2-byte encoded character, when looked at
+ * separately.
+ */
  static int
  pg_gb18030_mblen(const unsigned char *s)
  {
author	Heikki Linnakangas <[email protected]>
	Fri, 25 Jan 2019 12:54:38 +0000 (14:54 +0200)
committer	Heikki Linnakangas <[email protected]>
	Fri, 25 Jan 2019 12:54:38 +0000 (14:54 +0200)
src/backend/commands/copy.c		patch \| blob \| blame \| history
src/backend/utils/mb/wchar.c		patch \| blob \| blame \| history