{
int mblen;
+ /*
+ * It is enough to look at the first byte in all our encodings, to
+ * get the length. (GB18030 is a bit special, but still works for
+ * our purposes; see comment in pg_gb18030_mblen())
+ */
mblen_str[0] = c;
- /* All our encodings only read the first byte to get the length */
mblen = pg_encoding_mblen(cstate->file_encoding, mblen_str);
+
IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(mblen - 1);
IF_NEED_REFILL_AND_EOF_BREAK(mblen - 1);
raw_buf_ptr += mblen - 1;
/*
- * conversion to pg_wchar is done by "table driven."
- * to add an encoding support, define mb2wchar_with_len(), mblen(), dsplen()
- * for the particular encoding. Note that if the encoding is only
- * supported in the client, you don't need to define
- * mb2wchar_with_len() function (SJIS is the case).
+ * Operations on multi-byte encodings are driven by a table of helper
+ * functions.
+ *
+ * To add an encoding support, define mblen(), dsplen() and verifier() for
+ * the encoding. For server-encodings, also define mb2wchar() and wchar2mb()
+ * conversion functions.
*
* These functions generally assume that their input is validly formed.
* The "verifier" functions, further down in the file, have to be more
- * paranoid. We expect that mblen() does not need to examine more than
- * the first byte of the character to discover the correct length.
+ * paranoid.
+ *
+ * We expect that mblen() does not need to examine more than the first byte
+ * of the character to discover the correct length. GB18030 is an exception
+ * to that rule, though, as it also looks at second byte. But even that
+ * behaves in a predictable way, if you only pass the first byte: it will
+ * treat 4-byte encoded characters as two 2-byte encoded characters, which is
+ * good enough for all current uses.
*
* Note: for the display output of psql to work properly, the return values
* of the dsplen functions must conform to the Unicode standard. In particular
* GB18030
*/
+
+/*
+ * Unlike all other mblen() functions, this also looks at the second byte of
+ * the input. However, if you only pass the first byte of a multi-byte
+ * string, and \0 as the second byte, this still works in a predictable way:
+ * a 4-byte character will be reported as two 2-byte characters. That's
+ * enough for all current uses, as a client-only encoding. It works that
+ * way, because in any valid 4-byte GB18030-encoded character, the third and
+ * fourth byte look like a 2-byte encoded character, when looked at
+ * separately.
+ */
static int
pg_gb18030_mblen(const unsigned char *s)
{