Get pg_utf_mblen(), pg_utf2wchar_with_len(), and utf2ucs() all on the same

author Tom Lane <[email protected]>

Wed, 24 Jan 2007 17:12:47 +0000 (17:12 +0000)

committer Tom Lane <[email protected]>

Wed, 24 Jan 2007 17:12:47 +0000 (17:12 +0000)
author Tom Lane <[email protected]>
Wed, 24 Jan 2007 17:12:47 +0000 (17:12 +0000)
committer Tom Lane <[email protected]>
Wed, 24 Jan 2007 17:12:47 +0000 (17:12 +0000)
diff --git a/src/backend/utils/mb/wchar.c b/src/backend/utils/mb/wchar.c

index cefdaf0827801b3341c8ddb870f1709c2ebaf469..959c5348fc5a17345ae52c7759e8caff7a8641ba 100644 (file)
--- a/src/backend/utils/mb/wchar.c
+++ b/src/backend/utils/mb/wchar.c
@@ -266,18 +266,18 @@ pg_johab_mblen(const unsigned char *s)
  }
  
  /*
- * convert UTF-8 string to pg_wchar (UCS-2)
- * caller should allocate enough space for "to"
+ * convert UTF8 string to pg_wchar (UCS-4)
+ * caller must allocate enough space for "to", including a trailing zero!
   * len: length of from.
   * "from" not necessarily null terminated.
   */
  static int
  pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
  {
-       unsigned char c1,
+       int                     cnt = 0;
+       uint32          c1,
                                 c2,
                                 c3;
-       int                     cnt = 0;
  
         while (len > 0 && *from)
         {
@@ -286,26 +286,28 @@ pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
                         *to = *from++;
                         len--;
                 }
-               else if ((*from & 0xe0) == 0xc0 && len >= 2)
+               else if ((*from & 0xe0) == 0xc0)
                 {
+                       if (len < 2)
+                               break;                  /* drop trailing incomplete char */
                         c1 = *from++ & 0x1f;
                         c2 = *from++ & 0x3f;
-                       *to = c1 << 6;
-                       *to |= c2;
+                       *to = (c1 << 6) | c2;
                         len -= 2;
                 }
-               else if ((*from & 0xe0) == 0xe0 && len >= 3)
+               else if ((*from & 0xf0) == 0xe0)
                 {
+                       if (len < 3)
+                               break;                  /* drop trailing incomplete char */
                         c1 = *from++ & 0x0f;
                         c2 = *from++ & 0x3f;
                         c3 = *from++ & 0x3f;
-                       *to = c1 << 12;
-                       *to |= c2 << 6;
-                       *to |= c3;
+                       *to = (c1 << 12) | (c2 << 6) | c3;
                         len -= 3;
                 }
                 else
                 {
+                       /* treat a bogus char as length 1; not ours to raise error */
                         *to = *from++;
                         len--;
                 }
@@ -317,20 +319,38 @@ pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
  }
  
  /*
- * returns the byte length of a UTF-8 word pointed to by s
+ * Return the byte length of a UTF8 character pointed to by s
+ *
+ * Note: in the current implementation we do not support UTF8 sequences
+ * of more than 3 bytes; hence do NOT return a value larger than 3.
+ * We return "1" for any leading byte that is either flat-out illegal or
+ * indicates a length larger than we support.
+ *
+ * pg_utf2wchar_with_len(), utf2ucs(), pg_utf8_islegal(), and perhaps
+ * other places would need to be fixed to change this.
   */
  int
  pg_utf_mblen(const unsigned char *s)
  {
-       int                     len = 1;
+       int                     len;
  
         if ((*s & 0x80) == 0)
                 len = 1;
         else if ((*s & 0xe0) == 0xc0)
                 len = 2;
-       else if ((*s & 0xe0) == 0xe0)
+       else if ((*s & 0xf0) == 0xe0)
                 len = 3;
-       return (len);
+#ifdef NOT_USED
+       else if ((*s & 0xf8) == 0xf0)
+               len = 4;
+       else if ((*s & 0xfc) == 0xf8)
+               len = 5;
+       else if ((*s & 0xfe) == 0xfc)
+               len = 6;
+#endif
+       else
+               len = 1;
+       return len;
  }
  
  /*
author	Tom Lane <[email protected]>
	Wed, 24 Jan 2007 17:12:47 +0000 (17:12 +0000)
committer	Tom Lane <[email protected]>
	Wed, 24 Jan 2007 17:12:47 +0000 (17:12 +0000)