Add unistr function
authorPeter Eisentraut <[email protected]>
Sun, 28 Mar 2021 06:16:15 +0000 (08:16 +0200)
committerPeter Eisentraut <[email protected]>
Mon, 29 Mar 2021 09:56:53 +0000 (11:56 +0200)
This allows decoding a string with Unicode escape sequences.  It is
similar to Unicode escape strings, but offers some more flexibility.

Author: Pavel Stehule <[email protected]>
Reviewed-by: Asif Rehman <[email protected]>
Discussion: https://www.postgresql.org/message-id/flat/CAFj8pRA5GnKT+gDVwbVRH2ep451H_myBt+NTz8RkYUARE9+qOQ@mail.gmail.com

doc/src/sgml/func.sgml
src/backend/utils/adt/varlena.c
src/include/catalog/catversion.h
src/include/catalog/pg_proc.dat
src/test/regress/expected/strings.out
src/test/regress/sql/strings.sql

index 19285ae1360199ea26610d772a2d35dc13e788ac..fbf6062d0a80aba1d85437861aa87f601ebbf840 100644 (file)
@@ -3551,6 +3551,52 @@ repeat('Pg', 4) <returnvalue>PgPgPgPg</returnvalue>
        </para></entry>
       </row>
 
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>unistr</primary>
+        </indexterm>
+        <function>unistr</function> ( <type>text</type> )
+        <returnvalue>text</returnvalue>
+       </para>
+       <para>
+        Evaluate escaped Unicode characters in argument.  Unicode characters
+        can be specified as
+        <literal>\<replaceable>XXXX</replaceable></literal> (4 hexadecimal
+        digits), <literal>\+<replaceable>XXXXXX</replaceable></literal> (6
+        hexadecimal digits),
+        <literal>\u<replaceable>XXXX</replaceable></literal> (4 hexadecimal
+        digits), or <literal>\U<replaceable>XXXXXXXX</replaceable></literal>
+        (8 hexadecimal digits).  To specify a backslash, write two
+        backslashes.  All other characters are taken literally.
+       </para>
+
+       <para>
+        If the server encoding is not UTF-8, the Unicode code point identified
+        by one of these escape sequences is converted to the actual server
+        encoding; an error is reported if that's not possible.
+       </para>
+
+       <para>
+        This function provides a (non-standard) alternative to string
+        constants with Unicode escapes (see <xref
+        linkend="sql-syntax-strings-uescape"/>).
+       </para>
+
+       <para>
+        <literal>unistr('\0441\043B\043E\043D')</literal>
+        <returnvalue>слон</returnvalue>
+       </para>
+       <para>
+        <literal>unistr('d\0061t\+000061')</literal>
+        <returnvalue>data</returnvalue>
+       </para>
+       <para>
+        <literal>unistr('d\u0061t\U00000061')</literal>
+        <returnvalue>data</returnvalue>
+       </para></entry>
+      </row>
+
      </tbody>
     </tgroup>
    </table>
index 640e3fd4c04b956cd18d56e5b3acdfdc37c579ab..efc74e8f2d70fb46ac57046eed88dfd3d8eb2003 100644 (file)
@@ -6380,3 +6380,213 @@ unicode_is_normalized(PG_FUNCTION_ARGS)
 
    PG_RETURN_BOOL(result);
 }
+
+/*
+ * Check if first n chars are hexadecimal digits
+ */
+static bool
+isxdigits_n(const char *instr, size_t n)
+{
+   for (size_t i = 0; i < n; i++)
+       if (!isxdigit((unsigned char) instr[i]))
+           return false;
+
+   return true;
+}
+
+static unsigned int
+hexval(unsigned char c)
+{
+   if (c >= '0' && c <= '9')
+       return c - '0';
+   if (c >= 'a' && c <= 'f')
+       return c - 'a' + 0xA;
+   if (c >= 'A' && c <= 'F')
+       return c - 'A' + 0xA;
+   elog(ERROR, "invalid hexadecimal digit");
+   return 0;                   /* not reached */
+}
+
+/*
+ * Translate string with hexadecimal digits to number
+ */
+static unsigned int
+hexval_n(const char *instr, size_t n)
+{
+   unsigned int result = 0;
+
+   for (size_t i = 0; i < n; i++)
+       result += hexval(instr[i]) << (4 * (n - i - 1));
+
+   return result;
+}
+
+/*
+ * Replaces Unicode escape sequences by Unicode characters
+ */
+Datum
+unistr(PG_FUNCTION_ARGS)
+{
+   text       *input_text = PG_GETARG_TEXT_PP(0);
+   char       *instr;
+   int         len;
+   StringInfoData str;
+   text       *result;
+   pg_wchar    pair_first = 0;
+   char        cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
+
+   instr = VARDATA_ANY(input_text);
+   len = VARSIZE_ANY_EXHDR(input_text);
+
+   initStringInfo(&str);
+
+   while (len > 0)
+   {
+       if (instr[0] == '\\')
+       {
+           if (len >= 2 &&
+               instr[1] == '\\')
+           {
+               if (pair_first)
+                   goto invalid_pair;
+               appendStringInfoChar(&str, '\\');
+               instr += 2;
+               len -= 2;
+           }
+           else if ((len >= 5 && isxdigits_n(instr + 1, 4)) ||
+                    (len >= 6 && instr[1] == 'u' && isxdigits_n(instr + 2, 4)))
+           {
+               pg_wchar    unicode;
+               int         offset = instr[1] == 'u' ? 2 : 1;
+
+               unicode = hexval_n(instr + offset, 4);
+
+               if (!is_valid_unicode_codepoint(unicode))
+                   ereport(ERROR,
+                           errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                           errmsg("invalid Unicode code point: %04X", unicode));
+
+               if (pair_first)
+               {
+                   if (is_utf16_surrogate_second(unicode))
+                   {
+                       unicode = surrogate_pair_to_codepoint(pair_first, unicode);
+                       pair_first = 0;
+                   }
+                   else
+                       goto invalid_pair;
+               }
+               else if (is_utf16_surrogate_second(unicode))
+                   goto invalid_pair;
+
+               if (is_utf16_surrogate_first(unicode))
+                   pair_first = unicode;
+               else
+               {
+                   pg_unicode_to_server(unicode, (unsigned char *) cbuf);
+                   appendStringInfoString(&str, cbuf);
+               }
+
+               instr += 4 + offset;
+               len -= 4 + offset;
+           }
+           else if (len >= 8 && instr[1] == '+' && isxdigits_n(instr + 2, 6))
+           {
+               pg_wchar    unicode;
+
+               unicode = hexval_n(instr + 2, 6);
+
+               if (!is_valid_unicode_codepoint(unicode))
+                   ereport(ERROR,
+                           errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                           errmsg("invalid Unicode code point: %04X", unicode));
+
+               if (pair_first)
+               {
+                   if (is_utf16_surrogate_second(unicode))
+                   {
+                       unicode = surrogate_pair_to_codepoint(pair_first, unicode);
+                       pair_first = 0;
+                   }
+                   else
+                       goto invalid_pair;
+               }
+               else if (is_utf16_surrogate_second(unicode))
+                   goto invalid_pair;
+
+               if (is_utf16_surrogate_first(unicode))
+                   pair_first = unicode;
+               else
+               {
+                   pg_unicode_to_server(unicode, (unsigned char *) cbuf);
+                   appendStringInfoString(&str, cbuf);
+               }
+
+               instr += 8;
+               len -= 8;
+           }
+           else if (len >= 10 && instr[1] == 'U' && isxdigits_n(instr + 2, 8))
+           {
+               pg_wchar    unicode;
+
+               unicode = hexval_n(instr + 2, 8);
+
+               if (!is_valid_unicode_codepoint(unicode))
+                   ereport(ERROR,
+                           errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                           errmsg("invalid Unicode code point: %04X", unicode));
+
+               if (pair_first)
+               {
+                   if (is_utf16_surrogate_second(unicode))
+                   {
+                       unicode = surrogate_pair_to_codepoint(pair_first, unicode);
+                       pair_first = 0;
+                   }
+                   else
+                       goto invalid_pair;
+               }
+               else if (is_utf16_surrogate_second(unicode))
+                   goto invalid_pair;
+
+               if (is_utf16_surrogate_first(unicode))
+                   pair_first = unicode;
+               else
+               {
+                   pg_unicode_to_server(unicode, (unsigned char *) cbuf);
+                   appendStringInfoString(&str, cbuf);
+               }
+
+               instr += 10;
+               len -= 10;
+           }
+           else
+               ereport(ERROR,
+                       (errcode(ERRCODE_SYNTAX_ERROR),
+                        errmsg("invalid Unicode escape"),
+                        errhint("Unicode escapes must be \\XXXX, \\+XXXXXX, \\uXXXX, or \\UXXXXXXXX.")));
+       }
+       else
+       {
+           if (pair_first)
+               goto invalid_pair;
+
+           appendStringInfoChar(&str, *instr++);
+           len--;
+       }
+   }
+
+   /* unfinished surrogate pair? */
+   if (pair_first)
+       goto invalid_pair;
+
+   result = cstring_to_text_with_len(str.data, str.len);
+   pfree(str.data);
+
+   PG_RETURN_TEXT_P(result);
+
+invalid_pair:
+   ereport(ERROR,
+           (errcode(ERRCODE_SYNTAX_ERROR),
+            errmsg("invalid Unicode surrogate pair")));
+}
index 4a39da3c9d494d09e5e1d05bb722a3ebd3332f22..489f5be427fc4b3c5d6132339ea7b2a2990a49bd 100644 (file)
@@ -53,6 +53,6 @@
  */
 
 /*                         yyyymmddN */
-#define CATALOG_VERSION_NO 202103266
+#define CATALOG_VERSION_NO 202103291
 
 #endif
index cc7d90d2b0bd8c5ae126e3e1fdd9c567a7b7e58f..bfb89e0575dfeaa79e6a3fc5571b3c48fd484a74 100644 (file)
   proname => 'is_normalized', prorettype => 'bool', proargtypes => 'text text',
   prosrc => 'unicode_is_normalized' },
 
+{ oid => '9822', descr => 'unescape Unicode characters',
+  proname => 'unistr', prorettype => 'text', proargtypes => 'text',
+  prosrc => 'unistr' },
+
 { oid => '4596', descr => 'I/O',
   proname => 'brin_bloom_summary_in', prorettype => 'pg_brin_bloom_summary',
   proargtypes => 'cstring', prosrc => 'brin_bloom_summary_in' },
index afd84249c82d04b44c4cbe012681a47c4613414f..91aa8198045e72332dc4504862dddfd43b9f52a2 100644 (file)
@@ -2234,3 +2234,39 @@ SELECT bit_count('\x1234567890'::bytea);
         15
 (1 row)
 
+SELECT unistr('\0064at\+0000610');
+ unistr 
+--------
+ data0
+(1 row)
+
+SELECT unistr('d\u0061t\U000000610');
+ unistr 
+--------
+ data0
+(1 row)
+
+SELECT unistr('a\\b');
+ unistr 
+--------
+ a\b
+(1 row)
+
+-- errors:
+SELECT unistr('wrong: \db99');
+ERROR:  invalid Unicode surrogate pair
+SELECT unistr('wrong: \db99\0061');
+ERROR:  invalid Unicode surrogate pair
+SELECT unistr('wrong: \+00db99\+000061');
+ERROR:  invalid Unicode surrogate pair
+SELECT unistr('wrong: \+2FFFFF');
+ERROR:  invalid Unicode code point: 2FFFFF
+SELECT unistr('wrong: \udb99\u0061');
+ERROR:  invalid Unicode surrogate pair
+SELECT unistr('wrong: \U0000db99\U00000061');
+ERROR:  invalid Unicode surrogate pair
+SELECT unistr('wrong: \U002FFFFF');
+ERROR:  invalid Unicode code point: 2FFFFF
+SELECT unistr('wrong: \xyz');
+ERROR:  invalid Unicode escape
+HINT:  Unicode escapes must be \XXXX, \+XXXXXX, \uXXXX, or \UXXXXXXXX.
index 9aa1825f921b97e74c091d39298fc7da45c8807d..2c502534c2b679ebf8220e4374300a7cdc8b7e72 100644 (file)
@@ -746,3 +746,16 @@ SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 8)
 SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5 for 3),'escape');
 
 SELECT bit_count('\x1234567890'::bytea);
+
+SELECT unistr('\0064at\+0000610');
+SELECT unistr('d\u0061t\U000000610');
+SELECT unistr('a\\b');
+-- errors:
+SELECT unistr('wrong: \db99');
+SELECT unistr('wrong: \db99\0061');
+SELECT unistr('wrong: \+00db99\+000061');
+SELECT unistr('wrong: \+2FFFFF');
+SELECT unistr('wrong: \udb99\u0061');
+SELECT unistr('wrong: \U0000db99\U00000061');
+SELECT unistr('wrong: \U002FFFFF');
+SELECT unistr('wrong: \xyz');