*** pgsql/src/backend/parser/scan.l 2009/07/14 20:24:10 1.157 --- pgsql/src/backend/parser/scan.l 2009/09/21 22:22:07 1.158 *************** *** 24,30 **** * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION ! * $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.156 2009/07/13 03:11:12 tgl Exp $ * *------------------------------------------------------------------------- */ --- 24,30 ---- * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION ! * $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.157 2009/07/14 20:24:10 tgl Exp $ * *------------------------------------------------------------------------- */ *************** check_unicode_value(pg_wchar c, char *lo *** 1097,1107 **** --- 1097,1126 ---- } } + static bool + is_utf16_surrogate_first(pg_wchar c) + { + return (c >= 0xD800 && c <= 0xDBFF); + } + + static bool + is_utf16_surrogate_second(pg_wchar c) + { + return (c >= 0xDC00 && c <= 0xDFFF); + } + + static pg_wchar + surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second) + { + return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF); + } + static char * litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner) { char *new; char *litbuf, *in, *out; + pg_wchar pair_first = 0; if (isxdigit(escape) || escape == '+' *************** litbuf_udeescape(unsigned char escape, b *** 1131,1136 **** --- 1150,1160 ---- { if (in[1] == escape) { + if (pair_first) + { + ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */ + yyerror("invalid Unicode surrogate pair"); + } *out++ = escape; in += 2; } *************** litbuf_udeescape(unsigned char escape, b *** 1138,1146 **** { pg_wchar unicode = hexval(in[1]) * 16*16*16 + hexval(in[2]) * 16*16 + hexval(in[3]) * 16 + hexval(in[4]); check_unicode_value(unicode, in, yyscanner); ! unicode_to_utf8(unicode, (unsigned char *) out); in += 5; - out += pg_mblen(out); } else if (in[1] == '+' && isxdigit(in[2]) && isxdigit(in[3]) --- 1162,1188 ---- { pg_wchar unicode = hexval(in[1]) * 16*16*16 + hexval(in[2]) * 16*16 + hexval(in[3]) * 16 + hexval(in[4]); check_unicode_value(unicode, in, yyscanner); ! if (pair_first) ! { ! if (is_utf16_surrogate_second(unicode)) ! { ! unicode = surrogate_pair_to_codepoint(pair_first, unicode); ! pair_first = 0; ! } ! else ! { ! ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */ ! yyerror("invalid Unicode surrogate pair"); ! } ! } ! if (is_utf16_surrogate_first(unicode)) ! pair_first = unicode; ! else ! { ! unicode_to_utf8(unicode, (unsigned char *) out); ! out += pg_mblen(out); ! } in += 5; } else if (in[1] == '+' && isxdigit(in[2]) && isxdigit(in[3]) *************** litbuf_udeescape(unsigned char escape, b *** 1150,1158 **** pg_wchar unicode = hexval(in[2]) * 16*16*16*16*16 + hexval(in[3]) * 16*16*16*16 + hexval(in[4]) * 16*16*16 + hexval(in[5]) * 16*16 + hexval(in[6]) * 16 + hexval(in[7]); check_unicode_value(unicode, in, yyscanner); ! unicode_to_utf8(unicode, (unsigned char *) out); in += 8; - out += pg_mblen(out); } else { --- 1192,1218 ---- pg_wchar unicode = hexval(in[2]) * 16*16*16*16*16 + hexval(in[3]) * 16*16*16*16 + hexval(in[4]) * 16*16*16 + hexval(in[5]) * 16*16 + hexval(in[6]) * 16 + hexval(in[7]); check_unicode_value(unicode, in, yyscanner); ! if (pair_first) ! { ! if (is_utf16_surrogate_second(unicode)) ! { ! unicode = surrogate_pair_to_codepoint(pair_first, unicode); ! pair_first = 0; ! } ! else ! { ! ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */ ! yyerror("invalid Unicode surrogate pair"); ! } ! } ! if (is_utf16_surrogate_first(unicode)) ! pair_first = unicode; ! else ! { ! unicode_to_utf8(unicode, (unsigned char *) out); ! out += pg_mblen(out); ! } in += 8; } else { *************** litbuf_udeescape(unsigned char escape, b *** 1161,1167 **** --- 1221,1234 ---- } } else + { + if (pair_first) + { + ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */ + yyerror("invalid Unicode surrogate pair"); + } *out++ = *in++; + } } *out = '\0';