*** pgsql/src/backend/parser/scan.l	2009/07/14 20:24:10	1.157
--- pgsql/src/backend/parser/scan.l	2009/09/21 22:22:07	1.158
***************
*** 24,30 ****
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
!  *	  $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.156 2009/07/13 03:11:12 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
--- 24,30 ----
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
!  *	  $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.157 2009/07/14 20:24:10 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
*************** check_unicode_value(pg_wchar c, char *lo
*** 1097,1107 ****
--- 1097,1126 ----
  	}
  }
  
+ static bool
+ is_utf16_surrogate_first(pg_wchar c)
+ {
+ 	return (c >= 0xD800 && c <= 0xDBFF);
+ }
+ 
+ static bool
+ is_utf16_surrogate_second(pg_wchar c)
+ {
+ 	return (c >= 0xDC00 && c <= 0xDFFF);
+ }
+ 
+ static pg_wchar
+ surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second)
+ {
+ 	return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF);
+ }
+ 
  static char *
  litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner)
  {
  	char *new;
  	char *litbuf, *in, *out;
+ 	pg_wchar pair_first = 0;
  
  	if (isxdigit(escape)
  		|| escape == '+'
*************** litbuf_udeescape(unsigned char escape, b
*** 1131,1136 ****
--- 1150,1160 ----
  		{
  			if (in[1] == escape)
  			{
+ 				if (pair_first)
+ 				{
+ 					ADVANCE_YYLLOC(in - litbuf + 3);   /* 3 for U&" */
+ 					yyerror("invalid Unicode surrogate pair");
+ 				}
  				*out++ = escape;
  				in += 2;
  			}
*************** litbuf_udeescape(unsigned char escape, b
*** 1138,1146 ****
  			{
  				pg_wchar unicode = hexval(in[1]) * 16*16*16 + hexval(in[2]) * 16*16 + hexval(in[3]) * 16 + hexval(in[4]);
  				check_unicode_value(unicode, in, yyscanner);
! 				unicode_to_utf8(unicode, (unsigned char *) out);
  				in += 5;
- 				out += pg_mblen(out);
  			}
  			else if (in[1] == '+'
  					 && isxdigit(in[2]) && isxdigit(in[3])
--- 1162,1188 ----
  			{
  				pg_wchar unicode = hexval(in[1]) * 16*16*16 + hexval(in[2]) * 16*16 + hexval(in[3]) * 16 + hexval(in[4]);
  				check_unicode_value(unicode, in, yyscanner);
! 				if (pair_first)
! 				{
! 					if (is_utf16_surrogate_second(unicode))
! 					{
! 						unicode = surrogate_pair_to_codepoint(pair_first, unicode);
! 						pair_first = 0;
! 					}
! 					else
! 					{
! 						ADVANCE_YYLLOC(in - litbuf + 3);   /* 3 for U&" */
! 						yyerror("invalid Unicode surrogate pair");
! 					}
! 				}
! 				if (is_utf16_surrogate_first(unicode))
! 					pair_first = unicode;
! 				else
! 				{
! 					unicode_to_utf8(unicode, (unsigned char *) out);
! 					out += pg_mblen(out);
! 				}
  				in += 5;
  			}
  			else if (in[1] == '+'
  					 && isxdigit(in[2]) && isxdigit(in[3])
*************** litbuf_udeescape(unsigned char escape, b
*** 1150,1158 ****
  				pg_wchar unicode = hexval(in[2]) * 16*16*16*16*16 + hexval(in[3]) * 16*16*16*16 + hexval(in[4]) * 16*16*16
  									+ hexval(in[5]) * 16*16 + hexval(in[6]) * 16 + hexval(in[7]);
  				check_unicode_value(unicode, in, yyscanner);
! 				unicode_to_utf8(unicode, (unsigned char *) out);
  				in += 8;
- 				out += pg_mblen(out);
  			}
  			else
  			{
--- 1192,1218 ----
  				pg_wchar unicode = hexval(in[2]) * 16*16*16*16*16 + hexval(in[3]) * 16*16*16*16 + hexval(in[4]) * 16*16*16
  									+ hexval(in[5]) * 16*16 + hexval(in[6]) * 16 + hexval(in[7]);
  				check_unicode_value(unicode, in, yyscanner);
! 				if (pair_first)
! 				{
! 					if (is_utf16_surrogate_second(unicode))
! 					{
! 						unicode = surrogate_pair_to_codepoint(pair_first, unicode);
! 						pair_first = 0;
! 					}
! 					else
! 					{
! 						ADVANCE_YYLLOC(in - litbuf + 3);   /* 3 for U&" */
! 						yyerror("invalid Unicode surrogate pair");
! 					}
! 				}
! 				if (is_utf16_surrogate_first(unicode))
! 					pair_first = unicode;
! 				else
! 				{
! 					unicode_to_utf8(unicode, (unsigned char *) out);
! 					out += pg_mblen(out);
! 				}
  				in += 8;
  			}
  			else
  			{
*************** litbuf_udeescape(unsigned char escape, b
*** 1161,1167 ****
--- 1221,1234 ----
  			}
  		}
  		else
+ 		{
+ 			if (pair_first)
+ 			{
+ 				ADVANCE_YYLLOC(in - litbuf + 3);   /* 3 for U&" */
+ 				yyerror("invalid Unicode surrogate pair");
+ 			}
  			*out++ = *in++;
+ 		}
  	}
  
  	*out = '\0';