From 34e015f5ad3718d182139099b28917657df2c59e Mon Sep 17 00:00:00 2001 From: Steven Niu Date: Wed, 26 Mar 2025 14:43:43 +0800 Subject: [PATCH 1/2] Optimize function byteain() to avoid double scanning Optimized the function to eliminate the need for two scans, while preserving correctness and efficiency. Author: Steven Niu --- src/backend/utils/adt/varlena.c | 66 +++++++++++---------------------- 1 file changed, 22 insertions(+), 44 deletions(-) diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index 3e4d5568bde8..f1f1efba0530 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -291,7 +291,6 @@ text_to_cstring_buffer(const text *src, char *dst, size_t dst_len) * ereport(ERROR, ...) if bad form. * * BUGS: - * The input is scanned twice. * The error checking of input is minimal. */ Datum @@ -302,6 +301,7 @@ byteain(PG_FUNCTION_ARGS) char *tp; char *rp; int bc; + size_t input_len; bytea *result; /* Recognize hex input */ @@ -318,45 +318,28 @@ byteain(PG_FUNCTION_ARGS) PG_RETURN_BYTEA_P(result); } - /* Else, it's the traditional escaped style */ - for (bc = 0, tp = inputText; *tp != '\0'; bc++) - { - if (tp[0] != '\\') - tp++; - else if ((tp[0] == '\\') && - (tp[1] >= '0' && tp[1] <= '3') && - (tp[2] >= '0' && tp[2] <= '7') && - (tp[3] >= '0' && tp[3] <= '7')) - tp += 4; - else if ((tp[0] == '\\') && - (tp[1] == '\\')) - tp += 2; - else - { - /* - * one backslash, not followed by another or ### valid octal - */ - ereturn(escontext, (Datum) 0, - (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("invalid input syntax for type %s", "bytea"))); - } - } - - bc += VARHDRSZ; - - result = (bytea *) palloc(bc); - SET_VARSIZE(result, bc); - - tp = inputText; + /* Handle traditional escaped style in a single pass */ + input_len = strlen(inputText); + result = palloc(input_len + VARHDRSZ); /* Allocate max possible size */ rp = VARDATA(result); + tp = inputText; + while (*tp != '\0') { if (tp[0] != '\\') + { *rp++ = *tp++; - else if ((tp[0] == '\\') && - (tp[1] >= '0' && tp[1] <= '3') && - (tp[2] >= '0' && tp[2] <= '7') && - (tp[3] >= '0' && tp[3] <= '7')) + continue; + } + + if (tp[1] == '\\') + { + *rp++ = '\\'; + tp += 2; + } + else if ((tp[1] >= '0' && tp[1] <= '3') && + (tp[2] >= '0' && tp[2] <= '7') && + (tp[3] >= '0' && tp[3] <= '7')) { bc = VAL(tp[1]); bc <<= 3; @@ -366,23 +349,18 @@ byteain(PG_FUNCTION_ARGS) tp += 4; } - else if ((tp[0] == '\\') && - (tp[1] == '\\')) - { - *rp++ = '\\'; - tp += 2; - } else { - /* - * We should never get here. The first pass should not allow it. - */ + /* Invalid escape sequence: report error */ ereturn(escontext, (Datum) 0, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid input syntax for type %s", "bytea"))); } } + /* Set the actual size of the bytea */ + SET_VARSIZE(result, (rp - VARDATA(result)) + VARHDRSZ); + PG_RETURN_BYTEA_P(result); } From 12b320b6c544aa0936e5720d6d714b12bf715620 Mon Sep 17 00:00:00 2001 From: Stepan Neretin Date: Fri, 9 May 2025 17:36:28 +0700 Subject: [PATCH 2/2] Refactor byteain() to avoid double scanning and simplify logic This patch reworks the escaped input handling in byteain() by replacing manual buffer management with a StringInfo-based single-pass parse. It combines ideas from a previous proposal by Steven Niu with additional improvements to structure and readability. Also adds regression tests covering edge cases for both hex and escaped formats. Includes input from discussion with Kirill Reshke on pgsql-hackers. --- contrib/btree_gin/expected/bytea.out | 92 ++++++++++++++++++++++++ contrib/btree_gin/sql/bytea.sql | 37 ++++++++++ src/backend/utils/adt/varlena.c | 100 +++++++++++++-------------- 3 files changed, 178 insertions(+), 51 deletions(-) diff --git a/contrib/btree_gin/expected/bytea.out b/contrib/btree_gin/expected/bytea.out index b0ed7a53450a..d4ad28787759 100644 --- a/contrib/btree_gin/expected/bytea.out +++ b/contrib/btree_gin/expected/bytea.out @@ -44,3 +44,95 @@ SELECT * FROM test_bytea WHERE i>'abc'::bytea ORDER BY i; xyz (2 rows) +-- Simple ASCII strings +SELECT encode(bytea(E'a'), 'hex'); -- 61 + encode +-------- + 61 +(1 row) + +SELECT encode(bytea(E'ab'), 'hex'); -- 6162 + encode +-------- + 6162 +(1 row) + +-- Octal escapes +SELECT encode(bytea(E'\\000'), 'hex'); -- 00 + encode +-------- + 00 +(1 row) + +SELECT encode(bytea(E'\\001'), 'hex'); -- 01 + encode +-------- + 01 +(1 row) + +SELECT encode(bytea(E'\\001\\002\\003'), 'hex'); -- 010203 + encode +-------- + 010203 +(1 row) + +-- Mixed literal and escapes +SELECT encode(bytea(E'a\\000b\\134c'), 'hex'); -- 6100625c63 + encode +------------ + 6100625c63 +(1 row) + +-- Backslash literal +SELECT encode(bytea(E'\\\\'), 'hex'); -- 5c + encode +-------- + 5c +(1 row) + +-- Empty input +SELECT encode(bytea(E''), 'hex'); -- (empty string) + encode +-------- + +(1 row) + +-- Hex format +SELECT encode(bytea(E'\\x6869'), 'escape'); -- hi + encode +-------- + hi +(1 row) + +-- ===== Invalid bytea input tests ===== +-- Invalid octal escapes (less than 3 digits or out of range) +SELECT bytea(E'\\77'); -- ERROR +ERROR: invalid input syntax for type bytea +LINE 1: SELECT bytea(E'\\77'); + ^ +SELECT bytea(E'\\4'); -- ERROR +ERROR: invalid input syntax for type bytea +LINE 1: SELECT bytea(E'\\4'); + ^ +SELECT bytea(E'\\08'); -- ERROR +ERROR: invalid input syntax for type bytea +LINE 1: SELECT bytea(E'\\08'); + ^ +SELECT bytea(E'\\999'); -- ERROR +ERROR: invalid input syntax for type bytea +LINE 1: SELECT bytea(E'\\999'); + ^ +-- Invalid hex format +SELECT bytea(E'\\x1'); -- ERROR +ERROR: invalid hexadecimal data: odd number of digits +LINE 1: SELECT bytea(E'\\x1'); + ^ +SELECT bytea(E'\\xZZ'); -- ERROR +ERROR: invalid hexadecimal digit: "Z" +LINE 1: SELECT bytea(E'\\xZZ'); + ^ +-- Incomplete escape sequence +SELECT bytea(E'abc\\'); -- ERROR +ERROR: invalid input syntax for type bytea +LINE 1: SELECT bytea(E'abc\\'); + ^ diff --git a/contrib/btree_gin/sql/bytea.sql b/contrib/btree_gin/sql/bytea.sql index 5f3eb11b1698..cb8ee8eb2aae 100644 --- a/contrib/btree_gin/sql/bytea.sql +++ b/contrib/btree_gin/sql/bytea.sql @@ -15,3 +15,40 @@ SELECT * FROM test_bytea WHERE i<='abc'::bytea ORDER BY i; SELECT * FROM test_bytea WHERE i='abc'::bytea ORDER BY i; SELECT * FROM test_bytea WHERE i>='abc'::bytea ORDER BY i; SELECT * FROM test_bytea WHERE i>'abc'::bytea ORDER BY i; + + +-- Simple ASCII strings +SELECT encode(bytea(E'a'), 'hex'); -- 61 +SELECT encode(bytea(E'ab'), 'hex'); -- 6162 + +-- Octal escapes +SELECT encode(bytea(E'\\000'), 'hex'); -- 00 +SELECT encode(bytea(E'\\001'), 'hex'); -- 01 +SELECT encode(bytea(E'\\001\\002\\003'), 'hex'); -- 010203 + +-- Mixed literal and escapes +SELECT encode(bytea(E'a\\000b\\134c'), 'hex'); -- 6100625c63 + +-- Backslash literal +SELECT encode(bytea(E'\\\\'), 'hex'); -- 5c + +-- Empty input +SELECT encode(bytea(E''), 'hex'); -- (empty string) + +-- Hex format +SELECT encode(bytea(E'\\x6869'), 'escape'); -- hi + +-- ===== Invalid bytea input tests ===== + +-- Invalid octal escapes (less than 3 digits or out of range) +SELECT bytea(E'\\77'); -- ERROR +SELECT bytea(E'\\4'); -- ERROR +SELECT bytea(E'\\08'); -- ERROR +SELECT bytea(E'\\999'); -- ERROR + +-- Invalid hex format +SELECT bytea(E'\\x1'); -- ERROR +SELECT bytea(E'\\xZZ'); -- ERROR + +-- Incomplete escape sequence +SELECT bytea(E'abc\\'); -- ERROR \ No newline at end of file diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index f1f1efba0530..517965445feb 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -296,72 +296,70 @@ text_to_cstring_buffer(const text *src, char *dst, size_t dst_len) Datum byteain(PG_FUNCTION_ARGS) { - char *inputText = PG_GETARG_CSTRING(0); - Node *escontext = fcinfo->context; - char *tp; - char *rp; - int bc; - size_t input_len; - bytea *result; + char *inputText = PG_GETARG_CSTRING(0); + Node *escontext = fcinfo->context; - /* Recognize hex input */ + /* Hex format */ if (inputText[0] == '\\' && inputText[1] == 'x') { - size_t len = strlen(inputText); + size_t len; + int bc; + bytea *result; - bc = (len - 2) / 2 + VARHDRSZ; /* maximum possible length */ + len = strlen(inputText); + bc = (len - 2) / 2 + VARHDRSZ; result = palloc(bc); - bc = hex_decode_safe(inputText + 2, len - 2, VARDATA(result), - escontext); - SET_VARSIZE(result, bc + VARHDRSZ); /* actual length */ - + bc = hex_decode_safe(inputText + 2, len - 2, VARDATA(result), escontext); + SET_VARSIZE(result, bc + VARHDRSZ); PG_RETURN_BYTEA_P(result); } - /* Handle traditional escaped style in a single pass */ - input_len = strlen(inputText); - result = palloc(input_len + VARHDRSZ); /* Allocate max possible size */ - rp = VARDATA(result); - tp = inputText; - - while (*tp != '\0') + /* Escaped format */ { - if (tp[0] != '\\') - { - *rp++ = *tp++; - continue; - } + StringInfoData buf; + char *tp; + bytea *result; - if (tp[1] == '\\') - { - *rp++ = '\\'; - tp += 2; - } - else if ((tp[1] >= '0' && tp[1] <= '3') && - (tp[2] >= '0' && tp[2] <= '7') && - (tp[3] >= '0' && tp[3] <= '7')) - { - bc = VAL(tp[1]); - bc <<= 3; - bc += VAL(tp[2]); - bc <<= 3; - *rp++ = bc + VAL(tp[3]); + initStringInfo(&buf); + tp = inputText; - tp += 4; - } - else + while (*tp) { - /* Invalid escape sequence: report error */ - ereturn(escontext, (Datum) 0, - (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("invalid input syntax for type %s", "bytea"))); + if (*tp != '\\') + { + appendStringInfoChar(&buf, *tp++); + continue; + } + + if (tp[1] == '\\') + { + appendStringInfoChar(&buf, '\\'); + tp += 2; + } + else if ((tp[1] >= '0' && tp[1] <= '3') && + (tp[2] >= '0' && tp[2] <= '7') && + (tp[3] >= '0' && tp[3] <= '7')) + { + int byte_val = VAL(tp[1]); + byte_val = (byte_val << 3) + VAL(tp[2]); + byte_val = (byte_val << 3) + VAL(tp[3]); + appendStringInfoChar(&buf, byte_val); + tp += 4; + } + else + { + ereturn(escontext, (Datum) 0, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type %s", "bytea"))); + } } - } - /* Set the actual size of the bytea */ - SET_VARSIZE(result, (rp - VARDATA(result)) + VARHDRSZ); + result = palloc(buf.len + VARHDRSZ); + SET_VARSIZE(result, buf.len + VARHDRSZ); + memcpy(VARDATA(result), buf.data, buf.len); - PG_RETURN_BYTEA_P(result); + PG_RETURN_BYTEA_P(result); + } } /*