*/
typedef struct
{
+ pg_locale_t locale; /* collation used for substring matching */
bool is_multibyte_char_in_char; /* need to check char boundaries? */
+ bool greedy; /* find longest possible substring? */
char *str1; /* haystack string */
char *str2; /* needle string */
int skiptablemask; /* mask for ANDing with skiptable subscripts */
int skiptable[256]; /* skip distance for given mismatched char */
+ /*
+ * Note that with nondeterministic collations, the length of the last
+ * match is not necessarily equal to the length of the "needle" passed in.
+ */
char *last_match; /* pointer to last match in 'str1' */
+ int last_match_len; /* length of last match */
+ int last_match_len_tmp; /* same but for internal use */
/*
* Sometimes we need to convert the byte position of a match to a
TextPositionState state;
int result;
+ check_collation_set(collid);
+
/* Empty needle always matches at position 1 */
if (VARSIZE_ANY_EXHDR(t2) < 1)
return 1;
/* Otherwise, can't match if haystack is shorter than needle */
- if (VARSIZE_ANY_EXHDR(t1) < VARSIZE_ANY_EXHDR(t2))
+ if (VARSIZE_ANY_EXHDR(t1) < VARSIZE_ANY_EXHDR(t2) &&
+ pg_newlocale_from_collation(collid)->deterministic)
return 0;
text_position_setup(t1, t2, collid, &state);
+ /* don't need greedy mode here */
+ state.greedy = false;
+
if (!text_position_next(&state))
result = 0;
else
{
int len1 = VARSIZE_ANY_EXHDR(t1);
int len2 = VARSIZE_ANY_EXHDR(t2);
- pg_locale_t mylocale;
check_collation_set(collid);
- mylocale = pg_newlocale_from_collation(collid);
+ state->locale = pg_newlocale_from_collation(collid);
- if (!mylocale->deterministic)
- ereport(ERROR,
- (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
- errmsg("nondeterministic collations are not supported for substring searches")));
+ /*
+ * Most callers need greedy mode, but some might want to unset this to
+ * optimize.
+ */
+ state->greedy = true;
- Assert(len1 > 0);
Assert(len2 > 0);
/*
* point in wasting cycles initializing the table. We also choose not to
* use B-M-H for needles of length 1, since the skip table can't possibly
* save anything in that case.
+ *
+ * (With nondeterministic collations, the search is already
+ * multibyte-aware, so we don't need this.)
*/
- if (len1 >= len2 && len2 > 1)
+ if (len1 >= len2 && len2 > 1 && state->locale->deterministic)
{
int searchlength = len1 - len2;
int skiptablemask;
/* Start from the point right after the previous match. */
if (state->last_match)
- start_ptr = state->last_match + needle_len;
+ start_ptr = state->last_match + state->last_match_len;
else
start_ptr = state->str1;
* multi-byte character, we need to verify that the match was at a
* character boundary, not in the middle of a multi-byte character.
*/
- if (state->is_multibyte_char_in_char)
+ if (state->is_multibyte_char_in_char && state->locale->deterministic)
{
/* Walk one character at a time, until we reach the match. */
}
state->last_match = matchptr;
+ state->last_match_len = state->last_match_len_tmp;
return true;
}
Assert(start_ptr >= haystack && start_ptr <= haystack_end);
- if (needle_len == 1)
+ state->last_match_len_tmp = needle_len;
+
+ if (!state->locale->deterministic)
+ {
+ /*
+ * With a nondeterministic collation, we have to use an unoptimized
+ * route. We walk through the haystack and see if at each position
+ * there is a substring of the remaining string that is equal to the
+ * needle under the given collation.
+ *
+ * Note, the found substring could have a different length than the
+ * needle, including being empty. Callers that want to skip over the
+ * found string need to read the length of the found substring from
+ * last_match_len rather than just using the length of their needle.
+ *
+ * Most callers will require "greedy" semantics, meaning that we need
+ * to find the longest such substring, not the shortest. For callers
+ * that don't need greedy semantics, we can finish on the first match.
+ */
+ const char *result_hptr = NULL;
+
+ hptr = start_ptr;
+ while (hptr < haystack_end)
+ {
+ /*
+ * First check the common case that there is a match in the
+ * haystack of exactly the length of the needle.
+ */
+ if (!state->greedy &&
+ haystack_end - hptr >= needle_len &&
+ pg_strncoll(hptr, needle_len, needle, needle_len, state->locale) == 0)
+ return (char *) hptr;
+
+ /*
+ * Else check if any of the possible substrings starting at hptr
+ * are equal to the needle.
+ */
+ for (const char *test_end = hptr; test_end < haystack_end; test_end += pg_mblen(test_end))
+ {
+ if (pg_strncoll(hptr, (test_end - hptr), needle, needle_len, state->locale) == 0)
+ {
+ state->last_match_len_tmp = (test_end - hptr);
+ result_hptr = hptr;
+ if (!state->greedy)
+ break;
+ }
+ }
+ if (result_hptr)
+ break;
+
+ hptr += pg_mblen(hptr);
+ }
+
+ return (char *) result_hptr;
+ }
+ else if (needle_len == 1)
{
/* No point in using B-M-H for a one-character needle */
char nchar = *needle;
appendStringInfoText(&str, to_sub_text);
- start_ptr = curr_ptr + from_sub_text_len;
+ start_ptr = curr_ptr + state.last_match_len;
found = text_position_next(&state);
if (found)
/* special case of last field does not require an extra pass */
if (fldnum == -1)
{
- start_ptr = text_position_get_match_ptr(&state) + fldsep_len;
+ start_ptr = text_position_get_match_ptr(&state) + state.last_match_len;
end_ptr = VARDATA_ANY(inputstring) + inputstring_len;
text_position_cleanup(&state);
PG_RETURN_TEXT_P(cstring_to_text_with_len(start_ptr,
while (found && --fldnum > 0)
{
/* identify bounds of next field */
- start_ptr = end_ptr + fldsep_len;
+ start_ptr = end_ptr + state.last_match_len;
found = text_position_next(&state);
if (found)
end_ptr = text_position_get_match_ptr(&state);
if (!found)
break;
- start_ptr = end_ptr + fldsep_len;
+ start_ptr = end_ptr + state.last_match_len;
}
text_position_cleanup(&state);
CREATE TABLE test6 (a int, b text);
-- same string in different normal forms
-INSERT INTO test6 VALUES (1, U&'\00E4bc');
-INSERT INTO test6 VALUES (2, U&'\0061\0308bc');
+INSERT INTO test6 VALUES (1, U&'zy\00E4bc');
+INSERT INTO test6 VALUES (2, U&'zy\0061\0308bc');
SELECT * FROM test6;
- a | b
----+-----
- 1 | äbc
- 2 | äbc
+ a | b
+---+-------
+ 1 | zyäbc
+ 2 | zyäbc
(2 rows)
-SELECT * FROM test6 WHERE b = 'äbc' COLLATE ctest_det;
- a | b
----+-----
- 1 | äbc
+SELECT * FROM test6 WHERE b = 'zyäbc' COLLATE ctest_det;
+ a | b
+---+-------
+ 1 | zyäbc
(1 row)
-SELECT * FROM test6 WHERE b = 'äbc' COLLATE ctest_nondet;
- a | b
----+-----
- 1 | äbc
- 2 | äbc
+SELECT * FROM test6 WHERE b = 'zyäbc' COLLATE ctest_nondet;
+ a | b
+---+-------
+ 1 | zyäbc
+ 2 | zyäbc
(2 rows)
-SELECT * FROM test6 WHERE b LIKE 'äbc' COLLATE ctest_det;
- a | b
----+-----
- 1 | äbc
+SELECT strpos(b COLLATE ctest_det, 'bc') FROM test6;
+ strpos
+--------
+ 4
+ 5
+(2 rows)
+
+SELECT strpos(b COLLATE ctest_nondet, 'bc') FROM test6;
+ strpos
+--------
+ 4
+ 5
+(2 rows)
+
+SELECT replace(b COLLATE ctest_det, U&'\00E4b', 'X') FROM test6;
+ replace
+---------
+ zyXc
+ zyäbc
+(2 rows)
+
+SELECT replace(b COLLATE ctest_nondet, U&'\00E4b', 'X') FROM test6;
+ replace
+---------
+ zyXc
+ zyXc
+(2 rows)
+
+SELECT a, split_part(b COLLATE ctest_det, U&'\00E4b', 2) FROM test6;
+ a | split_part
+---+------------
+ 1 | c
+ 2 |
+(2 rows)
+
+SELECT a, split_part(b COLLATE ctest_nondet, U&'\00E4b', 2) FROM test6;
+ a | split_part
+---+------------
+ 1 | c
+ 2 | c
+(2 rows)
+
+SELECT a, split_part(b COLLATE ctest_det, U&'\00E4b', -1) FROM test6;
+ a | split_part
+---+------------
+ 1 | c
+ 2 | zyäbc
+(2 rows)
+
+SELECT a, split_part(b COLLATE ctest_nondet, U&'\00E4b', -1) FROM test6;
+ a | split_part
+---+------------
+ 1 | c
+ 2 | c
+(2 rows)
+
+SELECT a, string_to_array(b COLLATE ctest_det, U&'\00E4b') FROM test6;
+ a | string_to_array
+---+-----------------
+ 1 | {zy,c}
+ 2 | {zyäbc}
+(2 rows)
+
+SELECT a, string_to_array(b COLLATE ctest_nondet, U&'\00E4b') FROM test6;
+ a | string_to_array
+---+-----------------
+ 1 | {zy,c}
+ 2 | {zy,c}
+(2 rows)
+
+SELECT * FROM test6 WHERE b LIKE 'zyäbc' COLLATE ctest_det;
+ a | b
+---+-------
+ 1 | zyäbc
(1 row)
-SELECT * FROM test6 WHERE b LIKE 'äbc' COLLATE ctest_nondet;
- a | b
----+-----
- 1 | äbc
- 2 | äbc
+SELECT * FROM test6 WHERE b LIKE 'zyäbc' COLLATE ctest_nondet;
+ a | b
+---+-------
+ 1 | zyäbc
+ 2 | zyäbc
(2 rows)
-- same with arrays
ERROR: could not create unique index "test3ci_x_idx"
DETAIL: Key (x)=(abc) is duplicated.
SELECT string_to_array('ABC,DEF,GHI' COLLATE case_insensitive, ',', 'abc');
-ERROR: nondeterministic collations are not supported for substring searches
+ string_to_array
+-----------------
+ {NULL,DEF,GHI}
+(1 row)
+
SELECT string_to_array('ABCDEFGHI' COLLATE case_insensitive, NULL, 'b');
string_to_array
------------------------
ERROR: could not create unique index "test3bpci_x_idx"
DETAIL: Key (x)=(abc) is duplicated.
SELECT string_to_array('ABC,DEF,GHI'::char(11) COLLATE case_insensitive, ',', 'abc');
-ERROR: nondeterministic collations are not supported for substring searches
+ string_to_array
+-----------------
+ {NULL,DEF,GHI}
+(1 row)
+
SELECT string_to_array('ABCDEFGHI'::char(9) COLLATE case_insensitive, NULL, 'b');
string_to_array
------------------------
1 | cote
(1 row)
+CREATE TABLE test4nfd (a int, b text);
+INSERT INTO test4nfd VALUES (1, 'cote'), (2, 'côte'), (3, 'coté'), (4, 'côté');
+UPDATE test4nfd SET b = normalize(b, nfd);
+-- This shows why replace should be greedy. Otherwise, in the NFD
+-- case, the match would stop before the decomposed accents, which
+-- would leave the accents in the results.
+SELECT a, b, replace(b COLLATE ignore_accents, 'co', 'ma') FROM test4;
+ a | b | replace
+---+------+---------
+ 1 | cote | mate
+ 2 | côte | mate
+ 3 | coté | maté
+ 4 | côté | maté
+(4 rows)
+
+SELECT a, b, replace(b COLLATE ignore_accents, 'co', 'ma') FROM test4nfd;
+ a | b | replace
+---+------+---------
+ 1 | cote | mate
+ 2 | côte | mate
+ 3 | coté | maté
+ 4 | côté | maté
+(4 rows)
+
-- This is a tricky one. A naive implementation would first test
-- \00E4 matches \0061, which is true under ignore_accents, but then
-- the rest of the string won't match anymore. Therefore, the
CREATE TABLE test6 (a int, b text);
-- same string in different normal forms
-INSERT INTO test6 VALUES (1, U&'\00E4bc');
-INSERT INTO test6 VALUES (2, U&'\0061\0308bc');
+INSERT INTO test6 VALUES (1, U&'zy\00E4bc');
+INSERT INTO test6 VALUES (2, U&'zy\0061\0308bc');
SELECT * FROM test6;
-SELECT * FROM test6 WHERE b = 'äbc' COLLATE ctest_det;
-SELECT * FROM test6 WHERE b = 'äbc' COLLATE ctest_nondet;
+SELECT * FROM test6 WHERE b = 'zyäbc' COLLATE ctest_det;
+SELECT * FROM test6 WHERE b = 'zyäbc' COLLATE ctest_nondet;
-SELECT * FROM test6 WHERE b LIKE 'äbc' COLLATE ctest_det;
-SELECT * FROM test6 WHERE b LIKE 'äbc' COLLATE ctest_nondet;
+SELECT strpos(b COLLATE ctest_det, 'bc') FROM test6;
+SELECT strpos(b COLLATE ctest_nondet, 'bc') FROM test6;
+
+SELECT replace(b COLLATE ctest_det, U&'\00E4b', 'X') FROM test6;
+SELECT replace(b COLLATE ctest_nondet, U&'\00E4b', 'X') FROM test6;
+
+SELECT a, split_part(b COLLATE ctest_det, U&'\00E4b', 2) FROM test6;
+SELECT a, split_part(b COLLATE ctest_nondet, U&'\00E4b', 2) FROM test6;
+SELECT a, split_part(b COLLATE ctest_det, U&'\00E4b', -1) FROM test6;
+SELECT a, split_part(b COLLATE ctest_nondet, U&'\00E4b', -1) FROM test6;
+
+SELECT a, string_to_array(b COLLATE ctest_det, U&'\00E4b') FROM test6;
+SELECT a, string_to_array(b COLLATE ctest_nondet, U&'\00E4b') FROM test6;
+
+SELECT * FROM test6 WHERE b LIKE 'zyäbc' COLLATE ctest_det;
+SELECT * FROM test6 WHERE b LIKE 'zyäbc' COLLATE ctest_nondet;
-- same with arrays
CREATE TABLE test6a (a int, b text[]);
SELECT * FROM test4 WHERE b = 'Cote' COLLATE ignore_accents; -- still case-sensitive
SELECT * FROM test4 WHERE b = 'Cote' COLLATE case_insensitive;
+CREATE TABLE test4nfd (a int, b text);
+INSERT INTO test4nfd VALUES (1, 'cote'), (2, 'côte'), (3, 'coté'), (4, 'côté');
+UPDATE test4nfd SET b = normalize(b, nfd);
+
+-- This shows why replace should be greedy. Otherwise, in the NFD
+-- case, the match would stop before the decomposed accents, which
+-- would leave the accents in the results.
+SELECT a, b, replace(b COLLATE ignore_accents, 'co', 'ma') FROM test4;
+SELECT a, b, replace(b COLLATE ignore_accents, 'co', 'ma') FROM test4nfd;
+
-- This is a tricky one. A naive implementation would first test
-- \00E4 matches \0061, which is true under ignore_accents, but then
-- the rest of the string won't match anymore. Therefore, the