diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000000..5ad68edc33 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,11 @@ +# Don't send some content to the Docker host when building +img +travis +.git +.travis.yml + +*.gcno +*.gcda +*.gcov +*.so +*.o diff --git a/.gitignore b/.gitignore index dfc31f487a..a64cea1abf 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ results __pycache__ *.pyc +rum--*.sql tmp_install log diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000000..0c21a422c2 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,37 @@ +os: linux + +dist: jammy + +language: c + +services: + - docker + +before_install: + - cp travis/* . + +install: + - ./mk_dockerfile.sh + - docker-compose build + +script: + - docker-compose run $(bash <(curl -s https://codecov.io/env)) tests + +notifications: + email: + on_success: change + on_failure: always + +env: + - PG_VERSION=17 + - PG_VERSION=17 LEVEL=hardcore + - PG_VERSION=16 + - PG_VERSION=16 LEVEL=hardcore + - PG_VERSION=15 + - PG_VERSION=15 LEVEL=hardcore + - PG_VERSION=14 + - PG_VERSION=14 LEVEL=hardcore + - PG_VERSION=13 + - PG_VERSION=13 LEVEL=hardcore + - PG_VERSION=12 + - PG_VERSION=12 LEVEL=hardcore diff --git a/LICENSE b/LICENSE index d73dbdb454..a51596793f 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ RUM is released under the PostgreSQL License, a liberal Open Source license, similar to the BSD or MIT licenses. -Copyright (c) 2015-2017, Postgres Professional +Portions Copyright (c) 2015-2024, Postgres Professional Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group Portions Copyright (c) 1994, The Regents of the University of California @@ -8,4 +8,4 @@ Permission to use, copy, modify, and distribute this software and its documentat IN NO EVENT SHALL POSTGRES PROFESSIONAL BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF POSTGRES PROFESSIONAL HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -POSTGRES PROFESSIONAL SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND POSTGRES PROFESSIONAL HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. \ No newline at end of file +POSTGRES PROFESSIONAL SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND POSTGRES PROFESSIONAL HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. diff --git a/Makefile b/Makefile index 19476e53af..a8d510019d 100644 --- a/Makefile +++ b/Makefile @@ -11,28 +11,39 @@ OBJS = src/rumsort.o src/rum_ts_utils.o src/rumtsquery.o \ src/rumscan.o src/rumutil.o src/rumvacuum.o src/rumvalidate.o \ src/btree_rum.o src/rum_arr_utils.o $(WIN32RES) -DATA_first = rum--1.0.sql DATA_updates = rum--1.0--1.1.sql rum--1.1--1.2.sql \ rum--1.2--1.3.sql -DATA = $(DATA_first) rum--$(EXTVERSION).sql $(DATA_updates) - -# Do not use DATA_built. It removes built files if clean target was used -SQL_built = rum--$(EXTVERSION).sql $(DATA_updates) +DATA_built = $(EXTENSION)--$(EXTVERSION).sql INCLUDES = rum.h rumsort.h RELATIVE_INCLUDES = $(addprefix src/, $(INCLUDES)) LDFLAGS_SL += $(filter -lm, $(LIBS)) -REGRESS = rum rum_validate rum_hash ruminv timestamp orderby orderby_hash \ +REGRESS = security rum rum_validate rum_hash ruminv timestamp orderby orderby_hash \ altorder altorder_hash limits \ int2 int4 int8 float4 float8 money oid \ - time timetz date interval \ - macaddr inet cidr text varchar char bytea bit varbit \ - numeric + time timetz date interval \ + macaddr inet cidr text varchar char bytea bit varbit \ + numeric rum_weight expr + +TAP_TESTS = 1 + +ISOLATION = predicate-rum predicate-rum-2 +ISOLATION_OPTS = --load-extension=rum +EXTRA_CLEAN = pglist_tmp ifdef USE_PGXS + +# We cannot run isolation test for versions 12,13 in PGXS case +# because 'pg_isolation_regress' is not copied to install +# directory, see src/test/isolation/Makefile +ifeq ($(MAJORVERSION),$(filter 12% 13%,$(MAJORVERSION))) +undefine ISOLATION +undefine ISOLATION_OPTS +endif + PG_CONFIG = pg_config PGXS := $(shell $(PG_CONFIG) --pgxs) include $(PGXS) @@ -43,24 +54,27 @@ include $(top_builddir)/src/Makefile.global include $(top_srcdir)/contrib/contrib-global.mk endif +$(EXTENSION)--$(EXTVERSION).sql: rum_init.sql + cat $^ > $@ + ifeq ($(MAJORVERSION), 9.6) # arrays are not supported on 9.6 else REGRESS += array endif +# For 9.6-11 we have to make specific target with tap tests +ifeq ($(MAJORVERSION), $(filter 9.6% 10% 11%, $(MAJORVERSION))) wal-check: temp-install $(prove_check) -all: $(SQL_built) - -#9.6 requires 1.3 file but 10.0 could live with update files -rum--$(EXTVERSION).sql: $(DATA_first) $(DATA_updates) - cat $(DATA_first) $(DATA_updates) > rum--$(EXTVERSION).sql +check: wal-check +endif -# rule for updates, e.g. rum--1.0--1.1.sql -rum--%.sql: gen_rum_sql--%.pl - perl $< > $@ +# +# Make conditional targets to save backward compatibility with PG11, PG10 and PG9.6. +# +ifeq ($(MAJORVERSION), $(filter 9.6% 10% 11%, $(MAJORVERSION))) install: installincludes @@ -83,5 +97,6 @@ submake-rum: isolationcheck: | submake-isolation submake-rum temp-install $(pg_isolation_regress_check) \ - --temp-config $(top_srcdir)/contrib/rum/logical.conf \ + --temp-config $(top_srcdir)/contrib/rum/logical.conf \ $(ISOLATIONCHECKS) +endif \ No newline at end of file diff --git a/README.md b/README.md index f08c76e5f6..b6fb08420c 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,4 @@ +[![Build Status](https://api.travis-ci.com/postgrespro/rum.svg?branch=master)](https://travis-ci.com/postgrespro/rum) [![PGXN version](https://badge.fury.io/pg/rum.svg)](https://badge.fury.io/pg/rum) [![GitHub license](https://img.shields.io/badge/license-PostgreSQL-blue.svg)](https://raw.githubusercontent.com/postgrespro/rum/master/LICENSE) @@ -7,38 +8,38 @@ ## Introduction -The **rum** module provides access method to work with `RUM` index. It is based -on the `GIN` access methods code. +The **rum** module provides an access method to work with a `RUM` index. It is based +on the `GIN` access method's code. -`GIN` index allows to perform fast full text search using `tsvector` and -`tsquery` types. But full text search with GIN index has several problems: +A `GIN` index allows performing fast full-text search using `tsvector` and +`tsquery` types. But full-text search with a GIN index has several problems: -- Slow ranking. It is need position information about lexems to ranking. `GIN` -index doesn't store positions of lexems. So after index scan we need additional -heap scan to retreive lexems positions. -- Slow phrase search with `GIN` index. This problem relates with previous -problem. It is need position information to perform phrase search. -- Slow ordering by timestamp. `GIN` index can't store some related information -in index with lexemes. So it is necessary to perform additional heap scan. +- Slow ranking. It needs positional information about lexemes to do ranking. A `GIN` +index doesn't store positions of lexemes. So after index scanning, we need an +additional heap scan to retrieve lexeme positions. +- Slow phrase search with a `GIN` index. This problem relates to the previous +problem. It needs positional information to perform phrase search. +- Slow ordering by timestamp. A `GIN` index can't store some related information +in the index with lexemes. So it is necessary to perform an additional heap scan. -`RUM` solves this problems by storing additional information in posting tree. +`RUM` solves these problems by storing additional information in a posting tree. For example, positional information of lexemes or timestamps. You can get an -idea of `RUM` by the following picture: +idea of `RUM` with the following diagram: ![How RUM stores additional information](img/gin_rum.png) -Drawback of `RUM` is that it has slower build and insert time than `GIN`. -It is because we need to store additional information besides keys and because -`RUM` uses generic WAL records. +A drawback of `RUM` is that it has slower build and insert times than `GIN`. +This is because we need to store additional information besides keys and because +`RUM` uses generic Write-Ahead Log (WAL) records. ## License -This module available under the same license as +This module is available under the [license](LICENSE) similar to [PostgreSQL](http://www.postgresql.org/about/licence/). ## Installation -Before build and install **rum** you should ensure following: +Before building and installing **rum**, you should ensure following are installed: * PostgreSQL version is 9.6+. @@ -59,9 +60,29 @@ Typical installation procedure may look like this: > **Important:** Don't forget to set the `PG_CONFIG` variable in case you want to test `RUM` on a custom build of PostgreSQL. Read more [here](https://wiki.postgresql.org/wiki/Building_and_Installing_PostgreSQL_Extension_Modules). +## Tests + +$ make check + +This command runs: +- regression tests; +- isolation tests; +- tap tests. + + One of the tap tests downloads a 1GB archive and then unpacks it + into a file weighing almost 3GB. It is disabled by default. + + To run this test, you need to set an environment variable: + + $ export PG_TEST_EXTRA=big_values + + The way to turn it off again: + + $ export -n PG_TEST_EXTRA + ## Common operators and functions -**rum** module provides next operators. +The **rum** module provides next operators. | Operator | Returns | Description | -------------------- | ------- | ---------------------------------------------- @@ -70,19 +91,19 @@ Typical installation procedure may look like this: | timestamp <=| timestamp | float8 | Returns distance only for left timestamps. | timestamp |=> timestamp | float8 | Returns distance only for right timestamps. -Last three operations also works for types timestamptz, int2, int4, int8, float4, float8, +The last three operations also work for types timestamptz, int2, int4, int8, float4, float8, money and oid. ## Operator classes -**rum** provides next operator classes. +**rum** provides the following operator classes. ### rum_tsvector_ops For type: `tsvector` -This operator class stores `tsvector` lexemes with positional information. Supports -ordering by `<=>` operator and prefix search. There is the example. +This operator class stores `tsvector` lexemes with positional information. It supports +ordering by the `<=>` operator and prefix search. See the example below. Let us assume we have the table: @@ -139,8 +160,8 @@ SELECT t, a <=> to_tsquery('english', 'place | situation') AS rank For type: `tsvector` -This operator class stores hash of `tsvector` lexemes with positional information. -Supports ordering by `<=>` operator. But **doesn't** support prefix search. +This operator class stores a hash of `tsvector` lexemes with positional information. +It supports ordering by the `<=>` operator. It **doesn't** support prefix search. ### rum_TYPE_ops @@ -152,17 +173,18 @@ Supported operations: `<`, `<=`, `=`, `>=`, `>` for all types and `<=>`, `<=|` and `|=>` for int2, int4, int8, float4, float8, money, oid, timestamp and timestamptz types. -Supports ordering by `<=>`, `<=|` and `|=>` operators. Can be used with +This operator supports ordering by the `<=>`, `<=|` and `|=>` operators. It can be used with `rum_tsvector_addon_ops`, `rum_tsvector_hash_addon_ops' and `rum_anyarray_addon_ops` operator classes. ### rum_tsvector_addon_ops For type: `tsvector` -This operator class stores `tsvector` lexems with any supported by module -field. There is the example. +This operator class stores `tsvector` lexemes with any supported by module +field. See the example below. Let us assume we have the table: + ```sql CREATE TABLE tsts (id int, t tsvector, d timestamp); @@ -195,20 +217,22 @@ SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY (5 rows) ``` +> **Warning:** Currently RUM has bogus behaviour when one creates an index using ordering over pass-by-reference additional information. This is due to the fact that posting trees have fixed length right bound and fixed length non-leaf posting items. It isn't allowed to create such indexes. + ### rum_tsvector_hash_addon_ops For type: `tsvector` -This operator class stores hash of `tsvector` lexems with any supported by module +This operator class stores a hash of `tsvector` lexemes with any supported by module field. -**Doesn't** support prefix search. +It **doesn't** support prefix search. ### rum_tsquery_ops For type: `tsquery` -Stores branches of query tree in additional information. For example we have the table: +It stores branches of query tree in additional information. For example, we have the table: ```sql CREATE TABLE query (q tsquery, tag text); @@ -236,9 +260,9 @@ SELECT * FROM query For type: `anyarray` -This operator class stores `anyarrray` elements with length of the array. -Supports operators `&&`, `@>`, `<@`, `=`, `%` operators. Supports ordering by `<=>` operator. -For example we have the table: +This operator class stores `anyarray` elements with length of the array. +It supports operators `&&`, `@>`, `<@`, `=`, `%` operators. It also supports ordering by `<=>` operator. +For example, we have the table: ```sql CREATE TABLE test_array (i int2[]); @@ -275,7 +299,7 @@ SELECT * FROM test_array WHERE i && '{1}' ORDER BY i <=> '{1}' ASC; For type: `anyarray` -This operator class stores `anyarrray` elements with any supported by module +This operator class stores `anyarray` elements with any supported by module field. ## Todo @@ -294,3 +318,7 @@ Oleg Bartunov Postgres Professional Ltd., Russia Teodor Sigaev Postgres Professional Ltd., Russia Arthur Zakirov Postgres Professional Ltd., Russia + +Pavel Borisov Postgres Professional Ltd., Russia + +Maxim Orlov Postgres Professional Ltd., Russia diff --git a/data/rum_weight.data b/data/rum_weight.data new file mode 100644 index 0000000000..5bce717c1b --- /dev/null +++ b/data/rum_weight.data @@ -0,0 +1,52 @@ +As a reward for your reformation I write to you on this precious sheet.|write +You see I have come to be wonderfully attached to Heidelberg, the|attached come see +beautiful, the quaint, the historically poetic, learned and picturesque| +old town on the Neckar. It seems like another home. So I could not show|seems show could +my appreciation of you in a more complimentary way than by sending this|sending +little series of pictures. Have you ever been here, I wonder? You did|did have been wonder +not say, but you wrote as if you knew it by sight as well as by heart.|wrote say knew +As I cannot know, I will venture an explanation. The panorama speaks for|know will speaks +itself. Put on your "specs" and look at the castle, half way up the|put look +_berg_, "the Jettenhuhl, a wooded spur of the Konigestuhl." Look at it|Look +from the "Terrasse." Thus you'll get something of an idea of it. The|get +Gesprente Thurm is the one that was blown up by the French. The|is blown was +thickness of the walls, twenty-one feet, and the solid masonry, held it|held +so well that only a fragment, as it were, gave way. It still hangs as if|were gave hangs +ready to be replaced. "Das Grosse Fass Gebaude," too, you will have no|be replaced will have +difficulty in making out. If you only had it with its 49,000 gallons of|making had +wine, but wouldn't you divide with your neighbors! The columns in the|wouldn't divide +portico that shows in the Schlosshof are the four brought from|shows are brought +Charlemagne's palace at Ingelheim by the Count Palatine Ludwig, some| +time between 1508-44. The Zum Ritter has nothing to do with the castle,|has do +but is an ancient structure (1592) in the Renaissance style, and one of|is +the few that escaped destruction in 1693. It is a beautiful, highly|escaped is +ornamental building, and I wish you could see it, if you have not seen|wish could see have seen +it.| +| +All the above information, I beg you to believe, I do not intend you|beg believe do intend +to think was evolved from my inner consciousness, but gathered from|think was evolved gathered +the--nearest guide-book!| +| +I am so much obliged to you for mapping out Switzerland to me. I have|am obliged have +been trying my best to get all those "passes" into my brain. Now, thanks|been trying get +to your letter, I have them all in the handiest kind of a bunch. Ariel|have +like, "I'll do my bidding gently," and as surely, if I get there. But|do bidding get +there are dreadful reports of floods and roads caved in and bridges|are caved +swept away and snows and--enough of such exciting items as sets one|swept sets +thinking--"to go or not to go?" We are this far on the way. Reached|thinking go go are Reached +here this afternoon. Have spent the evening sauntering in the gardens,|Have spent sauntering +the Conversationhaus, the bazaar, mingling with the throng, listening to|mingling listening +the band, and comparing what it is with what it was. It was a gay and|comparing was was +curious spectacle, but on the whole had "the banquet-hall deserted"|had deserted +look. The situation is most beautiful. It lies, you know, at the|is lies know +entrance of the Black Forest, among picturesque, thickly-wooded hills,| +in the valley of the Oos, and extends up the slope of some of the hills.|extends +The Oos is a most turbid, turbulent stream; dashes through part of the|is +town with angry, headlong speed. There is an avenue along its bank of|is +oaks, limes and maples, bordered with flower-beds and shrubberies, and| +adorned with fountains and handsome villas. We shall devote to-morrow to| +seeing all there is to be seen, and go to Strassburg to-morrow evening|seeing is be seen go +for two or three days. From there to Constance, and then hold _our_| +"Council" as to further movements.| +def fgr| +def xxx fgr| diff --git a/expected/altorder.out b/expected/altorder.out index f99f0b1e81..6c0bcae2ad 100644 --- a/expected/altorder.out +++ b/expected/altorder.out @@ -1,120 +1,183 @@ +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * altorder.out - test output for 64-bit systems and + * altorder_1.out - test output for 32-bit systems. + * + */ CREATE TABLE atsts (id int, t tsvector, d timestamp); \copy atsts from 'data/tsts.data' +-- PGPRO-2537: We need more data to test rumsort.c with logtape.c +\copy atsts from 'data/tsts.data' +\copy atsts from 'data/tsts.data' +\copy atsts from 'data/tsts.data' CREATE INDEX atsts_idx ON atsts USING rum (t rum_tsvector_addon_ops, d) WITH (attach = 'd', to = 't', order_by_attach='t'); -INSERT INTO atsts VALUES (-1, 't1 t2', '2016-05-02 02:24:22.326724'); -INSERT INTO atsts VALUES (-2, 't1 t2 t3', '2016-05-02 02:26:22.326724'); +INSERT INTO atsts VALUES (-1, 't1 t2', '2016-05-02 02:24:22.326724'); +INSERT INTO atsts VALUES (-2, 't1 t2 t3', '2016-05-02 02:26:22.326724'); SELECT count(*) FROM atsts WHERE t @@ 'wr|qh'; count ------- - 158 + 632 (1 row) SELECT count(*) FROM atsts WHERE t @@ 'wr&qh'; count ------- - 17 + 68 (1 row) SELECT count(*) FROM atsts WHERE t @@ 'eq&yt'; count ------- - 6 + 24 (1 row) SELECT count(*) FROM atsts WHERE t @@ 'eq|yt'; count ------- - 98 + 392 (1 row) SELECT count(*) FROM atsts WHERE t @@ '(eq&yt)|(wr&qh)'; count ------- - 23 + 92 (1 row) SELECT count(*) FROM atsts WHERE t @@ '(eq|yt)&(wr|qh)'; count ------- - 39 + 156 (1 row) SET enable_indexscan=OFF; SET enable_indexonlyscan=OFF; SET enable_bitmapscan=OFF; SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; - id | d | ?column? ------+---------------------------------+--------------- - 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 - 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 - 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 - 406 | Wed May 18 17:21:22.326724 2016 | 183597.326724 - 415 | Thu May 19 02:21:22.326724 2016 | 215997.326724 + id | d | ?column? +-----+---------------------------------+------------- + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 (5 rows) SELECT id, d, d <=| '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; - id | d | ?column? ------+---------------------------------+--------------- - 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 - 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 - 252 | Thu May 12 07:21:22.326724 2016 | 370802.673276 - 232 | Wed May 11 11:21:22.326724 2016 | 442802.673276 - 168 | Sun May 08 19:21:22.326724 2016 | 673202.673276 + id | d | ?column? +-----+---------------------------------+------------- + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 (5 rows) SELECT id, d, d |=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? -----+---------------------------------+--------------- + 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 + 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 + 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 406 | Wed May 18 17:21:22.326724 2016 | 183597.326724 - 415 | Thu May 19 02:21:22.326724 2016 | 215997.326724 - 428 | Thu May 19 15:21:22.326724 2016 | 262797.326724 - 457 | Fri May 20 20:21:22.326724 2016 | 367197.326724 (5 rows) SELECT count(*) FROM atsts WHERE d < '2016-05-16 14:21:25'; count ------- - 357 + 1422 (1 row) SELECT count(*) FROM atsts WHERE d > '2016-05-16 14:21:25'; count ------- - 153 + 612 (1 row) SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; id | d -----+--------------------------------- 16 | Mon May 02 11:21:22.326724 2016 + 16 | Mon May 02 11:21:22.326724 2016 + 16 | Mon May 02 11:21:22.326724 2016 + 16 | Mon May 02 11:21:22.326724 2016 + 39 | Tue May 03 10:21:22.326724 2016 + 39 | Tue May 03 10:21:22.326724 2016 39 | Tue May 03 10:21:22.326724 2016 + 39 | Tue May 03 10:21:22.326724 2016 + 71 | Wed May 04 18:21:22.326724 2016 + 71 | Wed May 04 18:21:22.326724 2016 71 | Wed May 04 18:21:22.326724 2016 + 71 | Wed May 04 18:21:22.326724 2016 + 135 | Sat May 07 10:21:22.326724 2016 + 135 | Sat May 07 10:21:22.326724 2016 135 | Sat May 07 10:21:22.326724 2016 + 135 | Sat May 07 10:21:22.326724 2016 + 168 | Sun May 08 19:21:22.326724 2016 + 168 | Sun May 08 19:21:22.326724 2016 168 | Sun May 08 19:21:22.326724 2016 + 168 | Sun May 08 19:21:22.326724 2016 + 232 | Wed May 11 11:21:22.326724 2016 + 232 | Wed May 11 11:21:22.326724 2016 232 | Wed May 11 11:21:22.326724 2016 + 232 | Wed May 11 11:21:22.326724 2016 + 252 | Thu May 12 07:21:22.326724 2016 + 252 | Thu May 12 07:21:22.326724 2016 252 | Thu May 12 07:21:22.326724 2016 + 252 | Thu May 12 07:21:22.326724 2016 + 354 | Mon May 16 13:21:22.326724 2016 + 354 | Mon May 16 13:21:22.326724 2016 354 | Mon May 16 13:21:22.326724 2016 + 354 | Mon May 16 13:21:22.326724 2016 + 355 | Mon May 16 14:21:22.326724 2016 + 355 | Mon May 16 14:21:22.326724 2016 + 355 | Mon May 16 14:21:22.326724 2016 355 | Mon May 16 14:21:22.326724 2016 -(9 rows) +(36 rows) SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; id | d -----+--------------------------------- + 371 | Tue May 17 06:21:22.326724 2016 + 371 | Tue May 17 06:21:22.326724 2016 + 371 | Tue May 17 06:21:22.326724 2016 371 | Tue May 17 06:21:22.326724 2016 406 | Wed May 18 17:21:22.326724 2016 + 406 | Wed May 18 17:21:22.326724 2016 + 406 | Wed May 18 17:21:22.326724 2016 + 406 | Wed May 18 17:21:22.326724 2016 + 415 | Thu May 19 02:21:22.326724 2016 + 415 | Thu May 19 02:21:22.326724 2016 + 415 | Thu May 19 02:21:22.326724 2016 415 | Thu May 19 02:21:22.326724 2016 428 | Thu May 19 15:21:22.326724 2016 + 428 | Thu May 19 15:21:22.326724 2016 + 428 | Thu May 19 15:21:22.326724 2016 + 428 | Thu May 19 15:21:22.326724 2016 + 457 | Fri May 20 20:21:22.326724 2016 + 457 | Fri May 20 20:21:22.326724 2016 + 457 | Fri May 20 20:21:22.326724 2016 457 | Fri May 20 20:21:22.326724 2016 458 | Fri May 20 21:21:22.326724 2016 + 458 | Fri May 20 21:21:22.326724 2016 + 458 | Fri May 20 21:21:22.326724 2016 + 458 | Fri May 20 21:21:22.326724 2016 + 484 | Sat May 21 23:21:22.326724 2016 + 484 | Sat May 21 23:21:22.326724 2016 + 484 | Sat May 21 23:21:22.326724 2016 484 | Sat May 21 23:21:22.326724 2016 496 | Sun May 22 11:21:22.326724 2016 -(8 rows) + 496 | Sun May 22 11:21:22.326724 2016 + 496 | Sun May 22 11:21:22.326724 2016 + 496 | Sun May 22 11:21:22.326724 2016 +(32 rows) -RESET enable_indexscan; -RESET enable_indexonlyscan; -RESET enable_bitmapscan; +-- Test bitmap index scan +SET enable_bitmapscan=on; SET enable_seqscan = off; EXPLAIN (costs off) SELECT count(*) FROM atsts WHERE t @@ 'wr|qh'; @@ -130,37 +193,37 @@ SELECT count(*) FROM atsts WHERE t @@ 'wr|qh'; SELECT count(*) FROM atsts WHERE t @@ 'wr|qh'; count ------- - 158 + 632 (1 row) SELECT count(*) FROM atsts WHERE t @@ 'wr&qh'; count ------- - 17 + 68 (1 row) SELECT count(*) FROM atsts WHERE t @@ 'eq&yt'; count ------- - 6 + 24 (1 row) SELECT count(*) FROM atsts WHERE t @@ 'eq|yt'; count ------- - 98 + 392 (1 row) SELECT count(*) FROM atsts WHERE t @@ '(eq&yt)|(wr&qh)'; count ------- - 23 + 92 (1 row) SELECT count(*) FROM atsts WHERE t @@ '(eq|yt)&(wr|qh)'; count ------- - 39 + 156 (1 row) EXPLAIN (costs off) @@ -177,7 +240,7 @@ SELECT count(*) FROM atsts WHERE d < '2016-05-16 14:21:25'; SELECT count(*) FROM atsts WHERE d < '2016-05-16 14:21:25'; count ------- - 357 + 1422 (1 row) EXPLAIN (costs off) @@ -194,9 +257,13 @@ SELECT count(*) FROM atsts WHERE d > '2016-05-16 14:21:25'; SELECT count(*) FROM atsts WHERE d > '2016-05-16 14:21:25'; count ------- - 153 + 612 (1 row) +-- Test index scan +SET enable_indexscan=on; +SET enable_indexonlyscan=on; +SET enable_bitmapscan=off; EXPLAIN (costs off) SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; QUERY PLAN @@ -208,13 +275,13 @@ SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY (4 rows) SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; - id | d | ?column? ------+---------------------------------+--------------- - 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 - 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 - 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 - 406 | Wed May 18 17:21:22.326724 2016 | 183597.326724 - 415 | Thu May 19 02:21:22.326724 2016 | 215997.326724 + id | d | ?column? +-----+---------------------------------+------------- + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 (5 rows) EXPLAIN (costs off) @@ -228,13 +295,13 @@ SELECT id, d, d <=| '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY (4 rows) SELECT id, d, d <=| '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; - id | d | ?column? ------+---------------------------------+--------------- - 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 - 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 - 252 | Thu May 12 07:21:22.326724 2016 | 370802.673276 - 232 | Wed May 11 11:21:22.326724 2016 | 442802.673276 - 168 | Sun May 08 19:21:22.326724 2016 | 673202.673276 + id | d | ?column? +-----+---------------------------------+------------- + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 (5 rows) EXPLAIN (costs off) @@ -250,11 +317,11 @@ SELECT id, d, d |=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY SELECT id, d, d |=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? -----+---------------------------------+--------------- + 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 + 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 + 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 406 | Wed May 18 17:21:22.326724 2016 | 183597.326724 - 415 | Thu May 19 02:21:22.326724 2016 | 215997.326724 - 428 | Thu May 19 15:21:22.326724 2016 | 262797.326724 - 457 | Fri May 20 20:21:22.326724 2016 | 367197.326724 (5 rows) EXPLAIN (costs off) @@ -269,11 +336,11 @@ SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atsts ORDER BY d <=> '2016-05-16 SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atsts ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? -----+---------------------------------+------------- + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 356 | Mon May 16 15:21:22.326724 2016 | 3597.326724 - 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 - 357 | Mon May 16 16:21:22.326724 2016 | 7197.326724 - 353 | Mon May 16 12:21:22.326724 2016 | 7202.673276 (5 rows) EXPLAIN (costs off) @@ -290,15 +357,42 @@ SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER id | d -----+--------------------------------- 16 | Mon May 02 11:21:22.326724 2016 + 16 | Mon May 02 11:21:22.326724 2016 + 16 | Mon May 02 11:21:22.326724 2016 + 16 | Mon May 02 11:21:22.326724 2016 + 39 | Tue May 03 10:21:22.326724 2016 39 | Tue May 03 10:21:22.326724 2016 + 39 | Tue May 03 10:21:22.326724 2016 + 39 | Tue May 03 10:21:22.326724 2016 + 71 | Wed May 04 18:21:22.326724 2016 71 | Wed May 04 18:21:22.326724 2016 + 71 | Wed May 04 18:21:22.326724 2016 + 71 | Wed May 04 18:21:22.326724 2016 + 135 | Sat May 07 10:21:22.326724 2016 135 | Sat May 07 10:21:22.326724 2016 + 135 | Sat May 07 10:21:22.326724 2016 + 135 | Sat May 07 10:21:22.326724 2016 + 168 | Sun May 08 19:21:22.326724 2016 168 | Sun May 08 19:21:22.326724 2016 + 168 | Sun May 08 19:21:22.326724 2016 + 168 | Sun May 08 19:21:22.326724 2016 + 232 | Wed May 11 11:21:22.326724 2016 232 | Wed May 11 11:21:22.326724 2016 + 232 | Wed May 11 11:21:22.326724 2016 + 232 | Wed May 11 11:21:22.326724 2016 + 252 | Thu May 12 07:21:22.326724 2016 252 | Thu May 12 07:21:22.326724 2016 + 252 | Thu May 12 07:21:22.326724 2016 + 252 | Thu May 12 07:21:22.326724 2016 + 354 | Mon May 16 13:21:22.326724 2016 354 | Mon May 16 13:21:22.326724 2016 + 354 | Mon May 16 13:21:22.326724 2016 + 354 | Mon May 16 13:21:22.326724 2016 + 355 | Mon May 16 14:21:22.326724 2016 355 | Mon May 16 14:21:22.326724 2016 -(9 rows) + 355 | Mon May 16 14:21:22.326724 2016 + 355 | Mon May 16 14:21:22.326724 2016 +(36 rows) EXPLAIN (costs off) SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; @@ -314,12 +408,163 @@ SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER id | d -----+--------------------------------- 371 | Tue May 17 06:21:22.326724 2016 + 371 | Tue May 17 06:21:22.326724 2016 + 371 | Tue May 17 06:21:22.326724 2016 + 371 | Tue May 17 06:21:22.326724 2016 + 406 | Wed May 18 17:21:22.326724 2016 + 406 | Wed May 18 17:21:22.326724 2016 + 406 | Wed May 18 17:21:22.326724 2016 + 406 | Wed May 18 17:21:22.326724 2016 + 415 | Thu May 19 02:21:22.326724 2016 + 415 | Thu May 19 02:21:22.326724 2016 + 415 | Thu May 19 02:21:22.326724 2016 + 415 | Thu May 19 02:21:22.326724 2016 + 428 | Thu May 19 15:21:22.326724 2016 + 428 | Thu May 19 15:21:22.326724 2016 + 428 | Thu May 19 15:21:22.326724 2016 + 428 | Thu May 19 15:21:22.326724 2016 + 457 | Fri May 20 20:21:22.326724 2016 + 457 | Fri May 20 20:21:22.326724 2016 + 457 | Fri May 20 20:21:22.326724 2016 + 457 | Fri May 20 20:21:22.326724 2016 + 458 | Fri May 20 21:21:22.326724 2016 + 458 | Fri May 20 21:21:22.326724 2016 + 458 | Fri May 20 21:21:22.326724 2016 + 458 | Fri May 20 21:21:22.326724 2016 + 484 | Sat May 21 23:21:22.326724 2016 + 484 | Sat May 21 23:21:22.326724 2016 + 484 | Sat May 21 23:21:22.326724 2016 + 484 | Sat May 21 23:21:22.326724 2016 + 496 | Sun May 22 11:21:22.326724 2016 + 496 | Sun May 22 11:21:22.326724 2016 + 496 | Sun May 22 11:21:22.326724 2016 + 496 | Sun May 22 11:21:22.326724 2016 +(32 rows) + +EXPLAIN (costs off) +SELECT id, d FROM atsts WHERE t @@ 'wr&q:*' AND d >= '2016-05-16 14:21:25' ORDER BY d; + QUERY PLAN +----------------------------------------------------------------------------------------------------------------------------- + Sort + Sort Key: d + -> Index Scan using atsts_idx on atsts + Index Cond: ((t @@ '''wr'' & ''q'':*'::tsquery) AND (d >= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) +(4 rows) + +SELECT id, d FROM atsts WHERE t @@ 'wr&q:*' AND d >= '2016-05-16 14:21:25' ORDER BY d; + id | d +-----+--------------------------------- + 361 | Mon May 16 20:21:22.326724 2016 + 361 | Mon May 16 20:21:22.326724 2016 + 361 | Mon May 16 20:21:22.326724 2016 + 361 | Mon May 16 20:21:22.326724 2016 + 369 | Tue May 17 04:21:22.326724 2016 + 369 | Tue May 17 04:21:22.326724 2016 + 369 | Tue May 17 04:21:22.326724 2016 + 369 | Tue May 17 04:21:22.326724 2016 + 371 | Tue May 17 06:21:22.326724 2016 + 371 | Tue May 17 06:21:22.326724 2016 + 371 | Tue May 17 06:21:22.326724 2016 + 371 | Tue May 17 06:21:22.326724 2016 + 372 | Tue May 17 07:21:22.326724 2016 + 372 | Tue May 17 07:21:22.326724 2016 + 372 | Tue May 17 07:21:22.326724 2016 + 372 | Tue May 17 07:21:22.326724 2016 + 375 | Tue May 17 10:21:22.326724 2016 + 375 | Tue May 17 10:21:22.326724 2016 + 375 | Tue May 17 10:21:22.326724 2016 + 375 | Tue May 17 10:21:22.326724 2016 + 388 | Tue May 17 23:21:22.326724 2016 + 388 | Tue May 17 23:21:22.326724 2016 + 388 | Tue May 17 23:21:22.326724 2016 + 388 | Tue May 17 23:21:22.326724 2016 + 405 | Wed May 18 16:21:22.326724 2016 + 405 | Wed May 18 16:21:22.326724 2016 + 405 | Wed May 18 16:21:22.326724 2016 + 405 | Wed May 18 16:21:22.326724 2016 + 406 | Wed May 18 17:21:22.326724 2016 + 406 | Wed May 18 17:21:22.326724 2016 + 406 | Wed May 18 17:21:22.326724 2016 406 | Wed May 18 17:21:22.326724 2016 415 | Thu May 19 02:21:22.326724 2016 + 415 | Thu May 19 02:21:22.326724 2016 + 415 | Thu May 19 02:21:22.326724 2016 + 415 | Thu May 19 02:21:22.326724 2016 + 422 | Thu May 19 09:21:22.326724 2016 + 422 | Thu May 19 09:21:22.326724 2016 + 422 | Thu May 19 09:21:22.326724 2016 + 422 | Thu May 19 09:21:22.326724 2016 + 428 | Thu May 19 15:21:22.326724 2016 + 428 | Thu May 19 15:21:22.326724 2016 + 428 | Thu May 19 15:21:22.326724 2016 428 | Thu May 19 15:21:22.326724 2016 + 441 | Fri May 20 04:21:22.326724 2016 + 441 | Fri May 20 04:21:22.326724 2016 + 441 | Fri May 20 04:21:22.326724 2016 + 441 | Fri May 20 04:21:22.326724 2016 + 444 | Fri May 20 07:21:22.326724 2016 + 444 | Fri May 20 07:21:22.326724 2016 + 444 | Fri May 20 07:21:22.326724 2016 + 444 | Fri May 20 07:21:22.326724 2016 457 | Fri May 20 20:21:22.326724 2016 + 457 | Fri May 20 20:21:22.326724 2016 + 457 | Fri May 20 20:21:22.326724 2016 + 457 | Fri May 20 20:21:22.326724 2016 + 458 | Fri May 20 21:21:22.326724 2016 + 458 | Fri May 20 21:21:22.326724 2016 + 458 | Fri May 20 21:21:22.326724 2016 458 | Fri May 20 21:21:22.326724 2016 + 463 | Sat May 21 02:21:22.326724 2016 + 463 | Sat May 21 02:21:22.326724 2016 + 463 | Sat May 21 02:21:22.326724 2016 + 463 | Sat May 21 02:21:22.326724 2016 + 465 | Sat May 21 04:21:22.326724 2016 + 465 | Sat May 21 04:21:22.326724 2016 + 465 | Sat May 21 04:21:22.326724 2016 + 465 | Sat May 21 04:21:22.326724 2016 + 466 | Sat May 21 05:21:22.326724 2016 + 466 | Sat May 21 05:21:22.326724 2016 + 466 | Sat May 21 05:21:22.326724 2016 + 466 | Sat May 21 05:21:22.326724 2016 + 468 | Sat May 21 07:21:22.326724 2016 + 468 | Sat May 21 07:21:22.326724 2016 + 468 | Sat May 21 07:21:22.326724 2016 + 468 | Sat May 21 07:21:22.326724 2016 + 471 | Sat May 21 10:21:22.326724 2016 + 471 | Sat May 21 10:21:22.326724 2016 + 471 | Sat May 21 10:21:22.326724 2016 + 471 | Sat May 21 10:21:22.326724 2016 + 475 | Sat May 21 14:21:22.326724 2016 + 475 | Sat May 21 14:21:22.326724 2016 + 475 | Sat May 21 14:21:22.326724 2016 + 475 | Sat May 21 14:21:22.326724 2016 + 481 | Sat May 21 20:21:22.326724 2016 + 481 | Sat May 21 20:21:22.326724 2016 + 481 | Sat May 21 20:21:22.326724 2016 + 481 | Sat May 21 20:21:22.326724 2016 484 | Sat May 21 23:21:22.326724 2016 + 484 | Sat May 21 23:21:22.326724 2016 + 484 | Sat May 21 23:21:22.326724 2016 + 484 | Sat May 21 23:21:22.326724 2016 + 485 | Sun May 22 00:21:22.326724 2016 + 485 | Sun May 22 00:21:22.326724 2016 + 485 | Sun May 22 00:21:22.326724 2016 + 485 | Sun May 22 00:21:22.326724 2016 + 493 | Sun May 22 08:21:22.326724 2016 + 493 | Sun May 22 08:21:22.326724 2016 + 493 | Sun May 22 08:21:22.326724 2016 + 493 | Sun May 22 08:21:22.326724 2016 + 496 | Sun May 22 11:21:22.326724 2016 + 496 | Sun May 22 11:21:22.326724 2016 + 496 | Sun May 22 11:21:22.326724 2016 496 | Sun May 22 11:21:22.326724 2016 -(8 rows) + 499 | Sun May 22 14:21:22.326724 2016 + 499 | Sun May 22 14:21:22.326724 2016 + 499 | Sun May 22 14:21:22.326724 2016 + 499 | Sun May 22 14:21:22.326724 2016 + 506 | Sun May 22 21:21:22.326724 2016 + 506 | Sun May 22 21:21:22.326724 2016 + 506 | Sun May 22 21:21:22.326724 2016 + 506 | Sun May 22 21:21:22.326724 2016 +(112 rows) diff --git a/expected/altorder_1.out b/expected/altorder_1.out index b8c3141b94..980515f58e 100644 --- a/expected/altorder_1.out +++ b/expected/altorder_1.out @@ -1,293 +1,569 @@ +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * altorder.out - test output for 64-bit systems and + * altorder_1.out - test output for 32-bit systems. + * + */ CREATE TABLE atsts (id int, t tsvector, d timestamp); \copy atsts from 'data/tsts.data' +-- PGPRO-2537: We need more data to test rumsort.c with logtape.c +\copy atsts from 'data/tsts.data' +\copy atsts from 'data/tsts.data' +\copy atsts from 'data/tsts.data' CREATE INDEX atsts_idx ON atsts USING rum (t rum_tsvector_addon_ops, d) WITH (attach = 'd', to = 't', order_by_attach='t'); -INSERT INTO atsts VALUES (-1, 't1 t2', '2016-05-02 02:24:22.326724'); -INSERT INTO atsts VALUES (-2, 't1 t2 t3', '2016-05-02 02:26:22.326724'); +ERROR: doesn't support order index over pass-by-reference column +INSERT INTO atsts VALUES (-1, 't1 t2', '2016-05-02 02:24:22.326724'); +INSERT INTO atsts VALUES (-2, 't1 t2 t3', '2016-05-02 02:26:22.326724'); SELECT count(*) FROM atsts WHERE t @@ 'wr|qh'; count ------- - 158 + 632 (1 row) SELECT count(*) FROM atsts WHERE t @@ 'wr&qh'; count ------- - 17 + 68 (1 row) SELECT count(*) FROM atsts WHERE t @@ 'eq&yt'; count ------- - 6 + 24 (1 row) SELECT count(*) FROM atsts WHERE t @@ 'eq|yt'; count ------- - 98 + 392 (1 row) SELECT count(*) FROM atsts WHERE t @@ '(eq&yt)|(wr&qh)'; count ------- - 23 + 92 (1 row) SELECT count(*) FROM atsts WHERE t @@ '(eq|yt)&(wr|qh)'; count ------- - 39 + 156 (1 row) SET enable_indexscan=OFF; SET enable_indexonlyscan=OFF; SET enable_bitmapscan=OFF; SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; - id | d | ?column? ------+---------------------------------+--------------- - 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 - 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 - 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 - 406 | Wed May 18 17:21:22.326724 2016 | 183597.326724 - 415 | Thu May 19 02:21:22.326724 2016 | 215997.326724 + id | d | ?column? +-----+---------------------------------+------------- + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 (5 rows) SELECT id, d, d <=| '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; - id | d | ?column? ------+---------------------------------+--------------- - 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 - 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 - 252 | Thu May 12 07:21:22.326724 2016 | 370802.673276 - 232 | Wed May 11 11:21:22.326724 2016 | 442802.673276 - 168 | Sun May 08 19:21:22.326724 2016 | 673202.673276 + id | d | ?column? +-----+---------------------------------+------------- + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 (5 rows) SELECT id, d, d |=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? -----+---------------------------------+--------------- + 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 + 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 + 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 406 | Wed May 18 17:21:22.326724 2016 | 183597.326724 - 415 | Thu May 19 02:21:22.326724 2016 | 215997.326724 - 428 | Thu May 19 15:21:22.326724 2016 | 262797.326724 - 457 | Fri May 20 20:21:22.326724 2016 | 367197.326724 (5 rows) SELECT count(*) FROM atsts WHERE d < '2016-05-16 14:21:25'; count ------- - 357 + 1422 (1 row) SELECT count(*) FROM atsts WHERE d > '2016-05-16 14:21:25'; count ------- - 153 + 612 (1 row) SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; id | d -----+--------------------------------- 16 | Mon May 02 11:21:22.326724 2016 + 16 | Mon May 02 11:21:22.326724 2016 + 16 | Mon May 02 11:21:22.326724 2016 + 16 | Mon May 02 11:21:22.326724 2016 + 39 | Tue May 03 10:21:22.326724 2016 + 39 | Tue May 03 10:21:22.326724 2016 + 39 | Tue May 03 10:21:22.326724 2016 39 | Tue May 03 10:21:22.326724 2016 71 | Wed May 04 18:21:22.326724 2016 + 71 | Wed May 04 18:21:22.326724 2016 + 71 | Wed May 04 18:21:22.326724 2016 + 71 | Wed May 04 18:21:22.326724 2016 + 135 | Sat May 07 10:21:22.326724 2016 + 135 | Sat May 07 10:21:22.326724 2016 135 | Sat May 07 10:21:22.326724 2016 + 135 | Sat May 07 10:21:22.326724 2016 + 168 | Sun May 08 19:21:22.326724 2016 + 168 | Sun May 08 19:21:22.326724 2016 168 | Sun May 08 19:21:22.326724 2016 + 168 | Sun May 08 19:21:22.326724 2016 + 232 | Wed May 11 11:21:22.326724 2016 + 232 | Wed May 11 11:21:22.326724 2016 232 | Wed May 11 11:21:22.326724 2016 + 232 | Wed May 11 11:21:22.326724 2016 + 252 | Thu May 12 07:21:22.326724 2016 + 252 | Thu May 12 07:21:22.326724 2016 252 | Thu May 12 07:21:22.326724 2016 + 252 | Thu May 12 07:21:22.326724 2016 + 354 | Mon May 16 13:21:22.326724 2016 + 354 | Mon May 16 13:21:22.326724 2016 354 | Mon May 16 13:21:22.326724 2016 + 354 | Mon May 16 13:21:22.326724 2016 + 355 | Mon May 16 14:21:22.326724 2016 + 355 | Mon May 16 14:21:22.326724 2016 355 | Mon May 16 14:21:22.326724 2016 -(9 rows) + 355 | Mon May 16 14:21:22.326724 2016 +(36 rows) SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; id | d -----+--------------------------------- 371 | Tue May 17 06:21:22.326724 2016 + 371 | Tue May 17 06:21:22.326724 2016 + 371 | Tue May 17 06:21:22.326724 2016 + 371 | Tue May 17 06:21:22.326724 2016 + 406 | Wed May 18 17:21:22.326724 2016 + 406 | Wed May 18 17:21:22.326724 2016 406 | Wed May 18 17:21:22.326724 2016 + 406 | Wed May 18 17:21:22.326724 2016 + 415 | Thu May 19 02:21:22.326724 2016 415 | Thu May 19 02:21:22.326724 2016 + 415 | Thu May 19 02:21:22.326724 2016 + 415 | Thu May 19 02:21:22.326724 2016 + 428 | Thu May 19 15:21:22.326724 2016 428 | Thu May 19 15:21:22.326724 2016 + 428 | Thu May 19 15:21:22.326724 2016 + 428 | Thu May 19 15:21:22.326724 2016 + 457 | Fri May 20 20:21:22.326724 2016 457 | Fri May 20 20:21:22.326724 2016 + 457 | Fri May 20 20:21:22.326724 2016 + 457 | Fri May 20 20:21:22.326724 2016 + 458 | Fri May 20 21:21:22.326724 2016 458 | Fri May 20 21:21:22.326724 2016 + 458 | Fri May 20 21:21:22.326724 2016 + 458 | Fri May 20 21:21:22.326724 2016 + 484 | Sat May 21 23:21:22.326724 2016 484 | Sat May 21 23:21:22.326724 2016 + 484 | Sat May 21 23:21:22.326724 2016 + 484 | Sat May 21 23:21:22.326724 2016 + 496 | Sun May 22 11:21:22.326724 2016 496 | Sun May 22 11:21:22.326724 2016 -(8 rows) + 496 | Sun May 22 11:21:22.326724 2016 + 496 | Sun May 22 11:21:22.326724 2016 +(32 rows) -RESET enable_indexscan; -RESET enable_indexonlyscan; -RESET enable_bitmapscan; +-- Test bitmap index scan +SET enable_bitmapscan=on; SET enable_seqscan = off; EXPLAIN (costs off) SELECT count(*) FROM atsts WHERE t @@ 'wr|qh'; - QUERY PLAN -------------------------------------------------------------- + QUERY PLAN +--------------------------------------------------- Aggregate - -> Bitmap Heap Scan on atsts - Recheck Cond: (t @@ '''wr'' | ''qh'''::tsquery) - -> Bitmap Index Scan on atsts_idx - Index Cond: (t @@ '''wr'' | ''qh'''::tsquery) -(5 rows) + -> Seq Scan on atsts + Filter: (t @@ '''wr'' | ''qh'''::tsquery) +(3 rows) SELECT count(*) FROM atsts WHERE t @@ 'wr|qh'; count ------- - 158 + 632 (1 row) SELECT count(*) FROM atsts WHERE t @@ 'wr&qh'; count ------- - 17 + 68 (1 row) SELECT count(*) FROM atsts WHERE t @@ 'eq&yt'; count ------- - 6 + 24 (1 row) SELECT count(*) FROM atsts WHERE t @@ 'eq|yt'; count ------- - 98 + 392 (1 row) SELECT count(*) FROM atsts WHERE t @@ '(eq&yt)|(wr&qh)'; count ------- - 23 + 92 (1 row) SELECT count(*) FROM atsts WHERE t @@ '(eq|yt)&(wr|qh)'; count ------- - 39 + 156 (1 row) EXPLAIN (costs off) SELECT count(*) FROM atsts WHERE d < '2016-05-16 14:21:25'; - QUERY PLAN ------------------------------------------------------------------------------------------ + QUERY PLAN +------------------------------------------------------------------------------- Aggregate - -> Bitmap Heap Scan on atsts - Recheck Cond: (d < 'Mon May 16 14:21:25 2016'::timestamp without time zone) - -> Bitmap Index Scan on atsts_idx - Index Cond: (d < 'Mon May 16 14:21:25 2016'::timestamp without time zone) -(5 rows) + -> Seq Scan on atsts + Filter: (d < 'Mon May 16 14:21:25 2016'::timestamp without time zone) +(3 rows) SELECT count(*) FROM atsts WHERE d < '2016-05-16 14:21:25'; count ------- - 357 + 1422 (1 row) EXPLAIN (costs off) SELECT count(*) FROM atsts WHERE d > '2016-05-16 14:21:25'; - QUERY PLAN ------------------------------------------------------------------------------------------ + QUERY PLAN +------------------------------------------------------------------------------- Aggregate - -> Bitmap Heap Scan on atsts - Recheck Cond: (d > 'Mon May 16 14:21:25 2016'::timestamp without time zone) - -> Bitmap Index Scan on atsts_idx - Index Cond: (d > 'Mon May 16 14:21:25 2016'::timestamp without time zone) -(5 rows) + -> Seq Scan on atsts + Filter: (d > 'Mon May 16 14:21:25 2016'::timestamp without time zone) +(3 rows) SELECT count(*) FROM atsts WHERE d > '2016-05-16 14:21:25'; count ------- - 153 + 612 (1 row) +-- Test index scan +SET enable_indexscan=on; +SET enable_indexonlyscan=on; +SET enable_bitmapscan=off; EXPLAIN (costs off) SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; - QUERY PLAN ------------------------------------------------------------------------------------ + QUERY PLAN +------------------------------------------------------------------------------------- Limit - -> Index Scan using atsts_idx on atsts - Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) - Order By: (d <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone) -(4 rows) + -> Sort + Sort Key: ((d <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone)) + -> Seq Scan on atsts + Filter: (t @@ '''wr'' & ''qh'''::tsquery) +(5 rows) SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; -ERROR: doesn't support order by over pass-by-reference column + id | d | ?column? +-----+---------------------------------+------------- + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 +(5 rows) + EXPLAIN (costs off) SELECT id, d, d <=| '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; - QUERY PLAN ------------------------------------------------------------------------------------ + QUERY PLAN +------------------------------------------------------------------------------------- Limit - -> Index Scan using atsts_idx on atsts - Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) - Order By: (d <=| 'Mon May 16 14:21:25 2016'::timestamp without time zone) -(4 rows) + -> Sort + Sort Key: ((d <=| 'Mon May 16 14:21:25 2016'::timestamp without time zone)) + -> Seq Scan on atsts + Filter: (t @@ '''wr'' & ''qh'''::tsquery) +(5 rows) SELECT id, d, d <=| '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; -ERROR: doesn't support order by over pass-by-reference column + id | d | ?column? +-----+---------------------------------+------------- + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 +(5 rows) + EXPLAIN (costs off) SELECT id, d, d |=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; - QUERY PLAN ------------------------------------------------------------------------------------ + QUERY PLAN +------------------------------------------------------------------------------------- Limit - -> Index Scan using atsts_idx on atsts - Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) - Order By: (d |=> 'Mon May 16 14:21:25 2016'::timestamp without time zone) -(4 rows) + -> Sort + Sort Key: ((d |=> 'Mon May 16 14:21:25 2016'::timestamp without time zone)) + -> Seq Scan on atsts + Filter: (t @@ '''wr'' & ''qh'''::tsquery) +(5 rows) SELECT id, d, d |=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; -ERROR: doesn't support order by over pass-by-reference column + id | d | ?column? +-----+---------------------------------+--------------- + 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 + 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 + 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 + 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 + 406 | Wed May 18 17:21:22.326724 2016 | 183597.326724 +(5 rows) + EXPLAIN (costs off) SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atsts ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; - QUERY PLAN ------------------------------------------------------------------------------------ + QUERY PLAN +------------------------------------------------------------------------------------- Limit - -> Index Scan using atsts_idx on atsts - Order By: (d <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone) -(3 rows) + -> Sort + Sort Key: ((d <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone)) + -> Seq Scan on atsts +(4 rows) SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atsts ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; -ERROR: doesn't support order by over pass-by-reference column + id | d | ?column? +-----+---------------------------------+------------- + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 356 | Mon May 16 15:21:22.326724 2016 | 3597.326724 +(5 rows) + EXPLAIN (costs off) SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; - QUERY PLAN ----------------------------------------------------------------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------ Sort Sort Key: d - -> Index Scan using atsts_idx on atsts - Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d <= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) + -> Seq Scan on atsts + Filter: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d <= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) (4 rows) SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; id | d -----+--------------------------------- 16 | Mon May 02 11:21:22.326724 2016 + 16 | Mon May 02 11:21:22.326724 2016 + 16 | Mon May 02 11:21:22.326724 2016 + 16 | Mon May 02 11:21:22.326724 2016 + 39 | Tue May 03 10:21:22.326724 2016 + 39 | Tue May 03 10:21:22.326724 2016 + 39 | Tue May 03 10:21:22.326724 2016 39 | Tue May 03 10:21:22.326724 2016 71 | Wed May 04 18:21:22.326724 2016 + 71 | Wed May 04 18:21:22.326724 2016 + 71 | Wed May 04 18:21:22.326724 2016 + 71 | Wed May 04 18:21:22.326724 2016 + 135 | Sat May 07 10:21:22.326724 2016 + 135 | Sat May 07 10:21:22.326724 2016 + 135 | Sat May 07 10:21:22.326724 2016 135 | Sat May 07 10:21:22.326724 2016 168 | Sun May 08 19:21:22.326724 2016 + 168 | Sun May 08 19:21:22.326724 2016 + 168 | Sun May 08 19:21:22.326724 2016 + 168 | Sun May 08 19:21:22.326724 2016 + 232 | Wed May 11 11:21:22.326724 2016 + 232 | Wed May 11 11:21:22.326724 2016 + 232 | Wed May 11 11:21:22.326724 2016 232 | Wed May 11 11:21:22.326724 2016 252 | Thu May 12 07:21:22.326724 2016 + 252 | Thu May 12 07:21:22.326724 2016 + 252 | Thu May 12 07:21:22.326724 2016 + 252 | Thu May 12 07:21:22.326724 2016 + 354 | Mon May 16 13:21:22.326724 2016 + 354 | Mon May 16 13:21:22.326724 2016 354 | Mon May 16 13:21:22.326724 2016 + 354 | Mon May 16 13:21:22.326724 2016 + 355 | Mon May 16 14:21:22.326724 2016 + 355 | Mon May 16 14:21:22.326724 2016 355 | Mon May 16 14:21:22.326724 2016 -(9 rows) + 355 | Mon May 16 14:21:22.326724 2016 +(36 rows) EXPLAIN (costs off) SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; - QUERY PLAN ----------------------------------------------------------------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------ Sort Sort Key: d - -> Index Scan using atsts_idx on atsts - Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d >= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) + -> Seq Scan on atsts + Filter: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d >= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) (4 rows) SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; id | d -----+--------------------------------- 371 | Tue May 17 06:21:22.326724 2016 + 371 | Tue May 17 06:21:22.326724 2016 + 371 | Tue May 17 06:21:22.326724 2016 + 371 | Tue May 17 06:21:22.326724 2016 + 406 | Wed May 18 17:21:22.326724 2016 + 406 | Wed May 18 17:21:22.326724 2016 406 | Wed May 18 17:21:22.326724 2016 + 406 | Wed May 18 17:21:22.326724 2016 + 415 | Thu May 19 02:21:22.326724 2016 + 415 | Thu May 19 02:21:22.326724 2016 415 | Thu May 19 02:21:22.326724 2016 + 415 | Thu May 19 02:21:22.326724 2016 + 428 | Thu May 19 15:21:22.326724 2016 + 428 | Thu May 19 15:21:22.326724 2016 428 | Thu May 19 15:21:22.326724 2016 + 428 | Thu May 19 15:21:22.326724 2016 + 457 | Fri May 20 20:21:22.326724 2016 + 457 | Fri May 20 20:21:22.326724 2016 457 | Fri May 20 20:21:22.326724 2016 + 457 | Fri May 20 20:21:22.326724 2016 + 458 | Fri May 20 21:21:22.326724 2016 + 458 | Fri May 20 21:21:22.326724 2016 458 | Fri May 20 21:21:22.326724 2016 + 458 | Fri May 20 21:21:22.326724 2016 + 484 | Sat May 21 23:21:22.326724 2016 484 | Sat May 21 23:21:22.326724 2016 + 484 | Sat May 21 23:21:22.326724 2016 + 484 | Sat May 21 23:21:22.326724 2016 + 496 | Sun May 22 11:21:22.326724 2016 + 496 | Sun May 22 11:21:22.326724 2016 + 496 | Sun May 22 11:21:22.326724 2016 + 496 | Sun May 22 11:21:22.326724 2016 +(32 rows) + +EXPLAIN (costs off) +SELECT id, d FROM atsts WHERE t @@ 'wr&q:*' AND d >= '2016-05-16 14:21:25' ORDER BY d; + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------- + Sort + Sort Key: d + -> Seq Scan on atsts + Filter: ((t @@ '''wr'' & ''q'':*'::tsquery) AND (d >= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) +(4 rows) + +SELECT id, d FROM atsts WHERE t @@ 'wr&q:*' AND d >= '2016-05-16 14:21:25' ORDER BY d; + id | d +-----+--------------------------------- + 361 | Mon May 16 20:21:22.326724 2016 + 361 | Mon May 16 20:21:22.326724 2016 + 361 | Mon May 16 20:21:22.326724 2016 + 361 | Mon May 16 20:21:22.326724 2016 + 369 | Tue May 17 04:21:22.326724 2016 + 369 | Tue May 17 04:21:22.326724 2016 + 369 | Tue May 17 04:21:22.326724 2016 + 369 | Tue May 17 04:21:22.326724 2016 + 371 | Tue May 17 06:21:22.326724 2016 + 371 | Tue May 17 06:21:22.326724 2016 + 371 | Tue May 17 06:21:22.326724 2016 + 371 | Tue May 17 06:21:22.326724 2016 + 372 | Tue May 17 07:21:22.326724 2016 + 372 | Tue May 17 07:21:22.326724 2016 + 372 | Tue May 17 07:21:22.326724 2016 + 372 | Tue May 17 07:21:22.326724 2016 + 375 | Tue May 17 10:21:22.326724 2016 + 375 | Tue May 17 10:21:22.326724 2016 + 375 | Tue May 17 10:21:22.326724 2016 + 375 | Tue May 17 10:21:22.326724 2016 + 388 | Tue May 17 23:21:22.326724 2016 + 388 | Tue May 17 23:21:22.326724 2016 + 388 | Tue May 17 23:21:22.326724 2016 + 388 | Tue May 17 23:21:22.326724 2016 + 405 | Wed May 18 16:21:22.326724 2016 + 405 | Wed May 18 16:21:22.326724 2016 + 405 | Wed May 18 16:21:22.326724 2016 + 405 | Wed May 18 16:21:22.326724 2016 + 406 | Wed May 18 17:21:22.326724 2016 + 406 | Wed May 18 17:21:22.326724 2016 + 406 | Wed May 18 17:21:22.326724 2016 + 406 | Wed May 18 17:21:22.326724 2016 + 415 | Thu May 19 02:21:22.326724 2016 + 415 | Thu May 19 02:21:22.326724 2016 + 415 | Thu May 19 02:21:22.326724 2016 + 415 | Thu May 19 02:21:22.326724 2016 + 422 | Thu May 19 09:21:22.326724 2016 + 422 | Thu May 19 09:21:22.326724 2016 + 422 | Thu May 19 09:21:22.326724 2016 + 422 | Thu May 19 09:21:22.326724 2016 + 428 | Thu May 19 15:21:22.326724 2016 + 428 | Thu May 19 15:21:22.326724 2016 + 428 | Thu May 19 15:21:22.326724 2016 + 428 | Thu May 19 15:21:22.326724 2016 + 441 | Fri May 20 04:21:22.326724 2016 + 441 | Fri May 20 04:21:22.326724 2016 + 441 | Fri May 20 04:21:22.326724 2016 + 441 | Fri May 20 04:21:22.326724 2016 + 444 | Fri May 20 07:21:22.326724 2016 + 444 | Fri May 20 07:21:22.326724 2016 + 444 | Fri May 20 07:21:22.326724 2016 + 444 | Fri May 20 07:21:22.326724 2016 + 457 | Fri May 20 20:21:22.326724 2016 + 457 | Fri May 20 20:21:22.326724 2016 + 457 | Fri May 20 20:21:22.326724 2016 + 457 | Fri May 20 20:21:22.326724 2016 + 458 | Fri May 20 21:21:22.326724 2016 + 458 | Fri May 20 21:21:22.326724 2016 + 458 | Fri May 20 21:21:22.326724 2016 + 458 | Fri May 20 21:21:22.326724 2016 + 463 | Sat May 21 02:21:22.326724 2016 + 463 | Sat May 21 02:21:22.326724 2016 + 463 | Sat May 21 02:21:22.326724 2016 + 463 | Sat May 21 02:21:22.326724 2016 + 465 | Sat May 21 04:21:22.326724 2016 + 465 | Sat May 21 04:21:22.326724 2016 + 465 | Sat May 21 04:21:22.326724 2016 + 465 | Sat May 21 04:21:22.326724 2016 + 466 | Sat May 21 05:21:22.326724 2016 + 466 | Sat May 21 05:21:22.326724 2016 + 466 | Sat May 21 05:21:22.326724 2016 + 466 | Sat May 21 05:21:22.326724 2016 + 468 | Sat May 21 07:21:22.326724 2016 + 468 | Sat May 21 07:21:22.326724 2016 + 468 | Sat May 21 07:21:22.326724 2016 + 468 | Sat May 21 07:21:22.326724 2016 + 471 | Sat May 21 10:21:22.326724 2016 + 471 | Sat May 21 10:21:22.326724 2016 + 471 | Sat May 21 10:21:22.326724 2016 + 471 | Sat May 21 10:21:22.326724 2016 + 475 | Sat May 21 14:21:22.326724 2016 + 475 | Sat May 21 14:21:22.326724 2016 + 475 | Sat May 21 14:21:22.326724 2016 + 475 | Sat May 21 14:21:22.326724 2016 + 481 | Sat May 21 20:21:22.326724 2016 + 481 | Sat May 21 20:21:22.326724 2016 + 481 | Sat May 21 20:21:22.326724 2016 + 481 | Sat May 21 20:21:22.326724 2016 + 484 | Sat May 21 23:21:22.326724 2016 + 484 | Sat May 21 23:21:22.326724 2016 + 484 | Sat May 21 23:21:22.326724 2016 + 484 | Sat May 21 23:21:22.326724 2016 + 485 | Sun May 22 00:21:22.326724 2016 + 485 | Sun May 22 00:21:22.326724 2016 + 485 | Sun May 22 00:21:22.326724 2016 + 485 | Sun May 22 00:21:22.326724 2016 + 493 | Sun May 22 08:21:22.326724 2016 + 493 | Sun May 22 08:21:22.326724 2016 + 493 | Sun May 22 08:21:22.326724 2016 + 493 | Sun May 22 08:21:22.326724 2016 + 496 | Sun May 22 11:21:22.326724 2016 + 496 | Sun May 22 11:21:22.326724 2016 + 496 | Sun May 22 11:21:22.326724 2016 496 | Sun May 22 11:21:22.326724 2016 -(8 rows) + 499 | Sun May 22 14:21:22.326724 2016 + 499 | Sun May 22 14:21:22.326724 2016 + 499 | Sun May 22 14:21:22.326724 2016 + 499 | Sun May 22 14:21:22.326724 2016 + 506 | Sun May 22 21:21:22.326724 2016 + 506 | Sun May 22 21:21:22.326724 2016 + 506 | Sun May 22 21:21:22.326724 2016 + 506 | Sun May 22 21:21:22.326724 2016 +(112 rows) diff --git a/expected/altorder_hash.out b/expected/altorder_hash.out index a828287541..1011b90d0c 100644 --- a/expected/altorder_hash.out +++ b/expected/altorder_hash.out @@ -1,9 +1,18 @@ +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * altorder_hash.out - test output for 64-bit systems and + * altorder_hash_1.out - test output for 32-bit systems. + * + */ CREATE TABLE atstsh (id int, t tsvector, d timestamp); \copy atstsh from 'data/tsts.data' CREATE INDEX atstsh_idx ON atstsh USING rum (t rum_tsvector_hash_addon_ops, d) WITH (attach = 'd', to = 't', order_by_attach='t'); -INSERT INTO atstsh VALUES (-1, 't1 t2', '2016-05-02 02:24:22.326724'); -INSERT INTO atstsh VALUES (-2, 't1 t2 t3', '2016-05-02 02:26:22.326724'); +INSERT INTO atstsh VALUES (-1, 't1 t2', '2016-05-02 02:24:22.326724'); +INSERT INTO atstsh VALUES (-2, 't1 t2 t3', '2016-05-02 02:26:22.326724'); SELECT count(*) FROM atstsh WHERE t @@ 'wr|qh'; count ------- @@ -112,9 +121,8 @@ SELECT id, d FROM atstsh WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDE 496 | Sun May 22 11:21:22.326724 2016 (8 rows) -RESET enable_indexscan; -RESET enable_indexonlyscan; -RESET enable_bitmapscan; +-- Test bitmap index scan +SET enable_bitmapscan=on; SET enable_seqscan = off; EXPLAIN (costs off) SELECT count(*) FROM atstsh WHERE t @@ 'wr|qh'; @@ -197,6 +205,10 @@ SELECT count(*) FROM atstsh WHERE d > '2016-05-16 14:21:25'; 153 (1 row) +-- Test index scan +SET enable_indexscan=on; +SET enable_indexonlyscan=on; +SET enable_bitmapscan=off; EXPLAIN (costs off) SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atstsh WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; QUERY PLAN diff --git a/expected/altorder_hash_1.out b/expected/altorder_hash_1.out index ce969c44bd..e310fbdb89 100644 --- a/expected/altorder_hash_1.out +++ b/expected/altorder_hash_1.out @@ -1,9 +1,19 @@ +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * altorder_hash.out - test output for 64-bit systems and + * altorder_hash_1.out - test output for 32-bit systems. + * + */ CREATE TABLE atstsh (id int, t tsvector, d timestamp); \copy atstsh from 'data/tsts.data' CREATE INDEX atstsh_idx ON atstsh USING rum (t rum_tsvector_hash_addon_ops, d) WITH (attach = 'd', to = 't', order_by_attach='t'); -INSERT INTO atstsh VALUES (-1, 't1 t2', '2016-05-02 02:24:22.326724'); -INSERT INTO atstsh VALUES (-2, 't1 t2 t3', '2016-05-02 02:26:22.326724'); +ERROR: doesn't support order index over pass-by-reference column +INSERT INTO atstsh VALUES (-1, 't1 t2', '2016-05-02 02:24:22.326724'); +INSERT INTO atstsh VALUES (-2, 't1 t2 t3', '2016-05-02 02:26:22.326724'); SELECT count(*) FROM atstsh WHERE t @@ 'wr|qh'; count ------- @@ -112,20 +122,17 @@ SELECT id, d FROM atstsh WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDE 496 | Sun May 22 11:21:22.326724 2016 (8 rows) -RESET enable_indexscan; -RESET enable_indexonlyscan; -RESET enable_bitmapscan; +-- Test bitmap index scan +SET enable_bitmapscan=on; SET enable_seqscan = off; EXPLAIN (costs off) SELECT count(*) FROM atstsh WHERE t @@ 'wr|qh'; - QUERY PLAN -------------------------------------------------------------- + QUERY PLAN +--------------------------------------------------- Aggregate - -> Bitmap Heap Scan on atstsh - Recheck Cond: (t @@ '''wr'' | ''qh'''::tsquery) - -> Bitmap Index Scan on atstsh_idx - Index Cond: (t @@ '''wr'' | ''qh'''::tsquery) -(5 rows) + -> Seq Scan on atstsh + Filter: (t @@ '''wr'' | ''qh'''::tsquery) +(3 rows) SELECT count(*) FROM atstsh WHERE t @@ 'wr|qh'; count @@ -165,14 +172,12 @@ SELECT count(*) FROM atstsh WHERE t @@ '(eq|yt)&(wr|qh)'; EXPLAIN (costs off) SELECT count(*) FROM atstsh WHERE d < '2016-05-16 14:21:25'; - QUERY PLAN ------------------------------------------------------------------------------------------ + QUERY PLAN +------------------------------------------------------------------------------- Aggregate - -> Bitmap Heap Scan on atstsh - Recheck Cond: (d < 'Mon May 16 14:21:25 2016'::timestamp without time zone) - -> Bitmap Index Scan on atstsh_idx - Index Cond: (d < 'Mon May 16 14:21:25 2016'::timestamp without time zone) -(5 rows) + -> Seq Scan on atstsh + Filter: (d < 'Mon May 16 14:21:25 2016'::timestamp without time zone) +(3 rows) SELECT count(*) FROM atstsh WHERE d < '2016-05-16 14:21:25'; count @@ -182,14 +187,12 @@ SELECT count(*) FROM atstsh WHERE d < '2016-05-16 14:21:25'; EXPLAIN (costs off) SELECT count(*) FROM atstsh WHERE d > '2016-05-16 14:21:25'; - QUERY PLAN ------------------------------------------------------------------------------------------ + QUERY PLAN +------------------------------------------------------------------------------- Aggregate - -> Bitmap Heap Scan on atstsh - Recheck Cond: (d > 'Mon May 16 14:21:25 2016'::timestamp without time zone) - -> Bitmap Index Scan on atstsh_idx - Index Cond: (d > 'Mon May 16 14:21:25 2016'::timestamp without time zone) -(5 rows) + -> Seq Scan on atstsh + Filter: (d > 'Mon May 16 14:21:25 2016'::timestamp without time zone) +(3 rows) SELECT count(*) FROM atstsh WHERE d > '2016-05-16 14:21:25'; count @@ -197,61 +200,101 @@ SELECT count(*) FROM atstsh WHERE d > '2016-05-16 14:21:25'; 153 (1 row) +-- Test index scan +SET enable_indexscan=on; +SET enable_indexonlyscan=on; +SET enable_bitmapscan=off; EXPLAIN (costs off) SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atstsh WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; - QUERY PLAN ------------------------------------------------------------------------------------ + QUERY PLAN +------------------------------------------------------------------------------------- Limit - -> Index Scan using atstsh_idx on atstsh - Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) - Order By: (d <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone) -(4 rows) + -> Sort + Sort Key: ((d <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone)) + -> Seq Scan on atstsh + Filter: (t @@ '''wr'' & ''qh'''::tsquery) +(5 rows) SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atstsh WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; -ERROR: doesn't support order by over pass-by-reference column + id | d | ?column? +-----+---------------------------------+--------------- + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 + 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 + 406 | Wed May 18 17:21:22.326724 2016 | 183597.326724 + 415 | Thu May 19 02:21:22.326724 2016 | 215997.326724 +(5 rows) + EXPLAIN (costs off) SELECT id, d, d <=| '2016-05-16 14:21:25' FROM atstsh WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; - QUERY PLAN ------------------------------------------------------------------------------------ + QUERY PLAN +------------------------------------------------------------------------------------- Limit - -> Index Scan using atstsh_idx on atstsh - Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) - Order By: (d <=| 'Mon May 16 14:21:25 2016'::timestamp without time zone) -(4 rows) + -> Sort + Sort Key: ((d <=| 'Mon May 16 14:21:25 2016'::timestamp without time zone)) + -> Seq Scan on atstsh + Filter: (t @@ '''wr'' & ''qh'''::tsquery) +(5 rows) SELECT id, d, d <=| '2016-05-16 14:21:25' FROM atstsh WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; -ERROR: doesn't support order by over pass-by-reference column + id | d | ?column? +-----+---------------------------------+--------------- + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 + 252 | Thu May 12 07:21:22.326724 2016 | 370802.673276 + 232 | Wed May 11 11:21:22.326724 2016 | 442802.673276 + 168 | Sun May 08 19:21:22.326724 2016 | 673202.673276 +(5 rows) + EXPLAIN (costs off) SELECT id, d, d |=> '2016-05-16 14:21:25' FROM atstsh WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; - QUERY PLAN ------------------------------------------------------------------------------------ + QUERY PLAN +------------------------------------------------------------------------------------- Limit - -> Index Scan using atstsh_idx on atstsh - Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) - Order By: (d |=> 'Mon May 16 14:21:25 2016'::timestamp without time zone) -(4 rows) + -> Sort + Sort Key: ((d |=> 'Mon May 16 14:21:25 2016'::timestamp without time zone)) + -> Seq Scan on atstsh + Filter: (t @@ '''wr'' & ''qh'''::tsquery) +(5 rows) SELECT id, d, d |=> '2016-05-16 14:21:25' FROM atstsh WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; -ERROR: doesn't support order by over pass-by-reference column + id | d | ?column? +-----+---------------------------------+--------------- + 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 + 406 | Wed May 18 17:21:22.326724 2016 | 183597.326724 + 415 | Thu May 19 02:21:22.326724 2016 | 215997.326724 + 428 | Thu May 19 15:21:22.326724 2016 | 262797.326724 + 457 | Fri May 20 20:21:22.326724 2016 | 367197.326724 +(5 rows) + EXPLAIN (costs off) SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atstsh ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; - QUERY PLAN ------------------------------------------------------------------------------------ + QUERY PLAN +------------------------------------------------------------------------------------- Limit - -> Index Scan using atstsh_idx on atstsh - Order By: (d <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone) -(3 rows) + -> Sort + Sort Key: ((d <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone)) + -> Seq Scan on atstsh +(4 rows) SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atstsh ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; -ERROR: doesn't support order by over pass-by-reference column + id | d | ?column? +-----+---------------------------------+------------- + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 356 | Mon May 16 15:21:22.326724 2016 | 3597.326724 + 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 + 357 | Mon May 16 16:21:22.326724 2016 | 7197.326724 + 353 | Mon May 16 12:21:22.326724 2016 | 7202.673276 +(5 rows) + EXPLAIN (costs off) SELECT id, d FROM atstsh WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; - QUERY PLAN ----------------------------------------------------------------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------ Sort Sort Key: d - -> Index Scan using atstsh_idx on atstsh - Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d <= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) + -> Seq Scan on atstsh + Filter: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d <= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) (4 rows) SELECT id, d FROM atstsh WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; @@ -270,12 +313,12 @@ SELECT id, d FROM atstsh WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDE EXPLAIN (costs off) SELECT id, d FROM atstsh WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; - QUERY PLAN ----------------------------------------------------------------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------ Sort Sort Key: d - -> Index Scan using atstsh_idx on atstsh - Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d >= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) + -> Seq Scan on atstsh + Filter: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d >= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) (4 rows) SELECT id, d FROM atstsh WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; diff --git a/expected/array.out b/expected/array.out index 92864d95e4..a2fb3bb8df 100644 --- a/expected/array.out +++ b/expected/array.out @@ -1,3 +1,12 @@ +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * array.out - test output for 64-bit systems and + * array_1.out - test output for 32-bit systems. + * + */ set enable_seqscan=off; set enable_sort=off; /* @@ -843,35 +852,71 @@ EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i % '{}'; DROP INDEX idx_array; /* * Check ordering using distance operator + * + * We want to check that index scan provides us correct ordering by distance + * operator. File 'data/rum_array.data' contains two arrays that statisfy + * i @> '{23,20}' and have finite distance i <=> '{51}', and a bunch of arrays + * that statisfy i @> '{23,20}' and have infinite distance i <=> '{51}'. + * + * When ordering by distance the order of this bunch of arrays with infinite + * distance is not determined and may depend of PostgreSQL version and system. + * We don't add another sort expression to ORDER BY because that might cause + * the planner to avoid using the index. Instead, we replace arrays that have + * infinite distance with {-1} to unambiguously determine the test output. + * + * 'Infinity' is printed differently in the output in different PostgreSQL + * versions, so we replace it with -1. */ CREATE TABLE test_array_order ( i int2[] ); \copy test_array_order(i) from 'data/rum_array.data'; CREATE INDEX idx_array_order ON test_array_order USING rum (i rum_anyarray_ops); +/* + * Check that plan of the query uses ordering provided by index scan + */ EXPLAIN (COSTS OFF) -SELECT *, i <=> '{51}' from test_array_order WHERE i @> '{23,20}' order by i <=> '{51}'; - QUERY PLAN ------------------------------------------------------- - Index Scan using idx_array_order on test_array_order - Index Cond: (i @> '{23,20}'::smallint[]) - Order By: (i <=> '{51}'::smallint[]) -(3 rows) +SELECT + CASE WHEN distance = 'Infinity' THEN '{-1}' + ELSE i + END i, + CASE WHEN distance = 'Infinity' THEN -1 + ELSE distance::numeric(18,14) + END distance + FROM + (SELECT *, (i <=> '{51}') AS distance + FROM test_array_order WHERE i @> '{23,20}' ORDER BY distance) t; + QUERY PLAN +------------------------------------------------------------ + Subquery Scan on t + -> Index Scan using idx_array_order on test_array_order + Index Cond: (i @> '{23,20}'::smallint[]) + Order By: (i <=> '{51}'::smallint[]) +(4 rows) -SELECT *, i <=> '{51}' from test_array_order WHERE i @> '{23,20}' order by i <=> '{51}'; - i | ?column? +SELECT + CASE WHEN distance = 'Infinity' THEN '{-1}' + ELSE i + END i, + CASE WHEN distance = 'Infinity' THEN -1 + ELSE distance::numeric(18,14) + END distance + FROM + (SELECT *, (i <=> '{51}') AS distance + FROM test_array_order WHERE i @> '{23,20}' ORDER BY distance) t; + i | distance ---------------------+------------------ {20,23,51} | 1.73205080756888 {33,51,20,77,23,65} | 2.44948974278318 - {23,76,34,23,2,20} | Infinity - {20,60,45,23,29} | Infinity - {23,89,38,20,40,95} | Infinity - {23,20,72} | Infinity - {73,23,20} | Infinity - {6,97,20,89,23} | Infinity - {20,98,30,23,1,66} | Infinity - {57,23,39,46,50,20} | Infinity - {81,20,26,22,23} | Infinity - {18,23,10,90,15,20} | Infinity + {-1} | -1 + {-1} | -1 + {-1} | -1 + {-1} | -1 + {-1} | -1 + {-1} | -1 + {-1} | -1 + {-1} | -1 + {-1} | -1 + {-1} | -1 (12 rows) diff --git a/expected/array_1.out b/expected/array_1.out index e88ae50589..cc5f93307c 100644 --- a/expected/array_1.out +++ b/expected/array_1.out @@ -1,3 +1,12 @@ +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * array.out - test output for 64-bit systems and + * array_1.out - test output for 32-bit systems. + * + */ set enable_seqscan=off; set enable_sort=off; /* @@ -836,35 +845,71 @@ EXPLAIN (COSTS OFF) SELECT * FROM test_array WHERE i % '{}'; DROP INDEX idx_array; /* * Check ordering using distance operator + * + * We want to check that index scan provides us correct ordering by distance + * operator. File 'data/rum_array.data' contains two arrays that statisfy + * i @> '{23,20}' and have finite distance i <=> '{51}', and a bunch of arrays + * that statisfy i @> '{23,20}' and have infinite distance i <=> '{51}'. + * + * When ordering by distance the order of this bunch of arrays with infinite + * distance is not determined and may depend of PostgreSQL version and system. + * We don't add another sort expression to ORDER BY because that might cause + * the planner to avoid using the index. Instead, we replace arrays that have + * infinite distance with {-1} to unambiguously determine the test output. + * + * 'Infinity' is printed differently in the output in different PostgreSQL + * versions, so we replace it with -1. */ CREATE TABLE test_array_order ( i int2[] ); \copy test_array_order(i) from 'data/rum_array.data'; CREATE INDEX idx_array_order ON test_array_order USING rum (i rum_anyarray_ops); +/* + * Check that plan of the query uses ordering provided by index scan + */ EXPLAIN (COSTS OFF) -SELECT *, i <=> '{51}' from test_array_order WHERE i @> '{23,20}' order by i <=> '{51}'; - QUERY PLAN ------------------------------------------------------- - Index Scan using idx_array_order on test_array_order - Index Cond: (i @> '{23,20}'::smallint[]) - Order By: (i <=> '{51}'::smallint[]) -(3 rows) +SELECT + CASE WHEN distance = 'Infinity' THEN '{-1}' + ELSE i + END i, + CASE WHEN distance = 'Infinity' THEN -1 + ELSE distance::numeric(18,14) + END distance + FROM + (SELECT *, (i <=> '{51}') AS distance + FROM test_array_order WHERE i @> '{23,20}' ORDER BY distance) t; + QUERY PLAN +------------------------------------------------------------ + Subquery Scan on t + -> Index Scan using idx_array_order on test_array_order + Index Cond: (i @> '{23,20}'::smallint[]) + Order By: (i <=> '{51}'::smallint[]) +(4 rows) -SELECT *, i <=> '{51}' from test_array_order WHERE i @> '{23,20}' order by i <=> '{51}'; - i | ?column? +SELECT + CASE WHEN distance = 'Infinity' THEN '{-1}' + ELSE i + END i, + CASE WHEN distance = 'Infinity' THEN -1 + ELSE distance::numeric(18,14) + END distance + FROM + (SELECT *, (i <=> '{51}') AS distance + FROM test_array_order WHERE i @> '{23,20}' ORDER BY distance) t; + i | distance ---------------------+------------------ {20,23,51} | 1.73205080756888 {33,51,20,77,23,65} | 2.44948974278318 - {23,76,34,23,2,20} | Infinity - {20,60,45,23,29} | Infinity - {23,89,38,20,40,95} | Infinity - {23,20,72} | Infinity - {73,23,20} | Infinity - {6,97,20,89,23} | Infinity - {20,98,30,23,1,66} | Infinity - {57,23,39,46,50,20} | Infinity - {81,20,26,22,23} | Infinity - {18,23,10,90,15,20} | Infinity + {-1} | -1 + {-1} | -1 + {-1} | -1 + {-1} | -1 + {-1} | -1 + {-1} | -1 + {-1} | -1 + {-1} | -1 + {-1} | -1 + {-1} | -1 (12 rows) diff --git a/expected/expr.out b/expected/expr.out new file mode 100644 index 0000000000..b57de73ff4 --- /dev/null +++ b/expected/expr.out @@ -0,0 +1,26 @@ +CREATE TABLE documents ( + en text not null, + score float not null, + textsearch_index_en_col tsvector +); +INSERT INTO documents VALUES ('the pet cat is in the shed', 56, to_tsvector('english', 'the pet cat is in the shed')); +CREATE INDEX textsearch_index_en ON documents + USING rum (textsearch_index_en_col rum_tsvector_addon_ops, score) + WITH (attach = 'score', to = 'textsearch_index_en_col'); +SET enable_seqscan=off; +-- should be 1 row +SELECT * FROM documents WHERE textsearch_index_en_col @@ ('pet'::tsquery <-> ('dog'::tsquery || 'cat'::tsquery)); + en | score | textsearch_index_en_col +----------------------------+-------+-------------------------- + the pet cat is in the shed | 56 | 'cat':3 'pet':2 'shed':7 +(1 row) + +SET enable_seqscan=on; +-- 1 row +SELECT * FROM documents WHERE textsearch_index_en_col @@ ('pet'::tsquery <-> ('dog'::tsquery || 'cat'::tsquery)); + en | score | textsearch_index_en_col +----------------------------+-------+-------------------------- + the pet cat is in the shed | 56 | 'cat':3 'pet':2 'shed':7 +(1 row) + +DROP TABLE documents; diff --git a/expected/float8.out b/expected/float8.out index e96cb0ea54..fdca51343a 100644 --- a/expected/float8.out +++ b/expected/float8.out @@ -1,3 +1,12 @@ +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * float8.out - test output for 64-bit systems and + * float8_1.out - test output for 32-bit systems. + * + */ set enable_seqscan=off; CREATE TABLE test_float8 ( i float8 diff --git a/expected/float8_1.out b/expected/float8_1.out index dabdd51964..b421dcf311 100644 --- a/expected/float8_1.out +++ b/expected/float8_1.out @@ -1,3 +1,12 @@ +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * float8.out - test output for 64-bit systems and + * float8_1.out - test output for 32-bit systems. + * + */ set enable_seqscan=off; CREATE TABLE test_float8 ( i float8 diff --git a/expected/int4.out b/expected/int4.out index 379dd6dea3..00b73e3432 100644 --- a/expected/int4.out +++ b/expected/int4.out @@ -145,7 +145,6 @@ SELECT id FROM test_int4_o WHERE t @@ 'wr&qh' AND id >= 400 ORDER BY id; RESET enable_indexscan; RESET enable_indexonlyscan; -RESET enable_bitmapscan; SET enable_seqscan = off; EXPLAIN (costs off) SELECT id, id <=> 400 FROM test_int4_o WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; @@ -258,7 +257,6 @@ CREATE TABLE test_int4_a AS SELECT id::int4, t FROM tsts; CREATE INDEX test_int4_a_idx ON test_int4_a USING rum (t rum_tsvector_addon_ops, id) WITH (attach = 'id', to = 't', order_by_attach='t'); -SET enable_bitmapscan=OFF; EXPLAIN (costs off) SELECT count(*) FROM test_int4_a WHERE id < 400; QUERY PLAN @@ -448,7 +446,6 @@ SELECT id FROM test_int4_h_o WHERE t @@ 'wr&qh' AND id >= 400 ORDER BY id; RESET enable_indexscan; RESET enable_indexonlyscan; -RESET enable_bitmapscan; SET enable_seqscan = off; EXPLAIN (costs off) SELECT id, id <=> 400 FROM test_int4_h_o WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; @@ -561,7 +558,6 @@ CREATE TABLE test_int4_h_a AS SELECT id::int4, t FROM tsts; CREATE INDEX test_int4_h_a_idx ON test_int4_h_a USING rum (t rum_tsvector_hash_addon_ops, id) WITH (attach = 'id', to = 't', order_by_attach='t'); -SET enable_bitmapscan=OFF; EXPLAIN (costs off) SELECT count(*) FROM test_int4_h_a WHERE id < 400; QUERY PLAN diff --git a/expected/int8.out b/expected/int8.out index 40b091cdda..663162a18e 100644 --- a/expected/int8.out +++ b/expected/int8.out @@ -1,3 +1,12 @@ +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * int8.out - test output for 64-bit systems and + * int8_1.out - test output for 32-bit systems. + * + */ set enable_seqscan=off; CREATE TABLE test_int8 ( i int8 @@ -145,7 +154,6 @@ SELECT id FROM test_int8_o WHERE t @@ 'wr&qh' AND id >= 400::int8 ORDER BY id; RESET enable_indexscan; RESET enable_indexonlyscan; -RESET enable_bitmapscan; SET enable_seqscan = off; EXPLAIN (costs off) SELECT id, id <=> 400 FROM test_int8_o WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; @@ -258,7 +266,6 @@ CREATE TABLE test_int8_a AS SELECT id::int8, t FROM tsts; CREATE INDEX test_int8_a_idx ON test_int8_a USING rum (t rum_tsvector_addon_ops, id) WITH (attach = 'id', to = 't', order_by_attach='t'); -SET enable_bitmapscan=OFF; EXPLAIN (costs off) SELECT count(*) FROM test_int8_a WHERE id < 400::int8; QUERY PLAN @@ -448,7 +455,6 @@ SELECT id FROM test_int8_h_o WHERE t @@ 'wr&qh' AND id >= 400::int8 ORDER BY id RESET enable_indexscan; RESET enable_indexonlyscan; -RESET enable_bitmapscan; SET enable_seqscan = off; EXPLAIN (costs off) SELECT id, id <=> 400 FROM test_int8_h_o WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; @@ -561,7 +567,6 @@ CREATE TABLE test_int8_h_a AS SELECT id::int8, t FROM tsts; CREATE INDEX test_int8_h_a_idx ON test_int8_h_a USING rum (t rum_tsvector_hash_addon_ops, id) WITH (attach = 'id', to = 't', order_by_attach='t'); -SET enable_bitmapscan=OFF; EXPLAIN (costs off) SELECT count(*) FROM test_int8_h_a WHERE id < 400::int8; QUERY PLAN diff --git a/expected/int8_1.out b/expected/int8_1.out index fe7a3151fb..ffced0aaf8 100644 --- a/expected/int8_1.out +++ b/expected/int8_1.out @@ -1,3 +1,12 @@ +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * int8.out - test output for 64-bit systems and + * int8_1.out - test output for 32-bit systems. + * + */ set enable_seqscan=off; CREATE TABLE test_int8 ( i int8 @@ -130,7 +139,6 @@ SELECT id FROM test_int8_o WHERE t @@ 'wr&qh' AND id >= 400::int8 ORDER BY id; RESET enable_indexscan; RESET enable_indexonlyscan; -RESET enable_bitmapscan; SET enable_seqscan = off; EXPLAIN (costs off) SELECT id, id <=> 400 FROM test_int8_o WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; @@ -219,14 +227,14 @@ CREATE TABLE test_int8_a AS SELECT id::int8, t FROM tsts; CREATE INDEX test_int8_a_idx ON test_int8_a USING rum (t rum_tsvector_addon_ops, id) WITH (attach = 'id', to = 't', order_by_attach='t'); -SET enable_bitmapscan=OFF; +ERROR: doesn't support order index over pass-by-reference column EXPLAIN (costs off) SELECT count(*) FROM test_int8_a WHERE id < 400::int8; - QUERY PLAN -------------------------------------------------------- + QUERY PLAN +-------------------------------------- Aggregate - -> Index Scan using test_int8_a_idx on test_int8_a - Index Cond: (id < '400'::bigint) + -> Seq Scan on test_int8_a + Filter: (id < '400'::bigint) (3 rows) SELECT count(*) FROM test_int8_a WHERE id < 400::int8; @@ -237,48 +245,75 @@ SELECT count(*) FROM test_int8_a WHERE id < 400::int8; EXPLAIN (costs off) SELECT id, id <=> 400 FROM test_int8_a WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; - QUERY PLAN -------------------------------------------------------- + QUERY PLAN +--------------------------------------------------------- Limit - -> Index Scan using test_int8_a_idx on test_int8_a - Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) - Order By: (id <=> '400'::bigint) -(4 rows) + -> Sort + Sort Key: ((id <=> '400'::bigint)) + -> Seq Scan on test_int8_a + Filter: (t @@ '''wr'' & ''qh'''::tsquery) +(5 rows) SELECT id, id <=> 400 FROM test_int8_a WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; -ERROR: doesn't support order by over pass-by-reference column + id | ?column? +-----+---------- + 406 | 6 + 415 | 15 + 428 | 28 + 371 | 29 + 355 | 45 +(5 rows) + EXPLAIN (costs off) SELECT id, id <=| 400 FROM test_int8_a WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; - QUERY PLAN -------------------------------------------------------- + QUERY PLAN +--------------------------------------------------------- Limit - -> Index Scan using test_int8_a_idx on test_int8_a - Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) - Order By: (id <=| '400'::bigint) -(4 rows) + -> Sort + Sort Key: ((id <=| '400'::bigint)) + -> Seq Scan on test_int8_a + Filter: (t @@ '''wr'' & ''qh'''::tsquery) +(5 rows) SELECT id, id <=| 400 FROM test_int8_a WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; -ERROR: doesn't support order by over pass-by-reference column + id | ?column? +-----+---------- + 371 | 29 + 355 | 45 + 354 | 46 + 252 | 148 + 232 | 168 +(5 rows) + EXPLAIN (costs off) SELECT id, id |=> 400 FROM test_int8_a WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; - QUERY PLAN -------------------------------------------------------- + QUERY PLAN +--------------------------------------------------------- Limit - -> Index Scan using test_int8_a_idx on test_int8_a - Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) - Order By: (id |=> '400'::bigint) -(4 rows) + -> Sort + Sort Key: ((id |=> '400'::bigint)) + -> Seq Scan on test_int8_a + Filter: (t @@ '''wr'' & ''qh'''::tsquery) +(5 rows) SELECT id, id |=> 400 FROM test_int8_a WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; -ERROR: doesn't support order by over pass-by-reference column + id | ?column? +-----+---------- + 406 | 6 + 415 | 15 + 428 | 28 + 457 | 57 + 458 | 58 +(5 rows) + EXPLAIN (costs off) SELECT id FROM test_int8_a WHERE t @@ 'wr&qh' AND id <= 400::int8 ORDER BY id; - QUERY PLAN ------------------------------------------------------------------------------------ + QUERY PLAN +------------------------------------------------------------------------------- Sort Sort Key: id - -> Index Scan using test_int8_a_idx on test_int8_a - Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id <= '400'::bigint)) + -> Seq Scan on test_int8_a + Filter: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id <= '400'::bigint)) (4 rows) SELECT id FROM test_int8_a WHERE t @@ 'wr&qh' AND id <= 400::int8 ORDER BY id; @@ -298,12 +333,12 @@ SELECT id FROM test_int8_a WHERE t @@ 'wr&qh' AND id <= 400::int8 ORDER BY id; EXPLAIN (costs off) SELECT id FROM test_int8_a WHERE t @@ 'wr&qh' AND id >= 400::int8 ORDER BY id; - QUERY PLAN ------------------------------------------------------------------------------------ + QUERY PLAN +------------------------------------------------------------------------------- Sort Sort Key: id - -> Index Scan using test_int8_a_idx on test_int8_a - Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id >= '400'::bigint)) + -> Seq Scan on test_int8_a + Filter: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id >= '400'::bigint)) (4 rows) SELECT id FROM test_int8_a WHERE t @@ 'wr&qh' AND id >= 400::int8 ORDER BY id; @@ -385,7 +420,6 @@ SELECT id FROM test_int8_h_o WHERE t @@ 'wr&qh' AND id >= 400::int8 ORDER BY id RESET enable_indexscan; RESET enable_indexonlyscan; -RESET enable_bitmapscan; SET enable_seqscan = off; EXPLAIN (costs off) SELECT id, id <=> 400 FROM test_int8_h_o WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; @@ -474,14 +508,14 @@ CREATE TABLE test_int8_h_a AS SELECT id::int8, t FROM tsts; CREATE INDEX test_int8_h_a_idx ON test_int8_h_a USING rum (t rum_tsvector_hash_addon_ops, id) WITH (attach = 'id', to = 't', order_by_attach='t'); -SET enable_bitmapscan=OFF; +ERROR: doesn't support order index over pass-by-reference column EXPLAIN (costs off) SELECT count(*) FROM test_int8_h_a WHERE id < 400::int8; - QUERY PLAN ------------------------------------------------------------ + QUERY PLAN +-------------------------------------- Aggregate - -> Index Scan using test_int8_h_a_idx on test_int8_h_a - Index Cond: (id < '400'::bigint) + -> Seq Scan on test_int8_h_a + Filter: (id < '400'::bigint) (3 rows) SELECT count(*) FROM test_int8_h_a WHERE id < 400::int8; @@ -492,48 +526,75 @@ SELECT count(*) FROM test_int8_h_a WHERE id < 400::int8; EXPLAIN (costs off) SELECT id, id <=> 400 FROM test_int8_h_a WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; - QUERY PLAN ------------------------------------------------------------ + QUERY PLAN +--------------------------------------------------------- Limit - -> Index Scan using test_int8_h_a_idx on test_int8_h_a - Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) - Order By: (id <=> '400'::bigint) -(4 rows) + -> Sort + Sort Key: ((id <=> '400'::bigint)) + -> Seq Scan on test_int8_h_a + Filter: (t @@ '''wr'' & ''qh'''::tsquery) +(5 rows) SELECT id, id <=> 400 FROM test_int8_h_a WHERE t @@ 'wr&qh' ORDER BY id <=> 400 LIMIT 5; -ERROR: doesn't support order by over pass-by-reference column + id | ?column? +-----+---------- + 406 | 6 + 415 | 15 + 428 | 28 + 371 | 29 + 355 | 45 +(5 rows) + EXPLAIN (costs off) SELECT id, id <=| 400 FROM test_int8_h_a WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; - QUERY PLAN ------------------------------------------------------------ + QUERY PLAN +--------------------------------------------------------- Limit - -> Index Scan using test_int8_h_a_idx on test_int8_h_a - Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) - Order By: (id <=| '400'::bigint) -(4 rows) + -> Sort + Sort Key: ((id <=| '400'::bigint)) + -> Seq Scan on test_int8_h_a + Filter: (t @@ '''wr'' & ''qh'''::tsquery) +(5 rows) SELECT id, id <=| 400 FROM test_int8_h_a WHERE t @@ 'wr&qh' ORDER BY id <=| 400 LIMIT 5; -ERROR: doesn't support order by over pass-by-reference column + id | ?column? +-----+---------- + 371 | 29 + 355 | 45 + 354 | 46 + 252 | 148 + 232 | 168 +(5 rows) + EXPLAIN (costs off) SELECT id, id |=> 400 FROM test_int8_h_a WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; - QUERY PLAN ------------------------------------------------------------ + QUERY PLAN +--------------------------------------------------------- Limit - -> Index Scan using test_int8_h_a_idx on test_int8_h_a - Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) - Order By: (id |=> '400'::bigint) -(4 rows) + -> Sort + Sort Key: ((id |=> '400'::bigint)) + -> Seq Scan on test_int8_h_a + Filter: (t @@ '''wr'' & ''qh'''::tsquery) +(5 rows) SELECT id, id |=> 400 FROM test_int8_h_a WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; -ERROR: doesn't support order by over pass-by-reference column + id | ?column? +-----+---------- + 406 | 6 + 415 | 15 + 428 | 28 + 457 | 57 + 458 | 58 +(5 rows) + EXPLAIN (costs off) SELECT id FROM test_int8_h_a WHERE t @@ 'wr&qh' AND id <= 400::int8 ORDER BY id; - QUERY PLAN ------------------------------------------------------------------------------------ + QUERY PLAN +------------------------------------------------------------------------------- Sort Sort Key: id - -> Index Scan using test_int8_h_a_idx on test_int8_h_a - Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id <= '400'::bigint)) + -> Seq Scan on test_int8_h_a + Filter: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id <= '400'::bigint)) (4 rows) SELECT id FROM test_int8_h_a WHERE t @@ 'wr&qh' AND id <= 400::int8 ORDER BY id; @@ -553,12 +614,12 @@ SELECT id FROM test_int8_h_a WHERE t @@ 'wr&qh' AND id <= 400::int8 ORDER BY id EXPLAIN (costs off) SELECT id FROM test_int8_h_a WHERE t @@ 'wr&qh' AND id >= 400::int8 ORDER BY id; - QUERY PLAN ------------------------------------------------------------------------------------ + QUERY PLAN +------------------------------------------------------------------------------- Sort Sort Key: id - -> Index Scan using test_int8_h_a_idx on test_int8_h_a - Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id >= '400'::bigint)) + -> Seq Scan on test_int8_h_a + Filter: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id >= '400'::bigint)) (4 rows) SELECT id FROM test_int8_h_a WHERE t @@ 'wr&qh' AND id >= 400::int8 ORDER BY id; @@ -578,12 +639,26 @@ CREATE INDEX test_int8_id_t_idx ON test_int8_o USING rum (t rum_tsvector_ops, id); EXPLAIN (costs off) SELECT id FROM test_int8_h_a WHERE t @@ 'wr&qh' AND id <= 400::int8 ORDER BY id <=> 400::int8; - QUERY PLAN ------------------------------------------------------------------------------ - Index Scan using test_int8_h_a_idx on test_int8_h_a - Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id <= '400'::bigint)) - Order By: (id <=> '400'::bigint) -(3 rows) + QUERY PLAN +------------------------------------------------------------------------------- + Sort + Sort Key: ((id <=> '400'::bigint)) + -> Seq Scan on test_int8_h_a + Filter: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id <= '400'::bigint)) +(4 rows) SELECT id FROM test_int8_h_a WHERE t @@ 'wr&qh' AND id <= 400::int8 ORDER BY id <=> 400::int8; -ERROR: doesn't support order by over pass-by-reference column + id +----- + 371 + 355 + 354 + 252 + 232 + 168 + 135 + 71 + 39 + 16 +(10 rows) + diff --git a/expected/money.out b/expected/money.out index 7b9b20580e..b2e9bac41d 100644 --- a/expected/money.out +++ b/expected/money.out @@ -1,3 +1,12 @@ +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * money.out - test output for 64-bit systems and + * money_1.out - test output for 32-bit systems. + * + */ set enable_seqscan=off; CREATE TABLE test_money ( i money diff --git a/expected/money_1.out b/expected/money_1.out index b8ec0ec5c7..6a3fa8c211 100644 --- a/expected/money_1.out +++ b/expected/money_1.out @@ -1,3 +1,12 @@ +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * money.out - test output for 64-bit systems and + * money_1.out - test output for 32-bit systems. + * + */ set enable_seqscan=off; CREATE TABLE test_money ( i money diff --git a/expected/orderby.out b/expected/orderby.out index 38cda70f32..07ae7322ed 100644 --- a/expected/orderby.out +++ b/expected/orderby.out @@ -1,9 +1,92 @@ +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * orderby.out - test output for 64-bit systems and + * orderby_1.out - test output for 32-bit systems. + * + */ CREATE TABLE tsts (id int, t tsvector, d timestamp); \copy tsts from 'data/tsts.data' CREATE INDEX tsts_idx ON tsts USING rum (t rum_tsvector_addon_ops, d) WITH (attach = 'd', to = 't'); -INSERT INTO tsts VALUES (-1, 't1 t2', '2016-05-02 02:24:22.326724'); -INSERT INTO tsts VALUES (-2, 't1 t2 t3', '2016-05-02 02:26:22.326724'); +INSERT INTO tsts VALUES (-1, 't1 t2', '2016-05-02 02:24:22.326724'); +INSERT INTO tsts VALUES (-2, 't1 t2 t3', '2016-05-02 02:26:22.326724'); +SET enable_indexscan=OFF; +SET enable_indexonlyscan=OFF; +SET enable_bitmapscan=OFF; +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + id | d | ?column? +-----+---------------------------------+--------------- + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 + 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 + 406 | Wed May 18 17:21:22.326724 2016 | 183597.326724 + 415 | Thu May 19 02:21:22.326724 2016 | 215997.326724 +(5 rows) + +SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; + id | d | ?column? +-----+---------------------------------+--------------- + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 + 252 | Thu May 12 07:21:22.326724 2016 | 370802.673276 + 232 | Wed May 11 11:21:22.326724 2016 | 442802.673276 + 168 | Sun May 08 19:21:22.326724 2016 | 673202.673276 +(5 rows) + +SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; + id | d | ?column? +-----+---------------------------------+--------------- + 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 + 406 | Wed May 18 17:21:22.326724 2016 | 183597.326724 + 415 | Thu May 19 02:21:22.326724 2016 | 215997.326724 + 428 | Thu May 19 15:21:22.326724 2016 | 262797.326724 + 457 | Fri May 20 20:21:22.326724 2016 | 367197.326724 +(5 rows) + +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; + id | d +-----+--------------------------------- + 16 | Mon May 02 11:21:22.326724 2016 + 39 | Tue May 03 10:21:22.326724 2016 + 71 | Wed May 04 18:21:22.326724 2016 + 135 | Sat May 07 10:21:22.326724 2016 + 168 | Sun May 08 19:21:22.326724 2016 + 232 | Wed May 11 11:21:22.326724 2016 + 252 | Thu May 12 07:21:22.326724 2016 + 354 | Mon May 16 13:21:22.326724 2016 + 355 | Mon May 16 14:21:22.326724 2016 +(9 rows) + +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; + id | d +-----+--------------------------------- + 371 | Tue May 17 06:21:22.326724 2016 + 406 | Wed May 18 17:21:22.326724 2016 + 415 | Thu May 19 02:21:22.326724 2016 + 428 | Thu May 19 15:21:22.326724 2016 + 457 | Fri May 20 20:21:22.326724 2016 + 458 | Fri May 20 21:21:22.326724 2016 + 484 | Sat May 21 23:21:22.326724 2016 + 496 | Sun May 22 11:21:22.326724 2016 +(8 rows) + +-- Test bitmap index scan +RESET enable_bitmapscan; +SET enable_seqscan = off; +EXPLAIN (costs off) +SELECT count(*) FROM tsts WHERE t @@ 'wr|qh'; + QUERY PLAN +------------------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on tsts + Recheck Cond: (t @@ '''wr'' | ''qh'''::tsquery) + -> Bitmap Index Scan on tsts_idx + Index Cond: (t @@ '''wr'' | ''qh'''::tsquery) +(5 rows) + SELECT count(*) FROM tsts WHERE t @@ 'wr|qh'; count ------- @@ -40,9 +123,19 @@ SELECT count(*) FROM tsts WHERE t @@ '(eq|yt)&(wr|qh)'; 39 (1 row) -SET enable_indexscan=OFF; -SET enable_indexonlyscan=OFF; -SET enable_bitmapscan=OFF; +EXPLAIN (costs off) +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + QUERY PLAN +------------------------------------------------------------------------------------- + Limit + -> Sort + Sort Key: ((d <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone)) + -> Bitmap Heap Scan on tsts + Recheck Cond: (t @@ '''wr'' & ''qh'''::tsquery) + -> Bitmap Index Scan on tsts_idx + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) +(7 rows) + SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? -----+---------------------------------+--------------- @@ -53,6 +146,19 @@ SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY 415 | Thu May 19 02:21:22.326724 2016 | 215997.326724 (5 rows) +EXPLAIN (costs off) +SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; + QUERY PLAN +------------------------------------------------------------------------------------- + Limit + -> Sort + Sort Key: ((d <=| 'Mon May 16 14:21:25 2016'::timestamp without time zone)) + -> Bitmap Heap Scan on tsts + Recheck Cond: (t @@ '''wr'' & ''qh'''::tsquery) + -> Bitmap Index Scan on tsts_idx + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) +(7 rows) + SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? -----+---------------------------------+--------------- @@ -63,6 +169,19 @@ SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY 168 | Sun May 08 19:21:22.326724 2016 | 673202.673276 (5 rows) +EXPLAIN (costs off) +SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; + QUERY PLAN +------------------------------------------------------------------------------------- + Limit + -> Sort + Sort Key: ((d |=> 'Mon May 16 14:21:25 2016'::timestamp without time zone)) + -> Bitmap Heap Scan on tsts + Recheck Cond: (t @@ '''wr'' & ''qh'''::tsquery) + -> Bitmap Index Scan on tsts_idx + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) +(7 rows) + SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? -----+---------------------------------+--------------- @@ -73,6 +192,37 @@ SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY 457 | Fri May 20 20:21:22.326724 2016 | 367197.326724 (5 rows) +EXPLAIN (costs off) +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + QUERY PLAN +----------------------------------------------------------------------------------- + Limit + -> Index Scan using tsts_idx on tsts + Order By: (d <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone) +(3 rows) + +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + id | d | ?column? +-----+---------------------------------+------------- + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 356 | Mon May 16 15:21:22.326724 2016 | 3597.326724 + 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 + 357 | Mon May 16 16:21:22.326724 2016 | 7197.326724 + 353 | Mon May 16 12:21:22.326724 2016 | 7202.673276 +(5 rows) + +EXPLAIN (costs off) +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; + QUERY PLAN +---------------------------------------------------------------------------------------------------------------------------------- + Sort + Sort Key: d + -> Bitmap Heap Scan on tsts + Recheck Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d <= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) + -> Bitmap Index Scan on tsts_idx + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d <= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) +(6 rows) + SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; id | d -----+--------------------------------- @@ -87,6 +237,18 @@ SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER 355 | Mon May 16 14:21:22.326724 2016 (9 rows) +EXPLAIN (costs off) +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; + QUERY PLAN +---------------------------------------------------------------------------------------------------------------------------------- + Sort + Sort Key: d + -> Bitmap Heap Scan on tsts + Recheck Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d >= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) + -> Bitmap Index Scan on tsts_idx + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d >= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) +(6 rows) + SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; id | d -----+--------------------------------- @@ -100,20 +262,18 @@ SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER 496 | Sun May 22 11:21:22.326724 2016 (8 rows) +-- Test index scan RESET enable_indexscan; RESET enable_indexonlyscan; -RESET enable_bitmapscan; -SET enable_seqscan = off; +SET enable_bitmapscan=OFF; EXPLAIN (costs off) SELECT count(*) FROM tsts WHERE t @@ 'wr|qh'; - QUERY PLAN -------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------- Aggregate - -> Bitmap Heap Scan on tsts - Recheck Cond: (t @@ '''wr'' | ''qh'''::tsquery) - -> Bitmap Index Scan on tsts_idx - Index Cond: (t @@ '''wr'' | ''qh'''::tsquery) -(5 rows) + -> Index Scan using tsts_idx on tsts + Index Cond: (t @@ '''wr'' | ''qh'''::tsquery) +(3 rows) SELECT count(*) FROM tsts WHERE t @@ 'wr|qh'; count @@ -277,54 +437,6 @@ SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER 496 | Sun May 22 11:21:22.326724 2016 (8 rows) -SET enable_bitmapscan=OFF; -EXPLAIN (costs off) -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; - QUERY PLAN ----------------------------------------------------------------------------------------------------------------------------- - Sort - Sort Key: d - -> Index Scan using tsts_idx on tsts - Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d <= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) -(4 rows) - -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; - id | d ------+--------------------------------- - 16 | Mon May 02 11:21:22.326724 2016 - 39 | Tue May 03 10:21:22.326724 2016 - 71 | Wed May 04 18:21:22.326724 2016 - 135 | Sat May 07 10:21:22.326724 2016 - 168 | Sun May 08 19:21:22.326724 2016 - 232 | Wed May 11 11:21:22.326724 2016 - 252 | Thu May 12 07:21:22.326724 2016 - 354 | Mon May 16 13:21:22.326724 2016 - 355 | Mon May 16 14:21:22.326724 2016 -(9 rows) - -EXPLAIN (costs off) -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; - QUERY PLAN ----------------------------------------------------------------------------------------------------------------------------- - Sort - Sort Key: d - -> Index Scan using tsts_idx on tsts - Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d >= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) -(4 rows) - -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; - id | d ------+--------------------------------- - 371 | Tue May 17 06:21:22.326724 2016 - 406 | Wed May 18 17:21:22.326724 2016 - 415 | Thu May 19 02:21:22.326724 2016 - 428 | Thu May 19 15:21:22.326724 2016 - 457 | Fri May 20 20:21:22.326724 2016 - 458 | Fri May 20 21:21:22.326724 2016 - 484 | Sat May 21 23:21:22.326724 2016 - 496 | Sun May 22 11:21:22.326724 2016 -(8 rows) - SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d ASC LIMIT 3; id | d ----+--------------------------------- @@ -357,6 +469,11 @@ SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER 458 | Fri May 20 21:21:22.326724 2016 (3 rows) +-- Test "ORDER BY" error message +DROP INDEX tsts_idx; +CREATE INDEX tsts_idx ON tsts USING rum (t rum_tsvector_addon_ops, d); +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +ERROR: cannot order without attribute 2 in ORDER BY clause -- Test multicolumn index RESET enable_indexscan; RESET enable_indexonlyscan; diff --git a/expected/orderby_1.out b/expected/orderby_1.out index 09ace4276c..cdd536ac9d 100644 --- a/expected/orderby_1.out +++ b/expected/orderby_1.out @@ -1,9 +1,92 @@ +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * orderby.out - test output for 64-bit systems and + * orderby_1.out - test output for 32-bit systems. + * + */ CREATE TABLE tsts (id int, t tsvector, d timestamp); \copy tsts from 'data/tsts.data' CREATE INDEX tsts_idx ON tsts USING rum (t rum_tsvector_addon_ops, d) WITH (attach = 'd', to = 't'); -INSERT INTO tsts VALUES (-1, 't1 t2', '2016-05-02 02:24:22.326724'); -INSERT INTO tsts VALUES (-2, 't1 t2 t3', '2016-05-02 02:26:22.326724'); +INSERT INTO tsts VALUES (-1, 't1 t2', '2016-05-02 02:24:22.326724'); +INSERT INTO tsts VALUES (-2, 't1 t2 t3', '2016-05-02 02:26:22.326724'); +SET enable_indexscan=OFF; +SET enable_indexonlyscan=OFF; +SET enable_bitmapscan=OFF; +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + id | d | ?column? +-----+---------------------------------+--------------- + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 + 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 + 406 | Wed May 18 17:21:22.326724 2016 | 183597.326724 + 415 | Thu May 19 02:21:22.326724 2016 | 215997.326724 +(5 rows) + +SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; + id | d | ?column? +-----+---------------------------------+--------------- + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 + 252 | Thu May 12 07:21:22.326724 2016 | 370802.673276 + 232 | Wed May 11 11:21:22.326724 2016 | 442802.673276 + 168 | Sun May 08 19:21:22.326724 2016 | 673202.673276 +(5 rows) + +SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; + id | d | ?column? +-----+---------------------------------+--------------- + 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 + 406 | Wed May 18 17:21:22.326724 2016 | 183597.326724 + 415 | Thu May 19 02:21:22.326724 2016 | 215997.326724 + 428 | Thu May 19 15:21:22.326724 2016 | 262797.326724 + 457 | Fri May 20 20:21:22.326724 2016 | 367197.326724 +(5 rows) + +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; + id | d +-----+--------------------------------- + 16 | Mon May 02 11:21:22.326724 2016 + 39 | Tue May 03 10:21:22.326724 2016 + 71 | Wed May 04 18:21:22.326724 2016 + 135 | Sat May 07 10:21:22.326724 2016 + 168 | Sun May 08 19:21:22.326724 2016 + 232 | Wed May 11 11:21:22.326724 2016 + 252 | Thu May 12 07:21:22.326724 2016 + 354 | Mon May 16 13:21:22.326724 2016 + 355 | Mon May 16 14:21:22.326724 2016 +(9 rows) + +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; + id | d +-----+--------------------------------- + 371 | Tue May 17 06:21:22.326724 2016 + 406 | Wed May 18 17:21:22.326724 2016 + 415 | Thu May 19 02:21:22.326724 2016 + 428 | Thu May 19 15:21:22.326724 2016 + 457 | Fri May 20 20:21:22.326724 2016 + 458 | Fri May 20 21:21:22.326724 2016 + 484 | Sat May 21 23:21:22.326724 2016 + 496 | Sun May 22 11:21:22.326724 2016 +(8 rows) + +-- Test bitmap index scan +RESET enable_bitmapscan; +SET enable_seqscan = off; +EXPLAIN (costs off) +SELECT count(*) FROM tsts WHERE t @@ 'wr|qh'; + QUERY PLAN +------------------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on tsts + Recheck Cond: (t @@ '''wr'' | ''qh'''::tsquery) + -> Bitmap Index Scan on tsts_idx + Index Cond: (t @@ '''wr'' | ''qh'''::tsquery) +(5 rows) + SELECT count(*) FROM tsts WHERE t @@ 'wr|qh'; count ------- @@ -40,9 +123,19 @@ SELECT count(*) FROM tsts WHERE t @@ '(eq|yt)&(wr|qh)'; 39 (1 row) -SET enable_indexscan=OFF; -SET enable_indexonlyscan=OFF; -SET enable_bitmapscan=OFF; +EXPLAIN (costs off) +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + QUERY PLAN +------------------------------------------------------------------------------------- + Limit + -> Sort + Sort Key: ((d <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone)) + -> Bitmap Heap Scan on tsts + Recheck Cond: (t @@ '''wr'' & ''qh'''::tsquery) + -> Bitmap Index Scan on tsts_idx + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) +(7 rows) + SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? -----+---------------------------------+--------------- @@ -53,6 +146,19 @@ SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY 415 | Thu May 19 02:21:22.326724 2016 | 215997.326724 (5 rows) +EXPLAIN (costs off) +SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; + QUERY PLAN +------------------------------------------------------------------------------------- + Limit + -> Sort + Sort Key: ((d <=| 'Mon May 16 14:21:25 2016'::timestamp without time zone)) + -> Bitmap Heap Scan on tsts + Recheck Cond: (t @@ '''wr'' & ''qh'''::tsquery) + -> Bitmap Index Scan on tsts_idx + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) +(7 rows) + SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? -----+---------------------------------+--------------- @@ -63,6 +169,19 @@ SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY 168 | Sun May 08 19:21:22.326724 2016 | 673202.673276 (5 rows) +EXPLAIN (costs off) +SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; + QUERY PLAN +------------------------------------------------------------------------------------- + Limit + -> Sort + Sort Key: ((d |=> 'Mon May 16 14:21:25 2016'::timestamp without time zone)) + -> Bitmap Heap Scan on tsts + Recheck Cond: (t @@ '''wr'' & ''qh'''::tsquery) + -> Bitmap Index Scan on tsts_idx + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) +(7 rows) + SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? -----+---------------------------------+--------------- @@ -73,6 +192,29 @@ SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY 457 | Fri May 20 20:21:22.326724 2016 | 367197.326724 (5 rows) +EXPLAIN (costs off) +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + QUERY PLAN +----------------------------------------------------------------------------------- + Limit + -> Index Scan using tsts_idx on tsts + Order By: (d <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone) +(3 rows) + +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +ERROR: doesn't support order by over pass-by-reference column +EXPLAIN (costs off) +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; + QUERY PLAN +---------------------------------------------------------------------------------------------------------------------------------- + Sort + Sort Key: d + -> Bitmap Heap Scan on tsts + Recheck Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d <= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) + -> Bitmap Index Scan on tsts_idx + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d <= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) +(6 rows) + SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; id | d -----+--------------------------------- @@ -87,6 +229,18 @@ SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER 355 | Mon May 16 14:21:22.326724 2016 (9 rows) +EXPLAIN (costs off) +SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; + QUERY PLAN +---------------------------------------------------------------------------------------------------------------------------------- + Sort + Sort Key: d + -> Bitmap Heap Scan on tsts + Recheck Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d >= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) + -> Bitmap Index Scan on tsts_idx + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d >= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) +(6 rows) + SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; id | d -----+--------------------------------- @@ -100,20 +254,18 @@ SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER 496 | Sun May 22 11:21:22.326724 2016 (8 rows) +-- Test index scan RESET enable_indexscan; RESET enable_indexonlyscan; -RESET enable_bitmapscan; -SET enable_seqscan = off; +SET enable_bitmapscan=OFF; EXPLAIN (costs off) SELECT count(*) FROM tsts WHERE t @@ 'wr|qh'; - QUERY PLAN -------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------- Aggregate - -> Bitmap Heap Scan on tsts - Recheck Cond: (t @@ '''wr'' | ''qh'''::tsquery) - -> Bitmap Index Scan on tsts_idx - Index Cond: (t @@ '''wr'' | ''qh'''::tsquery) -(5 rows) + -> Index Scan using tsts_idx on tsts + Index Cond: (t @@ '''wr'' | ''qh'''::tsquery) +(3 rows) SELECT count(*) FROM tsts WHERE t @@ 'wr|qh'; count @@ -245,54 +397,6 @@ SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER 496 | Sun May 22 11:21:22.326724 2016 (8 rows) -SET enable_bitmapscan=OFF; -EXPLAIN (costs off) -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; - QUERY PLAN ----------------------------------------------------------------------------------------------------------------------------- - Sort - Sort Key: d - -> Index Scan using tsts_idx on tsts - Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d <= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) -(4 rows) - -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; - id | d ------+--------------------------------- - 16 | Mon May 02 11:21:22.326724 2016 - 39 | Tue May 03 10:21:22.326724 2016 - 71 | Wed May 04 18:21:22.326724 2016 - 135 | Sat May 07 10:21:22.326724 2016 - 168 | Sun May 08 19:21:22.326724 2016 - 232 | Wed May 11 11:21:22.326724 2016 - 252 | Thu May 12 07:21:22.326724 2016 - 354 | Mon May 16 13:21:22.326724 2016 - 355 | Mon May 16 14:21:22.326724 2016 -(9 rows) - -EXPLAIN (costs off) -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; - QUERY PLAN ----------------------------------------------------------------------------------------------------------------------------- - Sort - Sort Key: d - -> Index Scan using tsts_idx on tsts - Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d >= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) -(4 rows) - -SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; - id | d ------+--------------------------------- - 371 | Tue May 17 06:21:22.326724 2016 - 406 | Wed May 18 17:21:22.326724 2016 - 415 | Thu May 19 02:21:22.326724 2016 - 428 | Thu May 19 15:21:22.326724 2016 - 457 | Fri May 20 20:21:22.326724 2016 - 458 | Fri May 20 21:21:22.326724 2016 - 484 | Sat May 21 23:21:22.326724 2016 - 496 | Sun May 22 11:21:22.326724 2016 -(8 rows) - SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d ASC LIMIT 3; id | d ----+--------------------------------- @@ -325,6 +429,11 @@ SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER 458 | Fri May 20 21:21:22.326724 2016 (3 rows) +-- Test "ORDER BY" error message +DROP INDEX tsts_idx; +CREATE INDEX tsts_idx ON tsts USING rum (t rum_tsvector_addon_ops, d); +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +ERROR: doesn't support order by over pass-by-reference column -- Test multicolumn index RESET enable_indexscan; RESET enable_indexonlyscan; diff --git a/expected/orderby_hash.out b/expected/orderby_hash.out index 1636088fdb..782ad5700e 100644 --- a/expected/orderby_hash.out +++ b/expected/orderby_hash.out @@ -1,9 +1,92 @@ +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * orderby_hash.out - test output for 64-bit systems and + * orderby_hash_1.out - test output for 32-bit systems. + * + */ CREATE TABLE tstsh (id int, t tsvector, d timestamp); \copy tstsh from 'data/tsts.data' CREATE INDEX tstsh_idx ON tstsh USING rum (t rum_tsvector_hash_addon_ops, d) WITH (attach = 'd', to = 't'); -INSERT INTO tstsh VALUES (-1, 't1 t2', '2016-05-02 02:24:22.326724'); -INSERT INTO tstsh VALUES (-2, 't1 t2 t3', '2016-05-02 02:26:22.326724'); +INSERT INTO tstsh VALUES (-1, 't1 t2', '2016-05-02 02:24:22.326724'); +INSERT INTO tstsh VALUES (-2, 't1 t2 t3', '2016-05-02 02:26:22.326724'); +SET enable_indexscan=OFF; +SET enable_indexonlyscan=OFF; +SET enable_bitmapscan=OFF; +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + id | d | ?column? +-----+---------------------------------+--------------- + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 + 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 + 406 | Wed May 18 17:21:22.326724 2016 | 183597.326724 + 415 | Thu May 19 02:21:22.326724 2016 | 215997.326724 +(5 rows) + +SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; + id | d | ?column? +-----+---------------------------------+--------------- + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 + 252 | Thu May 12 07:21:22.326724 2016 | 370802.673276 + 232 | Wed May 11 11:21:22.326724 2016 | 442802.673276 + 168 | Sun May 08 19:21:22.326724 2016 | 673202.673276 +(5 rows) + +SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; + id | d | ?column? +-----+---------------------------------+--------------- + 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 + 406 | Wed May 18 17:21:22.326724 2016 | 183597.326724 + 415 | Thu May 19 02:21:22.326724 2016 | 215997.326724 + 428 | Thu May 19 15:21:22.326724 2016 | 262797.326724 + 457 | Fri May 20 20:21:22.326724 2016 | 367197.326724 +(5 rows) + +SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; + id | d +-----+--------------------------------- + 16 | Mon May 02 11:21:22.326724 2016 + 39 | Tue May 03 10:21:22.326724 2016 + 71 | Wed May 04 18:21:22.326724 2016 + 135 | Sat May 07 10:21:22.326724 2016 + 168 | Sun May 08 19:21:22.326724 2016 + 232 | Wed May 11 11:21:22.326724 2016 + 252 | Thu May 12 07:21:22.326724 2016 + 354 | Mon May 16 13:21:22.326724 2016 + 355 | Mon May 16 14:21:22.326724 2016 +(9 rows) + +SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; + id | d +-----+--------------------------------- + 371 | Tue May 17 06:21:22.326724 2016 + 406 | Wed May 18 17:21:22.326724 2016 + 415 | Thu May 19 02:21:22.326724 2016 + 428 | Thu May 19 15:21:22.326724 2016 + 457 | Fri May 20 20:21:22.326724 2016 + 458 | Fri May 20 21:21:22.326724 2016 + 484 | Sat May 21 23:21:22.326724 2016 + 496 | Sun May 22 11:21:22.326724 2016 +(8 rows) + +-- Test bitmap index scan +RESET enable_bitmapscan; +SET enable_seqscan = off; +EXPLAIN (costs off) +SELECT count(*) FROM tstsh WHERE t @@ 'wr|qh'; + QUERY PLAN +------------------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on tstsh + Recheck Cond: (t @@ '''wr'' | ''qh'''::tsquery) + -> Bitmap Index Scan on tstsh_idx + Index Cond: (t @@ '''wr'' | ''qh'''::tsquery) +(5 rows) + SELECT count(*) FROM tstsh WHERE t @@ 'wr|qh'; count ------- @@ -40,9 +123,19 @@ SELECT count(*) FROM tstsh WHERE t @@ '(eq|yt)&(wr|qh)'; 39 (1 row) -SET enable_indexscan=OFF; -SET enable_indexonlyscan=OFF; -SET enable_bitmapscan=OFF; +EXPLAIN (costs off) +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + QUERY PLAN +------------------------------------------------------------------------------------- + Limit + -> Sort + Sort Key: ((d <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone)) + -> Bitmap Heap Scan on tstsh + Recheck Cond: (t @@ '''wr'' & ''qh'''::tsquery) + -> Bitmap Index Scan on tstsh_idx + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) +(7 rows) + SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? -----+---------------------------------+--------------- @@ -53,6 +146,19 @@ SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY 415 | Thu May 19 02:21:22.326724 2016 | 215997.326724 (5 rows) +EXPLAIN (costs off) +SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; + QUERY PLAN +------------------------------------------------------------------------------------- + Limit + -> Sort + Sort Key: ((d <=| 'Mon May 16 14:21:25 2016'::timestamp without time zone)) + -> Bitmap Heap Scan on tstsh + Recheck Cond: (t @@ '''wr'' & ''qh'''::tsquery) + -> Bitmap Index Scan on tstsh_idx + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) +(7 rows) + SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? -----+---------------------------------+--------------- @@ -63,6 +169,19 @@ SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY 168 | Sun May 08 19:21:22.326724 2016 | 673202.673276 (5 rows) +EXPLAIN (costs off) +SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; + QUERY PLAN +------------------------------------------------------------------------------------- + Limit + -> Sort + Sort Key: ((d |=> 'Mon May 16 14:21:25 2016'::timestamp without time zone)) + -> Bitmap Heap Scan on tstsh + Recheck Cond: (t @@ '''wr'' & ''qh'''::tsquery) + -> Bitmap Index Scan on tstsh_idx + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) +(7 rows) + SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? -----+---------------------------------+--------------- @@ -73,6 +192,37 @@ SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY 457 | Fri May 20 20:21:22.326724 2016 | 367197.326724 (5 rows) +EXPLAIN (costs off) +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tstsh ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + QUERY PLAN +----------------------------------------------------------------------------------- + Limit + -> Index Scan using tstsh_idx on tstsh + Order By: (d <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone) +(3 rows) + +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tstsh ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + id | d | ?column? +-----+---------------------------------+------------- + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 356 | Mon May 16 15:21:22.326724 2016 | 3597.326724 + 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 + 357 | Mon May 16 16:21:22.326724 2016 | 7197.326724 + 353 | Mon May 16 12:21:22.326724 2016 | 7202.673276 +(5 rows) + +EXPLAIN (costs off) +SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; + QUERY PLAN +---------------------------------------------------------------------------------------------------------------------------------- + Sort + Sort Key: d + -> Bitmap Heap Scan on tstsh + Recheck Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d <= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) + -> Bitmap Index Scan on tstsh_idx + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d <= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) +(6 rows) + SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; id | d -----+--------------------------------- @@ -87,6 +237,18 @@ SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER 355 | Mon May 16 14:21:22.326724 2016 (9 rows) +EXPLAIN (costs off) +SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; + QUERY PLAN +---------------------------------------------------------------------------------------------------------------------------------- + Sort + Sort Key: d + -> Bitmap Heap Scan on tstsh + Recheck Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d >= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) + -> Bitmap Index Scan on tstsh_idx + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d >= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) +(6 rows) + SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; id | d -----+--------------------------------- @@ -100,20 +262,18 @@ SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER 496 | Sun May 22 11:21:22.326724 2016 (8 rows) +-- Test index scan RESET enable_indexscan; RESET enable_indexonlyscan; -RESET enable_bitmapscan; -SET enable_seqscan = off; +SET enable_bitmapscan=OFF; EXPLAIN (costs off) SELECT count(*) FROM tstsh WHERE t @@ 'wr|qh'; - QUERY PLAN -------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------- Aggregate - -> Bitmap Heap Scan on tstsh - Recheck Cond: (t @@ '''wr'' | ''qh'''::tsquery) - -> Bitmap Index Scan on tstsh_idx - Index Cond: (t @@ '''wr'' | ''qh'''::tsquery) -(5 rows) + -> Index Scan using tstsh_idx on tstsh + Index Cond: (t @@ '''wr'' | ''qh'''::tsquery) +(3 rows) SELECT count(*) FROM tstsh WHERE t @@ 'wr|qh'; count @@ -277,54 +437,6 @@ SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER 496 | Sun May 22 11:21:22.326724 2016 (8 rows) -SET enable_bitmapscan=OFF; -EXPLAIN (costs off) -SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; - QUERY PLAN ----------------------------------------------------------------------------------------------------------------------------- - Sort - Sort Key: d - -> Index Scan using tstsh_idx on tstsh - Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d <= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) -(4 rows) - -SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; - id | d ------+--------------------------------- - 16 | Mon May 02 11:21:22.326724 2016 - 39 | Tue May 03 10:21:22.326724 2016 - 71 | Wed May 04 18:21:22.326724 2016 - 135 | Sat May 07 10:21:22.326724 2016 - 168 | Sun May 08 19:21:22.326724 2016 - 232 | Wed May 11 11:21:22.326724 2016 - 252 | Thu May 12 07:21:22.326724 2016 - 354 | Mon May 16 13:21:22.326724 2016 - 355 | Mon May 16 14:21:22.326724 2016 -(9 rows) - -EXPLAIN (costs off) -SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; - QUERY PLAN ----------------------------------------------------------------------------------------------------------------------------- - Sort - Sort Key: d - -> Index Scan using tstsh_idx on tstsh - Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d >= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) -(4 rows) - -SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; - id | d ------+--------------------------------- - 371 | Tue May 17 06:21:22.326724 2016 - 406 | Wed May 18 17:21:22.326724 2016 - 415 | Thu May 19 02:21:22.326724 2016 - 428 | Thu May 19 15:21:22.326724 2016 - 457 | Fri May 20 20:21:22.326724 2016 - 458 | Fri May 20 21:21:22.326724 2016 - 484 | Sat May 21 23:21:22.326724 2016 - 496 | Sun May 22 11:21:22.326724 2016 -(8 rows) - SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d ASC LIMIT 3; id | d ----+--------------------------------- diff --git a/expected/orderby_hash_1.out b/expected/orderby_hash_1.out index 8182aff567..f19e4507c7 100644 --- a/expected/orderby_hash_1.out +++ b/expected/orderby_hash_1.out @@ -1,9 +1,92 @@ +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * orderby_hash.out - test output for 64-bit systems and + * orderby_hash_1.out - test output for 32-bit systems. + * + */ CREATE TABLE tstsh (id int, t tsvector, d timestamp); \copy tstsh from 'data/tsts.data' CREATE INDEX tstsh_idx ON tstsh USING rum (t rum_tsvector_hash_addon_ops, d) WITH (attach = 'd', to = 't'); -INSERT INTO tstsh VALUES (-1, 't1 t2', '2016-05-02 02:24:22.326724'); -INSERT INTO tstsh VALUES (-2, 't1 t2 t3', '2016-05-02 02:26:22.326724'); +INSERT INTO tstsh VALUES (-1, 't1 t2', '2016-05-02 02:24:22.326724'); +INSERT INTO tstsh VALUES (-2, 't1 t2 t3', '2016-05-02 02:26:22.326724'); +SET enable_indexscan=OFF; +SET enable_indexonlyscan=OFF; +SET enable_bitmapscan=OFF; +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + id | d | ?column? +-----+---------------------------------+--------------- + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 + 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 + 406 | Wed May 18 17:21:22.326724 2016 | 183597.326724 + 415 | Thu May 19 02:21:22.326724 2016 | 215997.326724 +(5 rows) + +SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; + id | d | ?column? +-----+---------------------------------+--------------- + 355 | Mon May 16 14:21:22.326724 2016 | 2.673276 + 354 | Mon May 16 13:21:22.326724 2016 | 3602.673276 + 252 | Thu May 12 07:21:22.326724 2016 | 370802.673276 + 232 | Wed May 11 11:21:22.326724 2016 | 442802.673276 + 168 | Sun May 08 19:21:22.326724 2016 | 673202.673276 +(5 rows) + +SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; + id | d | ?column? +-----+---------------------------------+--------------- + 371 | Tue May 17 06:21:22.326724 2016 | 57597.326724 + 406 | Wed May 18 17:21:22.326724 2016 | 183597.326724 + 415 | Thu May 19 02:21:22.326724 2016 | 215997.326724 + 428 | Thu May 19 15:21:22.326724 2016 | 262797.326724 + 457 | Fri May 20 20:21:22.326724 2016 | 367197.326724 +(5 rows) + +SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; + id | d +-----+--------------------------------- + 16 | Mon May 02 11:21:22.326724 2016 + 39 | Tue May 03 10:21:22.326724 2016 + 71 | Wed May 04 18:21:22.326724 2016 + 135 | Sat May 07 10:21:22.326724 2016 + 168 | Sun May 08 19:21:22.326724 2016 + 232 | Wed May 11 11:21:22.326724 2016 + 252 | Thu May 12 07:21:22.326724 2016 + 354 | Mon May 16 13:21:22.326724 2016 + 355 | Mon May 16 14:21:22.326724 2016 +(9 rows) + +SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; + id | d +-----+--------------------------------- + 371 | Tue May 17 06:21:22.326724 2016 + 406 | Wed May 18 17:21:22.326724 2016 + 415 | Thu May 19 02:21:22.326724 2016 + 428 | Thu May 19 15:21:22.326724 2016 + 457 | Fri May 20 20:21:22.326724 2016 + 458 | Fri May 20 21:21:22.326724 2016 + 484 | Sat May 21 23:21:22.326724 2016 + 496 | Sun May 22 11:21:22.326724 2016 +(8 rows) + +-- Test bitmap index scan +RESET enable_bitmapscan; +SET enable_seqscan = off; +EXPLAIN (costs off) +SELECT count(*) FROM tstsh WHERE t @@ 'wr|qh'; + QUERY PLAN +------------------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on tstsh + Recheck Cond: (t @@ '''wr'' | ''qh'''::tsquery) + -> Bitmap Index Scan on tstsh_idx + Index Cond: (t @@ '''wr'' | ''qh'''::tsquery) +(5 rows) + SELECT count(*) FROM tstsh WHERE t @@ 'wr|qh'; count ------- @@ -40,9 +123,19 @@ SELECT count(*) FROM tstsh WHERE t @@ '(eq|yt)&(wr|qh)'; 39 (1 row) -SET enable_indexscan=OFF; -SET enable_indexonlyscan=OFF; -SET enable_bitmapscan=OFF; +EXPLAIN (costs off) +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + QUERY PLAN +------------------------------------------------------------------------------------- + Limit + -> Sort + Sort Key: ((d <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone)) + -> Bitmap Heap Scan on tstsh + Recheck Cond: (t @@ '''wr'' & ''qh'''::tsquery) + -> Bitmap Index Scan on tstsh_idx + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) +(7 rows) + SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? -----+---------------------------------+--------------- @@ -53,6 +146,19 @@ SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY 415 | Thu May 19 02:21:22.326724 2016 | 215997.326724 (5 rows) +EXPLAIN (costs off) +SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; + QUERY PLAN +------------------------------------------------------------------------------------- + Limit + -> Sort + Sort Key: ((d <=| 'Mon May 16 14:21:25 2016'::timestamp without time zone)) + -> Bitmap Heap Scan on tstsh + Recheck Cond: (t @@ '''wr'' & ''qh'''::tsquery) + -> Bitmap Index Scan on tstsh_idx + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) +(7 rows) + SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? -----+---------------------------------+--------------- @@ -63,6 +169,19 @@ SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY 168 | Sun May 08 19:21:22.326724 2016 | 673202.673276 (5 rows) +EXPLAIN (costs off) +SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; + QUERY PLAN +------------------------------------------------------------------------------------- + Limit + -> Sort + Sort Key: ((d |=> 'Mon May 16 14:21:25 2016'::timestamp without time zone)) + -> Bitmap Heap Scan on tstsh + Recheck Cond: (t @@ '''wr'' & ''qh'''::tsquery) + -> Bitmap Index Scan on tstsh_idx + Index Cond: (t @@ '''wr'' & ''qh'''::tsquery) +(7 rows) + SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; id | d | ?column? -----+---------------------------------+--------------- @@ -73,6 +192,29 @@ SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY 457 | Fri May 20 20:21:22.326724 2016 | 367197.326724 (5 rows) +EXPLAIN (costs off) +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tstsh ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + QUERY PLAN +----------------------------------------------------------------------------------- + Limit + -> Index Scan using tstsh_idx on tstsh + Order By: (d <=> 'Mon May 16 14:21:25 2016'::timestamp without time zone) +(3 rows) + +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tstsh ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +ERROR: doesn't support order by over pass-by-reference column +EXPLAIN (costs off) +SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; + QUERY PLAN +---------------------------------------------------------------------------------------------------------------------------------- + Sort + Sort Key: d + -> Bitmap Heap Scan on tstsh + Recheck Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d <= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) + -> Bitmap Index Scan on tstsh_idx + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d <= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) +(6 rows) + SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; id | d -----+--------------------------------- @@ -87,6 +229,18 @@ SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER 355 | Mon May 16 14:21:22.326724 2016 (9 rows) +EXPLAIN (costs off) +SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; + QUERY PLAN +---------------------------------------------------------------------------------------------------------------------------------- + Sort + Sort Key: d + -> Bitmap Heap Scan on tstsh + Recheck Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d >= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) + -> Bitmap Index Scan on tstsh_idx + Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d >= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) +(6 rows) + SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; id | d -----+--------------------------------- @@ -100,20 +254,18 @@ SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER 496 | Sun May 22 11:21:22.326724 2016 (8 rows) +-- Test index scan RESET enable_indexscan; RESET enable_indexonlyscan; -RESET enable_bitmapscan; -SET enable_seqscan = off; +SET enable_bitmapscan=OFF; EXPLAIN (costs off) SELECT count(*) FROM tstsh WHERE t @@ 'wr|qh'; - QUERY PLAN -------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------- Aggregate - -> Bitmap Heap Scan on tstsh - Recheck Cond: (t @@ '''wr'' | ''qh'''::tsquery) - -> Bitmap Index Scan on tstsh_idx - Index Cond: (t @@ '''wr'' | ''qh'''::tsquery) -(5 rows) + -> Index Scan using tstsh_idx on tstsh + Index Cond: (t @@ '''wr'' | ''qh'''::tsquery) +(3 rows) SELECT count(*) FROM tstsh WHERE t @@ 'wr|qh'; count @@ -245,54 +397,6 @@ SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER 496 | Sun May 22 11:21:22.326724 2016 (8 rows) -SET enable_bitmapscan=OFF; -EXPLAIN (costs off) -SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; - QUERY PLAN ----------------------------------------------------------------------------------------------------------------------------- - Sort - Sort Key: d - -> Index Scan using tstsh_idx on tstsh - Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d <= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) -(4 rows) - -SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; - id | d ------+--------------------------------- - 16 | Mon May 02 11:21:22.326724 2016 - 39 | Tue May 03 10:21:22.326724 2016 - 71 | Wed May 04 18:21:22.326724 2016 - 135 | Sat May 07 10:21:22.326724 2016 - 168 | Sun May 08 19:21:22.326724 2016 - 232 | Wed May 11 11:21:22.326724 2016 - 252 | Thu May 12 07:21:22.326724 2016 - 354 | Mon May 16 13:21:22.326724 2016 - 355 | Mon May 16 14:21:22.326724 2016 -(9 rows) - -EXPLAIN (costs off) -SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; - QUERY PLAN ----------------------------------------------------------------------------------------------------------------------------- - Sort - Sort Key: d - -> Index Scan using tstsh_idx on tstsh - Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (d >= 'Mon May 16 14:21:25 2016'::timestamp without time zone)) -(4 rows) - -SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; - id | d ------+--------------------------------- - 371 | Tue May 17 06:21:22.326724 2016 - 406 | Wed May 18 17:21:22.326724 2016 - 415 | Thu May 19 02:21:22.326724 2016 - 428 | Thu May 19 15:21:22.326724 2016 - 457 | Fri May 20 20:21:22.326724 2016 - 458 | Fri May 20 21:21:22.326724 2016 - 484 | Sat May 21 23:21:22.326724 2016 - 496 | Sun May 22 11:21:22.326724 2016 -(8 rows) - SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d ASC LIMIT 3; id | d ----+--------------------------------- diff --git a/expected/predicate-rum-2.out b/expected/predicate-rum-2.out index d8a731091d..cc4720c052 100644 --- a/expected/predicate-rum-2.out +++ b/expected/predicate-rum-2.out @@ -2,109 +2,129 @@ Parsed test spec with 2 sessions starting permutation: rxy1 wx1 c1 rxy2 wy2 c2 step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; -id tsv - -165 'gi':1 'gj':2 'gx':6 'hb':5 'hx':8 'ir':7 'sq':3 'yg':4 -74 'cv':1 'de':8 'ds':10 'eh':4 'fd':6 'gh':3 'gi':7 'hn':5 'hx':9 'lo':2 -116 'el':1 'er':9 'ez':6 'gr':3 'gt':4 'hx':7 'ie':5 'iv':2 'od':10 'zf':8 -119 'eo':1 'fc':5 'he':7 'ht':9 'hx':8 'it':2 'km':3 'so':4 'uj':6 -190 'hh':1 'hx':2 'id':5 'iv':3 'ld':7 'ob':6 'oy':4 -206 'hx':1 'it':9 'ji':10 'jl':5 'lq':3 'mh':8 'nq':6 'pc':7 'ub':4 'xi':2 + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + step wx1: INSERT INTO rum_tbl(tsv) values('ab'); step c1: COMMIT; step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; -id tsv + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) -424 'qh':1 'su':2 'tu':3 'ww':4 -230 'iv':1 'lp':2 'mt':4 'qh':3 'ss':5 -248 'jn':1 'js':4 'mx':2 'ne':7 'nn':5 'nw':3 'qh':6 -50 'bx':1 'ca':5 'da':10 'dn':2 'eq':6 'fn':8 'gl':7 'hu':3 'ig':9 'mg':4 'qh':11 step wy2: INSERT INTO rum_tbl(tsv) values('xz'); step c2: COMMIT; starting permutation: rxy1 wx1 rxy2 c1 wy2 c2 step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; -id tsv - -165 'gi':1 'gj':2 'gx':6 'hb':5 'hx':8 'ir':7 'sq':3 'yg':4 -74 'cv':1 'de':8 'ds':10 'eh':4 'fd':6 'gh':3 'gi':7 'hn':5 'hx':9 'lo':2 -116 'el':1 'er':9 'ez':6 'gr':3 'gt':4 'hx':7 'ie':5 'iv':2 'od':10 'zf':8 -119 'eo':1 'fc':5 'he':7 'ht':9 'hx':8 'it':2 'km':3 'so':4 'uj':6 -190 'hh':1 'hx':2 'id':5 'iv':3 'ld':7 'ob':6 'oy':4 -206 'hx':1 'it':9 'ji':10 'jl':5 'lq':3 'mh':8 'nq':6 'pc':7 'ub':4 'xi':2 + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + step wx1: INSERT INTO rum_tbl(tsv) values('ab'); step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; -id tsv + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) -424 'qh':1 'su':2 'tu':3 'ww':4 -230 'iv':1 'lp':2 'mt':4 'qh':3 'ss':5 -248 'jn':1 'js':4 'mx':2 'ne':7 'nn':5 'nw':3 'qh':6 -50 'bx':1 'ca':5 'da':10 'dn':2 'eq':6 'fn':8 'gl':7 'hu':3 'ig':9 'mg':4 'qh':11 step c1: COMMIT; step wy2: INSERT INTO rum_tbl(tsv) values('xz'); step c2: COMMIT; starting permutation: rxy1 wx1 rxy2 wy2 c1 c2 step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; -id tsv - -165 'gi':1 'gj':2 'gx':6 'hb':5 'hx':8 'ir':7 'sq':3 'yg':4 -74 'cv':1 'de':8 'ds':10 'eh':4 'fd':6 'gh':3 'gi':7 'hn':5 'hx':9 'lo':2 -116 'el':1 'er':9 'ez':6 'gr':3 'gt':4 'hx':7 'ie':5 'iv':2 'od':10 'zf':8 -119 'eo':1 'fc':5 'he':7 'ht':9 'hx':8 'it':2 'km':3 'so':4 'uj':6 -190 'hh':1 'hx':2 'id':5 'iv':3 'ld':7 'ob':6 'oy':4 -206 'hx':1 'it':9 'ji':10 'jl':5 'lq':3 'mh':8 'nq':6 'pc':7 'ub':4 'xi':2 + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + step wx1: INSERT INTO rum_tbl(tsv) values('ab'); step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; -id tsv + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) -424 'qh':1 'su':2 'tu':3 'ww':4 -230 'iv':1 'lp':2 'mt':4 'qh':3 'ss':5 -248 'jn':1 'js':4 'mx':2 'ne':7 'nn':5 'nw':3 'qh':6 -50 'bx':1 'ca':5 'da':10 'dn':2 'eq':6 'fn':8 'gl':7 'hu':3 'ig':9 'mg':4 'qh':11 step wy2: INSERT INTO rum_tbl(tsv) values('xz'); step c1: COMMIT; step c2: COMMIT; starting permutation: rxy1 wx1 rxy2 wy2 c2 c1 step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; -id tsv - -165 'gi':1 'gj':2 'gx':6 'hb':5 'hx':8 'ir':7 'sq':3 'yg':4 -74 'cv':1 'de':8 'ds':10 'eh':4 'fd':6 'gh':3 'gi':7 'hn':5 'hx':9 'lo':2 -116 'el':1 'er':9 'ez':6 'gr':3 'gt':4 'hx':7 'ie':5 'iv':2 'od':10 'zf':8 -119 'eo':1 'fc':5 'he':7 'ht':9 'hx':8 'it':2 'km':3 'so':4 'uj':6 -190 'hh':1 'hx':2 'id':5 'iv':3 'ld':7 'ob':6 'oy':4 -206 'hx':1 'it':9 'ji':10 'jl':5 'lq':3 'mh':8 'nq':6 'pc':7 'ub':4 'xi':2 + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + step wx1: INSERT INTO rum_tbl(tsv) values('ab'); step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; -id tsv + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) -424 'qh':1 'su':2 'tu':3 'ww':4 -230 'iv':1 'lp':2 'mt':4 'qh':3 'ss':5 -248 'jn':1 'js':4 'mx':2 'ne':7 'nn':5 'nw':3 'qh':6 -50 'bx':1 'ca':5 'da':10 'dn':2 'eq':6 'fn':8 'gl':7 'hu':3 'ig':9 'mg':4 'qh':11 step wy2: INSERT INTO rum_tbl(tsv) values('xz'); step c2: COMMIT; step c1: COMMIT; starting permutation: rxy1 rxy2 wx1 c1 wy2 c2 step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; -id tsv - -165 'gi':1 'gj':2 'gx':6 'hb':5 'hx':8 'ir':7 'sq':3 'yg':4 -74 'cv':1 'de':8 'ds':10 'eh':4 'fd':6 'gh':3 'gi':7 'hn':5 'hx':9 'lo':2 -116 'el':1 'er':9 'ez':6 'gr':3 'gt':4 'hx':7 'ie':5 'iv':2 'od':10 'zf':8 -119 'eo':1 'fc':5 'he':7 'ht':9 'hx':8 'it':2 'km':3 'so':4 'uj':6 -190 'hh':1 'hx':2 'id':5 'iv':3 'ld':7 'ob':6 'oy':4 -206 'hx':1 'it':9 'ji':10 'jl':5 'lq':3 'mh':8 'nq':6 'pc':7 'ub':4 'xi':2 + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; -id tsv + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) -424 'qh':1 'su':2 'tu':3 'ww':4 -230 'iv':1 'lp':2 'mt':4 'qh':3 'ss':5 -248 'jn':1 'js':4 'mx':2 'ne':7 'nn':5 'nw':3 'qh':6 -50 'bx':1 'ca':5 'da':10 'dn':2 'eq':6 'fn':8 'gl':7 'hu':3 'ig':9 'mg':4 'qh':11 step wx1: INSERT INTO rum_tbl(tsv) values('ab'); step c1: COMMIT; step wy2: INSERT INTO rum_tbl(tsv) values('xz'); @@ -112,21 +132,25 @@ step c2: COMMIT; starting permutation: rxy1 rxy2 wx1 wy2 c1 c2 step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; -id tsv - -165 'gi':1 'gj':2 'gx':6 'hb':5 'hx':8 'ir':7 'sq':3 'yg':4 -74 'cv':1 'de':8 'ds':10 'eh':4 'fd':6 'gh':3 'gi':7 'hn':5 'hx':9 'lo':2 -116 'el':1 'er':9 'ez':6 'gr':3 'gt':4 'hx':7 'ie':5 'iv':2 'od':10 'zf':8 -119 'eo':1 'fc':5 'he':7 'ht':9 'hx':8 'it':2 'km':3 'so':4 'uj':6 -190 'hh':1 'hx':2 'id':5 'iv':3 'ld':7 'ob':6 'oy':4 -206 'hx':1 'it':9 'ji':10 'jl':5 'lq':3 'mh':8 'nq':6 'pc':7 'ub':4 'xi':2 + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; -id tsv + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) -424 'qh':1 'su':2 'tu':3 'ww':4 -230 'iv':1 'lp':2 'mt':4 'qh':3 'ss':5 -248 'jn':1 'js':4 'mx':2 'ne':7 'nn':5 'nw':3 'qh':6 -50 'bx':1 'ca':5 'da':10 'dn':2 'eq':6 'fn':8 'gl':7 'hu':3 'ig':9 'mg':4 'qh':11 step wx1: INSERT INTO rum_tbl(tsv) values('ab'); step wy2: INSERT INTO rum_tbl(tsv) values('xz'); step c1: COMMIT; @@ -134,21 +158,25 @@ step c2: COMMIT; starting permutation: rxy1 rxy2 wx1 wy2 c2 c1 step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; -id tsv - -165 'gi':1 'gj':2 'gx':6 'hb':5 'hx':8 'ir':7 'sq':3 'yg':4 -74 'cv':1 'de':8 'ds':10 'eh':4 'fd':6 'gh':3 'gi':7 'hn':5 'hx':9 'lo':2 -116 'el':1 'er':9 'ez':6 'gr':3 'gt':4 'hx':7 'ie':5 'iv':2 'od':10 'zf':8 -119 'eo':1 'fc':5 'he':7 'ht':9 'hx':8 'it':2 'km':3 'so':4 'uj':6 -190 'hh':1 'hx':2 'id':5 'iv':3 'ld':7 'ob':6 'oy':4 -206 'hx':1 'it':9 'ji':10 'jl':5 'lq':3 'mh':8 'nq':6 'pc':7 'ub':4 'xi':2 + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; -id tsv + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) -424 'qh':1 'su':2 'tu':3 'ww':4 -230 'iv':1 'lp':2 'mt':4 'qh':3 'ss':5 -248 'jn':1 'js':4 'mx':2 'ne':7 'nn':5 'nw':3 'qh':6 -50 'bx':1 'ca':5 'da':10 'dn':2 'eq':6 'fn':8 'gl':7 'hu':3 'ig':9 'mg':4 'qh':11 step wx1: INSERT INTO rum_tbl(tsv) values('ab'); step wy2: INSERT INTO rum_tbl(tsv) values('xz'); step c2: COMMIT; @@ -156,21 +184,25 @@ step c1: COMMIT; starting permutation: rxy1 rxy2 wy2 wx1 c1 c2 step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; -id tsv - -165 'gi':1 'gj':2 'gx':6 'hb':5 'hx':8 'ir':7 'sq':3 'yg':4 -74 'cv':1 'de':8 'ds':10 'eh':4 'fd':6 'gh':3 'gi':7 'hn':5 'hx':9 'lo':2 -116 'el':1 'er':9 'ez':6 'gr':3 'gt':4 'hx':7 'ie':5 'iv':2 'od':10 'zf':8 -119 'eo':1 'fc':5 'he':7 'ht':9 'hx':8 'it':2 'km':3 'so':4 'uj':6 -190 'hh':1 'hx':2 'id':5 'iv':3 'ld':7 'ob':6 'oy':4 -206 'hx':1 'it':9 'ji':10 'jl':5 'lq':3 'mh':8 'nq':6 'pc':7 'ub':4 'xi':2 + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; -id tsv + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) -424 'qh':1 'su':2 'tu':3 'ww':4 -230 'iv':1 'lp':2 'mt':4 'qh':3 'ss':5 -248 'jn':1 'js':4 'mx':2 'ne':7 'nn':5 'nw':3 'qh':6 -50 'bx':1 'ca':5 'da':10 'dn':2 'eq':6 'fn':8 'gl':7 'hu':3 'ig':9 'mg':4 'qh':11 step wy2: INSERT INTO rum_tbl(tsv) values('xz'); step wx1: INSERT INTO rum_tbl(tsv) values('ab'); step c1: COMMIT; @@ -178,21 +210,25 @@ step c2: COMMIT; starting permutation: rxy1 rxy2 wy2 wx1 c2 c1 step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; -id tsv - -165 'gi':1 'gj':2 'gx':6 'hb':5 'hx':8 'ir':7 'sq':3 'yg':4 -74 'cv':1 'de':8 'ds':10 'eh':4 'fd':6 'gh':3 'gi':7 'hn':5 'hx':9 'lo':2 -116 'el':1 'er':9 'ez':6 'gr':3 'gt':4 'hx':7 'ie':5 'iv':2 'od':10 'zf':8 -119 'eo':1 'fc':5 'he':7 'ht':9 'hx':8 'it':2 'km':3 'so':4 'uj':6 -190 'hh':1 'hx':2 'id':5 'iv':3 'ld':7 'ob':6 'oy':4 -206 'hx':1 'it':9 'ji':10 'jl':5 'lq':3 'mh':8 'nq':6 'pc':7 'ub':4 'xi':2 + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; -id tsv + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) -424 'qh':1 'su':2 'tu':3 'ww':4 -230 'iv':1 'lp':2 'mt':4 'qh':3 'ss':5 -248 'jn':1 'js':4 'mx':2 'ne':7 'nn':5 'nw':3 'qh':6 -50 'bx':1 'ca':5 'da':10 'dn':2 'eq':6 'fn':8 'gl':7 'hu':3 'ig':9 'mg':4 'qh':11 step wy2: INSERT INTO rum_tbl(tsv) values('xz'); step wx1: INSERT INTO rum_tbl(tsv) values('ab'); step c2: COMMIT; @@ -200,21 +236,25 @@ step c1: COMMIT; starting permutation: rxy1 rxy2 wy2 c2 wx1 c1 step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; -id tsv - -165 'gi':1 'gj':2 'gx':6 'hb':5 'hx':8 'ir':7 'sq':3 'yg':4 -74 'cv':1 'de':8 'ds':10 'eh':4 'fd':6 'gh':3 'gi':7 'hn':5 'hx':9 'lo':2 -116 'el':1 'er':9 'ez':6 'gr':3 'gt':4 'hx':7 'ie':5 'iv':2 'od':10 'zf':8 -119 'eo':1 'fc':5 'he':7 'ht':9 'hx':8 'it':2 'km':3 'so':4 'uj':6 -190 'hh':1 'hx':2 'id':5 'iv':3 'ld':7 'ob':6 'oy':4 -206 'hx':1 'it':9 'ji':10 'jl':5 'lq':3 'mh':8 'nq':6 'pc':7 'ub':4 'xi':2 + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; -id tsv + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) -424 'qh':1 'su':2 'tu':3 'ww':4 -230 'iv':1 'lp':2 'mt':4 'qh':3 'ss':5 -248 'jn':1 'js':4 'mx':2 'ne':7 'nn':5 'nw':3 'qh':6 -50 'bx':1 'ca':5 'da':10 'dn':2 'eq':6 'fn':8 'gl':7 'hu':3 'ig':9 'mg':4 'qh':11 step wy2: INSERT INTO rum_tbl(tsv) values('xz'); step c2: COMMIT; step wx1: INSERT INTO rum_tbl(tsv) values('ab'); @@ -222,21 +262,25 @@ step c1: COMMIT; starting permutation: rxy2 rxy1 wx1 c1 wy2 c2 step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; -id tsv + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) -424 'qh':1 'su':2 'tu':3 'ww':4 -230 'iv':1 'lp':2 'mt':4 'qh':3 'ss':5 -248 'jn':1 'js':4 'mx':2 'ne':7 'nn':5 'nw':3 'qh':6 -50 'bx':1 'ca':5 'da':10 'dn':2 'eq':6 'fn':8 'gl':7 'hu':3 'ig':9 'mg':4 'qh':11 step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; -id tsv - -165 'gi':1 'gj':2 'gx':6 'hb':5 'hx':8 'ir':7 'sq':3 'yg':4 -74 'cv':1 'de':8 'ds':10 'eh':4 'fd':6 'gh':3 'gi':7 'hn':5 'hx':9 'lo':2 -116 'el':1 'er':9 'ez':6 'gr':3 'gt':4 'hx':7 'ie':5 'iv':2 'od':10 'zf':8 -119 'eo':1 'fc':5 'he':7 'ht':9 'hx':8 'it':2 'km':3 'so':4 'uj':6 -190 'hh':1 'hx':2 'id':5 'iv':3 'ld':7 'ob':6 'oy':4 -206 'hx':1 'it':9 'ji':10 'jl':5 'lq':3 'mh':8 'nq':6 'pc':7 'ub':4 'xi':2 + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + step wx1: INSERT INTO rum_tbl(tsv) values('ab'); step c1: COMMIT; step wy2: INSERT INTO rum_tbl(tsv) values('xz'); @@ -244,21 +288,25 @@ step c2: COMMIT; starting permutation: rxy2 rxy1 wx1 wy2 c1 c2 step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; -id tsv + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) -424 'qh':1 'su':2 'tu':3 'ww':4 -230 'iv':1 'lp':2 'mt':4 'qh':3 'ss':5 -248 'jn':1 'js':4 'mx':2 'ne':7 'nn':5 'nw':3 'qh':6 -50 'bx':1 'ca':5 'da':10 'dn':2 'eq':6 'fn':8 'gl':7 'hu':3 'ig':9 'mg':4 'qh':11 step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; -id tsv - -165 'gi':1 'gj':2 'gx':6 'hb':5 'hx':8 'ir':7 'sq':3 'yg':4 -74 'cv':1 'de':8 'ds':10 'eh':4 'fd':6 'gh':3 'gi':7 'hn':5 'hx':9 'lo':2 -116 'el':1 'er':9 'ez':6 'gr':3 'gt':4 'hx':7 'ie':5 'iv':2 'od':10 'zf':8 -119 'eo':1 'fc':5 'he':7 'ht':9 'hx':8 'it':2 'km':3 'so':4 'uj':6 -190 'hh':1 'hx':2 'id':5 'iv':3 'ld':7 'ob':6 'oy':4 -206 'hx':1 'it':9 'ji':10 'jl':5 'lq':3 'mh':8 'nq':6 'pc':7 'ub':4 'xi':2 + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + step wx1: INSERT INTO rum_tbl(tsv) values('ab'); step wy2: INSERT INTO rum_tbl(tsv) values('xz'); step c1: COMMIT; @@ -266,21 +314,25 @@ step c2: COMMIT; starting permutation: rxy2 rxy1 wx1 wy2 c2 c1 step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; -id tsv + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) -424 'qh':1 'su':2 'tu':3 'ww':4 -230 'iv':1 'lp':2 'mt':4 'qh':3 'ss':5 -248 'jn':1 'js':4 'mx':2 'ne':7 'nn':5 'nw':3 'qh':6 -50 'bx':1 'ca':5 'da':10 'dn':2 'eq':6 'fn':8 'gl':7 'hu':3 'ig':9 'mg':4 'qh':11 step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; -id tsv - -165 'gi':1 'gj':2 'gx':6 'hb':5 'hx':8 'ir':7 'sq':3 'yg':4 -74 'cv':1 'de':8 'ds':10 'eh':4 'fd':6 'gh':3 'gi':7 'hn':5 'hx':9 'lo':2 -116 'el':1 'er':9 'ez':6 'gr':3 'gt':4 'hx':7 'ie':5 'iv':2 'od':10 'zf':8 -119 'eo':1 'fc':5 'he':7 'ht':9 'hx':8 'it':2 'km':3 'so':4 'uj':6 -190 'hh':1 'hx':2 'id':5 'iv':3 'ld':7 'ob':6 'oy':4 -206 'hx':1 'it':9 'ji':10 'jl':5 'lq':3 'mh':8 'nq':6 'pc':7 'ub':4 'xi':2 + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + step wx1: INSERT INTO rum_tbl(tsv) values('ab'); step wy2: INSERT INTO rum_tbl(tsv) values('xz'); step c2: COMMIT; @@ -288,21 +340,25 @@ step c1: COMMIT; starting permutation: rxy2 rxy1 wy2 wx1 c1 c2 step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; -id tsv + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) -424 'qh':1 'su':2 'tu':3 'ww':4 -230 'iv':1 'lp':2 'mt':4 'qh':3 'ss':5 -248 'jn':1 'js':4 'mx':2 'ne':7 'nn':5 'nw':3 'qh':6 -50 'bx':1 'ca':5 'da':10 'dn':2 'eq':6 'fn':8 'gl':7 'hu':3 'ig':9 'mg':4 'qh':11 step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; -id tsv - -165 'gi':1 'gj':2 'gx':6 'hb':5 'hx':8 'ir':7 'sq':3 'yg':4 -74 'cv':1 'de':8 'ds':10 'eh':4 'fd':6 'gh':3 'gi':7 'hn':5 'hx':9 'lo':2 -116 'el':1 'er':9 'ez':6 'gr':3 'gt':4 'hx':7 'ie':5 'iv':2 'od':10 'zf':8 -119 'eo':1 'fc':5 'he':7 'ht':9 'hx':8 'it':2 'km':3 'so':4 'uj':6 -190 'hh':1 'hx':2 'id':5 'iv':3 'ld':7 'ob':6 'oy':4 -206 'hx':1 'it':9 'ji':10 'jl':5 'lq':3 'mh':8 'nq':6 'pc':7 'ub':4 'xi':2 + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + step wy2: INSERT INTO rum_tbl(tsv) values('xz'); step wx1: INSERT INTO rum_tbl(tsv) values('ab'); step c1: COMMIT; @@ -310,21 +366,25 @@ step c2: COMMIT; starting permutation: rxy2 rxy1 wy2 wx1 c2 c1 step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; -id tsv + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) -424 'qh':1 'su':2 'tu':3 'ww':4 -230 'iv':1 'lp':2 'mt':4 'qh':3 'ss':5 -248 'jn':1 'js':4 'mx':2 'ne':7 'nn':5 'nw':3 'qh':6 -50 'bx':1 'ca':5 'da':10 'dn':2 'eq':6 'fn':8 'gl':7 'hu':3 'ig':9 'mg':4 'qh':11 step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; -id tsv - -165 'gi':1 'gj':2 'gx':6 'hb':5 'hx':8 'ir':7 'sq':3 'yg':4 -74 'cv':1 'de':8 'ds':10 'eh':4 'fd':6 'gh':3 'gi':7 'hn':5 'hx':9 'lo':2 -116 'el':1 'er':9 'ez':6 'gr':3 'gt':4 'hx':7 'ie':5 'iv':2 'od':10 'zf':8 -119 'eo':1 'fc':5 'he':7 'ht':9 'hx':8 'it':2 'km':3 'so':4 'uj':6 -190 'hh':1 'hx':2 'id':5 'iv':3 'ld':7 'ob':6 'oy':4 -206 'hx':1 'it':9 'ji':10 'jl':5 'lq':3 'mh':8 'nq':6 'pc':7 'ub':4 'xi':2 + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + step wy2: INSERT INTO rum_tbl(tsv) values('xz'); step wx1: INSERT INTO rum_tbl(tsv) values('ab'); step c2: COMMIT; @@ -332,21 +392,25 @@ step c1: COMMIT; starting permutation: rxy2 rxy1 wy2 c2 wx1 c1 step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; -id tsv + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) -424 'qh':1 'su':2 'tu':3 'ww':4 -230 'iv':1 'lp':2 'mt':4 'qh':3 'ss':5 -248 'jn':1 'js':4 'mx':2 'ne':7 'nn':5 'nw':3 'qh':6 -50 'bx':1 'ca':5 'da':10 'dn':2 'eq':6 'fn':8 'gl':7 'hu':3 'ig':9 'mg':4 'qh':11 step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; -id tsv - -165 'gi':1 'gj':2 'gx':6 'hb':5 'hx':8 'ir':7 'sq':3 'yg':4 -74 'cv':1 'de':8 'ds':10 'eh':4 'fd':6 'gh':3 'gi':7 'hn':5 'hx':9 'lo':2 -116 'el':1 'er':9 'ez':6 'gr':3 'gt':4 'hx':7 'ie':5 'iv':2 'od':10 'zf':8 -119 'eo':1 'fc':5 'he':7 'ht':9 'hx':8 'it':2 'km':3 'so':4 'uj':6 -190 'hh':1 'hx':2 'id':5 'iv':3 'ld':7 'ob':6 'oy':4 -206 'hx':1 'it':9 'ji':10 'jl':5 'lq':3 'mh':8 'nq':6 'pc':7 'ub':4 'xi':2 + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + step wy2: INSERT INTO rum_tbl(tsv) values('xz'); step c2: COMMIT; step wx1: INSERT INTO rum_tbl(tsv) values('ab'); @@ -354,88 +418,104 @@ step c1: COMMIT; starting permutation: rxy2 wy2 rxy1 wx1 c1 c2 step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; -id tsv + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) -424 'qh':1 'su':2 'tu':3 'ww':4 -230 'iv':1 'lp':2 'mt':4 'qh':3 'ss':5 -248 'jn':1 'js':4 'mx':2 'ne':7 'nn':5 'nw':3 'qh':6 -50 'bx':1 'ca':5 'da':10 'dn':2 'eq':6 'fn':8 'gl':7 'hu':3 'ig':9 'mg':4 'qh':11 step wy2: INSERT INTO rum_tbl(tsv) values('xz'); step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; -id tsv - -165 'gi':1 'gj':2 'gx':6 'hb':5 'hx':8 'ir':7 'sq':3 'yg':4 -74 'cv':1 'de':8 'ds':10 'eh':4 'fd':6 'gh':3 'gi':7 'hn':5 'hx':9 'lo':2 -116 'el':1 'er':9 'ez':6 'gr':3 'gt':4 'hx':7 'ie':5 'iv':2 'od':10 'zf':8 -119 'eo':1 'fc':5 'he':7 'ht':9 'hx':8 'it':2 'km':3 'so':4 'uj':6 -190 'hh':1 'hx':2 'id':5 'iv':3 'ld':7 'ob':6 'oy':4 -206 'hx':1 'it':9 'ji':10 'jl':5 'lq':3 'mh':8 'nq':6 'pc':7 'ub':4 'xi':2 + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + step wx1: INSERT INTO rum_tbl(tsv) values('ab'); step c1: COMMIT; step c2: COMMIT; starting permutation: rxy2 wy2 rxy1 wx1 c2 c1 step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; -id tsv + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) -424 'qh':1 'su':2 'tu':3 'ww':4 -230 'iv':1 'lp':2 'mt':4 'qh':3 'ss':5 -248 'jn':1 'js':4 'mx':2 'ne':7 'nn':5 'nw':3 'qh':6 -50 'bx':1 'ca':5 'da':10 'dn':2 'eq':6 'fn':8 'gl':7 'hu':3 'ig':9 'mg':4 'qh':11 step wy2: INSERT INTO rum_tbl(tsv) values('xz'); step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; -id tsv - -165 'gi':1 'gj':2 'gx':6 'hb':5 'hx':8 'ir':7 'sq':3 'yg':4 -74 'cv':1 'de':8 'ds':10 'eh':4 'fd':6 'gh':3 'gi':7 'hn':5 'hx':9 'lo':2 -116 'el':1 'er':9 'ez':6 'gr':3 'gt':4 'hx':7 'ie':5 'iv':2 'od':10 'zf':8 -119 'eo':1 'fc':5 'he':7 'ht':9 'hx':8 'it':2 'km':3 'so':4 'uj':6 -190 'hh':1 'hx':2 'id':5 'iv':3 'ld':7 'ob':6 'oy':4 -206 'hx':1 'it':9 'ji':10 'jl':5 'lq':3 'mh':8 'nq':6 'pc':7 'ub':4 'xi':2 + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + step wx1: INSERT INTO rum_tbl(tsv) values('ab'); step c2: COMMIT; step c1: COMMIT; starting permutation: rxy2 wy2 rxy1 c2 wx1 c1 step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; -id tsv + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) -424 'qh':1 'su':2 'tu':3 'ww':4 -230 'iv':1 'lp':2 'mt':4 'qh':3 'ss':5 -248 'jn':1 'js':4 'mx':2 'ne':7 'nn':5 'nw':3 'qh':6 -50 'bx':1 'ca':5 'da':10 'dn':2 'eq':6 'fn':8 'gl':7 'hu':3 'ig':9 'mg':4 'qh':11 step wy2: INSERT INTO rum_tbl(tsv) values('xz'); step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; -id tsv - -165 'gi':1 'gj':2 'gx':6 'hb':5 'hx':8 'ir':7 'sq':3 'yg':4 -74 'cv':1 'de':8 'ds':10 'eh':4 'fd':6 'gh':3 'gi':7 'hn':5 'hx':9 'lo':2 -116 'el':1 'er':9 'ez':6 'gr':3 'gt':4 'hx':7 'ie':5 'iv':2 'od':10 'zf':8 -119 'eo':1 'fc':5 'he':7 'ht':9 'hx':8 'it':2 'km':3 'so':4 'uj':6 -190 'hh':1 'hx':2 'id':5 'iv':3 'ld':7 'ob':6 'oy':4 -206 'hx':1 'it':9 'ji':10 'jl':5 'lq':3 'mh':8 'nq':6 'pc':7 'ub':4 'xi':2 + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + step c2: COMMIT; step wx1: INSERT INTO rum_tbl(tsv) values('ab'); step c1: COMMIT; starting permutation: rxy2 wy2 c2 rxy1 wx1 c1 step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; -id tsv + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) -424 'qh':1 'su':2 'tu':3 'ww':4 -230 'iv':1 'lp':2 'mt':4 'qh':3 'ss':5 -248 'jn':1 'js':4 'mx':2 'ne':7 'nn':5 'nw':3 'qh':6 -50 'bx':1 'ca':5 'da':10 'dn':2 'eq':6 'fn':8 'gl':7 'hu':3 'ig':9 'mg':4 'qh':11 step wy2: INSERT INTO rum_tbl(tsv) values('xz'); step c2: COMMIT; step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; -id tsv - -165 'gi':1 'gj':2 'gx':6 'hb':5 'hx':8 'ir':7 'sq':3 'yg':4 -74 'cv':1 'de':8 'ds':10 'eh':4 'fd':6 'gh':3 'gi':7 'hn':5 'hx':9 'lo':2 -116 'el':1 'er':9 'ez':6 'gr':3 'gt':4 'hx':7 'ie':5 'iv':2 'od':10 'zf':8 -119 'eo':1 'fc':5 'he':7 'ht':9 'hx':8 'it':2 'km':3 'so':4 'uj':6 -190 'hh':1 'hx':2 'id':5 'iv':3 'ld':7 'ob':6 'oy':4 -206 'hx':1 'it':9 'ji':10 'jl':5 'lq':3 'mh':8 'nq':6 'pc':7 'ub':4 'xi':2 + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + step wx1: INSERT INTO rum_tbl(tsv) values('ab'); step c1: COMMIT; diff --git a/expected/predicate-rum.out b/expected/predicate-rum.out index c708c1267a..86071a3c7a 100644 --- a/expected/predicate-rum.out +++ b/expected/predicate-rum.out @@ -2,460 +2,522 @@ Parsed test spec with 2 sessions starting permutation: rxy1 wx1 c1 rxy2 wy2 c2 step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; -id tsv - -165 'gi':1 'gj':2 'gx':6 'hb':5 'hx':8 'ir':7 'sq':3 'yg':4 -74 'cv':1 'de':8 'ds':10 'eh':4 'fd':6 'gh':3 'gi':7 'hn':5 'hx':9 'lo':2 -116 'el':1 'er':9 'ez':6 'gr':3 'gt':4 'hx':7 'ie':5 'iv':2 'od':10 'zf':8 -119 'eo':1 'fc':5 'he':7 'ht':9 'hx':8 'it':2 'km':3 'so':4 'uj':6 -190 'hh':1 'hx':2 'id':5 'iv':3 'ld':7 'ob':6 'oy':4 -206 'hx':1 'it':9 'ji':10 'jl':5 'lq':3 'mh':8 'nq':6 'pc':7 'ub':4 'xi':2 + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + step wx1: INSERT INTO rum_tbl(tsv) values('qh'); step c1: COMMIT; step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; -id tsv + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +339|'qh' +(6 rows) -424 'qh':1 'su':2 'tu':3 'ww':4 -230 'iv':1 'lp':2 'mt':4 'qh':3 'ss':5 -248 'jn':1 'js':4 'mx':2 'ne':7 'nn':5 'nw':3 'qh':6 -50 'bx':1 'ca':5 'da':10 'dn':2 'eq':6 'fn':8 'gl':7 'hu':3 'ig':9 'mg':4 'qh':11 -677 'qh' step wy2: INSERT INTO rum_tbl(tsv) values('hx'); step c2: COMMIT; starting permutation: rxy1 wx1 rxy2 c1 wy2 c2 step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; -id tsv - -165 'gi':1 'gj':2 'gx':6 'hb':5 'hx':8 'ir':7 'sq':3 'yg':4 -74 'cv':1 'de':8 'ds':10 'eh':4 'fd':6 'gh':3 'gi':7 'hn':5 'hx':9 'lo':2 -116 'el':1 'er':9 'ez':6 'gr':3 'gt':4 'hx':7 'ie':5 'iv':2 'od':10 'zf':8 -119 'eo':1 'fc':5 'he':7 'ht':9 'hx':8 'it':2 'km':3 'so':4 'uj':6 -190 'hh':1 'hx':2 'id':5 'iv':3 'ld':7 'ob':6 'oy':4 -206 'hx':1 'it':9 'ji':10 'jl':5 'lq':3 'mh':8 'nq':6 'pc':7 'ub':4 'xi':2 + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + step wx1: INSERT INTO rum_tbl(tsv) values('qh'); step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; -id tsv + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) -424 'qh':1 'su':2 'tu':3 'ww':4 -230 'iv':1 'lp':2 'mt':4 'qh':3 'ss':5 -248 'jn':1 'js':4 'mx':2 'ne':7 'nn':5 'nw':3 'qh':6 -50 'bx':1 'ca':5 'da':10 'dn':2 'eq':6 'fn':8 'gl':7 'hu':3 'ig':9 'mg':4 'qh':11 step c1: COMMIT; step wy2: INSERT INTO rum_tbl(tsv) values('hx'); -ERROR: could not serialize access due to read/write dependencies among transactions step c2: COMMIT; starting permutation: rxy1 wx1 rxy2 wy2 c1 c2 step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; -id tsv - -165 'gi':1 'gj':2 'gx':6 'hb':5 'hx':8 'ir':7 'sq':3 'yg':4 -74 'cv':1 'de':8 'ds':10 'eh':4 'fd':6 'gh':3 'gi':7 'hn':5 'hx':9 'lo':2 -116 'el':1 'er':9 'ez':6 'gr':3 'gt':4 'hx':7 'ie':5 'iv':2 'od':10 'zf':8 -119 'eo':1 'fc':5 'he':7 'ht':9 'hx':8 'it':2 'km':3 'so':4 'uj':6 -190 'hh':1 'hx':2 'id':5 'iv':3 'ld':7 'ob':6 'oy':4 -206 'hx':1 'it':9 'ji':10 'jl':5 'lq':3 'mh':8 'nq':6 'pc':7 'ub':4 'xi':2 + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + step wx1: INSERT INTO rum_tbl(tsv) values('qh'); step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; -id tsv + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) -424 'qh':1 'su':2 'tu':3 'ww':4 -230 'iv':1 'lp':2 'mt':4 'qh':3 'ss':5 -248 'jn':1 'js':4 'mx':2 'ne':7 'nn':5 'nw':3 'qh':6 -50 'bx':1 'ca':5 'da':10 'dn':2 'eq':6 'fn':8 'gl':7 'hu':3 'ig':9 'mg':4 'qh':11 step wy2: INSERT INTO rum_tbl(tsv) values('hx'); step c1: COMMIT; step c2: COMMIT; -ERROR: could not serialize access due to read/write dependencies among transactions starting permutation: rxy1 wx1 rxy2 wy2 c2 c1 step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; -id tsv - -165 'gi':1 'gj':2 'gx':6 'hb':5 'hx':8 'ir':7 'sq':3 'yg':4 -74 'cv':1 'de':8 'ds':10 'eh':4 'fd':6 'gh':3 'gi':7 'hn':5 'hx':9 'lo':2 -116 'el':1 'er':9 'ez':6 'gr':3 'gt':4 'hx':7 'ie':5 'iv':2 'od':10 'zf':8 -119 'eo':1 'fc':5 'he':7 'ht':9 'hx':8 'it':2 'km':3 'so':4 'uj':6 -190 'hh':1 'hx':2 'id':5 'iv':3 'ld':7 'ob':6 'oy':4 -206 'hx':1 'it':9 'ji':10 'jl':5 'lq':3 'mh':8 'nq':6 'pc':7 'ub':4 'xi':2 + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + step wx1: INSERT INTO rum_tbl(tsv) values('qh'); step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; -id tsv + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) -424 'qh':1 'su':2 'tu':3 'ww':4 -230 'iv':1 'lp':2 'mt':4 'qh':3 'ss':5 -248 'jn':1 'js':4 'mx':2 'ne':7 'nn':5 'nw':3 'qh':6 -50 'bx':1 'ca':5 'da':10 'dn':2 'eq':6 'fn':8 'gl':7 'hu':3 'ig':9 'mg':4 'qh':11 step wy2: INSERT INTO rum_tbl(tsv) values('hx'); step c2: COMMIT; step c1: COMMIT; -ERROR: could not serialize access due to read/write dependencies among transactions starting permutation: rxy1 rxy2 wx1 c1 wy2 c2 step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; -id tsv - -165 'gi':1 'gj':2 'gx':6 'hb':5 'hx':8 'ir':7 'sq':3 'yg':4 -74 'cv':1 'de':8 'ds':10 'eh':4 'fd':6 'gh':3 'gi':7 'hn':5 'hx':9 'lo':2 -116 'el':1 'er':9 'ez':6 'gr':3 'gt':4 'hx':7 'ie':5 'iv':2 'od':10 'zf':8 -119 'eo':1 'fc':5 'he':7 'ht':9 'hx':8 'it':2 'km':3 'so':4 'uj':6 -190 'hh':1 'hx':2 'id':5 'iv':3 'ld':7 'ob':6 'oy':4 -206 'hx':1 'it':9 'ji':10 'jl':5 'lq':3 'mh':8 'nq':6 'pc':7 'ub':4 'xi':2 + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; -id tsv + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) -424 'qh':1 'su':2 'tu':3 'ww':4 -230 'iv':1 'lp':2 'mt':4 'qh':3 'ss':5 -248 'jn':1 'js':4 'mx':2 'ne':7 'nn':5 'nw':3 'qh':6 -50 'bx':1 'ca':5 'da':10 'dn':2 'eq':6 'fn':8 'gl':7 'hu':3 'ig':9 'mg':4 'qh':11 step wx1: INSERT INTO rum_tbl(tsv) values('qh'); step c1: COMMIT; step wy2: INSERT INTO rum_tbl(tsv) values('hx'); -ERROR: could not serialize access due to read/write dependencies among transactions step c2: COMMIT; starting permutation: rxy1 rxy2 wx1 wy2 c1 c2 step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; -id tsv - -165 'gi':1 'gj':2 'gx':6 'hb':5 'hx':8 'ir':7 'sq':3 'yg':4 -74 'cv':1 'de':8 'ds':10 'eh':4 'fd':6 'gh':3 'gi':7 'hn':5 'hx':9 'lo':2 -116 'el':1 'er':9 'ez':6 'gr':3 'gt':4 'hx':7 'ie':5 'iv':2 'od':10 'zf':8 -119 'eo':1 'fc':5 'he':7 'ht':9 'hx':8 'it':2 'km':3 'so':4 'uj':6 -190 'hh':1 'hx':2 'id':5 'iv':3 'ld':7 'ob':6 'oy':4 -206 'hx':1 'it':9 'ji':10 'jl':5 'lq':3 'mh':8 'nq':6 'pc':7 'ub':4 'xi':2 + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; -id tsv + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) -424 'qh':1 'su':2 'tu':3 'ww':4 -230 'iv':1 'lp':2 'mt':4 'qh':3 'ss':5 -248 'jn':1 'js':4 'mx':2 'ne':7 'nn':5 'nw':3 'qh':6 -50 'bx':1 'ca':5 'da':10 'dn':2 'eq':6 'fn':8 'gl':7 'hu':3 'ig':9 'mg':4 'qh':11 step wx1: INSERT INTO rum_tbl(tsv) values('qh'); step wy2: INSERT INTO rum_tbl(tsv) values('hx'); step c1: COMMIT; step c2: COMMIT; -ERROR: could not serialize access due to read/write dependencies among transactions starting permutation: rxy1 rxy2 wx1 wy2 c2 c1 step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; -id tsv - -165 'gi':1 'gj':2 'gx':6 'hb':5 'hx':8 'ir':7 'sq':3 'yg':4 -74 'cv':1 'de':8 'ds':10 'eh':4 'fd':6 'gh':3 'gi':7 'hn':5 'hx':9 'lo':2 -116 'el':1 'er':9 'ez':6 'gr':3 'gt':4 'hx':7 'ie':5 'iv':2 'od':10 'zf':8 -119 'eo':1 'fc':5 'he':7 'ht':9 'hx':8 'it':2 'km':3 'so':4 'uj':6 -190 'hh':1 'hx':2 'id':5 'iv':3 'ld':7 'ob':6 'oy':4 -206 'hx':1 'it':9 'ji':10 'jl':5 'lq':3 'mh':8 'nq':6 'pc':7 'ub':4 'xi':2 + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; -id tsv + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) -424 'qh':1 'su':2 'tu':3 'ww':4 -230 'iv':1 'lp':2 'mt':4 'qh':3 'ss':5 -248 'jn':1 'js':4 'mx':2 'ne':7 'nn':5 'nw':3 'qh':6 -50 'bx':1 'ca':5 'da':10 'dn':2 'eq':6 'fn':8 'gl':7 'hu':3 'ig':9 'mg':4 'qh':11 step wx1: INSERT INTO rum_tbl(tsv) values('qh'); step wy2: INSERT INTO rum_tbl(tsv) values('hx'); step c2: COMMIT; step c1: COMMIT; -ERROR: could not serialize access due to read/write dependencies among transactions starting permutation: rxy1 rxy2 wy2 wx1 c1 c2 step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; -id tsv - -165 'gi':1 'gj':2 'gx':6 'hb':5 'hx':8 'ir':7 'sq':3 'yg':4 -74 'cv':1 'de':8 'ds':10 'eh':4 'fd':6 'gh':3 'gi':7 'hn':5 'hx':9 'lo':2 -116 'el':1 'er':9 'ez':6 'gr':3 'gt':4 'hx':7 'ie':5 'iv':2 'od':10 'zf':8 -119 'eo':1 'fc':5 'he':7 'ht':9 'hx':8 'it':2 'km':3 'so':4 'uj':6 -190 'hh':1 'hx':2 'id':5 'iv':3 'ld':7 'ob':6 'oy':4 -206 'hx':1 'it':9 'ji':10 'jl':5 'lq':3 'mh':8 'nq':6 'pc':7 'ub':4 'xi':2 + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; -id tsv + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) -424 'qh':1 'su':2 'tu':3 'ww':4 -230 'iv':1 'lp':2 'mt':4 'qh':3 'ss':5 -248 'jn':1 'js':4 'mx':2 'ne':7 'nn':5 'nw':3 'qh':6 -50 'bx':1 'ca':5 'da':10 'dn':2 'eq':6 'fn':8 'gl':7 'hu':3 'ig':9 'mg':4 'qh':11 step wy2: INSERT INTO rum_tbl(tsv) values('hx'); step wx1: INSERT INTO rum_tbl(tsv) values('qh'); step c1: COMMIT; step c2: COMMIT; -ERROR: could not serialize access due to read/write dependencies among transactions starting permutation: rxy1 rxy2 wy2 wx1 c2 c1 step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; -id tsv - -165 'gi':1 'gj':2 'gx':6 'hb':5 'hx':8 'ir':7 'sq':3 'yg':4 -74 'cv':1 'de':8 'ds':10 'eh':4 'fd':6 'gh':3 'gi':7 'hn':5 'hx':9 'lo':2 -116 'el':1 'er':9 'ez':6 'gr':3 'gt':4 'hx':7 'ie':5 'iv':2 'od':10 'zf':8 -119 'eo':1 'fc':5 'he':7 'ht':9 'hx':8 'it':2 'km':3 'so':4 'uj':6 -190 'hh':1 'hx':2 'id':5 'iv':3 'ld':7 'ob':6 'oy':4 -206 'hx':1 'it':9 'ji':10 'jl':5 'lq':3 'mh':8 'nq':6 'pc':7 'ub':4 'xi':2 + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; -id tsv + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) -424 'qh':1 'su':2 'tu':3 'ww':4 -230 'iv':1 'lp':2 'mt':4 'qh':3 'ss':5 -248 'jn':1 'js':4 'mx':2 'ne':7 'nn':5 'nw':3 'qh':6 -50 'bx':1 'ca':5 'da':10 'dn':2 'eq':6 'fn':8 'gl':7 'hu':3 'ig':9 'mg':4 'qh':11 step wy2: INSERT INTO rum_tbl(tsv) values('hx'); step wx1: INSERT INTO rum_tbl(tsv) values('qh'); step c2: COMMIT; step c1: COMMIT; -ERROR: could not serialize access due to read/write dependencies among transactions starting permutation: rxy1 rxy2 wy2 c2 wx1 c1 step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; -id tsv - -165 'gi':1 'gj':2 'gx':6 'hb':5 'hx':8 'ir':7 'sq':3 'yg':4 -74 'cv':1 'de':8 'ds':10 'eh':4 'fd':6 'gh':3 'gi':7 'hn':5 'hx':9 'lo':2 -116 'el':1 'er':9 'ez':6 'gr':3 'gt':4 'hx':7 'ie':5 'iv':2 'od':10 'zf':8 -119 'eo':1 'fc':5 'he':7 'ht':9 'hx':8 'it':2 'km':3 'so':4 'uj':6 -190 'hh':1 'hx':2 'id':5 'iv':3 'ld':7 'ob':6 'oy':4 -206 'hx':1 'it':9 'ji':10 'jl':5 'lq':3 'mh':8 'nq':6 'pc':7 'ub':4 'xi':2 + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; -id tsv + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) -424 'qh':1 'su':2 'tu':3 'ww':4 -230 'iv':1 'lp':2 'mt':4 'qh':3 'ss':5 -248 'jn':1 'js':4 'mx':2 'ne':7 'nn':5 'nw':3 'qh':6 -50 'bx':1 'ca':5 'da':10 'dn':2 'eq':6 'fn':8 'gl':7 'hu':3 'ig':9 'mg':4 'qh':11 step wy2: INSERT INTO rum_tbl(tsv) values('hx'); step c2: COMMIT; step wx1: INSERT INTO rum_tbl(tsv) values('qh'); -ERROR: could not serialize access due to read/write dependencies among transactions step c1: COMMIT; starting permutation: rxy2 rxy1 wx1 c1 wy2 c2 step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; -id tsv + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) -424 'qh':1 'su':2 'tu':3 'ww':4 -230 'iv':1 'lp':2 'mt':4 'qh':3 'ss':5 -248 'jn':1 'js':4 'mx':2 'ne':7 'nn':5 'nw':3 'qh':6 -50 'bx':1 'ca':5 'da':10 'dn':2 'eq':6 'fn':8 'gl':7 'hu':3 'ig':9 'mg':4 'qh':11 step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; -id tsv - -165 'gi':1 'gj':2 'gx':6 'hb':5 'hx':8 'ir':7 'sq':3 'yg':4 -74 'cv':1 'de':8 'ds':10 'eh':4 'fd':6 'gh':3 'gi':7 'hn':5 'hx':9 'lo':2 -116 'el':1 'er':9 'ez':6 'gr':3 'gt':4 'hx':7 'ie':5 'iv':2 'od':10 'zf':8 -119 'eo':1 'fc':5 'he':7 'ht':9 'hx':8 'it':2 'km':3 'so':4 'uj':6 -190 'hh':1 'hx':2 'id':5 'iv':3 'ld':7 'ob':6 'oy':4 -206 'hx':1 'it':9 'ji':10 'jl':5 'lq':3 'mh':8 'nq':6 'pc':7 'ub':4 'xi':2 + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + step wx1: INSERT INTO rum_tbl(tsv) values('qh'); step c1: COMMIT; step wy2: INSERT INTO rum_tbl(tsv) values('hx'); -ERROR: could not serialize access due to read/write dependencies among transactions step c2: COMMIT; starting permutation: rxy2 rxy1 wx1 wy2 c1 c2 step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; -id tsv + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) -424 'qh':1 'su':2 'tu':3 'ww':4 -230 'iv':1 'lp':2 'mt':4 'qh':3 'ss':5 -248 'jn':1 'js':4 'mx':2 'ne':7 'nn':5 'nw':3 'qh':6 -50 'bx':1 'ca':5 'da':10 'dn':2 'eq':6 'fn':8 'gl':7 'hu':3 'ig':9 'mg':4 'qh':11 step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; -id tsv - -165 'gi':1 'gj':2 'gx':6 'hb':5 'hx':8 'ir':7 'sq':3 'yg':4 -74 'cv':1 'de':8 'ds':10 'eh':4 'fd':6 'gh':3 'gi':7 'hn':5 'hx':9 'lo':2 -116 'el':1 'er':9 'ez':6 'gr':3 'gt':4 'hx':7 'ie':5 'iv':2 'od':10 'zf':8 -119 'eo':1 'fc':5 'he':7 'ht':9 'hx':8 'it':2 'km':3 'so':4 'uj':6 -190 'hh':1 'hx':2 'id':5 'iv':3 'ld':7 'ob':6 'oy':4 -206 'hx':1 'it':9 'ji':10 'jl':5 'lq':3 'mh':8 'nq':6 'pc':7 'ub':4 'xi':2 + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + step wx1: INSERT INTO rum_tbl(tsv) values('qh'); step wy2: INSERT INTO rum_tbl(tsv) values('hx'); step c1: COMMIT; step c2: COMMIT; -ERROR: could not serialize access due to read/write dependencies among transactions starting permutation: rxy2 rxy1 wx1 wy2 c2 c1 step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; -id tsv + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) -424 'qh':1 'su':2 'tu':3 'ww':4 -230 'iv':1 'lp':2 'mt':4 'qh':3 'ss':5 -248 'jn':1 'js':4 'mx':2 'ne':7 'nn':5 'nw':3 'qh':6 -50 'bx':1 'ca':5 'da':10 'dn':2 'eq':6 'fn':8 'gl':7 'hu':3 'ig':9 'mg':4 'qh':11 step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; -id tsv - -165 'gi':1 'gj':2 'gx':6 'hb':5 'hx':8 'ir':7 'sq':3 'yg':4 -74 'cv':1 'de':8 'ds':10 'eh':4 'fd':6 'gh':3 'gi':7 'hn':5 'hx':9 'lo':2 -116 'el':1 'er':9 'ez':6 'gr':3 'gt':4 'hx':7 'ie':5 'iv':2 'od':10 'zf':8 -119 'eo':1 'fc':5 'he':7 'ht':9 'hx':8 'it':2 'km':3 'so':4 'uj':6 -190 'hh':1 'hx':2 'id':5 'iv':3 'ld':7 'ob':6 'oy':4 -206 'hx':1 'it':9 'ji':10 'jl':5 'lq':3 'mh':8 'nq':6 'pc':7 'ub':4 'xi':2 + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + step wx1: INSERT INTO rum_tbl(tsv) values('qh'); step wy2: INSERT INTO rum_tbl(tsv) values('hx'); step c2: COMMIT; step c1: COMMIT; -ERROR: could not serialize access due to read/write dependencies among transactions starting permutation: rxy2 rxy1 wy2 wx1 c1 c2 step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; -id tsv + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) -424 'qh':1 'su':2 'tu':3 'ww':4 -230 'iv':1 'lp':2 'mt':4 'qh':3 'ss':5 -248 'jn':1 'js':4 'mx':2 'ne':7 'nn':5 'nw':3 'qh':6 -50 'bx':1 'ca':5 'da':10 'dn':2 'eq':6 'fn':8 'gl':7 'hu':3 'ig':9 'mg':4 'qh':11 step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; -id tsv - -165 'gi':1 'gj':2 'gx':6 'hb':5 'hx':8 'ir':7 'sq':3 'yg':4 -74 'cv':1 'de':8 'ds':10 'eh':4 'fd':6 'gh':3 'gi':7 'hn':5 'hx':9 'lo':2 -116 'el':1 'er':9 'ez':6 'gr':3 'gt':4 'hx':7 'ie':5 'iv':2 'od':10 'zf':8 -119 'eo':1 'fc':5 'he':7 'ht':9 'hx':8 'it':2 'km':3 'so':4 'uj':6 -190 'hh':1 'hx':2 'id':5 'iv':3 'ld':7 'ob':6 'oy':4 -206 'hx':1 'it':9 'ji':10 'jl':5 'lq':3 'mh':8 'nq':6 'pc':7 'ub':4 'xi':2 + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + step wy2: INSERT INTO rum_tbl(tsv) values('hx'); step wx1: INSERT INTO rum_tbl(tsv) values('qh'); step c1: COMMIT; step c2: COMMIT; -ERROR: could not serialize access due to read/write dependencies among transactions starting permutation: rxy2 rxy1 wy2 wx1 c2 c1 step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; -id tsv + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) -424 'qh':1 'su':2 'tu':3 'ww':4 -230 'iv':1 'lp':2 'mt':4 'qh':3 'ss':5 -248 'jn':1 'js':4 'mx':2 'ne':7 'nn':5 'nw':3 'qh':6 -50 'bx':1 'ca':5 'da':10 'dn':2 'eq':6 'fn':8 'gl':7 'hu':3 'ig':9 'mg':4 'qh':11 step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; -id tsv - -165 'gi':1 'gj':2 'gx':6 'hb':5 'hx':8 'ir':7 'sq':3 'yg':4 -74 'cv':1 'de':8 'ds':10 'eh':4 'fd':6 'gh':3 'gi':7 'hn':5 'hx':9 'lo':2 -116 'el':1 'er':9 'ez':6 'gr':3 'gt':4 'hx':7 'ie':5 'iv':2 'od':10 'zf':8 -119 'eo':1 'fc':5 'he':7 'ht':9 'hx':8 'it':2 'km':3 'so':4 'uj':6 -190 'hh':1 'hx':2 'id':5 'iv':3 'ld':7 'ob':6 'oy':4 -206 'hx':1 'it':9 'ji':10 'jl':5 'lq':3 'mh':8 'nq':6 'pc':7 'ub':4 'xi':2 + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + step wy2: INSERT INTO rum_tbl(tsv) values('hx'); step wx1: INSERT INTO rum_tbl(tsv) values('qh'); step c2: COMMIT; step c1: COMMIT; -ERROR: could not serialize access due to read/write dependencies among transactions starting permutation: rxy2 rxy1 wy2 c2 wx1 c1 step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; -id tsv + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) -424 'qh':1 'su':2 'tu':3 'ww':4 -230 'iv':1 'lp':2 'mt':4 'qh':3 'ss':5 -248 'jn':1 'js':4 'mx':2 'ne':7 'nn':5 'nw':3 'qh':6 -50 'bx':1 'ca':5 'da':10 'dn':2 'eq':6 'fn':8 'gl':7 'hu':3 'ig':9 'mg':4 'qh':11 step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; -id tsv - -165 'gi':1 'gj':2 'gx':6 'hb':5 'hx':8 'ir':7 'sq':3 'yg':4 -74 'cv':1 'de':8 'ds':10 'eh':4 'fd':6 'gh':3 'gi':7 'hn':5 'hx':9 'lo':2 -116 'el':1 'er':9 'ez':6 'gr':3 'gt':4 'hx':7 'ie':5 'iv':2 'od':10 'zf':8 -119 'eo':1 'fc':5 'he':7 'ht':9 'hx':8 'it':2 'km':3 'so':4 'uj':6 -190 'hh':1 'hx':2 'id':5 'iv':3 'ld':7 'ob':6 'oy':4 -206 'hx':1 'it':9 'ji':10 'jl':5 'lq':3 'mh':8 'nq':6 'pc':7 'ub':4 'xi':2 + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + step wy2: INSERT INTO rum_tbl(tsv) values('hx'); step c2: COMMIT; step wx1: INSERT INTO rum_tbl(tsv) values('qh'); -ERROR: could not serialize access due to read/write dependencies among transactions step c1: COMMIT; starting permutation: rxy2 wy2 rxy1 wx1 c1 c2 step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; -id tsv + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) -424 'qh':1 'su':2 'tu':3 'ww':4 -230 'iv':1 'lp':2 'mt':4 'qh':3 'ss':5 -248 'jn':1 'js':4 'mx':2 'ne':7 'nn':5 'nw':3 'qh':6 -50 'bx':1 'ca':5 'da':10 'dn':2 'eq':6 'fn':8 'gl':7 'hu':3 'ig':9 'mg':4 'qh':11 step wy2: INSERT INTO rum_tbl(tsv) values('hx'); step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; -id tsv - -165 'gi':1 'gj':2 'gx':6 'hb':5 'hx':8 'ir':7 'sq':3 'yg':4 -74 'cv':1 'de':8 'ds':10 'eh':4 'fd':6 'gh':3 'gi':7 'hn':5 'hx':9 'lo':2 -116 'el':1 'er':9 'ez':6 'gr':3 'gt':4 'hx':7 'ie':5 'iv':2 'od':10 'zf':8 -119 'eo':1 'fc':5 'he':7 'ht':9 'hx':8 'it':2 'km':3 'so':4 'uj':6 -190 'hh':1 'hx':2 'id':5 'iv':3 'ld':7 'ob':6 'oy':4 -206 'hx':1 'it':9 'ji':10 'jl':5 'lq':3 'mh':8 'nq':6 'pc':7 'ub':4 'xi':2 + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + step wx1: INSERT INTO rum_tbl(tsv) values('qh'); step c1: COMMIT; step c2: COMMIT; -ERROR: could not serialize access due to read/write dependencies among transactions starting permutation: rxy2 wy2 rxy1 wx1 c2 c1 step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; -id tsv + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) -424 'qh':1 'su':2 'tu':3 'ww':4 -230 'iv':1 'lp':2 'mt':4 'qh':3 'ss':5 -248 'jn':1 'js':4 'mx':2 'ne':7 'nn':5 'nw':3 'qh':6 -50 'bx':1 'ca':5 'da':10 'dn':2 'eq':6 'fn':8 'gl':7 'hu':3 'ig':9 'mg':4 'qh':11 step wy2: INSERT INTO rum_tbl(tsv) values('hx'); step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; -id tsv - -165 'gi':1 'gj':2 'gx':6 'hb':5 'hx':8 'ir':7 'sq':3 'yg':4 -74 'cv':1 'de':8 'ds':10 'eh':4 'fd':6 'gh':3 'gi':7 'hn':5 'hx':9 'lo':2 -116 'el':1 'er':9 'ez':6 'gr':3 'gt':4 'hx':7 'ie':5 'iv':2 'od':10 'zf':8 -119 'eo':1 'fc':5 'he':7 'ht':9 'hx':8 'it':2 'km':3 'so':4 'uj':6 -190 'hh':1 'hx':2 'id':5 'iv':3 'ld':7 'ob':6 'oy':4 -206 'hx':1 'it':9 'ji':10 'jl':5 'lq':3 'mh':8 'nq':6 'pc':7 'ub':4 'xi':2 + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + step wx1: INSERT INTO rum_tbl(tsv) values('qh'); step c2: COMMIT; step c1: COMMIT; -ERROR: could not serialize access due to read/write dependencies among transactions starting permutation: rxy2 wy2 rxy1 c2 wx1 c1 step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; -id tsv + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) -424 'qh':1 'su':2 'tu':3 'ww':4 -230 'iv':1 'lp':2 'mt':4 'qh':3 'ss':5 -248 'jn':1 'js':4 'mx':2 'ne':7 'nn':5 'nw':3 'qh':6 -50 'bx':1 'ca':5 'da':10 'dn':2 'eq':6 'fn':8 'gl':7 'hu':3 'ig':9 'mg':4 'qh':11 step wy2: INSERT INTO rum_tbl(tsv) values('hx'); step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; -id tsv - -165 'gi':1 'gj':2 'gx':6 'hb':5 'hx':8 'ir':7 'sq':3 'yg':4 -74 'cv':1 'de':8 'ds':10 'eh':4 'fd':6 'gh':3 'gi':7 'hn':5 'hx':9 'lo':2 -116 'el':1 'er':9 'ez':6 'gr':3 'gt':4 'hx':7 'ie':5 'iv':2 'od':10 'zf':8 -119 'eo':1 'fc':5 'he':7 'ht':9 'hx':8 'it':2 'km':3 'so':4 'uj':6 -190 'hh':1 'hx':2 'id':5 'iv':3 'ld':7 'ob':6 'oy':4 -206 'hx':1 'it':9 'ji':10 'jl':5 'lq':3 'mh':8 'nq':6 'pc':7 'ub':4 'xi':2 + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +(5 rows) + step c2: COMMIT; step wx1: INSERT INTO rum_tbl(tsv) values('qh'); -ERROR: could not serialize access due to read/write dependencies among transactions step c1: COMMIT; starting permutation: rxy2 wy2 c2 rxy1 wx1 c1 step rxy2: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'qh'; -id tsv + id|tsv +---+---------------------------------------------------------------------- + 10|'af':8 'iy':3 'kg':5 'ln':10 'lq':1 'po':7 'pp':4 'qh':2 'sx':6 'yw':9 + 77|'da':3 'dr':4 'iy':9 'lq':7 'pp':10 'qh':8 'qj':2 'rs':1 'si':5 'uz':6 +145|'af':10 'iy':5 'kg':7 'lq':3 'po':9 'pp':6 'qh':4 'si':1 'sx':8 'uz':2 +212|'da':5 'dr':6 'hb':2 'kk':1 'lq':9 'qh':10 'qj':4 'rs':3 'si':7 'uz':8 +280|'da':1 'dr':2 'iy':7 'kg':9 'lq':5 'pp':8 'qh':6 'si':3 'sx':10 'uz':4 +(5 rows) -424 'qh':1 'su':2 'tu':3 'ww':4 -230 'iv':1 'lp':2 'mt':4 'qh':3 'ss':5 -248 'jn':1 'js':4 'mx':2 'ne':7 'nn':5 'nw':3 'qh':6 -50 'bx':1 'ca':5 'da':10 'dn':2 'eq':6 'fn':8 'gl':7 'hu':3 'ig':9 'mg':4 'qh':11 step wy2: INSERT INTO rum_tbl(tsv) values('hx'); step c2: COMMIT; step rxy1: SELECT id, tsv FROM rum_tbl WHERE tsv @@ 'hx'; -id tsv - -165 'gi':1 'gj':2 'gx':6 'hb':5 'hx':8 'ir':7 'sq':3 'yg':4 -74 'cv':1 'de':8 'ds':10 'eh':4 'fd':6 'gh':3 'gi':7 'hn':5 'hx':9 'lo':2 -116 'el':1 'er':9 'ez':6 'gr':3 'gt':4 'hx':7 'ie':5 'iv':2 'od':10 'zf':8 -119 'eo':1 'fc':5 'he':7 'ht':9 'hx':8 'it':2 'km':3 'so':4 'uj':6 -190 'hh':1 'hx':2 'id':5 'iv':3 'ld':7 'ob':6 'oy':4 -206 'hx':1 'it':9 'ji':10 'jl':5 'lq':3 'mh':8 'nq':6 'pc':7 'ub':4 'xi':2 -677 'hx' + id|tsv +---+---------------------------------------------------------------------- + 28|'aq':3 'eo':9 'ep':6 'fh':4 'hi':1 'hx':8 'jz':2 'pf':10 'xy':5 'zg':7 + 96|'an':8 'be':9 'eo':5 'ep':2 'hx':4 'nw':7 'pf':6 'pv':10 'xy':1 'zg':3 +163|'aq':5 'ep':8 'fh':6 'hi':3 'hx':10 'jz':4 'sa':1 'sr':2 'xy':7 'zg':9 +231|'an':10 'aq':1 'eo':7 'ep':4 'fh':2 'hx':6 'nw':9 'pf':8 'xy':3 'zg':5 +299|'an':6 'be':7 'eo':3 'hx':2 'jd':10 'nw':5 'pf':4 'pv':8 'sm':9 'zg':1 +339|'hx' +(6 rows) + step wx1: INSERT INTO rum_tbl(tsv) values('qh'); step c1: COMMIT; diff --git a/expected/rum.out b/expected/rum.out index ad960650d2..5966d196fe 100644 --- a/expected/rum.out +++ b/expected/rum.out @@ -132,8 +132,8 @@ SELECT count(*) FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 1 (1 row) -SELECT rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way')), - rum_ts_score(a, to_tsquery('pg_catalog.english', 'way')), +SELECT rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way'))::numeric(10,4), + rum_ts_score(a, to_tsquery('pg_catalog.english', 'way'))::numeric(10,7), * FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'way') @@ -146,33 +146,33 @@ SELECT rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way')), 16.4493 | 0.0607927 | thinking--"to go or not to go?" We are this far on the way. Reached | 'far':11 'go':3,7 'reach':15 'think':1 'way':14 (4 rows) -SELECT rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way & (go | half)')), - rum_ts_score(a, to_tsquery('pg_catalog.english', 'way & (go | half)')), +SELECT rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way & (go | half)'))::numeric(10,4), + rum_ts_score(a, to_tsquery('pg_catalog.english', 'way & (go | half)'))::numeric(10,6), * FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'way & (go | half)') ORDER BY a <=> to_tsquery('pg_catalog.english', 'way & (go | half)'); rum_ts_distance | rum_ts_score | t | a -----------------+--------------+---------------------------------------------------------------------+--------------------------------------------------------- - 8.22467 | 0.121585 | itself. Put on your "specs" and look at the castle, half way up the | 'castl':10 'half':11 'look':7 'put':2 'spec':5 'way':12 - 57.5727 | 0.0173693 | thinking--"to go or not to go?" We are this far on the way. Reached | 'far':11 'go':3,7 'reach':15 'think':1 'way':14 + 8.2247 | 0.121585 | itself. Put on your "specs" and look at the castle, half way up the | 'castl':10 'half':11 'look':7 'put':2 'spec':5 'way':12 + 57.5727 | 0.017369 | thinking--"to go or not to go?" We are this far on the way. Reached | 'far':11 'go':3,7 'reach':15 'think':1 'way':14 (2 rows) SELECT - a <=> to_tsquery('pg_catalog.english', 'way & (go | half)'), - rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way & (go | half)')), + (a <=> to_tsquery('pg_catalog.english', 'way & (go | half)'))::numeric(10,4) AS distance, + rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way & (go | half)'))::numeric(10,4), * FROM test_rum ORDER BY a <=> to_tsquery('pg_catalog.english', 'way & (go | half)') limit 2; - ?column? | rum_ts_distance | t | a + distance | rum_ts_distance | t | a ----------+-----------------+---------------------------------------------------------------------+--------------------------------------------------------- - 8.22467 | 8.22467 | itself. Put on your "specs" and look at the castle, half way up the | 'castl':10 'half':11 'look':7 'put':2 'spec':5 'way':12 + 8.2247 | 8.2247 | itself. Put on your "specs" and look at the castle, half way up the | 'castl':10 'half':11 'look':7 'put':2 'spec':5 'way':12 57.5727 | 57.5727 | thinking--"to go or not to go?" We are this far on the way. Reached | 'far':11 'go':3,7 'reach':15 'think':1 'way':14 (2 rows) -- Check ranking normalization -SELECT rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way'), 0), - rum_ts_score(a, to_tsquery('pg_catalog.english', 'way'), 0), +SELECT rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way'), 0)::numeric(10,4), + rum_ts_score(a, to_tsquery('pg_catalog.english', 'way'), 0)::numeric(10,7), * FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'way') @@ -185,16 +185,16 @@ SELECT rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way'), 0), 16.4493 | 0.0607927 | thinking--"to go or not to go?" We are this far on the way. Reached | 'far':11 'go':3,7 'reach':15 'think':1 'way':14 (4 rows) -SELECT rum_ts_distance(a, row(to_tsquery('pg_catalog.english', 'way & (go | half)'), 0)::rum_distance_query), - rum_ts_score(a, row(to_tsquery('pg_catalog.english', 'way & (go | half)'), 0)::rum_distance_query), +SELECT rum_ts_distance(a, row(to_tsquery('pg_catalog.english', 'way & (go | half)'), 0)::rum_distance_query)::numeric(10,4), + rum_ts_score(a, row(to_tsquery('pg_catalog.english', 'way & (go | half)'), 0)::rum_distance_query)::numeric(10,6), * FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'way & (go | half)') ORDER BY a <=> to_tsquery('pg_catalog.english', 'way & (go | half)'); rum_ts_distance | rum_ts_score | t | a -----------------+--------------+---------------------------------------------------------------------+--------------------------------------------------------- - 8.22467 | 0.121585 | itself. Put on your "specs" and look at the castle, half way up the | 'castl':10 'half':11 'look':7 'put':2 'spec':5 'way':12 - 57.5727 | 0.0173693 | thinking--"to go or not to go?" We are this far on the way. Reached | 'far':11 'go':3,7 'reach':15 'think':1 'way':14 + 8.2247 | 0.121585 | itself. Put on your "specs" and look at the castle, half way up the | 'castl':10 'half':11 'look':7 'put':2 'spec':5 'way':12 + 57.5727 | 0.017369 | thinking--"to go or not to go?" We are this far on the way. Reached | 'far':11 'go':3,7 'reach':15 'think':1 'way':14 (2 rows) INSERT INTO test_rum (t) VALUES ('foo bar foo the over foo qq bar'); @@ -232,65 +232,71 @@ SELECT a FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'bar') ORDER (1 row) -- Check full-index scan with order by -SELECT a <=> to_tsquery('pg_catalog.english', 'ever|wrote') FROM test_rum ORDER BY a <=> to_tsquery('pg_catalog.english', 'ever|wrote'); - ?column? +SELECT + CASE WHEN distance = 'Infinity' THEN -1 + ELSE distance::numeric(10,4) + END distance + FROM + (SELECT a <=> to_tsquery('pg_catalog.english', 'ever|wrote') AS distance + FROM test_rum ORDER BY a <=> to_tsquery('pg_catalog.english', 'ever|wrote')) t; + distance ---------- 16.4493 16.4493 - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 (56 rows) CREATE TABLE tst (i int4, t tsvector); @@ -325,15 +331,15 @@ SELECT a <=> to_tsquery('pg_catalog.english', 'w:*'), * Order By: (a <=> '''w'':*'::tsquery) (3 rows) -SELECT a <=> to_tsquery('pg_catalog.english', 'w:*'), * +SELECT (a <=> to_tsquery('pg_catalog.english', 'w:*'))::numeric(10,4) AS distance, * FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'w:*') ORDER BY a <=> to_tsquery('pg_catalog.english', 'w:*'); - ?column? | t | a + distance | t | a ----------+--------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------- - 8.22467 | so well that only a fragment, as it were, gave way. It still hangs as if | 'fragment':6 'gave':10 'hang':14 'still':13 'way':11 'well':2 - 8.22467 | wine, but wouldn't you divide with your neighbors! The columns in the | 'column':11 'divid':6 'neighbor':9 'wine':1 'wouldn':3 - 8.22467 | not say, but you wrote as if you knew it by sight as well as by heart. | 'heart':17 'knew':9 'say':2 'sight':12 'well':14 'wrote':5 + 8.2247 | so well that only a fragment, as it were, gave way. It still hangs as if | 'fragment':6 'gave':10 'hang':14 'still':13 'way':11 'well':2 + 8.2247 | wine, but wouldn't you divide with your neighbors! The columns in the | 'column':11 'divid':6 'neighbor':9 'wine':1 'wouldn':3 + 8.2247 | not say, but you wrote as if you knew it by sight as well as by heart. | 'heart':17 'knew':9 'say':2 'sight':12 'well':14 'wrote':5 16.4493 | little series of pictures. Have you ever been here, I wonder? You did | 'ever':7 'littl':1 'pictur':4 'seri':2 'wonder':11 16.4493 | itself. Put on your "specs" and look at the castle, half way up the | 'castl':10 'half':11 'look':7 'put':2 'spec':5 'way':12 16.4493 | _berg_, "the Jettenhuhl, a wooded spur of the Konigestuhl." Look at it | 'berg':1 'jettenhuhl':3 'konigestuhl':9 'look':10 'spur':6 'wood':5 @@ -347,16 +353,16 @@ SELECT a <=> to_tsquery('pg_catalog.english', 'w:*'), * 16.4493 | my appreciation of you in a more complimentary way than by sending this | 'appreci':2 'complimentari':8 'send':12 'way':9 (14 rows) -SELECT a <=> to_tsquery('pg_catalog.english', 'b:*'), * +SELECT (a <=> to_tsquery('pg_catalog.english', 'b:*'))::numeric(10,4) AS distance, * FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'b:*') ORDER BY a <=> to_tsquery('pg_catalog.english', 'b:*'); - ?column? | t | a + distance | t | a ----------+--------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------- - 8.22467 | been trying my best to get all those "passes" into my brain. Now, thanks | 'best':4 'brain':12 'get':6 'pass':9 'thank':14 'tri':2 - 8.22467 | All the above information, I beg you to believe, I do not intend you | 'beg':6 'believ':9 'inform':4 'intend':13 - 8.22467 | curious spectacle, but on the whole had "the banquet-hall deserted" | 'banquet':10 'banquet-hal':9 'curious':1 'desert':12 'hall':11 'spectacl':2 'whole':6 - 8.22467 | oaks, limes and maples, bordered with flower-beds and shrubberies, and | 'bed':9 'border':5 'flower':8 'flower-b':7 'lime':2 'mapl':4 'oak':1 'shrubberi':11 + 8.2247 | been trying my best to get all those "passes" into my brain. Now, thanks | 'best':4 'brain':12 'get':6 'pass':9 'thank':14 'tri':2 + 8.2247 | All the above information, I beg you to believe, I do not intend you | 'beg':6 'believ':9 'inform':4 'intend':13 + 8.2247 | curious spectacle, but on the whole had "the banquet-hall deserted" | 'banquet':10 'banquet-hal':9 'curious':1 'desert':12 'hall':11 'spectacl':2 'whole':6 + 8.2247 | oaks, limes and maples, bordered with flower-beds and shrubberies, and | 'bed':9 'border':5 'flower':8 'flower-b':7 'lime':2 'mapl':4 'oak':1 'shrubberi':11 13.1595 | foo bar foo the over foo qq bar | 'bar':2,8 'foo':1,3,6 'qq':7 16.4493 | ornamental building, and I wish you could see it, if you have not seen | 'build':2 'could':7 'ornament':1 'see':8 'seen':14 'wish':5 16.4493 | the--nearest guide-book! | 'book':5 'guid':4 'guide-book':3 'nearest':2 @@ -375,14 +381,32 @@ SELECT a <=> to_tsquery('pg_catalog.english', 'b:*'), * 16.4493 | the few that escaped destruction in 1693. It is a beautiful, highly | '1693':7 'beauti':11 'destruct':5 'escap':4 'high':12 (20 rows) -select 'bjarn:6237 stroustrup:6238'::tsvector <=> 'bjarn <-> stroustrup'::tsquery; - ?column? +-- Test correct work of phrase operator when position information is not in index. +create table test_rum_addon as table test_rum; +alter table test_rum_addon add column id serial; +create index on test_rum_addon using rum (a rum_tsvector_addon_ops, id) with (attach = 'id', to='a'); +select * from test_rum_addon where a @@ to_tsquery('pg_catalog.english', 'half <-> way'); + t | a | id +---------------------------------------------------------------------+---------------------------------------------------------+---- + itself. Put on your "specs" and look at the castle, half way up the | 'castl':10 'half':11 'look':7 'put':2 'spec':5 'way':12 | 9 +(1 row) + +explain (costs off) select * from test_rum_addon where a @@ to_tsquery('pg_catalog.english', 'half <-> way'); + QUERY PLAN +------------------------------------------------------------ + Index Scan using test_rum_addon_a_id_idx on test_rum_addon + Index Cond: (a @@ '''half'' <-> ''way'''::tsquery) +(2 rows) + +-- +select ('bjarn:6237 stroustrup:6238'::tsvector <=> 'bjarn <-> stroustrup'::tsquery)::numeric(10,5) AS distance; + distance ---------- 8.22467 (1 row) -SELECT 'stroustrup:5508B,6233B,6238B bjarn:6235B,6237B' <=> 'bjarn <-> stroustrup'::tsquery; - ?column? +SELECT ('stroustrup:5508B,6233B,6238B bjarn:6235B,6237B' <=> 'bjarn <-> stroustrup'::tsquery)::numeric(10,5) AS distance; + distance ---------- 2.05617 (1 row) diff --git a/expected/rum_hash.out b/expected/rum_hash.out index 4838be4e93..43a9760a28 100644 --- a/expected/rum_hash.out +++ b/expected/rum_hash.out @@ -118,8 +118,8 @@ SELECT count(*) FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 1 (1 row) -SELECT rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way')), - rum_ts_score(a, to_tsquery('pg_catalog.english', 'way')), +SELECT rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way'))::numeric(10,4), + rum_ts_score(a, to_tsquery('pg_catalog.english', 'way'))::numeric(10,7), * FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 'way') @@ -132,34 +132,34 @@ SELECT rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way')), 16.4493 | 0.0607927 | thinking--"to go or not to go?" We are this far on the way. Reached | 'far':11 'go':3,7 'reach':15 'think':1 'way':14 (4 rows) -SELECT rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way & (go | half)')), - rum_ts_score(a, to_tsquery('pg_catalog.english', 'way & (go | half)')), +SELECT rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way & (go | half)'))::numeric(10,4), + rum_ts_score(a, to_tsquery('pg_catalog.english', 'way & (go | half)'))::numeric(10,6), * FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 'way & (go | half)') ORDER BY a <=> to_tsquery('pg_catalog.english', 'way & (go | half)'); rum_ts_distance | rum_ts_score | t | a -----------------+--------------+---------------------------------------------------------------------+--------------------------------------------------------- - 8.22467 | 0.121585 | itself. Put on your "specs" and look at the castle, half way up the | 'castl':10 'half':11 'look':7 'put':2 'spec':5 'way':12 - 57.5727 | 0.0173693 | thinking--"to go or not to go?" We are this far on the way. Reached | 'far':11 'go':3,7 'reach':15 'think':1 'way':14 + 8.2247 | 0.121585 | itself. Put on your "specs" and look at the castle, half way up the | 'castl':10 'half':11 'look':7 'put':2 'spec':5 'way':12 + 57.5727 | 0.017369 | thinking--"to go or not to go?" We are this far on the way. Reached | 'far':11 'go':3,7 'reach':15 'think':1 'way':14 (2 rows) SELECT - a <=> to_tsquery('pg_catalog.english', 'way & (go | half)'), - rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way & (go | half)')), - rum_ts_score(a, to_tsquery('pg_catalog.english', 'way & (go | half)')), + (a <=> to_tsquery('pg_catalog.english', 'way & (go | half)'))::numeric(10,4) AS distance, + rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way & (go | half)'))::numeric(10,4), + rum_ts_score(a, to_tsquery('pg_catalog.english', 'way & (go | half)'))::numeric(10,6), * FROM test_rum_hash ORDER BY a <=> to_tsquery('pg_catalog.english', 'way & (go | half)') limit 2; - ?column? | rum_ts_distance | rum_ts_score | t | a + distance | rum_ts_distance | rum_ts_score | t | a ----------+-----------------+--------------+---------------------------------------------------------------------+--------------------------------------------------------- - 8.22467 | 8.22467 | 0.121585 | itself. Put on your "specs" and look at the castle, half way up the | 'castl':10 'half':11 'look':7 'put':2 'spec':5 'way':12 - 57.5727 | 57.5727 | 0.0173693 | thinking--"to go or not to go?" We are this far on the way. Reached | 'far':11 'go':3,7 'reach':15 'think':1 'way':14 + 8.2247 | 8.2247 | 0.121585 | itself. Put on your "specs" and look at the castle, half way up the | 'castl':10 'half':11 'look':7 'put':2 'spec':5 'way':12 + 57.5727 | 57.5727 | 0.017369 | thinking--"to go or not to go?" We are this far on the way. Reached | 'far':11 'go':3,7 'reach':15 'think':1 'way':14 (2 rows) -- Check ranking normalization -SELECT rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way'), 0), - rum_ts_score(a, to_tsquery('pg_catalog.english', 'way'), 0), +SELECT rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way'), 0)::numeric(10,4), + rum_ts_score(a, to_tsquery('pg_catalog.english', 'way'), 0)::numeric(10,7), * FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 'way') @@ -172,16 +172,16 @@ SELECT rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way'), 0), 16.4493 | 0.0607927 | thinking--"to go or not to go?" We are this far on the way. Reached | 'far':11 'go':3,7 'reach':15 'think':1 'way':14 (4 rows) -SELECT rum_ts_distance(a, row(to_tsquery('pg_catalog.english', 'way & (go | half)'), 0)::rum_distance_query), - rum_ts_score(a, row(to_tsquery('pg_catalog.english', 'way & (go | half)'), 0)::rum_distance_query), +SELECT rum_ts_distance(a, row(to_tsquery('pg_catalog.english', 'way & (go | half)'), 0)::rum_distance_query)::numeric(10,4), + rum_ts_score(a, row(to_tsquery('pg_catalog.english', 'way & (go | half)'), 0)::rum_distance_query)::numeric(10,6), * FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 'way & (go | half)') ORDER BY a <=> to_tsquery('pg_catalog.english', 'way & (go | half)'); rum_ts_distance | rum_ts_score | t | a -----------------+--------------+---------------------------------------------------------------------+--------------------------------------------------------- - 8.22467 | 0.121585 | itself. Put on your "specs" and look at the castle, half way up the | 'castl':10 'half':11 'look':7 'put':2 'spec':5 'way':12 - 57.5727 | 0.0173693 | thinking--"to go or not to go?" We are this far on the way. Reached | 'far':11 'go':3,7 'reach':15 'think':1 'way':14 + 8.2247 | 0.121585 | itself. Put on your "specs" and look at the castle, half way up the | 'castl':10 'half':11 'look':7 'put':2 'spec':5 'way':12 + 57.5727 | 0.017369 | thinking--"to go or not to go?" We are this far on the way. Reached | 'far':11 'go':3,7 'reach':15 'think':1 'way':14 (2 rows) INSERT INTO test_rum_hash (t) VALUES ('foo bar foo the over foo qq bar'); @@ -219,65 +219,71 @@ SELECT a FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 'bar') O (1 row) -- Check full-index scan with order by -SELECT a <=> to_tsquery('pg_catalog.english', 'ever|wrote') FROM test_rum_hash ORDER BY a <=> to_tsquery('pg_catalog.english', 'ever|wrote'); - ?column? +SELECT + CASE WHEN distance = 'Infinity' THEN -1 + ELSE distance::numeric(10,4) + END distance + FROM + (SELECT a <=> to_tsquery('pg_catalog.english', 'ever|wrote') AS distance + FROM test_rum_hash ORDER BY a <=> to_tsquery('pg_catalog.english', 'ever|wrote')) t; + distance ---------- 16.4493 16.4493 - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity - Infinity + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 + -1 (56 rows) CREATE TABLE tst_hash (i int4, t tsvector); diff --git a/expected/rum_validate.out b/expected/rum_validate.out index 83a312c0ed..22000a1ee5 100644 --- a/expected/rum_validate.out +++ b/expected/rum_validate.out @@ -90,7 +90,7 @@ FROM unnest(array['asc','desc','nulls_first','nulls_last','orderable','distance_ -- Check incorrect operator class -- DROP INDEX rumidx; --- Check incorrect operator class +-- PGPRO-1175: Check incorrect operator class, i.e. it shouldn't work correctly CREATE OPERATOR CLASS rum_tsvector_norm_ops FOR TYPE tsvector USING rum AS @@ -110,12 +110,23 @@ CREATE INDEX rum_norm_idx ON test_rum USING rum(a rum_tsvector_norm_ops); SET enable_seqscan=off; SET enable_bitmapscan=off; SET enable_indexscan=on; +-- PGPRO-1175: Select using incorrect operator class SELECT a FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'bar') - ORDER BY a <=> (to_tsquery('pg_catalog.english', 'bar'),0) + ORDER BY a <=> (to_tsquery('pg_catalog.english', 'bar'),0); a ------------------------------ 'bar':2,8 'foo':1,3,6 'qq':7 (1 row) +-- PGPRO-9026: column and attached column cannot be the same +CREATE TABLE test_array (i int2[]); +CREATE INDEX idx_array ON test_array USING rum (i rum_anyarray_addon_ops) WITH (attach = 'i', to = 'i'); +ERROR: column "i" and attached column cannot be the same +SELECT * FROM test_array WHERE i && '{1}'; + i +--- +(0 rows) + +DROP TABLE test_array; diff --git a/expected/rum_weight.out b/expected/rum_weight.out new file mode 100644 index 0000000000..0c1565d1ce --- /dev/null +++ b/expected/rum_weight.out @@ -0,0 +1,136 @@ +CREATE TABLE testweight_rum( t text, a tsvector, r text ); +CREATE FUNCTION fill_weight_trigger() RETURNS trigger AS $$ +begin + new.a := + setweight(to_tsvector('pg_catalog.english', coalesce(new.r,'')), 'A') || + setweight(to_tsvector('pg_catalog.english', coalesce(new.t,'')), 'D'); + return new; +end +$$ LANGUAGE plpgsql; +CREATE TRIGGER tsvectorweightupdate +BEFORE INSERT OR UPDATE ON testweight_rum +FOR EACH ROW EXECUTE PROCEDURE fill_weight_trigger(); +CREATE INDEX rumidx_weight ON testweight_rum USING rum (a rum_tsvector_ops); +\copy testweight_rum(t,r) from 'data/rum_weight.data' DELIMITER '|' ; +SET enable_seqscan=off; +SET enable_indexscan=off; +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'ever:A|wrote'); + count +------- + 1 +(1 row) + +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'have:A&wish:DAC'); + count +------- + 1 +(1 row) + +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'have:A&wish:DAC'); + count +------- + 1 +(1 row) + +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'among:ABC'); + count +------- + 0 +(1 row) + +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'structure:D&ancient:BCD'); + count +------- + 1 +(1 row) + +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', '(complimentary:DC|sight)&(sending:ABC|heart)'); + count +------- + 2 +(1 row) + +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', '!gave:D & way'); + count +------- + 3 +(1 row) + +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', '(go<->go:a)&(think:d<->go)'); + count +------- + 0 +(1 row) + +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', '(go<->go:a)&(think:d<2>go)'); + count +------- + 1 +(1 row) + +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'go & (!reach:a | way<->reach)'); + count +------- + 2 +(1 row) + +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'go & (!reach:a & way<->reach)'); + count +------- + 0 +(1 row) + +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'reach:d & go & !way:a'); + count +------- + 1 +(1 row) + +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'show:d & seem & !town:a'); + count +------- + 1 +(1 row) + +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', '!way:a'); + count +------- + 52 +(1 row) + +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'go & !way:a'); + count +------- + 2 +(1 row) + +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'reach:d & !way:a'); + count +------- + 1 +(1 row) + +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'reach:d & go'); + count +------- + 1 +(1 row) + +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'think<->go:d | go<->see'); + count +------- + 1 +(1 row) + +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'reach:d<->think'); + count +------- + 0 +(1 row) + +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'reach<->think'); + count +------- + 1 +(1 row) + diff --git a/expected/security.out b/expected/security.out new file mode 100644 index 0000000000..86fcbf81da --- /dev/null +++ b/expected/security.out @@ -0,0 +1,5 @@ +-- Check security CVE-2020-14350 +CREATE FUNCTION rum_anyarray_similar(anyarray,anyarray) RETURNS bool AS $$ SELECT false $$ LANGUAGE SQL; +CREATE EXTENSION rum; +ERROR: function "rum_anyarray_similar" already exists with same argument types +DROP FUNCTION rum_anyarray_similar(anyarray,anyarray); diff --git a/expected/text.out b/expected/text.out index d8df82168b..9cf9310a77 100644 --- a/expected/text.out +++ b/expected/text.out @@ -125,16 +125,18 @@ SELECT id FROM test_text_o WHERE t @@ 'wr&qh' AND id >= '400' ORDER BY id; (8 rows) CREATE TABLE test_text_a AS SELECT id::text, t FROM tsts; +-- Should fail, temporarly it isn't allowed to order an index over pass-by-reference column CREATE INDEX test_text_a_idx ON test_text_a USING rum (t rum_tsvector_addon_ops, id) WITH (attach = 'id', to = 't', order_by_attach='t'); +ERROR: doesn't support order index over pass-by-reference column EXPLAIN (costs off) SELECT count(*) FROM test_text_a WHERE id < '400'; - QUERY PLAN -------------------------------------------------------- + QUERY PLAN +------------------------------------ Aggregate - -> Index Scan using test_text_a_idx on test_text_a - Index Cond: (id < '400'::text) + -> Seq Scan on test_text_a + Filter: (id < '400'::text) (3 rows) SELECT count(*) FROM test_text_a WHERE id < '400'; @@ -145,12 +147,12 @@ SELECT count(*) FROM test_text_a WHERE id < '400'; EXPLAIN (costs off) SELECT id FROM test_text_a WHERE t @@ 'wr&qh' AND id <= '400' ORDER BY id; - QUERY PLAN ---------------------------------------------------------------------------------- + QUERY PLAN +----------------------------------------------------------------------------- Sort Sort Key: id - -> Index Scan using test_text_a_idx on test_text_a - Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id <= '400'::text)) + -> Seq Scan on test_text_a + Filter: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id <= '400'::text)) (4 rows) SELECT id FROM test_text_a WHERE t @@ 'wr&qh' AND id <= '400' ORDER BY id; @@ -169,12 +171,12 @@ SELECT id FROM test_text_a WHERE t @@ 'wr&qh' AND id <= '400' ORDER BY id; EXPLAIN (costs off) SELECT id FROM test_text_a WHERE t @@ 'wr&qh' AND id >= '400' ORDER BY id; - QUERY PLAN ---------------------------------------------------------------------------------- + QUERY PLAN +----------------------------------------------------------------------------- Sort Sort Key: id - -> Index Scan using test_text_a_idx on test_text_a - Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id >= '400'::text)) + -> Seq Scan on test_text_a + Filter: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id >= '400'::text)) (4 rows) SELECT id FROM test_text_a WHERE t @@ 'wr&qh' AND id >= '400' ORDER BY id; @@ -242,16 +244,18 @@ SELECT id FROM test_text_h_o WHERE t @@ 'wr&qh' AND id >= '400' ORDER BY id; (8 rows) CREATE TABLE test_text_h_a AS SELECT id::text, t FROM tsts; +-- Should fail, temporarly it isn't allowed to order an index over pass-by-reference column CREATE INDEX test_text_h_a_idx ON test_text_h_a USING rum (t rum_tsvector_hash_addon_ops, id) WITH (attach = 'id', to = 't', order_by_attach='t'); +ERROR: doesn't support order index over pass-by-reference column EXPLAIN (costs off) SELECT count(*) FROM test_text_h_a WHERE id < '400'; - QUERY PLAN ------------------------------------------------------------ + QUERY PLAN +------------------------------------ Aggregate - -> Index Scan using test_text_h_a_idx on test_text_h_a - Index Cond: (id < '400'::text) + -> Seq Scan on test_text_h_a + Filter: (id < '400'::text) (3 rows) SELECT count(*) FROM test_text_h_a WHERE id < '400'; @@ -262,12 +266,12 @@ SELECT count(*) FROM test_text_h_a WHERE id < '400'; EXPLAIN (costs off) SELECT id FROM test_text_h_a WHERE t @@ 'wr&qh' AND id <= '400' ORDER BY id; - QUERY PLAN ---------------------------------------------------------------------------------- + QUERY PLAN +----------------------------------------------------------------------------- Sort Sort Key: id - -> Index Scan using test_text_h_a_idx on test_text_h_a - Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id <= '400'::text)) + -> Seq Scan on test_text_h_a + Filter: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id <= '400'::text)) (4 rows) SELECT id FROM test_text_h_a WHERE t @@ 'wr&qh' AND id <= '400' ORDER BY id; @@ -286,12 +290,12 @@ SELECT id FROM test_text_h_a WHERE t @@ 'wr&qh' AND id <= '400' ORDER BY id; EXPLAIN (costs off) SELECT id FROM test_text_h_a WHERE t @@ 'wr&qh' AND id >= '400' ORDER BY id; - QUERY PLAN ---------------------------------------------------------------------------------- + QUERY PLAN +----------------------------------------------------------------------------- Sort Sort Key: id - -> Index Scan using test_text_h_a_idx on test_text_h_a - Index Cond: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id >= '400'::text)) + -> Seq Scan on test_text_h_a + Filter: ((t @@ '''wr'' & ''qh'''::tsquery) AND (id >= '400'::text)) (4 rows) SELECT id FROM test_text_h_a WHERE t @@ 'wr&qh' AND id >= '400' ORDER BY id; diff --git a/expected/timestamp.out b/expected/timestamp.out index 37f26f073f..00969a7534 100644 --- a/expected/timestamp.out +++ b/expected/timestamp.out @@ -1,3 +1,12 @@ +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * timestamp.out - test output for 64-bit systems and + * timestamp_1.out - test output for 32-bit systems. + * + */ CREATE TABLE test_timestamp ( i timestamp ); diff --git a/expected/timestamp_1.out b/expected/timestamp_1.out index e15bcc7584..a8641a3232 100644 --- a/expected/timestamp_1.out +++ b/expected/timestamp_1.out @@ -1,3 +1,12 @@ +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * timestamp.out - test output for 64-bit systems and + * timestamp_1.out - test output for 32-bit systems. + * + */ CREATE TABLE test_timestamp ( i timestamp ); diff --git a/gen_rum_sql--1.0--1.1.pl b/gen_rum_sql--1.0--1.1.pl deleted file mode 100644 index 7296f6c023..0000000000 --- a/gen_rum_sql--1.0--1.1.pl +++ /dev/null @@ -1,335 +0,0 @@ -use strict; -use warnings; - -my $func_base_template=< ( - PROCEDURE = rum_TYPEIDENT_distance, - LEFTARG = TYPENAME, - RIGHTARG = TYPENAME, - COMMUTATOR = <=> -); - -CREATE FUNCTION rum_TYPEIDENT_left_distance(TYPENAME, TYPENAME) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR <=| ( - PROCEDURE = rum_TYPEIDENT_left_distance, - LEFTARG = TYPENAME, - RIGHTARG = TYPENAME, - COMMUTATOR = |=> -); - -CREATE FUNCTION rum_TYPEIDENT_right_distance(TYPENAME, TYPENAME) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR |=> ( - PROCEDURE = rum_TYPEIDENT_right_distance, - LEFTARG = TYPENAME, - RIGHTARG = TYPENAME, - COMMUTATOR = <=| -); - -CREATE FUNCTION rum_TYPEIDENT_outer_distance(TYPENAME, TYPENAME, smallint) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_TYPEIDENT_config(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - - -EOT - -my $opclass_base_template=<= TYPESOPARG, - OPERATOR 5 > TYPESOPARG, - FUNCTION 1 TYPECMPFUNC(TYPECMPTYPE,TYPECMPTYPE), - FUNCTION 2 rum_TYPESUBIDENT_extract_value(TYPESUBNAME, internal), - FUNCTION 3 rum_TYPESUBIDENT_extract_query(TYPESUBNAME, internal, int2, internal, internal), - FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), - FUNCTION 5 rum_TYPESUBIDENT_compare_prefix(TYPESUBNAME,TYPESUBNAME,int2, internal), -STORAGE TYPENAME; - -EOT - -my $opclass_distance_template=<= TYPESOPARG, - OPERATOR 5 > TYPESOPARG, - OPERATOR 20 <=> (TYPENAME,TYPENAME) FOR ORDER BY pg_catalog.float_ops, - OPERATOR 21 <=| (TYPENAME,TYPENAME) FOR ORDER BY pg_catalog.float_ops, - OPERATOR 22 |=> (TYPENAME,TYPENAME) FOR ORDER BY pg_catalog.float_ops, - FUNCTION 1 TYPECMPFUNC(TYPECMPTYPE,TYPECMPTYPE), - FUNCTION 2 rum_TYPESUBIDENT_extract_value(TYPESUBNAME, internal), - FUNCTION 3 rum_TYPESUBIDENT_extract_query(TYPESUBNAME, internal, int2, internal, internal), - FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), - FUNCTION 5 rum_TYPESUBIDENT_compare_prefix(TYPESUBNAME,TYPESUBNAME,int2, internal), - -- support to TYPEIDENT distance in rum_tsvector_addon_ops - FUNCTION 6 rum_TYPEIDENT_config(internal), - FUNCTION 9 rum_TYPEIDENT_outer_distance(TYPENAME, TYPENAME, smallint), -STORAGE TYPENAME; - -EOT - -my @opinfo = map { - $_->{TYPEIDENT} = $_->{TYPENAME} if ! exists $_->{TYPEIDENT}; - $_->{TYPECMPTYPE} = $_->{TYPENAME} if !exists $_->{TYPECMPTYPE}; - $_->{TYPESUBNAME} = $_->{TYPENAME} if !exists $_->{TYPESUBNAME}; - $_->{TYPESUBIDENT}= $_->{TYPEIDENT} if ! exists $_->{TYPESUBIDENT}; - $_->{TYPESOPARG}= '' if ! exists $_->{TYPESOPARG}; - $_ - } ( - # timestamp/tz aren't here: they are in rum--1.0.sql - - { - TYPENAME => 'int2', - TYPECMPFUNC => 'btint2cmp', - func_tmpl => \$func_distance_template, - opclass_tmpl=> \$opclass_distance_template, - }, - { - TYPENAME => 'int4', - TYPECMPFUNC => 'btint4cmp', - func_tmpl => \$func_distance_template, - opclass_tmpl=> \$opclass_distance_template, - }, - { - TYPENAME => 'int8', - TYPECMPFUNC => 'btint8cmp', - func_tmpl => \$func_distance_template, - opclass_tmpl=> \$opclass_distance_template, - }, - { - TYPENAME => 'float4', - TYPECMPFUNC => 'btfloat4cmp', - func_tmpl => \$func_distance_template, - opclass_tmpl=> \$opclass_distance_template, - }, - { - TYPENAME => 'float8', - TYPECMPFUNC => 'btfloat8cmp', - func_tmpl => \$func_distance_template, - opclass_tmpl=> \$opclass_distance_template, - }, - { - TYPENAME => 'money', - TYPECMPFUNC => 'cash_cmp', - func_tmpl => \$func_distance_template, - opclass_tmpl=> \$opclass_distance_template, - }, - { - TYPENAME => 'oid', - TYPECMPFUNC => 'btoidcmp', - func_tmpl => \$func_distance_template, - opclass_tmpl=> \$opclass_distance_template, - }, - { - TYPENAME => 'time', - TYPECMPFUNC => 'time_cmp', - func_tmpl => \$func_base_template, - opclass_tmpl=> \$opclass_base_template, - }, - { - TYPENAME => 'timetz', - TYPECMPFUNC => 'timetz_cmp', - func_tmpl => \$func_base_template, - opclass_tmpl=> \$opclass_base_template, - }, - { - TYPENAME => 'date', - TYPECMPFUNC => 'date_cmp', - func_tmpl => \$func_base_template, - opclass_tmpl=> \$opclass_base_template, - }, - { - TYPENAME => 'interval', - TYPECMPFUNC => 'interval_cmp', - func_tmpl => \$func_base_template, - opclass_tmpl=> \$opclass_base_template, - }, - { - TYPENAME => 'macaddr', - TYPECMPFUNC => 'macaddr_cmp', - func_tmpl => \$func_base_template, - opclass_tmpl=> \$opclass_base_template, - }, - { - TYPENAME => 'inet', - TYPECMPFUNC => 'network_cmp', - func_tmpl => \$func_base_template, - opclass_tmpl=> \$opclass_base_template, - }, - { - TYPENAME => 'cidr', - TYPECMPFUNC => 'network_cmp', - TYPECMPTYPE => 'inet', - TYPESOPARG => '(inet, inet)', - func_tmpl => \$func_base_template, - opclass_tmpl=> \$opclass_base_template, - }, - { - TYPENAME => 'text', - TYPECMPFUNC => 'bttextcmp', - func_tmpl => \$func_base_template, - opclass_tmpl=> \$opclass_base_template, - }, - { - TYPENAME => 'varchar', - TYPECMPFUNC => 'bttextcmp', - TYPECMPTYPE => 'text', - TYPESUBIDENT=> 'text', - TYPESUBNAME => 'text', - TYPESOPARG => '(text, text)', - opclass_tmpl=> \$opclass_base_template, - }, - { - TYPENAME => '"char"', - TYPEIDENT => 'char', - TYPECMPFUNC => 'btcharcmp', - func_tmpl => \$func_base_template, - opclass_tmpl=> \$opclass_base_template, - }, - { - TYPENAME => 'bytea', - TYPECMPFUNC => 'byteacmp', - func_tmpl => \$func_base_template, - opclass_tmpl=> \$opclass_base_template, - }, - { - TYPENAME => 'bit', - TYPECMPFUNC => 'bitcmp', - func_tmpl => \$func_base_template, - opclass_tmpl=> \$opclass_base_template, - }, - { - TYPENAME => 'varbit', - TYPECMPFUNC => 'varbitcmp', - func_tmpl => \$func_base_template, - opclass_tmpl=> \$opclass_base_template, - }, - { - TYPENAME => 'numeric', - TYPECMPFUNC => 'rum_numeric_cmp', - func_tmpl => \$func_base_template, - opclass_tmpl=> \$opclass_base_template, - }, -); - -##############Generate!!! - -print <{TYPENAME}-----------------------*/\n\n"; - - for my $v (qw(func_tmpl opclass_tmpl)) - { - next if !exists $t->{$v}; - - my $x = ${$t->{$v}}; - - for my $k (grep {uc($_) eq $_} keys %$t) - { - $x=~s/$k/$t->{$k}/g; - } - - print $x; - } -} - -# Drop doesn't work -#print <{TYPEIDENT} = $_->{TYPENAME} if !exists $_->{TYPEIDENT}; - $_ - } ( - { - TYPENAME => 'int2', - func_tmpl => \$func_distance_template, - opclass_tmpl=> \$opclass_distance_template, - }, - { - TYPENAME => 'int4', - func_tmpl => \$func_distance_template, - opclass_tmpl=> \$opclass_distance_template, - }, - { - TYPENAME => 'int8', - func_tmpl => \$func_distance_template, - opclass_tmpl=> \$opclass_distance_template, - }, - { - TYPENAME => 'float4', - func_tmpl => \$func_distance_template, - opclass_tmpl=> \$opclass_distance_template, - }, - { - TYPENAME => 'float8', - func_tmpl => \$func_distance_template, - opclass_tmpl=> \$opclass_distance_template, - }, - { - TYPENAME => 'money', - func_tmpl => \$func_distance_template, - opclass_tmpl=> \$opclass_distance_template, - }, - { - TYPENAME => 'oid', - func_tmpl => \$func_distance_template, - opclass_tmpl=> \$opclass_distance_template, - }, - { - TYPENAME => 'timestamp', - func_tmpl => \$func_distance_template, - opclass_tmpl=> \$opclass_distance_template, - }, - { - TYPENAME => 'timestamptz', - func_tmpl => \$func_distance_template, - opclass_tmpl=> \$opclass_distance_template, - }, -); - -##############Generate!!! - -print < ( - PROCEDURE = rum_anyarray_distance, - LEFTARG = anyarray, - RIGHTARG = anyarray, - COMMUTATOR = '<=>' -); - - -CREATE FUNCTION rum_extract_anyarray(anyarray,internal,internal,internal,internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION rum_extract_anyarray_query(anyarray,internal,smallint,internal,internal,internal,internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION rum_anyarray_consistent(internal, smallint, anyarray, integer, internal, internal, internal, internal) -RETURNS bool -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION rum_anyarray_ordering(internal,smallint,anyarray,int,internal,internal,internal,internal,internal) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - - -CREATE OPERATOR CLASS rum_anyarray_ops -DEFAULT FOR TYPE anyarray USING rum -AS - OPERATOR 1 && (anyarray, anyarray), - OPERATOR 2 @> (anyarray, anyarray), - OPERATOR 3 <@ (anyarray, anyarray), - OPERATOR 4 = (anyarray, anyarray), - OPERATOR 5 % (anyarray, anyarray), - OPERATOR 20 <=> (anyarray, anyarray) FOR ORDER BY pg_catalog.float_ops, - --dispatch function 1 for concrete type - FUNCTION 2 rum_extract_anyarray(anyarray,internal,internal,internal,internal), - FUNCTION 3 rum_extract_anyarray_query(anyarray,internal,smallint,internal,internal,internal,internal), - FUNCTION 4 rum_anyarray_consistent(internal,smallint,anyarray,integer,internal,internal,internal,internal), - FUNCTION 6 rum_anyarray_config(internal), - FUNCTION 8 rum_anyarray_ordering(internal,smallint,anyarray,int,internal,internal,internal,internal,internal), - STORAGE anyelement; - -CREATE OPERATOR CLASS rum_anyarray_addon_ops -FOR TYPE anyarray USING rum -AS - OPERATOR 1 && (anyarray, anyarray), - OPERATOR 2 @> (anyarray, anyarray), - OPERATOR 3 <@ (anyarray, anyarray), - OPERATOR 4 = (anyarray, anyarray), - --dispatch function 1 for concrete type - FUNCTION 2 ginarrayextract(anyarray,internal,internal), - FUNCTION 3 ginqueryarrayextract(anyarray,internal,smallint,internal,internal,internal,internal), - FUNCTION 4 ginarrayconsistent(internal,smallint,anyarray,integer,internal,internal,internal,internal), - STORAGE anyelement; - -EOT - -foreach my $t (@opinfo) -{ - print "/*--------------------$t->{TYPENAME}-----------------------*/\n\n"; - - for my $v (qw(func_tmpl opclass_tmpl)) - { - next if !exists $t->{$v}; - - my $x = ${$t->{$v}}; - - for my $k (grep {uc($_) eq $_} keys %$t) - { - $x=~s/$k/$t->{$k}/g; - } - - print $x; - } -} diff --git a/meson.build b/meson.build new file mode 100644 index 0000000000..b4336f0668 --- /dev/null +++ b/meson.build @@ -0,0 +1,118 @@ +# Copyright (c) 2025, Postgres Professional + +# Does not support the PGXS infrastructure at this time. Please, compile as part +# of the contrib source tree. + +extension = 'rum' +extversion = '1.3' + +rum_sources = files( + 'src/btree_rum.c', + 'src/rum_arr_utils.c', + 'src/rum_ts_utils.c', + 'src/rumbtree.c', + 'src/rumbulk.c', + 'src/rumdatapage.c', + 'src/rumentrypage.c', + 'src/rumget.c', + 'src/ruminsert.c', + 'src/rumscan.c', + 'src/rumsort.c', + 'src/rumtsquery.c', + 'src/rumutil.c', + 'src/rumvacuum.c', + 'src/rumvalidate.c', +) + +if host_system == 'windows' + rum_sources += rc_lib_gen.process(win32ver_rc, extra_args: [ + '--NAME', 'rum', + '--FILEDESC', 'RUM index access method',]) +endif + +rum = shared_module('rum', + rum_sources, + kwargs: contrib_mod_args, +) +contrib_targets += rum + +configure_file( + input: 'rum_init.sql', + output: extension + '--' + extversion + '.sql', + copy: true, + install: true, + install_dir: contrib_data_args['install_dir'], +) + +install_data( + 'rum.control', + 'rum--1.0--1.1.sql', + 'rum--1.1--1.2.sql', + 'rum--1.2--1.3.sql', + kwargs: contrib_data_args, +) + +tests += { + 'name': 'rum', + 'sd': meson.current_source_dir(), + 'bd': meson.current_build_dir(), + 'regress': { + 'sql': [ + 'security', + 'rum', + 'rum_validate', + 'rum_hash', + 'ruminv', + 'timestamp', + 'orderby', + 'orderby_hash', + 'altorder', + 'altorder_hash', + 'limits', + 'int2', + 'int4', + 'int8', + 'float4', + 'float8', + 'money', + 'oid', + 'time', + 'timetz', + 'date', + 'interval', + 'macaddr', + 'inet', + 'cidr', + 'text', + 'varchar', + 'char', + 'bytea', + 'bit', + 'varbit', + 'numeric', + 'rum_weight', + 'expr', + 'array', + ], + 'regress_args': [ + '--temp-config', files('logical.conf') + ], + }, + 'tap': { + 'tests': [ + 't/001_wal.pl', + 't/002_pglist.pl', + ], + 'test_kwargs': {'timeout': 3000}, + }, + 'isolation': { + 'specs': [ + 'predicate-rum', + 'predicate-rum-2', + ], + 'regress_args': [ + '--temp-config', files('logical.conf'), + '--load-extension=rum', + ], + }, +} diff --git a/rum--1.0.sql b/rum--1.0.sql deleted file mode 100644 index fc83eed11a..0000000000 --- a/rum--1.0.sql +++ /dev/null @@ -1,411 +0,0 @@ -CREATE OR REPLACE FUNCTION rumhandler(internal) -RETURNS index_am_handler -AS 'MODULE_PATHNAME' -LANGUAGE C; - -/* - * RUM access method - */ - -CREATE ACCESS METHOD rum TYPE INDEX HANDLER rumhandler; - -/* - * RUM built-in types, operators and functions - */ - --- Type used in distance calculations with normalization argument -CREATE TYPE rum_distance_query AS (query tsquery, method int); - -CREATE FUNCTION tsquery_to_distance_query(tsquery) -RETURNS rum_distance_query -AS 'MODULE_PATHNAME', 'tsquery_to_distance_query' -LANGUAGE C IMMUTABLE STRICT; - -CREATE CAST (tsquery AS rum_distance_query) - WITH FUNCTION tsquery_to_distance_query(tsquery) AS IMPLICIT; - -CREATE FUNCTION rum_ts_distance(tsvector,tsquery) -RETURNS float4 -AS 'MODULE_PATHNAME', 'rum_ts_distance_tt' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION rum_ts_distance(tsvector,tsquery,int) -RETURNS float4 -AS 'MODULE_PATHNAME', 'rum_ts_distance_ttf' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION rum_ts_distance(tsvector,rum_distance_query) -RETURNS float4 -AS 'MODULE_PATHNAME', 'rum_ts_distance_td' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR <=> ( - LEFTARG = tsvector, - RIGHTARG = tsquery, - PROCEDURE = rum_ts_distance -); - -CREATE OPERATOR <=> ( - LEFTARG = tsvector, - RIGHTARG = rum_distance_query, - PROCEDURE = rum_ts_distance -); - -CREATE FUNCTION rum_timestamp_distance(timestamp, timestamp) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR <=> ( - PROCEDURE = rum_timestamp_distance, - LEFTARG = timestamp, - RIGHTARG = timestamp, - COMMUTATOR = <=> -); - -CREATE FUNCTION rum_timestamp_left_distance(timestamp, timestamp) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR <=| ( - PROCEDURE = rum_timestamp_left_distance, - LEFTARG = timestamp, - RIGHTARG = timestamp, - COMMUTATOR = |=> -); - -CREATE FUNCTION rum_timestamp_right_distance(timestamp, timestamp) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR |=> ( - PROCEDURE = rum_timestamp_right_distance, - LEFTARG = timestamp, - RIGHTARG = timestamp, - COMMUTATOR = <=| -); - -/* - * rum_tsvector_ops operator class - */ - -CREATE FUNCTION rum_extract_tsvector(tsvector,internal,internal,internal,internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION rum_extract_tsquery(tsquery,internal,smallint,internal,internal,internal,internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION rum_tsvector_config(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal) -RETURNS bool -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION rum_tsquery_consistent(internal, smallint, tsvector, integer, internal, internal, internal, internal) -RETURNS bool -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION rum_tsquery_distance(internal,smallint,tsvector,int,internal,internal,internal,internal,internal) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - --- To prevent calling from SQL -CREATE FUNCTION rum_ts_join_pos(internal, internal) -RETURNS bytea -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR CLASS rum_tsvector_ops -DEFAULT FOR TYPE tsvector USING rum -AS - OPERATOR 1 @@ (tsvector, tsquery), - OPERATOR 2 <=> (tsvector, tsquery) FOR ORDER BY pg_catalog.float_ops, - FUNCTION 1 gin_cmp_tslexeme(text, text), - FUNCTION 2 rum_extract_tsvector(tsvector,internal,internal,internal,internal), - FUNCTION 3 rum_extract_tsquery(tsquery,internal,smallint,internal,internal,internal,internal), - FUNCTION 4 rum_tsquery_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - FUNCTION 5 gin_cmp_prefix(text,text,smallint,internal), - FUNCTION 6 rum_tsvector_config(internal), - FUNCTION 7 rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - FUNCTION 8 rum_tsquery_distance(internal,smallint,tsvector,int,internal,internal,internal,internal,internal), - FUNCTION 10 rum_ts_join_pos(internal, internal), - STORAGE text; - -/* - * rum_tsvector_hash_ops operator class. - * - * Stores hash of entries as keys in index. - */ - -CREATE FUNCTION rum_extract_tsvector_hash(tsvector,internal,internal,internal,internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION rum_extract_tsquery_hash(tsquery,internal,smallint,internal,internal,internal,internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR CLASS rum_tsvector_hash_ops -FOR TYPE tsvector USING rum -AS - OPERATOR 1 @@ (tsvector, tsquery), - OPERATOR 2 <=> (tsvector, tsquery) FOR ORDER BY pg_catalog.float_ops, - FUNCTION 1 btint4cmp(integer, integer), - FUNCTION 2 rum_extract_tsvector_hash(tsvector,internal,internal,internal,internal), - FUNCTION 3 rum_extract_tsquery_hash(tsquery,internal,smallint,internal,internal,internal,internal), - FUNCTION 4 rum_tsquery_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - FUNCTION 6 rum_tsvector_config(internal), - FUNCTION 7 rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - FUNCTION 8 rum_tsquery_distance(internal,smallint,tsvector,int,internal,internal,internal,internal,internal), - FUNCTION 10 rum_ts_join_pos(internal, internal), - STORAGE integer; - -/* - * rum_timestamp_ops operator class - */ - --- timestamp operator class - -CREATE FUNCTION rum_timestamp_extract_value(timestamp,internal,internal,internal,internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_timestamp_compare_prefix(timestamp,timestamp,smallint,internal) -RETURNS int4 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_timestamp_config(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION rum_timestamp_extract_query(timestamp,internal,smallint,internal,internal,internal,internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_timestamp_consistent(internal,smallint,timestamp,int,internal,internal,internal,internal) -RETURNS bool -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_timestamp_outer_distance(timestamp, timestamp, smallint) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE OPERATOR CLASS rum_timestamp_ops -DEFAULT FOR TYPE timestamp USING rum -AS - OPERATOR 1 <, - OPERATOR 2 <=, - OPERATOR 3 =, - OPERATOR 4 >=, - OPERATOR 5 >, - --support - FUNCTION 1 timestamp_cmp(timestamp,timestamp), - FUNCTION 2 rum_timestamp_extract_value(timestamp,internal,internal,internal,internal), - FUNCTION 3 rum_timestamp_extract_query(timestamp,internal,smallint,internal,internal,internal,internal), - FUNCTION 4 rum_timestamp_consistent(internal,smallint,timestamp,int,internal,internal,internal,internal), - FUNCTION 5 rum_timestamp_compare_prefix(timestamp,timestamp,smallint,internal), - FUNCTION 6 rum_timestamp_config(internal), - -- support to timestamp distance in rum_tsvector_timestamp_ops - FUNCTION 9 rum_timestamp_outer_distance(timestamp, timestamp, smallint), - OPERATOR 20 <=> (timestamp,timestamp) FOR ORDER BY pg_catalog.float_ops, - OPERATOR 21 <=| (timestamp,timestamp) FOR ORDER BY pg_catalog.float_ops, - OPERATOR 22 |=> (timestamp,timestamp) FOR ORDER BY pg_catalog.float_ops, -STORAGE timestamp; - -/* - * rum_tsvector_timestamp_ops operator class. - * - * Stores timestamp with tsvector. - */ - -CREATE FUNCTION rum_tsquery_timestamp_consistent(internal, smallint, tsvector, integer, internal, internal, internal, internal) -RETURNS bool -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -/* - * !!!deprecated, use rum_tsvector_addon_ops!!! - */ -CREATE OPERATOR CLASS rum_tsvector_timestamp_ops -FOR TYPE tsvector USING rum -AS - OPERATOR 1 @@ (tsvector, tsquery), - --support function - FUNCTION 1 gin_cmp_tslexeme(text, text), - FUNCTION 2 rum_extract_tsvector(tsvector,internal,internal,internal,internal), - FUNCTION 3 rum_extract_tsquery(tsquery,internal,smallint,internal,internal,internal,internal), - FUNCTION 4 rum_tsquery_timestamp_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - FUNCTION 5 gin_cmp_prefix(text,text,smallint,internal), - FUNCTION 7 rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - STORAGE text; - -/* - * rum_tsvector_hash_timestamp_ops operator class - * !!!deprecated, use rum_tsvector_hash_addon_ops!!! - */ - -CREATE OPERATOR CLASS rum_tsvector_hash_timestamp_ops -FOR TYPE tsvector USING rum -AS - OPERATOR 1 @@ (tsvector, tsquery), - --support function - FUNCTION 1 btint4cmp(integer, integer), - FUNCTION 2 rum_extract_tsvector_hash(tsvector,internal,internal,internal,internal), - FUNCTION 3 rum_extract_tsquery_hash(tsquery,internal,smallint,internal,internal,internal,internal), - FUNCTION 4 rum_tsquery_timestamp_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - FUNCTION 7 rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - STORAGE integer; - -/* - * rum_timestamptz_ops operator class - */ - -CREATE FUNCTION rum_timestamptz_distance(timestamptz, timestamptz) -RETURNS float8 -AS 'MODULE_PATHNAME', 'rum_timestamp_distance' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR <=> ( - PROCEDURE = rum_timestamptz_distance, - LEFTARG = timestamptz, - RIGHTARG = timestamptz, - COMMUTATOR = <=> -); - -CREATE FUNCTION rum_timestamptz_left_distance(timestamptz, timestamptz) -RETURNS float8 -AS 'MODULE_PATHNAME', 'rum_timestamp_left_distance' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR <=| ( - PROCEDURE = rum_timestamptz_left_distance, - LEFTARG = timestamptz, - RIGHTARG = timestamptz, - COMMUTATOR = |=> -); - -CREATE FUNCTION rum_timestamptz_right_distance(timestamptz, timestamptz) -RETURNS float8 -AS 'MODULE_PATHNAME', 'rum_timestamp_right_distance' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR |=> ( - PROCEDURE = rum_timestamptz_right_distance, - LEFTARG = timestamptz, - RIGHTARG = timestamptz, - COMMUTATOR = <=| -); - -CREATE OPERATOR CLASS rum_timestamptz_ops -DEFAULT FOR TYPE timestamptz USING rum -AS - OPERATOR 1 <, - OPERATOR 2 <=, - OPERATOR 3 =, - OPERATOR 4 >=, - OPERATOR 5 >, - --support - FUNCTION 1 timestamptz_cmp(timestamptz,timestamptz), - FUNCTION 2 rum_timestamp_extract_value(timestamp,internal,internal,internal,internal), - FUNCTION 3 rum_timestamp_extract_query(timestamp,internal,smallint,internal,internal,internal,internal), - FUNCTION 4 rum_timestamp_consistent(internal,smallint,timestamp,int,internal,internal,internal,internal), - FUNCTION 5 rum_timestamp_compare_prefix(timestamp,timestamp,smallint,internal), - FUNCTION 6 rum_timestamp_config(internal), - -- support to timestamptz distance in rum_tsvector_timestamptz_ops - FUNCTION 9 rum_timestamp_outer_distance(timestamp, timestamp, smallint), - OPERATOR 20 <=> (timestamptz,timestamptz) FOR ORDER BY pg_catalog.float_ops, - OPERATOR 21 <=| (timestamptz,timestamptz) FOR ORDER BY pg_catalog.float_ops, - OPERATOR 22 |=> (timestamptz,timestamptz) FOR ORDER BY pg_catalog.float_ops, -STORAGE timestamptz; - -/* - * rum_tsvector_timestamptz_ops operator class. - * - * Stores tsvector with timestamptz. - */ - -CREATE OPERATOR CLASS rum_tsvector_timestamptz_ops -FOR TYPE tsvector USING rum -AS - OPERATOR 1 @@ (tsvector, tsquery), - --support function - FUNCTION 1 gin_cmp_tslexeme(text, text), - FUNCTION 2 rum_extract_tsvector(tsvector,internal,internal,internal,internal), - FUNCTION 3 rum_extract_tsquery(tsquery,internal,smallint,internal,internal,internal,internal), - FUNCTION 4 rum_tsquery_timestamp_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - FUNCTION 5 gin_cmp_prefix(text,text,smallint,internal), - FUNCTION 7 rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - STORAGE text; - -/* - * rum_tsvector_hash_timestamptz_ops operator class - */ - -CREATE OPERATOR CLASS rum_tsvector_hash_timestamptz_ops -FOR TYPE tsvector USING rum -AS - OPERATOR 1 @@ (tsvector, tsquery), - --support function - FUNCTION 1 btint4cmp(integer, integer), - FUNCTION 2 rum_extract_tsvector_hash(tsvector,internal,internal,internal,internal), - FUNCTION 3 rum_extract_tsquery_hash(tsquery,internal,smallint,internal,internal,internal,internal), - FUNCTION 4 rum_tsquery_timestamp_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - FUNCTION 7 rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - STORAGE integer; - -/* - * rum_tsquery_ops operator class. - * - * Used for inversed text search. - */ - -CREATE FUNCTION ruminv_extract_tsquery(tsquery,internal,internal,internal,internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION ruminv_extract_tsvector(tsvector,internal,smallint,internal,internal,internal,internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION ruminv_tsvector_consistent(internal, smallint, tsvector, integer, internal, internal, internal, internal) -RETURNS bool -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION ruminv_tsquery_config(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR CLASS rum_tsquery_ops -DEFAULT FOR TYPE tsquery USING rum -AS - OPERATOR 1 @@ (tsquery, tsvector), - FUNCTION 1 gin_cmp_tslexeme(text, text), - FUNCTION 2 ruminv_extract_tsquery(tsquery,internal,internal,internal,internal), - FUNCTION 3 ruminv_extract_tsvector(tsvector,internal,smallint,internal,internal,internal,internal), - FUNCTION 4 ruminv_tsvector_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - FUNCTION 6 ruminv_tsquery_config(internal), - STORAGE text; diff --git a/rum--1.1--1.2.sql b/rum--1.1--1.2.sql index fad0250c87..f1ea81bc1f 100644 --- a/rum--1.1--1.2.sql +++ b/rum--1.1--1.2.sql @@ -10,7 +10,7 @@ AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT; -CREATE OR REPLACE FUNCTION rum_anyarray_similar(anyarray,anyarray) +CREATE FUNCTION rum_anyarray_similar(anyarray,anyarray) RETURNS bool AS 'MODULE_PATHNAME' LANGUAGE C STRICT STABLE; @@ -25,7 +25,7 @@ CREATE OPERATOR % ( ); -CREATE OR REPLACE FUNCTION rum_anyarray_distance(anyarray,anyarray) +CREATE FUNCTION rum_anyarray_distance(anyarray,anyarray) RETURNS float8 AS 'MODULE_PATHNAME' LANGUAGE C STRICT STABLE; diff --git a/rum--1.1.sql b/rum--1.1.sql deleted file mode 100644 index 15b8ebae1c..0000000000 --- a/rum--1.1.sql +++ /dev/null @@ -1,1513 +0,0 @@ -CREATE OR REPLACE FUNCTION rumhandler(internal) -RETURNS index_am_handler -AS 'MODULE_PATHNAME' -LANGUAGE C; - -/* - * RUM access method - */ - -CREATE ACCESS METHOD rum TYPE INDEX HANDLER rumhandler; - -/* - * RUM built-in types, operators and functions - */ - --- Type used in distance calculations with normalization argument -CREATE TYPE rum_distance_query AS (query tsquery, method int); - -CREATE FUNCTION tsquery_to_distance_query(tsquery) -RETURNS rum_distance_query -AS 'MODULE_PATHNAME', 'tsquery_to_distance_query' -LANGUAGE C IMMUTABLE STRICT; - -CREATE CAST (tsquery AS rum_distance_query) - WITH FUNCTION tsquery_to_distance_query(tsquery) AS IMPLICIT; - -CREATE FUNCTION rum_ts_distance(tsvector,tsquery) -RETURNS float4 -AS 'MODULE_PATHNAME', 'rum_ts_distance_tt' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION rum_ts_distance(tsvector,tsquery,int) -RETURNS float4 -AS 'MODULE_PATHNAME', 'rum_ts_distance_ttf' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION rum_ts_distance(tsvector,rum_distance_query) -RETURNS float4 -AS 'MODULE_PATHNAME', 'rum_ts_distance_td' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR <=> ( - LEFTARG = tsvector, - RIGHTARG = tsquery, - PROCEDURE = rum_ts_distance -); - -CREATE OPERATOR <=> ( - LEFTARG = tsvector, - RIGHTARG = rum_distance_query, - PROCEDURE = rum_ts_distance -); - -CREATE FUNCTION rum_timestamp_distance(timestamp, timestamp) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR <=> ( - PROCEDURE = rum_timestamp_distance, - LEFTARG = timestamp, - RIGHTARG = timestamp, - COMMUTATOR = <=> -); - -CREATE FUNCTION rum_timestamp_left_distance(timestamp, timestamp) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR <=| ( - PROCEDURE = rum_timestamp_left_distance, - LEFTARG = timestamp, - RIGHTARG = timestamp, - COMMUTATOR = |=> -); - -CREATE FUNCTION rum_timestamp_right_distance(timestamp, timestamp) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR |=> ( - PROCEDURE = rum_timestamp_right_distance, - LEFTARG = timestamp, - RIGHTARG = timestamp, - COMMUTATOR = <=| -); - -/* - * rum_tsvector_ops operator class - */ - -CREATE FUNCTION rum_extract_tsvector(tsvector,internal,internal,internal,internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION rum_extract_tsquery(tsquery,internal,smallint,internal,internal,internal,internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION rum_tsvector_config(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal) -RETURNS bool -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION rum_tsquery_consistent(internal, smallint, tsvector, integer, internal, internal, internal, internal) -RETURNS bool -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION rum_tsquery_distance(internal,smallint,tsvector,int,internal,internal,internal,internal,internal) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - --- To prevent calling from SQL -CREATE FUNCTION rum_ts_join_pos(internal, internal) -RETURNS bytea -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR CLASS rum_tsvector_ops -DEFAULT FOR TYPE tsvector USING rum -AS - OPERATOR 1 @@ (tsvector, tsquery), - OPERATOR 2 <=> (tsvector, tsquery) FOR ORDER BY pg_catalog.float_ops, - FUNCTION 1 gin_cmp_tslexeme(text, text), - FUNCTION 2 rum_extract_tsvector(tsvector,internal,internal,internal,internal), - FUNCTION 3 rum_extract_tsquery(tsquery,internal,smallint,internal,internal,internal,internal), - FUNCTION 4 rum_tsquery_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - FUNCTION 5 gin_cmp_prefix(text,text,smallint,internal), - FUNCTION 6 rum_tsvector_config(internal), - FUNCTION 7 rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - FUNCTION 8 rum_tsquery_distance(internal,smallint,tsvector,int,internal,internal,internal,internal,internal), - FUNCTION 10 rum_ts_join_pos(internal, internal), - STORAGE text; - -/* - * rum_tsvector_hash_ops operator class. - * - * Stores hash of entries as keys in index. - */ - -CREATE FUNCTION rum_extract_tsvector_hash(tsvector,internal,internal,internal,internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION rum_extract_tsquery_hash(tsquery,internal,smallint,internal,internal,internal,internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR CLASS rum_tsvector_hash_ops -FOR TYPE tsvector USING rum -AS - OPERATOR 1 @@ (tsvector, tsquery), - OPERATOR 2 <=> (tsvector, tsquery) FOR ORDER BY pg_catalog.float_ops, - FUNCTION 1 btint4cmp(integer, integer), - FUNCTION 2 rum_extract_tsvector_hash(tsvector,internal,internal,internal,internal), - FUNCTION 3 rum_extract_tsquery_hash(tsquery,internal,smallint,internal,internal,internal,internal), - FUNCTION 4 rum_tsquery_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - FUNCTION 6 rum_tsvector_config(internal), - FUNCTION 7 rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - FUNCTION 8 rum_tsquery_distance(internal,smallint,tsvector,int,internal,internal,internal,internal,internal), - FUNCTION 10 rum_ts_join_pos(internal, internal), - STORAGE integer; - -/* - * rum_timestamp_ops operator class - */ - --- timestamp operator class - -CREATE FUNCTION rum_timestamp_extract_value(timestamp,internal,internal,internal,internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_timestamp_compare_prefix(timestamp,timestamp,smallint,internal) -RETURNS int4 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_timestamp_config(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION rum_timestamp_extract_query(timestamp,internal,smallint,internal,internal,internal,internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_timestamp_consistent(internal,smallint,timestamp,int,internal,internal,internal,internal) -RETURNS bool -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_timestamp_outer_distance(timestamp, timestamp, smallint) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE OPERATOR CLASS rum_timestamp_ops -DEFAULT FOR TYPE timestamp USING rum -AS - OPERATOR 1 <, - OPERATOR 2 <=, - OPERATOR 3 =, - OPERATOR 4 >=, - OPERATOR 5 >, - --support - FUNCTION 1 timestamp_cmp(timestamp,timestamp), - FUNCTION 2 rum_timestamp_extract_value(timestamp,internal,internal,internal,internal), - FUNCTION 3 rum_timestamp_extract_query(timestamp,internal,smallint,internal,internal,internal,internal), - FUNCTION 4 rum_timestamp_consistent(internal,smallint,timestamp,int,internal,internal,internal,internal), - FUNCTION 5 rum_timestamp_compare_prefix(timestamp,timestamp,smallint,internal), - FUNCTION 6 rum_timestamp_config(internal), - -- support to timestamp disttance in rum_tsvector_timestamp_ops - FUNCTION 9 rum_timestamp_outer_distance(timestamp, timestamp, smallint), - OPERATOR 20 <=> (timestamp,timestamp) FOR ORDER BY pg_catalog.float_ops, - OPERATOR 21 <=| (timestamp,timestamp) FOR ORDER BY pg_catalog.float_ops, - OPERATOR 22 |=> (timestamp,timestamp) FOR ORDER BY pg_catalog.float_ops, -STORAGE timestamp; - -/* - * rum_tsvector_timestamp_ops operator class. - * - * Stores timestamp with tsvector. - */ - -CREATE FUNCTION rum_tsquery_timestamp_consistent(internal, smallint, tsvector, integer, internal, internal, internal, internal) -RETURNS bool -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -/* - * !!!deprecated, use rum_tsvector_hash_addon_ops!!! - */ -CREATE OPERATOR CLASS rum_tsvector_timestamp_ops -FOR TYPE tsvector USING rum -AS - OPERATOR 1 @@ (tsvector, tsquery), - --support function - FUNCTION 1 gin_cmp_tslexeme(text, text), - FUNCTION 2 rum_extract_tsvector(tsvector,internal,internal,internal,internal), - FUNCTION 3 rum_extract_tsquery(tsquery,internal,smallint,internal,internal,internal,internal), - FUNCTION 4 rum_tsquery_timestamp_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - FUNCTION 5 gin_cmp_prefix(text,text,smallint,internal), - FUNCTION 7 rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - STORAGE text; - -/* - * rum_tsvector_hash_timestamp_ops operator class - * !!!deprecated, use rum_tsvector_hash_addon_ops!!! - */ - -CREATE OPERATOR CLASS rum_tsvector_hash_timestamp_ops -FOR TYPE tsvector USING rum -AS - OPERATOR 1 @@ (tsvector, tsquery), - --support function - FUNCTION 1 btint4cmp(integer, integer), - FUNCTION 2 rum_extract_tsvector_hash(tsvector,internal,internal,internal,internal), - FUNCTION 3 rum_extract_tsquery_hash(tsquery,internal,smallint,internal,internal,internal,internal), - FUNCTION 4 rum_tsquery_timestamp_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - FUNCTION 7 rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - STORAGE integer; - -/* - * rum_timestamptz_ops operator class - */ - -CREATE FUNCTION rum_timestamptz_distance(timestamptz, timestamptz) -RETURNS float8 -AS 'MODULE_PATHNAME', 'rum_timestamp_distance' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR <=> ( - PROCEDURE = rum_timestamptz_distance, - LEFTARG = timestamptz, - RIGHTARG = timestamptz, - COMMUTATOR = <=> -); - -CREATE FUNCTION rum_timestamptz_left_distance(timestamptz, timestamptz) -RETURNS float8 -AS 'MODULE_PATHNAME', 'rum_timestamp_left_distance' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR <=| ( - PROCEDURE = rum_timestamptz_left_distance, - LEFTARG = timestamptz, - RIGHTARG = timestamptz, - COMMUTATOR = |=> -); - -CREATE FUNCTION rum_timestamptz_right_distance(timestamptz, timestamptz) -RETURNS float8 -AS 'MODULE_PATHNAME', 'rum_timestamp_right_distance' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR |=> ( - PROCEDURE = rum_timestamptz_right_distance, - LEFTARG = timestamptz, - RIGHTARG = timestamptz, - COMMUTATOR = <=| -); - -CREATE OPERATOR CLASS rum_timestamptz_ops -DEFAULT FOR TYPE timestamptz USING rum -AS - OPERATOR 1 <, - OPERATOR 2 <=, - OPERATOR 3 =, - OPERATOR 4 >=, - OPERATOR 5 >, - --support - FUNCTION 1 timestamptz_cmp(timestamptz,timestamptz), - FUNCTION 2 rum_timestamp_extract_value(timestamp,internal,internal,internal,internal), - FUNCTION 3 rum_timestamp_extract_query(timestamp,internal,smallint,internal,internal,internal,internal), - FUNCTION 4 rum_timestamp_consistent(internal,smallint,timestamp,int,internal,internal,internal,internal), - FUNCTION 5 rum_timestamp_compare_prefix(timestamp,timestamp,smallint,internal), - FUNCTION 6 rum_timestamp_config(internal), - -- support to timestamptz distance in rum_tsvector_timestamptz_ops - FUNCTION 9 rum_timestamp_outer_distance(timestamp, timestamp, smallint), - OPERATOR 20 <=> (timestamptz,timestamptz) FOR ORDER BY pg_catalog.float_ops, - OPERATOR 21 <=| (timestamptz,timestamptz) FOR ORDER BY pg_catalog.float_ops, - OPERATOR 22 |=> (timestamptz,timestamptz) FOR ORDER BY pg_catalog.float_ops, -STORAGE timestamptz; - -/* - * rum_tsvector_timestamptz_ops operator class. - * - * Stores tsvector with timestamptz. - */ - -CREATE OPERATOR CLASS rum_tsvector_timestamptz_ops -FOR TYPE tsvector USING rum -AS - OPERATOR 1 @@ (tsvector, tsquery), - --support function - FUNCTION 1 gin_cmp_tslexeme(text, text), - FUNCTION 2 rum_extract_tsvector(tsvector,internal,internal,internal,internal), - FUNCTION 3 rum_extract_tsquery(tsquery,internal,smallint,internal,internal,internal,internal), - FUNCTION 4 rum_tsquery_timestamp_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - FUNCTION 5 gin_cmp_prefix(text,text,smallint,internal), - FUNCTION 7 rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - STORAGE text; - -/* - * rum_tsvector_hash_timestamptz_ops operator class - */ - -CREATE OPERATOR CLASS rum_tsvector_hash_timestamptz_ops -FOR TYPE tsvector USING rum -AS - OPERATOR 1 @@ (tsvector, tsquery), - --support function - FUNCTION 1 btint4cmp(integer, integer), - FUNCTION 2 rum_extract_tsvector_hash(tsvector,internal,internal,internal,internal), - FUNCTION 3 rum_extract_tsquery_hash(tsquery,internal,smallint,internal,internal,internal,internal), - FUNCTION 4 rum_tsquery_timestamp_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - FUNCTION 7 rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - STORAGE integer; - -/* - * rum_tsquery_ops operator class. - * - * Used for inversed text search. - */ - -CREATE FUNCTION ruminv_extract_tsquery(tsquery,internal,internal,internal,internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION ruminv_extract_tsvector(tsvector,internal,smallint,internal,internal,internal,internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION ruminv_tsvector_consistent(internal, smallint, tsvector, integer, internal, internal, internal, internal) -RETURNS bool -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION ruminv_tsquery_config(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR CLASS rum_tsquery_ops -DEFAULT FOR TYPE tsquery USING rum -AS - OPERATOR 1 @@ (tsquery, tsvector), - FUNCTION 1 gin_cmp_tslexeme(text, text), - FUNCTION 2 ruminv_extract_tsquery(tsquery,internal,internal,internal,internal), - FUNCTION 3 ruminv_extract_tsvector(tsvector,internal,smallint,internal,internal,internal,internal), - FUNCTION 4 ruminv_tsvector_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - FUNCTION 6 ruminv_tsquery_config(internal), - STORAGE text; -CREATE FUNCTION rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal) -RETURNS bool -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -ALTER FUNCTION - rum_tsquery_timestamp_consistent (internal,smallint,tsvector,int,internal,internal,internal,internal) - RENAME TO rum_tsquery_addon_consistent; - -CREATE FUNCTION rum_numeric_cmp(numeric, numeric) -RETURNS int4 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE OPERATOR CLASS rum_tsvector_addon_ops -FOR TYPE tsvector USING rum -AS - OPERATOR 1 @@ (tsvector, tsquery), - --support function - FUNCTION 1 gin_cmp_tslexeme(text, text), - FUNCTION 2 rum_extract_tsvector(tsvector,internal,internal,internal,internal), - FUNCTION 3 rum_extract_tsquery(tsquery,internal,smallint,internal,internal,internal,internal), - FUNCTION 4 rum_tsquery_addon_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - FUNCTION 5 gin_cmp_prefix(text,text,smallint,internal), - FUNCTION 7 rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - STORAGE text; - -CREATE OPERATOR CLASS rum_tsvector_hash_addon_ops -FOR TYPE tsvector USING rum -AS - OPERATOR 1 @@ (tsvector, tsquery), - --support function - FUNCTION 1 btint4cmp(integer, integer), - FUNCTION 2 rum_extract_tsvector_hash(tsvector,internal,internal,internal,internal), - FUNCTION 3 rum_extract_tsquery_hash(tsquery,internal,smallint,internal,internal,internal,internal), - FUNCTION 4 rum_tsquery_addon_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - FUNCTION 7 rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - STORAGE integer; - -/*--------------------int2-----------------------*/ - -CREATE FUNCTION rum_int2_extract_value(int2, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_int2_compare_prefix(int2, int2, int2, internal) -RETURNS int4 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_int2_extract_query(int2, internal, int2, internal, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - - - -CREATE FUNCTION rum_int2_distance(int2, int2) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR <=> ( - PROCEDURE = rum_int2_distance, - LEFTARG = int2, - RIGHTARG = int2, - COMMUTATOR = <=> -); - -CREATE FUNCTION rum_int2_left_distance(int2, int2) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR <=| ( - PROCEDURE = rum_int2_left_distance, - LEFTARG = int2, - RIGHTARG = int2, - COMMUTATOR = |=> -); - -CREATE FUNCTION rum_int2_right_distance(int2, int2) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR |=> ( - PROCEDURE = rum_int2_right_distance, - LEFTARG = int2, - RIGHTARG = int2, - COMMUTATOR = <=| -); - -CREATE FUNCTION rum_int2_outer_distance(int2, int2, smallint) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_int2_config(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - - - -CREATE OPERATOR CLASS rum_int2_ops -DEFAULT FOR TYPE int2 USING rum -AS - OPERATOR 1 < , - OPERATOR 2 <= , - OPERATOR 3 = , - OPERATOR 4 >= , - OPERATOR 5 > , - OPERATOR 20 <=> (int2,int2) FOR ORDER BY pg_catalog.float_ops, - OPERATOR 21 <=| (int2,int2) FOR ORDER BY pg_catalog.float_ops, - OPERATOR 22 |=> (int2,int2) FOR ORDER BY pg_catalog.float_ops, - FUNCTION 1 btint2cmp(int2,int2), - FUNCTION 2 rum_int2_extract_value(int2, internal), - FUNCTION 3 rum_int2_extract_query(int2, internal, int2, internal, internal), - FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), - FUNCTION 5 rum_int2_compare_prefix(int2,int2,int2, internal), - -- support to int2 distance in rum_tsvector_addon_ops - FUNCTION 6 rum_int2_config(internal), - FUNCTION 9 rum_int2_outer_distance(int2, int2, smallint), -STORAGE int2; - -/*--------------------int4-----------------------*/ - -CREATE FUNCTION rum_int4_extract_value(int4, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_int4_compare_prefix(int4, int4, int2, internal) -RETURNS int4 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_int4_extract_query(int4, internal, int2, internal, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - - - -CREATE FUNCTION rum_int4_distance(int4, int4) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR <=> ( - PROCEDURE = rum_int4_distance, - LEFTARG = int4, - RIGHTARG = int4, - COMMUTATOR = <=> -); - -CREATE FUNCTION rum_int4_left_distance(int4, int4) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR <=| ( - PROCEDURE = rum_int4_left_distance, - LEFTARG = int4, - RIGHTARG = int4, - COMMUTATOR = |=> -); - -CREATE FUNCTION rum_int4_right_distance(int4, int4) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR |=> ( - PROCEDURE = rum_int4_right_distance, - LEFTARG = int4, - RIGHTARG = int4, - COMMUTATOR = <=| -); - -CREATE FUNCTION rum_int4_outer_distance(int4, int4, smallint) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_int4_config(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - - - -CREATE OPERATOR CLASS rum_int4_ops -DEFAULT FOR TYPE int4 USING rum -AS - OPERATOR 1 < , - OPERATOR 2 <= , - OPERATOR 3 = , - OPERATOR 4 >= , - OPERATOR 5 > , - OPERATOR 20 <=> (int4,int4) FOR ORDER BY pg_catalog.float_ops, - OPERATOR 21 <=| (int4,int4) FOR ORDER BY pg_catalog.float_ops, - OPERATOR 22 |=> (int4,int4) FOR ORDER BY pg_catalog.float_ops, - FUNCTION 1 btint4cmp(int4,int4), - FUNCTION 2 rum_int4_extract_value(int4, internal), - FUNCTION 3 rum_int4_extract_query(int4, internal, int2, internal, internal), - FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), - FUNCTION 5 rum_int4_compare_prefix(int4,int4,int2, internal), - -- support to int4 distance in rum_tsvector_addon_ops - FUNCTION 6 rum_int4_config(internal), - FUNCTION 9 rum_int4_outer_distance(int4, int4, smallint), -STORAGE int4; - -/*--------------------int8-----------------------*/ - -CREATE FUNCTION rum_int8_extract_value(int8, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_int8_compare_prefix(int8, int8, int2, internal) -RETURNS int4 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_int8_extract_query(int8, internal, int2, internal, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - - - -CREATE FUNCTION rum_int8_distance(int8, int8) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR <=> ( - PROCEDURE = rum_int8_distance, - LEFTARG = int8, - RIGHTARG = int8, - COMMUTATOR = <=> -); - -CREATE FUNCTION rum_int8_left_distance(int8, int8) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR <=| ( - PROCEDURE = rum_int8_left_distance, - LEFTARG = int8, - RIGHTARG = int8, - COMMUTATOR = |=> -); - -CREATE FUNCTION rum_int8_right_distance(int8, int8) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR |=> ( - PROCEDURE = rum_int8_right_distance, - LEFTARG = int8, - RIGHTARG = int8, - COMMUTATOR = <=| -); - -CREATE FUNCTION rum_int8_outer_distance(int8, int8, smallint) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_int8_config(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - - - -CREATE OPERATOR CLASS rum_int8_ops -DEFAULT FOR TYPE int8 USING rum -AS - OPERATOR 1 < , - OPERATOR 2 <= , - OPERATOR 3 = , - OPERATOR 4 >= , - OPERATOR 5 > , - OPERATOR 20 <=> (int8,int8) FOR ORDER BY pg_catalog.float_ops, - OPERATOR 21 <=| (int8,int8) FOR ORDER BY pg_catalog.float_ops, - OPERATOR 22 |=> (int8,int8) FOR ORDER BY pg_catalog.float_ops, - FUNCTION 1 btint8cmp(int8,int8), - FUNCTION 2 rum_int8_extract_value(int8, internal), - FUNCTION 3 rum_int8_extract_query(int8, internal, int2, internal, internal), - FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), - FUNCTION 5 rum_int8_compare_prefix(int8,int8,int2, internal), - -- support to int8 distance in rum_tsvector_addon_ops - FUNCTION 6 rum_int8_config(internal), - FUNCTION 9 rum_int8_outer_distance(int8, int8, smallint), -STORAGE int8; - -/*--------------------float4-----------------------*/ - -CREATE FUNCTION rum_float4_extract_value(float4, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_float4_compare_prefix(float4, float4, int2, internal) -RETURNS int4 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_float4_extract_query(float4, internal, int2, internal, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - - - -CREATE FUNCTION rum_float4_distance(float4, float4) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR <=> ( - PROCEDURE = rum_float4_distance, - LEFTARG = float4, - RIGHTARG = float4, - COMMUTATOR = <=> -); - -CREATE FUNCTION rum_float4_left_distance(float4, float4) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR <=| ( - PROCEDURE = rum_float4_left_distance, - LEFTARG = float4, - RIGHTARG = float4, - COMMUTATOR = |=> -); - -CREATE FUNCTION rum_float4_right_distance(float4, float4) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR |=> ( - PROCEDURE = rum_float4_right_distance, - LEFTARG = float4, - RIGHTARG = float4, - COMMUTATOR = <=| -); - -CREATE FUNCTION rum_float4_outer_distance(float4, float4, smallint) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_float4_config(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - - - -CREATE OPERATOR CLASS rum_float4_ops -DEFAULT FOR TYPE float4 USING rum -AS - OPERATOR 1 < , - OPERATOR 2 <= , - OPERATOR 3 = , - OPERATOR 4 >= , - OPERATOR 5 > , - OPERATOR 20 <=> (float4,float4) FOR ORDER BY pg_catalog.float_ops, - OPERATOR 21 <=| (float4,float4) FOR ORDER BY pg_catalog.float_ops, - OPERATOR 22 |=> (float4,float4) FOR ORDER BY pg_catalog.float_ops, - FUNCTION 1 btfloat4cmp(float4,float4), - FUNCTION 2 rum_float4_extract_value(float4, internal), - FUNCTION 3 rum_float4_extract_query(float4, internal, int2, internal, internal), - FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), - FUNCTION 5 rum_float4_compare_prefix(float4,float4,int2, internal), - -- support to float4 distance in rum_tsvector_addon_ops - FUNCTION 6 rum_float4_config(internal), - FUNCTION 9 rum_float4_outer_distance(float4, float4, smallint), -STORAGE float4; - -/*--------------------float8-----------------------*/ - -CREATE FUNCTION rum_float8_extract_value(float8, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_float8_compare_prefix(float8, float8, int2, internal) -RETURNS int4 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_float8_extract_query(float8, internal, int2, internal, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - - - -CREATE FUNCTION rum_float8_distance(float8, float8) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR <=> ( - PROCEDURE = rum_float8_distance, - LEFTARG = float8, - RIGHTARG = float8, - COMMUTATOR = <=> -); - -CREATE FUNCTION rum_float8_left_distance(float8, float8) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR <=| ( - PROCEDURE = rum_float8_left_distance, - LEFTARG = float8, - RIGHTARG = float8, - COMMUTATOR = |=> -); - -CREATE FUNCTION rum_float8_right_distance(float8, float8) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR |=> ( - PROCEDURE = rum_float8_right_distance, - LEFTARG = float8, - RIGHTARG = float8, - COMMUTATOR = <=| -); - -CREATE FUNCTION rum_float8_outer_distance(float8, float8, smallint) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_float8_config(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - - - -CREATE OPERATOR CLASS rum_float8_ops -DEFAULT FOR TYPE float8 USING rum -AS - OPERATOR 1 < , - OPERATOR 2 <= , - OPERATOR 3 = , - OPERATOR 4 >= , - OPERATOR 5 > , - OPERATOR 20 <=> (float8,float8) FOR ORDER BY pg_catalog.float_ops, - OPERATOR 21 <=| (float8,float8) FOR ORDER BY pg_catalog.float_ops, - OPERATOR 22 |=> (float8,float8) FOR ORDER BY pg_catalog.float_ops, - FUNCTION 1 btfloat8cmp(float8,float8), - FUNCTION 2 rum_float8_extract_value(float8, internal), - FUNCTION 3 rum_float8_extract_query(float8, internal, int2, internal, internal), - FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), - FUNCTION 5 rum_float8_compare_prefix(float8,float8,int2, internal), - -- support to float8 distance in rum_tsvector_addon_ops - FUNCTION 6 rum_float8_config(internal), - FUNCTION 9 rum_float8_outer_distance(float8, float8, smallint), -STORAGE float8; - -/*--------------------money-----------------------*/ - -CREATE FUNCTION rum_money_extract_value(money, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_money_compare_prefix(money, money, int2, internal) -RETURNS int4 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_money_extract_query(money, internal, int2, internal, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - - - -CREATE FUNCTION rum_money_distance(money, money) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR <=> ( - PROCEDURE = rum_money_distance, - LEFTARG = money, - RIGHTARG = money, - COMMUTATOR = <=> -); - -CREATE FUNCTION rum_money_left_distance(money, money) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR <=| ( - PROCEDURE = rum_money_left_distance, - LEFTARG = money, - RIGHTARG = money, - COMMUTATOR = |=> -); - -CREATE FUNCTION rum_money_right_distance(money, money) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR |=> ( - PROCEDURE = rum_money_right_distance, - LEFTARG = money, - RIGHTARG = money, - COMMUTATOR = <=| -); - -CREATE FUNCTION rum_money_outer_distance(money, money, smallint) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_money_config(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - - - -CREATE OPERATOR CLASS rum_money_ops -DEFAULT FOR TYPE money USING rum -AS - OPERATOR 1 < , - OPERATOR 2 <= , - OPERATOR 3 = , - OPERATOR 4 >= , - OPERATOR 5 > , - OPERATOR 20 <=> (money,money) FOR ORDER BY pg_catalog.float_ops, - OPERATOR 21 <=| (money,money) FOR ORDER BY pg_catalog.float_ops, - OPERATOR 22 |=> (money,money) FOR ORDER BY pg_catalog.float_ops, - FUNCTION 1 cash_cmp(money,money), - FUNCTION 2 rum_money_extract_value(money, internal), - FUNCTION 3 rum_money_extract_query(money, internal, int2, internal, internal), - FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), - FUNCTION 5 rum_money_compare_prefix(money,money,int2, internal), - -- support to money distance in rum_tsvector_addon_ops - FUNCTION 6 rum_money_config(internal), - FUNCTION 9 rum_money_outer_distance(money, money, smallint), -STORAGE money; - -/*--------------------oid-----------------------*/ - -CREATE FUNCTION rum_oid_extract_value(oid, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_oid_compare_prefix(oid, oid, int2, internal) -RETURNS int4 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_oid_extract_query(oid, internal, int2, internal, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - - - -CREATE FUNCTION rum_oid_distance(oid, oid) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR <=> ( - PROCEDURE = rum_oid_distance, - LEFTARG = oid, - RIGHTARG = oid, - COMMUTATOR = <=> -); - -CREATE FUNCTION rum_oid_left_distance(oid, oid) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR <=| ( - PROCEDURE = rum_oid_left_distance, - LEFTARG = oid, - RIGHTARG = oid, - COMMUTATOR = |=> -); - -CREATE FUNCTION rum_oid_right_distance(oid, oid) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR |=> ( - PROCEDURE = rum_oid_right_distance, - LEFTARG = oid, - RIGHTARG = oid, - COMMUTATOR = <=| -); - -CREATE FUNCTION rum_oid_outer_distance(oid, oid, smallint) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_oid_config(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - - - -CREATE OPERATOR CLASS rum_oid_ops -DEFAULT FOR TYPE oid USING rum -AS - OPERATOR 1 < , - OPERATOR 2 <= , - OPERATOR 3 = , - OPERATOR 4 >= , - OPERATOR 5 > , - OPERATOR 20 <=> (oid,oid) FOR ORDER BY pg_catalog.float_ops, - OPERATOR 21 <=| (oid,oid) FOR ORDER BY pg_catalog.float_ops, - OPERATOR 22 |=> (oid,oid) FOR ORDER BY pg_catalog.float_ops, - FUNCTION 1 btoidcmp(oid,oid), - FUNCTION 2 rum_oid_extract_value(oid, internal), - FUNCTION 3 rum_oid_extract_query(oid, internal, int2, internal, internal), - FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), - FUNCTION 5 rum_oid_compare_prefix(oid,oid,int2, internal), - -- support to oid distance in rum_tsvector_addon_ops - FUNCTION 6 rum_oid_config(internal), - FUNCTION 9 rum_oid_outer_distance(oid, oid, smallint), -STORAGE oid; - -/*--------------------time-----------------------*/ - -CREATE FUNCTION rum_time_extract_value(time, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_time_compare_prefix(time, time, int2, internal) -RETURNS int4 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_time_extract_query(time, internal, int2, internal, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - - -CREATE OPERATOR CLASS rum_time_ops -DEFAULT FOR TYPE time USING rum -AS - OPERATOR 1 < , - OPERATOR 2 <= , - OPERATOR 3 = , - OPERATOR 4 >= , - OPERATOR 5 > , - FUNCTION 1 time_cmp(time,time), - FUNCTION 2 rum_time_extract_value(time, internal), - FUNCTION 3 rum_time_extract_query(time, internal, int2, internal, internal), - FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), - FUNCTION 5 rum_time_compare_prefix(time,time,int2, internal), -STORAGE time; - -/*--------------------timetz-----------------------*/ - -CREATE FUNCTION rum_timetz_extract_value(timetz, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_timetz_compare_prefix(timetz, timetz, int2, internal) -RETURNS int4 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_timetz_extract_query(timetz, internal, int2, internal, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - - -CREATE OPERATOR CLASS rum_timetz_ops -DEFAULT FOR TYPE timetz USING rum -AS - OPERATOR 1 < , - OPERATOR 2 <= , - OPERATOR 3 = , - OPERATOR 4 >= , - OPERATOR 5 > , - FUNCTION 1 timetz_cmp(timetz,timetz), - FUNCTION 2 rum_timetz_extract_value(timetz, internal), - FUNCTION 3 rum_timetz_extract_query(timetz, internal, int2, internal, internal), - FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), - FUNCTION 5 rum_timetz_compare_prefix(timetz,timetz,int2, internal), -STORAGE timetz; - -/*--------------------date-----------------------*/ - -CREATE FUNCTION rum_date_extract_value(date, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_date_compare_prefix(date, date, int2, internal) -RETURNS int4 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_date_extract_query(date, internal, int2, internal, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - - -CREATE OPERATOR CLASS rum_date_ops -DEFAULT FOR TYPE date USING rum -AS - OPERATOR 1 < , - OPERATOR 2 <= , - OPERATOR 3 = , - OPERATOR 4 >= , - OPERATOR 5 > , - FUNCTION 1 date_cmp(date,date), - FUNCTION 2 rum_date_extract_value(date, internal), - FUNCTION 3 rum_date_extract_query(date, internal, int2, internal, internal), - FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), - FUNCTION 5 rum_date_compare_prefix(date,date,int2, internal), -STORAGE date; - -/*--------------------interval-----------------------*/ - -CREATE FUNCTION rum_interval_extract_value(interval, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_interval_compare_prefix(interval, interval, int2, internal) -RETURNS int4 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_interval_extract_query(interval, internal, int2, internal, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - - -CREATE OPERATOR CLASS rum_interval_ops -DEFAULT FOR TYPE interval USING rum -AS - OPERATOR 1 < , - OPERATOR 2 <= , - OPERATOR 3 = , - OPERATOR 4 >= , - OPERATOR 5 > , - FUNCTION 1 interval_cmp(interval,interval), - FUNCTION 2 rum_interval_extract_value(interval, internal), - FUNCTION 3 rum_interval_extract_query(interval, internal, int2, internal, internal), - FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), - FUNCTION 5 rum_interval_compare_prefix(interval,interval,int2, internal), -STORAGE interval; - -/*--------------------macaddr-----------------------*/ - -CREATE FUNCTION rum_macaddr_extract_value(macaddr, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_macaddr_compare_prefix(macaddr, macaddr, int2, internal) -RETURNS int4 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_macaddr_extract_query(macaddr, internal, int2, internal, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - - -CREATE OPERATOR CLASS rum_macaddr_ops -DEFAULT FOR TYPE macaddr USING rum -AS - OPERATOR 1 < , - OPERATOR 2 <= , - OPERATOR 3 = , - OPERATOR 4 >= , - OPERATOR 5 > , - FUNCTION 1 macaddr_cmp(macaddr,macaddr), - FUNCTION 2 rum_macaddr_extract_value(macaddr, internal), - FUNCTION 3 rum_macaddr_extract_query(macaddr, internal, int2, internal, internal), - FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), - FUNCTION 5 rum_macaddr_compare_prefix(macaddr,macaddr,int2, internal), -STORAGE macaddr; - -/*--------------------inet-----------------------*/ - -CREATE FUNCTION rum_inet_extract_value(inet, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_inet_compare_prefix(inet, inet, int2, internal) -RETURNS int4 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_inet_extract_query(inet, internal, int2, internal, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - - -CREATE OPERATOR CLASS rum_inet_ops -DEFAULT FOR TYPE inet USING rum -AS - OPERATOR 1 < , - OPERATOR 2 <= , - OPERATOR 3 = , - OPERATOR 4 >= , - OPERATOR 5 > , - FUNCTION 1 network_cmp(inet,inet), - FUNCTION 2 rum_inet_extract_value(inet, internal), - FUNCTION 3 rum_inet_extract_query(inet, internal, int2, internal, internal), - FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), - FUNCTION 5 rum_inet_compare_prefix(inet,inet,int2, internal), -STORAGE inet; - -/*--------------------cidr-----------------------*/ - -CREATE FUNCTION rum_cidr_extract_value(cidr, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_cidr_compare_prefix(cidr, cidr, int2, internal) -RETURNS int4 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_cidr_extract_query(cidr, internal, int2, internal, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - - -CREATE OPERATOR CLASS rum_cidr_ops -DEFAULT FOR TYPE cidr USING rum -AS - OPERATOR 1 < (inet, inet), - OPERATOR 2 <= (inet, inet), - OPERATOR 3 = (inet, inet), - OPERATOR 4 >= (inet, inet), - OPERATOR 5 > (inet, inet), - FUNCTION 1 network_cmp(inet,inet), - FUNCTION 2 rum_cidr_extract_value(cidr, internal), - FUNCTION 3 rum_cidr_extract_query(cidr, internal, int2, internal, internal), - FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), - FUNCTION 5 rum_cidr_compare_prefix(cidr,cidr,int2, internal), -STORAGE cidr; - -/*--------------------text-----------------------*/ - -CREATE FUNCTION rum_text_extract_value(text, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_text_compare_prefix(text, text, int2, internal) -RETURNS int4 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_text_extract_query(text, internal, int2, internal, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - - -CREATE OPERATOR CLASS rum_text_ops -DEFAULT FOR TYPE text USING rum -AS - OPERATOR 1 < , - OPERATOR 2 <= , - OPERATOR 3 = , - OPERATOR 4 >= , - OPERATOR 5 > , - FUNCTION 1 bttextcmp(text,text), - FUNCTION 2 rum_text_extract_value(text, internal), - FUNCTION 3 rum_text_extract_query(text, internal, int2, internal, internal), - FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), - FUNCTION 5 rum_text_compare_prefix(text,text,int2, internal), -STORAGE text; - -/*--------------------varchar-----------------------*/ - - -CREATE OPERATOR CLASS rum_varchar_ops -DEFAULT FOR TYPE varchar USING rum -AS - OPERATOR 1 < (text, text), - OPERATOR 2 <= (text, text), - OPERATOR 3 = (text, text), - OPERATOR 4 >= (text, text), - OPERATOR 5 > (text, text), - FUNCTION 1 bttextcmp(text,text), - FUNCTION 2 rum_text_extract_value(text, internal), - FUNCTION 3 rum_text_extract_query(text, internal, int2, internal, internal), - FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), - FUNCTION 5 rum_text_compare_prefix(text,text,int2, internal), -STORAGE varchar; - -/*--------------------"char"-----------------------*/ - -CREATE FUNCTION rum_char_extract_value("char", internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_char_compare_prefix("char", "char", int2, internal) -RETURNS int4 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_char_extract_query("char", internal, int2, internal, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - - -CREATE OPERATOR CLASS rum_char_ops -DEFAULT FOR TYPE "char" USING rum -AS - OPERATOR 1 < , - OPERATOR 2 <= , - OPERATOR 3 = , - OPERATOR 4 >= , - OPERATOR 5 > , - FUNCTION 1 btcharcmp("char","char"), - FUNCTION 2 rum_char_extract_value("char", internal), - FUNCTION 3 rum_char_extract_query("char", internal, int2, internal, internal), - FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), - FUNCTION 5 rum_char_compare_prefix("char","char",int2, internal), -STORAGE "char"; - -/*--------------------bytea-----------------------*/ - -CREATE FUNCTION rum_bytea_extract_value(bytea, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_bytea_compare_prefix(bytea, bytea, int2, internal) -RETURNS int4 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_bytea_extract_query(bytea, internal, int2, internal, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - - -CREATE OPERATOR CLASS rum_bytea_ops -DEFAULT FOR TYPE bytea USING rum -AS - OPERATOR 1 < , - OPERATOR 2 <= , - OPERATOR 3 = , - OPERATOR 4 >= , - OPERATOR 5 > , - FUNCTION 1 byteacmp(bytea,bytea), - FUNCTION 2 rum_bytea_extract_value(bytea, internal), - FUNCTION 3 rum_bytea_extract_query(bytea, internal, int2, internal, internal), - FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), - FUNCTION 5 rum_bytea_compare_prefix(bytea,bytea,int2, internal), -STORAGE bytea; - -/*--------------------bit-----------------------*/ - -CREATE FUNCTION rum_bit_extract_value(bit, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_bit_compare_prefix(bit, bit, int2, internal) -RETURNS int4 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_bit_extract_query(bit, internal, int2, internal, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - - -CREATE OPERATOR CLASS rum_bit_ops -DEFAULT FOR TYPE bit USING rum -AS - OPERATOR 1 < , - OPERATOR 2 <= , - OPERATOR 3 = , - OPERATOR 4 >= , - OPERATOR 5 > , - FUNCTION 1 bitcmp(bit,bit), - FUNCTION 2 rum_bit_extract_value(bit, internal), - FUNCTION 3 rum_bit_extract_query(bit, internal, int2, internal, internal), - FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), - FUNCTION 5 rum_bit_compare_prefix(bit,bit,int2, internal), -STORAGE bit; - -/*--------------------varbit-----------------------*/ - -CREATE FUNCTION rum_varbit_extract_value(varbit, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_varbit_compare_prefix(varbit, varbit, int2, internal) -RETURNS int4 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_varbit_extract_query(varbit, internal, int2, internal, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - - -CREATE OPERATOR CLASS rum_varbit_ops -DEFAULT FOR TYPE varbit USING rum -AS - OPERATOR 1 < , - OPERATOR 2 <= , - OPERATOR 3 = , - OPERATOR 4 >= , - OPERATOR 5 > , - FUNCTION 1 varbitcmp(varbit,varbit), - FUNCTION 2 rum_varbit_extract_value(varbit, internal), - FUNCTION 3 rum_varbit_extract_query(varbit, internal, int2, internal, internal), - FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), - FUNCTION 5 rum_varbit_compare_prefix(varbit,varbit,int2, internal), -STORAGE varbit; - -/*--------------------numeric-----------------------*/ - -CREATE FUNCTION rum_numeric_extract_value(numeric, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_numeric_compare_prefix(numeric, numeric, int2, internal) -RETURNS int4 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_numeric_extract_query(numeric, internal, int2, internal, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - - -CREATE OPERATOR CLASS rum_numeric_ops -DEFAULT FOR TYPE numeric USING rum -AS - OPERATOR 1 < , - OPERATOR 2 <= , - OPERATOR 3 = , - OPERATOR 4 >= , - OPERATOR 5 > , - FUNCTION 1 rum_numeric_cmp(numeric,numeric), - FUNCTION 2 rum_numeric_extract_value(numeric, internal), - FUNCTION 3 rum_numeric_extract_query(numeric, internal, int2, internal, internal), - FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), - FUNCTION 5 rum_numeric_compare_prefix(numeric,numeric,int2, internal), -STORAGE numeric; - diff --git a/rum--1.2.sql b/rum--1.2.sql deleted file mode 100644 index 74237fc990..0000000000 --- a/rum--1.2.sql +++ /dev/null @@ -1,1707 +0,0 @@ -CREATE OR REPLACE FUNCTION rumhandler(internal) -RETURNS index_am_handler -AS 'MODULE_PATHNAME' -LANGUAGE C; - -/* - * RUM access method - */ - -CREATE ACCESS METHOD rum TYPE INDEX HANDLER rumhandler; - -/* - * RUM built-in types, operators and functions - */ - --- Type used in distance calculations with normalization argument -CREATE TYPE rum_distance_query AS (query tsquery, method int); - -CREATE FUNCTION tsquery_to_distance_query(tsquery) -RETURNS rum_distance_query -AS 'MODULE_PATHNAME', 'tsquery_to_distance_query' -LANGUAGE C IMMUTABLE STRICT; - -CREATE CAST (tsquery AS rum_distance_query) - WITH FUNCTION tsquery_to_distance_query(tsquery) AS IMPLICIT; - -CREATE FUNCTION rum_ts_distance(tsvector,tsquery) -RETURNS float4 -AS 'MODULE_PATHNAME', 'rum_ts_distance_tt' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION rum_ts_distance(tsvector,tsquery,int) -RETURNS float4 -AS 'MODULE_PATHNAME', 'rum_ts_distance_ttf' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION rum_ts_distance(tsvector,rum_distance_query) -RETURNS float4 -AS 'MODULE_PATHNAME', 'rum_ts_distance_td' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR <=> ( - LEFTARG = tsvector, - RIGHTARG = tsquery, - PROCEDURE = rum_ts_distance -); - -CREATE OPERATOR <=> ( - LEFTARG = tsvector, - RIGHTARG = rum_distance_query, - PROCEDURE = rum_ts_distance -); - -CREATE FUNCTION rum_timestamp_distance(timestamp, timestamp) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR <=> ( - PROCEDURE = rum_timestamp_distance, - LEFTARG = timestamp, - RIGHTARG = timestamp, - COMMUTATOR = <=> -); - -CREATE FUNCTION rum_timestamp_left_distance(timestamp, timestamp) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR <=| ( - PROCEDURE = rum_timestamp_left_distance, - LEFTARG = timestamp, - RIGHTARG = timestamp, - COMMUTATOR = |=> -); - -CREATE FUNCTION rum_timestamp_right_distance(timestamp, timestamp) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR |=> ( - PROCEDURE = rum_timestamp_right_distance, - LEFTARG = timestamp, - RIGHTARG = timestamp, - COMMUTATOR = <=| -); - -/* - * rum_tsvector_ops operator class - */ - -CREATE FUNCTION rum_extract_tsvector(tsvector,internal,internal,internal,internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION rum_extract_tsquery(tsquery,internal,smallint,internal,internal,internal,internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION rum_tsvector_config(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal) -RETURNS bool -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION rum_tsquery_consistent(internal, smallint, tsvector, integer, internal, internal, internal, internal) -RETURNS bool -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION rum_tsquery_distance(internal,smallint,tsvector,int,internal,internal,internal,internal,internal) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - --- To prevent calling from SQL -CREATE FUNCTION rum_ts_join_pos(internal, internal) -RETURNS bytea -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR CLASS rum_tsvector_ops -DEFAULT FOR TYPE tsvector USING rum -AS - OPERATOR 1 @@ (tsvector, tsquery), - OPERATOR 2 <=> (tsvector, tsquery) FOR ORDER BY pg_catalog.float_ops, - FUNCTION 1 gin_cmp_tslexeme(text, text), - FUNCTION 2 rum_extract_tsvector(tsvector,internal,internal,internal,internal), - FUNCTION 3 rum_extract_tsquery(tsquery,internal,smallint,internal,internal,internal,internal), - FUNCTION 4 rum_tsquery_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - FUNCTION 5 gin_cmp_prefix(text,text,smallint,internal), - FUNCTION 6 rum_tsvector_config(internal), - FUNCTION 7 rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - FUNCTION 8 rum_tsquery_distance(internal,smallint,tsvector,int,internal,internal,internal,internal,internal), - FUNCTION 10 rum_ts_join_pos(internal, internal), - STORAGE text; - -/* - * rum_tsvector_hash_ops operator class. - * - * Stores hash of entries as keys in index. - */ - -CREATE FUNCTION rum_extract_tsvector_hash(tsvector,internal,internal,internal,internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION rum_extract_tsquery_hash(tsquery,internal,smallint,internal,internal,internal,internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR CLASS rum_tsvector_hash_ops -FOR TYPE tsvector USING rum -AS - OPERATOR 1 @@ (tsvector, tsquery), - OPERATOR 2 <=> (tsvector, tsquery) FOR ORDER BY pg_catalog.float_ops, - FUNCTION 1 btint4cmp(integer, integer), - FUNCTION 2 rum_extract_tsvector_hash(tsvector,internal,internal,internal,internal), - FUNCTION 3 rum_extract_tsquery_hash(tsquery,internal,smallint,internal,internal,internal,internal), - FUNCTION 4 rum_tsquery_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - FUNCTION 6 rum_tsvector_config(internal), - FUNCTION 7 rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - FUNCTION 8 rum_tsquery_distance(internal,smallint,tsvector,int,internal,internal,internal,internal,internal), - FUNCTION 10 rum_ts_join_pos(internal, internal), - STORAGE integer; - -/* - * rum_timestamp_ops operator class - */ - --- timestamp operator class - -CREATE FUNCTION rum_timestamp_extract_value(timestamp,internal,internal,internal,internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_timestamp_compare_prefix(timestamp,timestamp,smallint,internal) -RETURNS int4 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_timestamp_config(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION rum_timestamp_extract_query(timestamp,internal,smallint,internal,internal,internal,internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_timestamp_consistent(internal,smallint,timestamp,int,internal,internal,internal,internal) -RETURNS bool -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_timestamp_outer_distance(timestamp, timestamp, smallint) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE OPERATOR CLASS rum_timestamp_ops -DEFAULT FOR TYPE timestamp USING rum -AS - OPERATOR 1 <, - OPERATOR 2 <=, - OPERATOR 3 =, - OPERATOR 4 >=, - OPERATOR 5 >, - --support - FUNCTION 1 timestamp_cmp(timestamp,timestamp), - FUNCTION 2 rum_timestamp_extract_value(timestamp,internal,internal,internal,internal), - FUNCTION 3 rum_timestamp_extract_query(timestamp,internal,smallint,internal,internal,internal,internal), - FUNCTION 4 rum_timestamp_consistent(internal,smallint,timestamp,int,internal,internal,internal,internal), - FUNCTION 5 rum_timestamp_compare_prefix(timestamp,timestamp,smallint,internal), - FUNCTION 6 rum_timestamp_config(internal), - -- support to timestamp distance in rum_tsvector_timestamp_ops - FUNCTION 9 rum_timestamp_outer_distance(timestamp, timestamp, smallint), - OPERATOR 20 <=> (timestamp,timestamp) FOR ORDER BY pg_catalog.float_ops, - OPERATOR 21 <=| (timestamp,timestamp) FOR ORDER BY pg_catalog.float_ops, - OPERATOR 22 |=> (timestamp,timestamp) FOR ORDER BY pg_catalog.float_ops, -STORAGE timestamp; - -/* - * rum_tsvector_timestamp_ops operator class. - * - * Stores timestamp with tsvector. - */ - -CREATE FUNCTION rum_tsquery_timestamp_consistent(internal, smallint, tsvector, integer, internal, internal, internal, internal) -RETURNS bool -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -/* - * !!!deprecated, use rum_tsvector_addon_ops!!! - */ -CREATE OPERATOR CLASS rum_tsvector_timestamp_ops -FOR TYPE tsvector USING rum -AS - OPERATOR 1 @@ (tsvector, tsquery), - --support function - FUNCTION 1 gin_cmp_tslexeme(text, text), - FUNCTION 2 rum_extract_tsvector(tsvector,internal,internal,internal,internal), - FUNCTION 3 rum_extract_tsquery(tsquery,internal,smallint,internal,internal,internal,internal), - FUNCTION 4 rum_tsquery_timestamp_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - FUNCTION 5 gin_cmp_prefix(text,text,smallint,internal), - FUNCTION 7 rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - STORAGE text; - -/* - * rum_tsvector_hash_timestamp_ops operator class - * !!!deprecated, use rum_tsvector_hash_addon_ops!!! - */ - -CREATE OPERATOR CLASS rum_tsvector_hash_timestamp_ops -FOR TYPE tsvector USING rum -AS - OPERATOR 1 @@ (tsvector, tsquery), - --support function - FUNCTION 1 btint4cmp(integer, integer), - FUNCTION 2 rum_extract_tsvector_hash(tsvector,internal,internal,internal,internal), - FUNCTION 3 rum_extract_tsquery_hash(tsquery,internal,smallint,internal,internal,internal,internal), - FUNCTION 4 rum_tsquery_timestamp_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - FUNCTION 7 rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - STORAGE integer; - -/* - * rum_timestamptz_ops operator class - */ - -CREATE FUNCTION rum_timestamptz_distance(timestamptz, timestamptz) -RETURNS float8 -AS 'MODULE_PATHNAME', 'rum_timestamp_distance' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR <=> ( - PROCEDURE = rum_timestamptz_distance, - LEFTARG = timestamptz, - RIGHTARG = timestamptz, - COMMUTATOR = <=> -); - -CREATE FUNCTION rum_timestamptz_left_distance(timestamptz, timestamptz) -RETURNS float8 -AS 'MODULE_PATHNAME', 'rum_timestamp_left_distance' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR <=| ( - PROCEDURE = rum_timestamptz_left_distance, - LEFTARG = timestamptz, - RIGHTARG = timestamptz, - COMMUTATOR = |=> -); - -CREATE FUNCTION rum_timestamptz_right_distance(timestamptz, timestamptz) -RETURNS float8 -AS 'MODULE_PATHNAME', 'rum_timestamp_right_distance' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR |=> ( - PROCEDURE = rum_timestamptz_right_distance, - LEFTARG = timestamptz, - RIGHTARG = timestamptz, - COMMUTATOR = <=| -); - -CREATE OPERATOR CLASS rum_timestamptz_ops -DEFAULT FOR TYPE timestamptz USING rum -AS - OPERATOR 1 <, - OPERATOR 2 <=, - OPERATOR 3 =, - OPERATOR 4 >=, - OPERATOR 5 >, - --support - FUNCTION 1 timestamptz_cmp(timestamptz,timestamptz), - FUNCTION 2 rum_timestamp_extract_value(timestamp,internal,internal,internal,internal), - FUNCTION 3 rum_timestamp_extract_query(timestamp,internal,smallint,internal,internal,internal,internal), - FUNCTION 4 rum_timestamp_consistent(internal,smallint,timestamp,int,internal,internal,internal,internal), - FUNCTION 5 rum_timestamp_compare_prefix(timestamp,timestamp,smallint,internal), - FUNCTION 6 rum_timestamp_config(internal), - -- support to timestamptz distance in rum_tsvector_timestamptz_ops - FUNCTION 9 rum_timestamp_outer_distance(timestamp, timestamp, smallint), - OPERATOR 20 <=> (timestamptz,timestamptz) FOR ORDER BY pg_catalog.float_ops, - OPERATOR 21 <=| (timestamptz,timestamptz) FOR ORDER BY pg_catalog.float_ops, - OPERATOR 22 |=> (timestamptz,timestamptz) FOR ORDER BY pg_catalog.float_ops, -STORAGE timestamptz; - -/* - * rum_tsvector_timestamptz_ops operator class. - * - * Stores tsvector with timestamptz. - */ - -CREATE OPERATOR CLASS rum_tsvector_timestamptz_ops -FOR TYPE tsvector USING rum -AS - OPERATOR 1 @@ (tsvector, tsquery), - --support function - FUNCTION 1 gin_cmp_tslexeme(text, text), - FUNCTION 2 rum_extract_tsvector(tsvector,internal,internal,internal,internal), - FUNCTION 3 rum_extract_tsquery(tsquery,internal,smallint,internal,internal,internal,internal), - FUNCTION 4 rum_tsquery_timestamp_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - FUNCTION 5 gin_cmp_prefix(text,text,smallint,internal), - FUNCTION 7 rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - STORAGE text; - -/* - * rum_tsvector_hash_timestamptz_ops operator class - */ - -CREATE OPERATOR CLASS rum_tsvector_hash_timestamptz_ops -FOR TYPE tsvector USING rum -AS - OPERATOR 1 @@ (tsvector, tsquery), - --support function - FUNCTION 1 btint4cmp(integer, integer), - FUNCTION 2 rum_extract_tsvector_hash(tsvector,internal,internal,internal,internal), - FUNCTION 3 rum_extract_tsquery_hash(tsquery,internal,smallint,internal,internal,internal,internal), - FUNCTION 4 rum_tsquery_timestamp_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - FUNCTION 7 rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - STORAGE integer; - -/* - * rum_tsquery_ops operator class. - * - * Used for inversed text search. - */ - -CREATE FUNCTION ruminv_extract_tsquery(tsquery,internal,internal,internal,internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION ruminv_extract_tsvector(tsvector,internal,smallint,internal,internal,internal,internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION ruminv_tsvector_consistent(internal, smallint, tsvector, integer, internal, internal, internal, internal) -RETURNS bool -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION ruminv_tsquery_config(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR CLASS rum_tsquery_ops -DEFAULT FOR TYPE tsquery USING rum -AS - OPERATOR 1 @@ (tsquery, tsvector), - FUNCTION 1 gin_cmp_tslexeme(text, text), - FUNCTION 2 ruminv_extract_tsquery(tsquery,internal,internal,internal,internal), - FUNCTION 3 ruminv_extract_tsvector(tsvector,internal,smallint,internal,internal,internal,internal), - FUNCTION 4 ruminv_tsvector_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - FUNCTION 6 ruminv_tsquery_config(internal), - STORAGE text; -/* - * RUM version 1.1 - */ - -CREATE FUNCTION rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal) -RETURNS bool -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -ALTER FUNCTION - rum_tsquery_timestamp_consistent (internal,smallint,tsvector,int,internal,internal,internal,internal) - RENAME TO rum_tsquery_addon_consistent; - -CREATE FUNCTION rum_numeric_cmp(numeric, numeric) -RETURNS int4 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE OPERATOR CLASS rum_tsvector_addon_ops -FOR TYPE tsvector USING rum -AS - OPERATOR 1 @@ (tsvector, tsquery), - --support function - FUNCTION 1 gin_cmp_tslexeme(text, text), - FUNCTION 2 rum_extract_tsvector(tsvector,internal,internal,internal,internal), - FUNCTION 3 rum_extract_tsquery(tsquery,internal,smallint,internal,internal,internal,internal), - FUNCTION 4 rum_tsquery_addon_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - FUNCTION 5 gin_cmp_prefix(text,text,smallint,internal), - FUNCTION 7 rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - STORAGE text; - -CREATE OPERATOR CLASS rum_tsvector_hash_addon_ops -FOR TYPE tsvector USING rum -AS - OPERATOR 1 @@ (tsvector, tsquery), - --support function - FUNCTION 1 btint4cmp(integer, integer), - FUNCTION 2 rum_extract_tsvector_hash(tsvector,internal,internal,internal,internal), - FUNCTION 3 rum_extract_tsquery_hash(tsquery,internal,smallint,internal,internal,internal,internal), - FUNCTION 4 rum_tsquery_addon_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - FUNCTION 7 rum_tsquery_pre_consistent(internal,smallint,tsvector,int,internal,internal,internal,internal), - STORAGE integer; - -/*--------------------int2-----------------------*/ - -CREATE FUNCTION rum_int2_extract_value(int2, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_int2_compare_prefix(int2, int2, int2, internal) -RETURNS int4 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_int2_extract_query(int2, internal, int2, internal, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - - - -CREATE FUNCTION rum_int2_distance(int2, int2) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR <=> ( - PROCEDURE = rum_int2_distance, - LEFTARG = int2, - RIGHTARG = int2, - COMMUTATOR = <=> -); - -CREATE FUNCTION rum_int2_left_distance(int2, int2) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR <=| ( - PROCEDURE = rum_int2_left_distance, - LEFTARG = int2, - RIGHTARG = int2, - COMMUTATOR = |=> -); - -CREATE FUNCTION rum_int2_right_distance(int2, int2) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR |=> ( - PROCEDURE = rum_int2_right_distance, - LEFTARG = int2, - RIGHTARG = int2, - COMMUTATOR = <=| -); - -CREATE FUNCTION rum_int2_outer_distance(int2, int2, smallint) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_int2_config(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - - - -CREATE OPERATOR CLASS rum_int2_ops -DEFAULT FOR TYPE int2 USING rum -AS - OPERATOR 1 < , - OPERATOR 2 <= , - OPERATOR 3 = , - OPERATOR 4 >= , - OPERATOR 5 > , - OPERATOR 20 <=> (int2,int2) FOR ORDER BY pg_catalog.float_ops, - OPERATOR 21 <=| (int2,int2) FOR ORDER BY pg_catalog.float_ops, - OPERATOR 22 |=> (int2,int2) FOR ORDER BY pg_catalog.float_ops, - FUNCTION 1 btint2cmp(int2,int2), - FUNCTION 2 rum_int2_extract_value(int2, internal), - FUNCTION 3 rum_int2_extract_query(int2, internal, int2, internal, internal), - FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), - FUNCTION 5 rum_int2_compare_prefix(int2,int2,int2, internal), - -- support to int2 distance in rum_tsvector_addon_ops - FUNCTION 6 rum_int2_config(internal), - FUNCTION 9 rum_int2_outer_distance(int2, int2, smallint), -STORAGE int2; - -/*--------------------int4-----------------------*/ - -CREATE FUNCTION rum_int4_extract_value(int4, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_int4_compare_prefix(int4, int4, int2, internal) -RETURNS int4 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_int4_extract_query(int4, internal, int2, internal, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - - - -CREATE FUNCTION rum_int4_distance(int4, int4) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR <=> ( - PROCEDURE = rum_int4_distance, - LEFTARG = int4, - RIGHTARG = int4, - COMMUTATOR = <=> -); - -CREATE FUNCTION rum_int4_left_distance(int4, int4) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR <=| ( - PROCEDURE = rum_int4_left_distance, - LEFTARG = int4, - RIGHTARG = int4, - COMMUTATOR = |=> -); - -CREATE FUNCTION rum_int4_right_distance(int4, int4) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR |=> ( - PROCEDURE = rum_int4_right_distance, - LEFTARG = int4, - RIGHTARG = int4, - COMMUTATOR = <=| -); - -CREATE FUNCTION rum_int4_outer_distance(int4, int4, smallint) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_int4_config(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - - - -CREATE OPERATOR CLASS rum_int4_ops -DEFAULT FOR TYPE int4 USING rum -AS - OPERATOR 1 < , - OPERATOR 2 <= , - OPERATOR 3 = , - OPERATOR 4 >= , - OPERATOR 5 > , - OPERATOR 20 <=> (int4,int4) FOR ORDER BY pg_catalog.float_ops, - OPERATOR 21 <=| (int4,int4) FOR ORDER BY pg_catalog.float_ops, - OPERATOR 22 |=> (int4,int4) FOR ORDER BY pg_catalog.float_ops, - FUNCTION 1 btint4cmp(int4,int4), - FUNCTION 2 rum_int4_extract_value(int4, internal), - FUNCTION 3 rum_int4_extract_query(int4, internal, int2, internal, internal), - FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), - FUNCTION 5 rum_int4_compare_prefix(int4,int4,int2, internal), - -- support to int4 distance in rum_tsvector_addon_ops - FUNCTION 6 rum_int4_config(internal), - FUNCTION 9 rum_int4_outer_distance(int4, int4, smallint), -STORAGE int4; - -/*--------------------int8-----------------------*/ - -CREATE FUNCTION rum_int8_extract_value(int8, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_int8_compare_prefix(int8, int8, int2, internal) -RETURNS int4 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_int8_extract_query(int8, internal, int2, internal, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - - - -CREATE FUNCTION rum_int8_distance(int8, int8) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR <=> ( - PROCEDURE = rum_int8_distance, - LEFTARG = int8, - RIGHTARG = int8, - COMMUTATOR = <=> -); - -CREATE FUNCTION rum_int8_left_distance(int8, int8) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR <=| ( - PROCEDURE = rum_int8_left_distance, - LEFTARG = int8, - RIGHTARG = int8, - COMMUTATOR = |=> -); - -CREATE FUNCTION rum_int8_right_distance(int8, int8) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR |=> ( - PROCEDURE = rum_int8_right_distance, - LEFTARG = int8, - RIGHTARG = int8, - COMMUTATOR = <=| -); - -CREATE FUNCTION rum_int8_outer_distance(int8, int8, smallint) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_int8_config(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - - - -CREATE OPERATOR CLASS rum_int8_ops -DEFAULT FOR TYPE int8 USING rum -AS - OPERATOR 1 < , - OPERATOR 2 <= , - OPERATOR 3 = , - OPERATOR 4 >= , - OPERATOR 5 > , - OPERATOR 20 <=> (int8,int8) FOR ORDER BY pg_catalog.float_ops, - OPERATOR 21 <=| (int8,int8) FOR ORDER BY pg_catalog.float_ops, - OPERATOR 22 |=> (int8,int8) FOR ORDER BY pg_catalog.float_ops, - FUNCTION 1 btint8cmp(int8,int8), - FUNCTION 2 rum_int8_extract_value(int8, internal), - FUNCTION 3 rum_int8_extract_query(int8, internal, int2, internal, internal), - FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), - FUNCTION 5 rum_int8_compare_prefix(int8,int8,int2, internal), - -- support to int8 distance in rum_tsvector_addon_ops - FUNCTION 6 rum_int8_config(internal), - FUNCTION 9 rum_int8_outer_distance(int8, int8, smallint), -STORAGE int8; - -/*--------------------float4-----------------------*/ - -CREATE FUNCTION rum_float4_extract_value(float4, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_float4_compare_prefix(float4, float4, int2, internal) -RETURNS int4 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_float4_extract_query(float4, internal, int2, internal, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - - - -CREATE FUNCTION rum_float4_distance(float4, float4) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR <=> ( - PROCEDURE = rum_float4_distance, - LEFTARG = float4, - RIGHTARG = float4, - COMMUTATOR = <=> -); - -CREATE FUNCTION rum_float4_left_distance(float4, float4) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR <=| ( - PROCEDURE = rum_float4_left_distance, - LEFTARG = float4, - RIGHTARG = float4, - COMMUTATOR = |=> -); - -CREATE FUNCTION rum_float4_right_distance(float4, float4) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR |=> ( - PROCEDURE = rum_float4_right_distance, - LEFTARG = float4, - RIGHTARG = float4, - COMMUTATOR = <=| -); - -CREATE FUNCTION rum_float4_outer_distance(float4, float4, smallint) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_float4_config(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - - - -CREATE OPERATOR CLASS rum_float4_ops -DEFAULT FOR TYPE float4 USING rum -AS - OPERATOR 1 < , - OPERATOR 2 <= , - OPERATOR 3 = , - OPERATOR 4 >= , - OPERATOR 5 > , - OPERATOR 20 <=> (float4,float4) FOR ORDER BY pg_catalog.float_ops, - OPERATOR 21 <=| (float4,float4) FOR ORDER BY pg_catalog.float_ops, - OPERATOR 22 |=> (float4,float4) FOR ORDER BY pg_catalog.float_ops, - FUNCTION 1 btfloat4cmp(float4,float4), - FUNCTION 2 rum_float4_extract_value(float4, internal), - FUNCTION 3 rum_float4_extract_query(float4, internal, int2, internal, internal), - FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), - FUNCTION 5 rum_float4_compare_prefix(float4,float4,int2, internal), - -- support to float4 distance in rum_tsvector_addon_ops - FUNCTION 6 rum_float4_config(internal), - FUNCTION 9 rum_float4_outer_distance(float4, float4, smallint), -STORAGE float4; - -/*--------------------float8-----------------------*/ - -CREATE FUNCTION rum_float8_extract_value(float8, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_float8_compare_prefix(float8, float8, int2, internal) -RETURNS int4 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_float8_extract_query(float8, internal, int2, internal, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - - - -CREATE FUNCTION rum_float8_distance(float8, float8) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR <=> ( - PROCEDURE = rum_float8_distance, - LEFTARG = float8, - RIGHTARG = float8, - COMMUTATOR = <=> -); - -CREATE FUNCTION rum_float8_left_distance(float8, float8) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR <=| ( - PROCEDURE = rum_float8_left_distance, - LEFTARG = float8, - RIGHTARG = float8, - COMMUTATOR = |=> -); - -CREATE FUNCTION rum_float8_right_distance(float8, float8) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR |=> ( - PROCEDURE = rum_float8_right_distance, - LEFTARG = float8, - RIGHTARG = float8, - COMMUTATOR = <=| -); - -CREATE FUNCTION rum_float8_outer_distance(float8, float8, smallint) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_float8_config(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - - - -CREATE OPERATOR CLASS rum_float8_ops -DEFAULT FOR TYPE float8 USING rum -AS - OPERATOR 1 < , - OPERATOR 2 <= , - OPERATOR 3 = , - OPERATOR 4 >= , - OPERATOR 5 > , - OPERATOR 20 <=> (float8,float8) FOR ORDER BY pg_catalog.float_ops, - OPERATOR 21 <=| (float8,float8) FOR ORDER BY pg_catalog.float_ops, - OPERATOR 22 |=> (float8,float8) FOR ORDER BY pg_catalog.float_ops, - FUNCTION 1 btfloat8cmp(float8,float8), - FUNCTION 2 rum_float8_extract_value(float8, internal), - FUNCTION 3 rum_float8_extract_query(float8, internal, int2, internal, internal), - FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), - FUNCTION 5 rum_float8_compare_prefix(float8,float8,int2, internal), - -- support to float8 distance in rum_tsvector_addon_ops - FUNCTION 6 rum_float8_config(internal), - FUNCTION 9 rum_float8_outer_distance(float8, float8, smallint), -STORAGE float8; - -/*--------------------money-----------------------*/ - -CREATE FUNCTION rum_money_extract_value(money, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_money_compare_prefix(money, money, int2, internal) -RETURNS int4 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_money_extract_query(money, internal, int2, internal, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - - - -CREATE FUNCTION rum_money_distance(money, money) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR <=> ( - PROCEDURE = rum_money_distance, - LEFTARG = money, - RIGHTARG = money, - COMMUTATOR = <=> -); - -CREATE FUNCTION rum_money_left_distance(money, money) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR <=| ( - PROCEDURE = rum_money_left_distance, - LEFTARG = money, - RIGHTARG = money, - COMMUTATOR = |=> -); - -CREATE FUNCTION rum_money_right_distance(money, money) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR |=> ( - PROCEDURE = rum_money_right_distance, - LEFTARG = money, - RIGHTARG = money, - COMMUTATOR = <=| -); - -CREATE FUNCTION rum_money_outer_distance(money, money, smallint) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_money_config(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - - - -CREATE OPERATOR CLASS rum_money_ops -DEFAULT FOR TYPE money USING rum -AS - OPERATOR 1 < , - OPERATOR 2 <= , - OPERATOR 3 = , - OPERATOR 4 >= , - OPERATOR 5 > , - OPERATOR 20 <=> (money,money) FOR ORDER BY pg_catalog.float_ops, - OPERATOR 21 <=| (money,money) FOR ORDER BY pg_catalog.float_ops, - OPERATOR 22 |=> (money,money) FOR ORDER BY pg_catalog.float_ops, - FUNCTION 1 cash_cmp(money,money), - FUNCTION 2 rum_money_extract_value(money, internal), - FUNCTION 3 rum_money_extract_query(money, internal, int2, internal, internal), - FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), - FUNCTION 5 rum_money_compare_prefix(money,money,int2, internal), - -- support to money distance in rum_tsvector_addon_ops - FUNCTION 6 rum_money_config(internal), - FUNCTION 9 rum_money_outer_distance(money, money, smallint), -STORAGE money; - -/*--------------------oid-----------------------*/ - -CREATE FUNCTION rum_oid_extract_value(oid, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_oid_compare_prefix(oid, oid, int2, internal) -RETURNS int4 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_oid_extract_query(oid, internal, int2, internal, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - - - -CREATE FUNCTION rum_oid_distance(oid, oid) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR <=> ( - PROCEDURE = rum_oid_distance, - LEFTARG = oid, - RIGHTARG = oid, - COMMUTATOR = <=> -); - -CREATE FUNCTION rum_oid_left_distance(oid, oid) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR <=| ( - PROCEDURE = rum_oid_left_distance, - LEFTARG = oid, - RIGHTARG = oid, - COMMUTATOR = |=> -); - -CREATE FUNCTION rum_oid_right_distance(oid, oid) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR |=> ( - PROCEDURE = rum_oid_right_distance, - LEFTARG = oid, - RIGHTARG = oid, - COMMUTATOR = <=| -); - -CREATE FUNCTION rum_oid_outer_distance(oid, oid, smallint) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_oid_config(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - - - -CREATE OPERATOR CLASS rum_oid_ops -DEFAULT FOR TYPE oid USING rum -AS - OPERATOR 1 < , - OPERATOR 2 <= , - OPERATOR 3 = , - OPERATOR 4 >= , - OPERATOR 5 > , - OPERATOR 20 <=> (oid,oid) FOR ORDER BY pg_catalog.float_ops, - OPERATOR 21 <=| (oid,oid) FOR ORDER BY pg_catalog.float_ops, - OPERATOR 22 |=> (oid,oid) FOR ORDER BY pg_catalog.float_ops, - FUNCTION 1 btoidcmp(oid,oid), - FUNCTION 2 rum_oid_extract_value(oid, internal), - FUNCTION 3 rum_oid_extract_query(oid, internal, int2, internal, internal), - FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), - FUNCTION 5 rum_oid_compare_prefix(oid,oid,int2, internal), - -- support to oid distance in rum_tsvector_addon_ops - FUNCTION 6 rum_oid_config(internal), - FUNCTION 9 rum_oid_outer_distance(oid, oid, smallint), -STORAGE oid; - -/*--------------------time-----------------------*/ - -CREATE FUNCTION rum_time_extract_value(time, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_time_compare_prefix(time, time, int2, internal) -RETURNS int4 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_time_extract_query(time, internal, int2, internal, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - - -CREATE OPERATOR CLASS rum_time_ops -DEFAULT FOR TYPE time USING rum -AS - OPERATOR 1 < , - OPERATOR 2 <= , - OPERATOR 3 = , - OPERATOR 4 >= , - OPERATOR 5 > , - FUNCTION 1 time_cmp(time,time), - FUNCTION 2 rum_time_extract_value(time, internal), - FUNCTION 3 rum_time_extract_query(time, internal, int2, internal, internal), - FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), - FUNCTION 5 rum_time_compare_prefix(time,time,int2, internal), -STORAGE time; - -/*--------------------timetz-----------------------*/ - -CREATE FUNCTION rum_timetz_extract_value(timetz, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_timetz_compare_prefix(timetz, timetz, int2, internal) -RETURNS int4 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_timetz_extract_query(timetz, internal, int2, internal, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - - -CREATE OPERATOR CLASS rum_timetz_ops -DEFAULT FOR TYPE timetz USING rum -AS - OPERATOR 1 < , - OPERATOR 2 <= , - OPERATOR 3 = , - OPERATOR 4 >= , - OPERATOR 5 > , - FUNCTION 1 timetz_cmp(timetz,timetz), - FUNCTION 2 rum_timetz_extract_value(timetz, internal), - FUNCTION 3 rum_timetz_extract_query(timetz, internal, int2, internal, internal), - FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), - FUNCTION 5 rum_timetz_compare_prefix(timetz,timetz,int2, internal), -STORAGE timetz; - -/*--------------------date-----------------------*/ - -CREATE FUNCTION rum_date_extract_value(date, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_date_compare_prefix(date, date, int2, internal) -RETURNS int4 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_date_extract_query(date, internal, int2, internal, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - - -CREATE OPERATOR CLASS rum_date_ops -DEFAULT FOR TYPE date USING rum -AS - OPERATOR 1 < , - OPERATOR 2 <= , - OPERATOR 3 = , - OPERATOR 4 >= , - OPERATOR 5 > , - FUNCTION 1 date_cmp(date,date), - FUNCTION 2 rum_date_extract_value(date, internal), - FUNCTION 3 rum_date_extract_query(date, internal, int2, internal, internal), - FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), - FUNCTION 5 rum_date_compare_prefix(date,date,int2, internal), -STORAGE date; - -/*--------------------interval-----------------------*/ - -CREATE FUNCTION rum_interval_extract_value(interval, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_interval_compare_prefix(interval, interval, int2, internal) -RETURNS int4 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_interval_extract_query(interval, internal, int2, internal, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - - -CREATE OPERATOR CLASS rum_interval_ops -DEFAULT FOR TYPE interval USING rum -AS - OPERATOR 1 < , - OPERATOR 2 <= , - OPERATOR 3 = , - OPERATOR 4 >= , - OPERATOR 5 > , - FUNCTION 1 interval_cmp(interval,interval), - FUNCTION 2 rum_interval_extract_value(interval, internal), - FUNCTION 3 rum_interval_extract_query(interval, internal, int2, internal, internal), - FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), - FUNCTION 5 rum_interval_compare_prefix(interval,interval,int2, internal), -STORAGE interval; - -/*--------------------macaddr-----------------------*/ - -CREATE FUNCTION rum_macaddr_extract_value(macaddr, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_macaddr_compare_prefix(macaddr, macaddr, int2, internal) -RETURNS int4 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_macaddr_extract_query(macaddr, internal, int2, internal, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - - -CREATE OPERATOR CLASS rum_macaddr_ops -DEFAULT FOR TYPE macaddr USING rum -AS - OPERATOR 1 < , - OPERATOR 2 <= , - OPERATOR 3 = , - OPERATOR 4 >= , - OPERATOR 5 > , - FUNCTION 1 macaddr_cmp(macaddr,macaddr), - FUNCTION 2 rum_macaddr_extract_value(macaddr, internal), - FUNCTION 3 rum_macaddr_extract_query(macaddr, internal, int2, internal, internal), - FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), - FUNCTION 5 rum_macaddr_compare_prefix(macaddr,macaddr,int2, internal), -STORAGE macaddr; - -/*--------------------inet-----------------------*/ - -CREATE FUNCTION rum_inet_extract_value(inet, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_inet_compare_prefix(inet, inet, int2, internal) -RETURNS int4 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_inet_extract_query(inet, internal, int2, internal, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - - -CREATE OPERATOR CLASS rum_inet_ops -DEFAULT FOR TYPE inet USING rum -AS - OPERATOR 1 < , - OPERATOR 2 <= , - OPERATOR 3 = , - OPERATOR 4 >= , - OPERATOR 5 > , - FUNCTION 1 network_cmp(inet,inet), - FUNCTION 2 rum_inet_extract_value(inet, internal), - FUNCTION 3 rum_inet_extract_query(inet, internal, int2, internal, internal), - FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), - FUNCTION 5 rum_inet_compare_prefix(inet,inet,int2, internal), -STORAGE inet; - -/*--------------------cidr-----------------------*/ - -CREATE FUNCTION rum_cidr_extract_value(cidr, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_cidr_compare_prefix(cidr, cidr, int2, internal) -RETURNS int4 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_cidr_extract_query(cidr, internal, int2, internal, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - - -CREATE OPERATOR CLASS rum_cidr_ops -DEFAULT FOR TYPE cidr USING rum -AS - OPERATOR 1 < (inet, inet), - OPERATOR 2 <= (inet, inet), - OPERATOR 3 = (inet, inet), - OPERATOR 4 >= (inet, inet), - OPERATOR 5 > (inet, inet), - FUNCTION 1 network_cmp(inet,inet), - FUNCTION 2 rum_cidr_extract_value(cidr, internal), - FUNCTION 3 rum_cidr_extract_query(cidr, internal, int2, internal, internal), - FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), - FUNCTION 5 rum_cidr_compare_prefix(cidr,cidr,int2, internal), -STORAGE cidr; - -/*--------------------text-----------------------*/ - -CREATE FUNCTION rum_text_extract_value(text, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_text_compare_prefix(text, text, int2, internal) -RETURNS int4 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_text_extract_query(text, internal, int2, internal, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - - -CREATE OPERATOR CLASS rum_text_ops -DEFAULT FOR TYPE text USING rum -AS - OPERATOR 1 < , - OPERATOR 2 <= , - OPERATOR 3 = , - OPERATOR 4 >= , - OPERATOR 5 > , - FUNCTION 1 bttextcmp(text,text), - FUNCTION 2 rum_text_extract_value(text, internal), - FUNCTION 3 rum_text_extract_query(text, internal, int2, internal, internal), - FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), - FUNCTION 5 rum_text_compare_prefix(text,text,int2, internal), -STORAGE text; - -/*--------------------varchar-----------------------*/ - - -CREATE OPERATOR CLASS rum_varchar_ops -DEFAULT FOR TYPE varchar USING rum -AS - OPERATOR 1 < (text, text), - OPERATOR 2 <= (text, text), - OPERATOR 3 = (text, text), - OPERATOR 4 >= (text, text), - OPERATOR 5 > (text, text), - FUNCTION 1 bttextcmp(text,text), - FUNCTION 2 rum_text_extract_value(text, internal), - FUNCTION 3 rum_text_extract_query(text, internal, int2, internal, internal), - FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), - FUNCTION 5 rum_text_compare_prefix(text,text,int2, internal), -STORAGE varchar; - -/*--------------------"char"-----------------------*/ - -CREATE FUNCTION rum_char_extract_value("char", internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_char_compare_prefix("char", "char", int2, internal) -RETURNS int4 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_char_extract_query("char", internal, int2, internal, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - - -CREATE OPERATOR CLASS rum_char_ops -DEFAULT FOR TYPE "char" USING rum -AS - OPERATOR 1 < , - OPERATOR 2 <= , - OPERATOR 3 = , - OPERATOR 4 >= , - OPERATOR 5 > , - FUNCTION 1 btcharcmp("char","char"), - FUNCTION 2 rum_char_extract_value("char", internal), - FUNCTION 3 rum_char_extract_query("char", internal, int2, internal, internal), - FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), - FUNCTION 5 rum_char_compare_prefix("char","char",int2, internal), -STORAGE "char"; - -/*--------------------bytea-----------------------*/ - -CREATE FUNCTION rum_bytea_extract_value(bytea, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_bytea_compare_prefix(bytea, bytea, int2, internal) -RETURNS int4 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_bytea_extract_query(bytea, internal, int2, internal, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - - -CREATE OPERATOR CLASS rum_bytea_ops -DEFAULT FOR TYPE bytea USING rum -AS - OPERATOR 1 < , - OPERATOR 2 <= , - OPERATOR 3 = , - OPERATOR 4 >= , - OPERATOR 5 > , - FUNCTION 1 byteacmp(bytea,bytea), - FUNCTION 2 rum_bytea_extract_value(bytea, internal), - FUNCTION 3 rum_bytea_extract_query(bytea, internal, int2, internal, internal), - FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), - FUNCTION 5 rum_bytea_compare_prefix(bytea,bytea,int2, internal), -STORAGE bytea; - -/*--------------------bit-----------------------*/ - -CREATE FUNCTION rum_bit_extract_value(bit, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_bit_compare_prefix(bit, bit, int2, internal) -RETURNS int4 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_bit_extract_query(bit, internal, int2, internal, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - - -CREATE OPERATOR CLASS rum_bit_ops -DEFAULT FOR TYPE bit USING rum -AS - OPERATOR 1 < , - OPERATOR 2 <= , - OPERATOR 3 = , - OPERATOR 4 >= , - OPERATOR 5 > , - FUNCTION 1 bitcmp(bit,bit), - FUNCTION 2 rum_bit_extract_value(bit, internal), - FUNCTION 3 rum_bit_extract_query(bit, internal, int2, internal, internal), - FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), - FUNCTION 5 rum_bit_compare_prefix(bit,bit,int2, internal), -STORAGE bit; - -/*--------------------varbit-----------------------*/ - -CREATE FUNCTION rum_varbit_extract_value(varbit, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_varbit_compare_prefix(varbit, varbit, int2, internal) -RETURNS int4 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_varbit_extract_query(varbit, internal, int2, internal, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - - -CREATE OPERATOR CLASS rum_varbit_ops -DEFAULT FOR TYPE varbit USING rum -AS - OPERATOR 1 < , - OPERATOR 2 <= , - OPERATOR 3 = , - OPERATOR 4 >= , - OPERATOR 5 > , - FUNCTION 1 varbitcmp(varbit,varbit), - FUNCTION 2 rum_varbit_extract_value(varbit, internal), - FUNCTION 3 rum_varbit_extract_query(varbit, internal, int2, internal, internal), - FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), - FUNCTION 5 rum_varbit_compare_prefix(varbit,varbit,int2, internal), -STORAGE varbit; - -/*--------------------numeric-----------------------*/ - -CREATE FUNCTION rum_numeric_extract_value(numeric, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_numeric_compare_prefix(numeric, numeric, int2, internal) -RETURNS int4 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - -CREATE FUNCTION rum_numeric_extract_query(numeric, internal, int2, internal, internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT IMMUTABLE; - - -CREATE OPERATOR CLASS rum_numeric_ops -DEFAULT FOR TYPE numeric USING rum -AS - OPERATOR 1 < , - OPERATOR 2 <= , - OPERATOR 3 = , - OPERATOR 4 >= , - OPERATOR 5 > , - FUNCTION 1 rum_numeric_cmp(numeric,numeric), - FUNCTION 2 rum_numeric_extract_value(numeric, internal), - FUNCTION 3 rum_numeric_extract_query(numeric, internal, int2, internal, internal), - FUNCTION 4 rum_btree_consistent(internal,smallint,internal,int,internal,internal,internal,internal), - FUNCTION 5 rum_numeric_compare_prefix(numeric,numeric,int2, internal), -STORAGE numeric; - -/* - * RUM version 1.2 - */ - -/*--------------------anyarray-----------------------*/ - -CREATE FUNCTION rum_anyarray_config(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - - -CREATE OR REPLACE FUNCTION rum_anyarray_similar(anyarray,anyarray) -RETURNS bool -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT STABLE; - -CREATE OPERATOR % ( - PROCEDURE = rum_anyarray_similar, - LEFTARG = anyarray, - RIGHTARG = anyarray, - COMMUTATOR = '%', - RESTRICT = contsel, - JOIN = contjoinsel -); - - -CREATE OR REPLACE FUNCTION rum_anyarray_distance(anyarray,anyarray) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT STABLE; - -CREATE OPERATOR <=> ( - PROCEDURE = rum_anyarray_distance, - LEFTARG = anyarray, - RIGHTARG = anyarray, - COMMUTATOR = '<=>' -); - - -CREATE FUNCTION rum_extract_anyarray(anyarray,internal,internal,internal,internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION rum_extract_anyarray_query(anyarray,internal,smallint,internal,internal,internal,internal) -RETURNS internal -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION rum_anyarray_consistent(internal, smallint, anyarray, integer, internal, internal, internal, internal) -RETURNS bool -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - -CREATE FUNCTION rum_anyarray_ordering(internal,smallint,anyarray,int,internal,internal,internal,internal,internal) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - - -CREATE OPERATOR CLASS rum_anyarray_ops -DEFAULT FOR TYPE anyarray USING rum -AS - OPERATOR 1 && (anyarray, anyarray), - OPERATOR 2 @> (anyarray, anyarray), - OPERATOR 3 <@ (anyarray, anyarray), - OPERATOR 4 = (anyarray, anyarray), - OPERATOR 5 % (anyarray, anyarray), - OPERATOR 20 <=> (anyarray, anyarray) FOR ORDER BY pg_catalog.float_ops, - --dispatch function 1 for concrete type - FUNCTION 2 rum_extract_anyarray(anyarray,internal,internal,internal,internal), - FUNCTION 3 rum_extract_anyarray_query(anyarray,internal,smallint,internal,internal,internal,internal), - FUNCTION 4 rum_anyarray_consistent(internal,smallint,anyarray,integer,internal,internal,internal,internal), - FUNCTION 6 rum_anyarray_config(internal), - FUNCTION 8 rum_anyarray_ordering(internal,smallint,anyarray,int,internal,internal,internal,internal,internal), - STORAGE anyelement; - -CREATE OPERATOR CLASS rum_anyarray_addon_ops -FOR TYPE anyarray USING rum -AS - OPERATOR 1 && (anyarray, anyarray), - OPERATOR 2 @> (anyarray, anyarray), - OPERATOR 3 <@ (anyarray, anyarray), - OPERATOR 4 = (anyarray, anyarray), - --dispatch function 1 for concrete type - FUNCTION 2 ginarrayextract(anyarray,internal,internal), - FUNCTION 3 ginqueryarrayextract(anyarray,internal,smallint,internal,internal,internal,internal), - FUNCTION 4 ginarrayconsistent(internal,smallint,anyarray,integer,internal,internal,internal,internal), - STORAGE anyelement; - -/*--------------------int2-----------------------*/ - -CREATE FUNCTION rum_int2_key_distance(int2, int2, smallint) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - - -ALTER OPERATOR FAMILY rum_int2_ops USING rum ADD - FUNCTION 8 (int2,int2) rum_int2_key_distance(int2, int2, smallint); - -/*--------------------int4-----------------------*/ - -CREATE FUNCTION rum_int4_key_distance(int4, int4, smallint) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - - -ALTER OPERATOR FAMILY rum_int4_ops USING rum ADD - FUNCTION 8 (int4,int4) rum_int4_key_distance(int4, int4, smallint); - -/*--------------------int8-----------------------*/ - -CREATE FUNCTION rum_int8_key_distance(int8, int8, smallint) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - - -ALTER OPERATOR FAMILY rum_int8_ops USING rum ADD - FUNCTION 8 (int8,int8) rum_int8_key_distance(int8, int8, smallint); - -/*--------------------float4-----------------------*/ - -CREATE FUNCTION rum_float4_key_distance(float4, float4, smallint) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - - -ALTER OPERATOR FAMILY rum_float4_ops USING rum ADD - FUNCTION 8 (float4,float4) rum_float4_key_distance(float4, float4, smallint); - -/*--------------------float8-----------------------*/ - -CREATE FUNCTION rum_float8_key_distance(float8, float8, smallint) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - - -ALTER OPERATOR FAMILY rum_float8_ops USING rum ADD - FUNCTION 8 (float8,float8) rum_float8_key_distance(float8, float8, smallint); - -/*--------------------money-----------------------*/ - -CREATE FUNCTION rum_money_key_distance(money, money, smallint) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - - -ALTER OPERATOR FAMILY rum_money_ops USING rum ADD - FUNCTION 8 (money,money) rum_money_key_distance(money, money, smallint); - -/*--------------------oid-----------------------*/ - -CREATE FUNCTION rum_oid_key_distance(oid, oid, smallint) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - - -ALTER OPERATOR FAMILY rum_oid_ops USING rum ADD - FUNCTION 8 (oid,oid) rum_oid_key_distance(oid, oid, smallint); - -/*--------------------timestamp-----------------------*/ - -CREATE FUNCTION rum_timestamp_key_distance(timestamp, timestamp, smallint) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - - -ALTER OPERATOR FAMILY rum_timestamp_ops USING rum ADD - FUNCTION 8 (timestamp,timestamp) rum_timestamp_key_distance(timestamp, timestamp, smallint); - -/*--------------------timestamptz-----------------------*/ - -CREATE FUNCTION rum_timestamptz_key_distance(timestamptz, timestamptz, smallint) -RETURNS float8 -AS 'MODULE_PATHNAME' -LANGUAGE C IMMUTABLE STRICT; - - -ALTER OPERATOR FAMILY rum_timestamptz_ops USING rum ADD - FUNCTION 8 (timestamptz,timestamptz) rum_timestamptz_key_distance(timestamptz, timestamptz, smallint); - diff --git a/rum--1.3.sql b/rum_init.sql similarity index 99% rename from rum--1.3.sql rename to rum_init.sql index 40d9418c68..621c4d2b9f 100644 --- a/rum--1.3.sql +++ b/rum_init.sql @@ -1,4 +1,4 @@ -CREATE OR REPLACE FUNCTION rumhandler(internal) +CREATE FUNCTION rumhandler(internal) RETURNS index_am_handler AS 'MODULE_PATHNAME' LANGUAGE C; @@ -1527,7 +1527,7 @@ AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT; -CREATE OR REPLACE FUNCTION rum_anyarray_similar(anyarray,anyarray) +CREATE FUNCTION rum_anyarray_similar(anyarray,anyarray) RETURNS bool AS 'MODULE_PATHNAME' LANGUAGE C STRICT STABLE; @@ -1542,7 +1542,7 @@ CREATE OPERATOR % ( ); -CREATE OR REPLACE FUNCTION rum_anyarray_distance(anyarray,anyarray) +CREATE FUNCTION rum_anyarray_distance(anyarray,anyarray) RETURNS float8 AS 'MODULE_PATHNAME' LANGUAGE C STRICT STABLE; diff --git a/specs/predicate-rum-2.spec b/specs/predicate-rum-2.spec index 0d0278ed77..c88383caee 100644 --- a/specs/predicate-rum-2.spec +++ b/specs/predicate-rum-2.spec @@ -6,24 +6,29 @@ setup { - CREATE EXTENSION rum; - CREATE TABLE rum_tbl (id serial, tsv tsvector); CREATE TABLE text_table (id1 serial, t text[]); - SELECT SETSEED(0.5); - INSERT INTO text_table(t) SELECT array[chr(i) || chr(j)] FROM generate_series(65,90) i, generate_series(65,90) j ; - INSERT INTO rum_tbl(tsv) SELECT to_tsvector('simple', t[1] ) FROM text_table; - + -- We need to use pseudorandom to generate values for test table + -- In this case we use linear congruential generator because random() + -- function may generate different outputs with different systems DO $$ + DECLARE + c integer := 17; + a integer := 261; + m integer := 6760; + Xi integer := 228; BEGIN - FOR j in 1..10 LOOP - UPDATE rum_tbl SET tsv = tsv || q.t1 FROM (SELECT id1,to_tsvector('simple', t[1] ) - as t1 FROM text_table) as q WHERE id = (random()*q.id1)::integer; + FOR i in 1..338 LOOP + INSERT INTO rum_tbl(tsv) VALUES (''); + FOR j in 1..10 LOOP + UPDATE rum_tbl SET tsv = tsv || (SELECT to_tsvector('simple', t[1]) FROM text_table WHERE id1 = Xi % 676 + 1) WHERE id = i; + Xi = (a * Xi + c) % m; + END LOOP; END LOOP; END; $$; @@ -35,7 +40,6 @@ teardown { DROP TABLE text_table; DROP TABLE rum_tbl; - DROP EXTENSION rum; } session "s1" diff --git a/specs/predicate-rum.spec b/specs/predicate-rum.spec index 2d87194d40..4d324b9ef2 100644 --- a/specs/predicate-rum.spec +++ b/specs/predicate-rum.spec @@ -6,24 +6,29 @@ setup { - CREATE EXTENSION rum; - CREATE TABLE rum_tbl (id serial, tsv tsvector); CREATE TABLE text_table (id1 serial, t text[]); - SELECT SETSEED(0.5); - INSERT INTO text_table(t) SELECT array[chr(i) || chr(j)] FROM generate_series(65,90) i, generate_series(65,90) j ; - INSERT INTO rum_tbl(tsv) SELECT to_tsvector('simple', t[1] ) FROM text_table; - + -- We need to use pseudorandom to generate values for test table + -- In this case we use linear congruential generator because random() + -- function may generate different outputs with different systems DO $$ + DECLARE + c integer := 17; + a integer := 261; + m integer := 6760; + Xi integer := 228; BEGIN - FOR j in 1..10 LOOP - UPDATE rum_tbl SET tsv = tsv || q.t1 FROM (SELECT id1,to_tsvector('simple', t[1] ) - as t1 FROM text_table) as q WHERE id = (random()*q.id1)::integer; + FOR i in 1..338 LOOP + INSERT INTO rum_tbl(tsv) VALUES (''); + FOR j in 1..10 LOOP + UPDATE rum_tbl SET tsv = tsv || (SELECT to_tsvector('simple', t[1]) FROM text_table WHERE id1 = Xi % 676 + 1) WHERE id = i; + Xi = (a * Xi + c) % m; + END LOOP; END LOOP; END; $$; @@ -35,7 +40,6 @@ teardown { DROP TABLE text_table; DROP TABLE rum_tbl; - DROP EXTENSION rum; } session "s1" diff --git a/sql/altorder.sql b/sql/altorder.sql index 85c6cdf630..850e252325 100644 --- a/sql/altorder.sql +++ b/sql/altorder.sql @@ -1,13 +1,28 @@ +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * altorder.out - test output for 64-bit systems and + * altorder_1.out - test output for 32-bit systems. + * + */ + + CREATE TABLE atsts (id int, t tsvector, d timestamp); +\copy atsts from 'data/tsts.data' +-- PGPRO-2537: We need more data to test rumsort.c with logtape.c +\copy atsts from 'data/tsts.data' +\copy atsts from 'data/tsts.data' \copy atsts from 'data/tsts.data' CREATE INDEX atsts_idx ON atsts USING rum (t rum_tsvector_addon_ops, d) WITH (attach = 'd', to = 't', order_by_attach='t'); -INSERT INTO atsts VALUES (-1, 't1 t2', '2016-05-02 02:24:22.326724'); -INSERT INTO atsts VALUES (-2, 't1 t2 t3', '2016-05-02 02:26:22.326724'); +INSERT INTO atsts VALUES (-1, 't1 t2', '2016-05-02 02:24:22.326724'); +INSERT INTO atsts VALUES (-2, 't1 t2 t3', '2016-05-02 02:26:22.326724'); SELECT count(*) FROM atsts WHERE t @@ 'wr|qh'; @@ -30,9 +45,8 @@ SELECT count(*) FROM atsts WHERE d > '2016-05-16 14:21:25'; SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; -RESET enable_indexscan; -RESET enable_indexonlyscan; -RESET enable_bitmapscan; +-- Test bitmap index scan +SET enable_bitmapscan=on; SET enable_seqscan = off; EXPLAIN (costs off) @@ -52,6 +66,11 @@ EXPLAIN (costs off) SELECT count(*) FROM atsts WHERE d > '2016-05-16 14:21:25'; SELECT count(*) FROM atsts WHERE d > '2016-05-16 14:21:25'; +-- Test index scan +SET enable_indexscan=on; +SET enable_indexonlyscan=on; +SET enable_bitmapscan=off; + EXPLAIN (costs off) SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; @@ -73,3 +92,6 @@ EXPLAIN (costs off) SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; SELECT id, d FROM atsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; +EXPLAIN (costs off) +SELECT id, d FROM atsts WHERE t @@ 'wr&q:*' AND d >= '2016-05-16 14:21:25' ORDER BY d; +SELECT id, d FROM atsts WHERE t @@ 'wr&q:*' AND d >= '2016-05-16 14:21:25' ORDER BY d; diff --git a/sql/altorder_hash.sql b/sql/altorder_hash.sql index ff07b4f769..148407c661 100644 --- a/sql/altorder_hash.sql +++ b/sql/altorder_hash.sql @@ -1,3 +1,14 @@ +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * altorder_hash.out - test output for 64-bit systems and + * altorder_hash_1.out - test output for 32-bit systems. + * + */ + + CREATE TABLE atstsh (id int, t tsvector, d timestamp); \copy atstsh from 'data/tsts.data' @@ -6,8 +17,8 @@ CREATE INDEX atstsh_idx ON atstsh USING rum (t rum_tsvector_hash_addon_ops, d) WITH (attach = 'd', to = 't', order_by_attach='t'); -INSERT INTO atstsh VALUES (-1, 't1 t2', '2016-05-02 02:24:22.326724'); -INSERT INTO atstsh VALUES (-2, 't1 t2 t3', '2016-05-02 02:26:22.326724'); +INSERT INTO atstsh VALUES (-1, 't1 t2', '2016-05-02 02:24:22.326724'); +INSERT INTO atstsh VALUES (-2, 't1 t2 t3', '2016-05-02 02:26:22.326724'); SELECT count(*) FROM atstsh WHERE t @@ 'wr|qh'; @@ -30,9 +41,8 @@ SELECT count(*) FROM atstsh WHERE d > '2016-05-16 14:21:25'; SELECT id, d FROM atstsh WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; SELECT id, d FROM atstsh WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; -RESET enable_indexscan; -RESET enable_indexonlyscan; -RESET enable_bitmapscan; +-- Test bitmap index scan +SET enable_bitmapscan=on; SET enable_seqscan = off; EXPLAIN (costs off) @@ -52,6 +62,11 @@ EXPLAIN (costs off) SELECT count(*) FROM atstsh WHERE d > '2016-05-16 14:21:25'; SELECT count(*) FROM atstsh WHERE d > '2016-05-16 14:21:25'; +-- Test index scan +SET enable_indexscan=on; +SET enable_indexonlyscan=on; +SET enable_bitmapscan=off; + EXPLAIN (costs off) SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atstsh WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; SELECT id, d, d <=> '2016-05-16 14:21:25' FROM atstsh WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; diff --git a/sql/array.sql b/sql/array.sql index e3869b06d3..9eba800bcf 100644 --- a/sql/array.sql +++ b/sql/array.sql @@ -1,7 +1,17 @@ +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * array.out - test output for 64-bit systems and + * array_1.out - test output for 32-bit systems. + * + */ + + set enable_seqscan=off; set enable_sort=off; - /* * Complete checks for int2[]. */ @@ -199,6 +209,20 @@ DROP INDEX idx_array; /* * Check ordering using distance operator + * + * We want to check that index scan provides us correct ordering by distance + * operator. File 'data/rum_array.data' contains two arrays that statisfy + * i @> '{23,20}' and have finite distance i <=> '{51}', and a bunch of arrays + * that statisfy i @> '{23,20}' and have infinite distance i <=> '{51}'. + * + * When ordering by distance the order of this bunch of arrays with infinite + * distance is not determined and may depend of PostgreSQL version and system. + * We don't add another sort expression to ORDER BY because that might cause + * the planner to avoid using the index. Instead, we replace arrays that have + * infinite distance with {-1} to unambiguously determine the test output. + * + * 'Infinity' is printed differently in the output in different PostgreSQL + * versions, so we replace it with -1. */ CREATE TABLE test_array_order ( @@ -208,6 +232,29 @@ CREATE TABLE test_array_order ( CREATE INDEX idx_array_order ON test_array_order USING rum (i rum_anyarray_ops); +/* + * Check that plan of the query uses ordering provided by index scan + */ + EXPLAIN (COSTS OFF) -SELECT *, i <=> '{51}' from test_array_order WHERE i @> '{23,20}' order by i <=> '{51}'; -SELECT *, i <=> '{51}' from test_array_order WHERE i @> '{23,20}' order by i <=> '{51}'; +SELECT + CASE WHEN distance = 'Infinity' THEN '{-1}' + ELSE i + END i, + CASE WHEN distance = 'Infinity' THEN -1 + ELSE distance::numeric(18,14) + END distance + FROM + (SELECT *, (i <=> '{51}') AS distance + FROM test_array_order WHERE i @> '{23,20}' ORDER BY distance) t; + +SELECT + CASE WHEN distance = 'Infinity' THEN '{-1}' + ELSE i + END i, + CASE WHEN distance = 'Infinity' THEN -1 + ELSE distance::numeric(18,14) + END distance + FROM + (SELECT *, (i <=> '{51}') AS distance + FROM test_array_order WHERE i @> '{23,20}' ORDER BY distance) t; diff --git a/sql/expr.sql b/sql/expr.sql new file mode 100644 index 0000000000..d7b7ee3d24 --- /dev/null +++ b/sql/expr.sql @@ -0,0 +1,21 @@ +CREATE TABLE documents ( + en text not null, + score float not null, + textsearch_index_en_col tsvector +); + +INSERT INTO documents VALUES ('the pet cat is in the shed', 56, to_tsvector('english', 'the pet cat is in the shed')); + +CREATE INDEX textsearch_index_en ON documents + USING rum (textsearch_index_en_col rum_tsvector_addon_ops, score) + WITH (attach = 'score', to = 'textsearch_index_en_col'); + +SET enable_seqscan=off; +-- should be 1 row +SELECT * FROM documents WHERE textsearch_index_en_col @@ ('pet'::tsquery <-> ('dog'::tsquery || 'cat'::tsquery)); + +SET enable_seqscan=on; +-- 1 row +SELECT * FROM documents WHERE textsearch_index_en_col @@ ('pet'::tsquery <-> ('dog'::tsquery || 'cat'::tsquery)); + +DROP TABLE documents; diff --git a/sql/float8.sql b/sql/float8.sql index 2de5b9ea19..b61cbfb0da 100644 --- a/sql/float8.sql +++ b/sql/float8.sql @@ -1,3 +1,14 @@ +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * float8.out - test output for 64-bit systems and + * float8_1.out - test output for 32-bit systems. + * + */ + + set enable_seqscan=off; CREATE TABLE test_float8 ( diff --git a/sql/int4.sql b/sql/int4.sql index fa7357b6e6..2fa0e8afec 100644 --- a/sql/int4.sql +++ b/sql/int4.sql @@ -40,7 +40,6 @@ SELECT id FROM test_int4_o WHERE t @@ 'wr&qh' AND id >= 400 ORDER BY id; RESET enable_indexscan; RESET enable_indexonlyscan; -RESET enable_bitmapscan; SET enable_seqscan = off; EXPLAIN (costs off) @@ -66,8 +65,6 @@ CREATE INDEX test_int4_a_idx ON test_int4_a USING rum (t rum_tsvector_addon_ops, id) WITH (attach = 'id', to = 't', order_by_attach='t'); -SET enable_bitmapscan=OFF; - EXPLAIN (costs off) SELECT count(*) FROM test_int4_a WHERE id < 400; SELECT count(*) FROM test_int4_a WHERE id < 400; @@ -107,7 +104,6 @@ SELECT id FROM test_int4_h_o WHERE t @@ 'wr&qh' AND id >= 400 ORDER BY id; RESET enable_indexscan; RESET enable_indexonlyscan; -RESET enable_bitmapscan; SET enable_seqscan = off; EXPLAIN (costs off) @@ -133,8 +129,6 @@ CREATE INDEX test_int4_h_a_idx ON test_int4_h_a USING rum (t rum_tsvector_hash_addon_ops, id) WITH (attach = 'id', to = 't', order_by_attach='t'); -SET enable_bitmapscan=OFF; - EXPLAIN (costs off) SELECT count(*) FROM test_int4_h_a WHERE id < 400; SELECT count(*) FROM test_int4_h_a WHERE id < 400; diff --git a/sql/int8.sql b/sql/int8.sql index 540f2b7dbb..c51705e62b 100644 --- a/sql/int8.sql +++ b/sql/int8.sql @@ -1,3 +1,14 @@ +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * int8.out - test output for 64-bit systems and + * int8_1.out - test output for 32-bit systems. + * + */ + + set enable_seqscan=off; CREATE TABLE test_int8 ( @@ -40,7 +51,6 @@ SELECT id FROM test_int8_o WHERE t @@ 'wr&qh' AND id >= 400::int8 ORDER BY id; RESET enable_indexscan; RESET enable_indexonlyscan; -RESET enable_bitmapscan; SET enable_seqscan = off; EXPLAIN (costs off) @@ -66,8 +76,6 @@ CREATE INDEX test_int8_a_idx ON test_int8_a USING rum (t rum_tsvector_addon_ops, id) WITH (attach = 'id', to = 't', order_by_attach='t'); -SET enable_bitmapscan=OFF; - EXPLAIN (costs off) SELECT count(*) FROM test_int8_a WHERE id < 400::int8; SELECT count(*) FROM test_int8_a WHERE id < 400::int8; @@ -107,7 +115,6 @@ SELECT id FROM test_int8_h_o WHERE t @@ 'wr&qh' AND id >= 400::int8 ORDER BY id RESET enable_indexscan; RESET enable_indexonlyscan; -RESET enable_bitmapscan; SET enable_seqscan = off; EXPLAIN (costs off) @@ -120,6 +127,7 @@ EXPLAIN (costs off) SELECT id, id |=> 400 FROM test_int8_h_o WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; SELECT id, id |=> 400 FROM test_int8_h_o WHERE t @@ 'wr&qh' ORDER BY id |=> 400 LIMIT 5; + EXPLAIN (costs off) SELECT id FROM test_int8_h_o WHERE t @@ 'wr&qh' AND id <= 400::int8 ORDER BY id; SELECT id FROM test_int8_h_o WHERE t @@ 'wr&qh' AND id <= 400::int8 ORDER BY id; @@ -133,8 +141,6 @@ CREATE INDEX test_int8_h_a_idx ON test_int8_h_a USING rum (t rum_tsvector_hash_addon_ops, id) WITH (attach = 'id', to = 't', order_by_attach='t'); -SET enable_bitmapscan=OFF; - EXPLAIN (costs off) SELECT count(*) FROM test_int8_h_a WHERE id < 400::int8; SELECT count(*) FROM test_int8_h_a WHERE id < 400::int8; diff --git a/sql/money.sql b/sql/money.sql index 952d2bc8fe..13df5ed260 100644 --- a/sql/money.sql +++ b/sql/money.sql @@ -1,3 +1,14 @@ +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * money.out - test output for 64-bit systems and + * money_1.out - test output for 32-bit systems. + * + */ + + set enable_seqscan=off; CREATE TABLE test_money ( diff --git a/sql/orderby.sql b/sql/orderby.sql index f254483ae0..a2bd227873 100644 --- a/sql/orderby.sql +++ b/sql/orderby.sql @@ -1,3 +1,14 @@ +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * orderby.out - test output for 64-bit systems and + * orderby_1.out - test output for 32-bit systems. + * + */ + + CREATE TABLE tsts (id int, t tsvector, d timestamp); \copy tsts from 'data/tsts.data' @@ -6,17 +17,10 @@ CREATE INDEX tsts_idx ON tsts USING rum (t rum_tsvector_addon_ops, d) WITH (attach = 'd', to = 't'); -INSERT INTO tsts VALUES (-1, 't1 t2', '2016-05-02 02:24:22.326724'); -INSERT INTO tsts VALUES (-2, 't1 t2 t3', '2016-05-02 02:26:22.326724'); +INSERT INTO tsts VALUES (-1, 't1 t2', '2016-05-02 02:24:22.326724'); +INSERT INTO tsts VALUES (-2, 't1 t2 t3', '2016-05-02 02:26:22.326724'); -SELECT count(*) FROM tsts WHERE t @@ 'wr|qh'; -SELECT count(*) FROM tsts WHERE t @@ 'wr&qh'; -SELECT count(*) FROM tsts WHERE t @@ 'eq&yt'; -SELECT count(*) FROM tsts WHERE t @@ 'eq|yt'; -SELECT count(*) FROM tsts WHERE t @@ '(eq&yt)|(wr&qh)'; -SELECT count(*) FROM tsts WHERE t @@ '(eq|yt)&(wr|qh)'; - SET enable_indexscan=OFF; SET enable_indexonlyscan=OFF; SET enable_bitmapscan=OFF; @@ -27,8 +31,7 @@ SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; -RESET enable_indexscan; -RESET enable_indexonlyscan; +-- Test bitmap index scan RESET enable_bitmapscan; SET enable_seqscan = off; @@ -62,8 +65,34 @@ EXPLAIN (costs off) SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; +-- Test index scan +RESET enable_indexscan; +RESET enable_indexonlyscan; SET enable_bitmapscan=OFF; +EXPLAIN (costs off) +SELECT count(*) FROM tsts WHERE t @@ 'wr|qh'; +SELECT count(*) FROM tsts WHERE t @@ 'wr|qh'; +SELECT count(*) FROM tsts WHERE t @@ 'wr&qh'; +SELECT count(*) FROM tsts WHERE t @@ 'eq&yt'; +SELECT count(*) FROM tsts WHERE t @@ 'eq|yt'; +SELECT count(*) FROM tsts WHERE t @@ '(eq&yt)|(wr&qh)'; +SELECT count(*) FROM tsts WHERE t @@ '(eq|yt)&(wr|qh)'; + +EXPLAIN (costs off) +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +EXPLAIN (costs off) +SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; +EXPLAIN (costs off) +SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; + +EXPLAIN (costs off) +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + EXPLAIN (costs off) SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; @@ -77,6 +106,13 @@ SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d ASC LIMIT 3; SELECT id, d FROM tsts WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d DESC LIMIT 3; +-- Test "ORDER BY" error message +DROP INDEX tsts_idx; + +CREATE INDEX tsts_idx ON tsts USING rum (t rum_tsvector_addon_ops, d); + +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tsts WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + -- Test multicolumn index RESET enable_indexscan; diff --git a/sql/orderby_hash.sql b/sql/orderby_hash.sql index 66a45268ca..dba8f17ca1 100644 --- a/sql/orderby_hash.sql +++ b/sql/orderby_hash.sql @@ -1,3 +1,14 @@ +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * orderby_hash.out - test output for 64-bit systems and + * orderby_hash_1.out - test output for 32-bit systems. + * + */ + + CREATE TABLE tstsh (id int, t tsvector, d timestamp); \copy tstsh from 'data/tsts.data' @@ -6,16 +17,9 @@ CREATE INDEX tstsh_idx ON tstsh USING rum (t rum_tsvector_hash_addon_ops, d) WITH (attach = 'd', to = 't'); -INSERT INTO tstsh VALUES (-1, 't1 t2', '2016-05-02 02:24:22.326724'); -INSERT INTO tstsh VALUES (-2, 't1 t2 t3', '2016-05-02 02:26:22.326724'); - +INSERT INTO tstsh VALUES (-1, 't1 t2', '2016-05-02 02:24:22.326724'); +INSERT INTO tstsh VALUES (-2, 't1 t2 t3', '2016-05-02 02:26:22.326724'); -SELECT count(*) FROM tstsh WHERE t @@ 'wr|qh'; -SELECT count(*) FROM tstsh WHERE t @@ 'wr&qh'; -SELECT count(*) FROM tstsh WHERE t @@ 'eq&yt'; -SELECT count(*) FROM tstsh WHERE t @@ 'eq|yt'; -SELECT count(*) FROM tstsh WHERE t @@ '(eq&yt)|(wr&qh)'; -SELECT count(*) FROM tstsh WHERE t @@ '(eq|yt)&(wr|qh)'; SET enable_indexscan=OFF; SET enable_indexonlyscan=OFF; @@ -27,8 +31,7 @@ SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; -RESET enable_indexscan; -RESET enable_indexonlyscan; +-- Test bitmap index scan RESET enable_bitmapscan; SET enable_seqscan = off; @@ -62,8 +65,34 @@ EXPLAIN (costs off) SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d >= '2016-05-16 14:21:25' ORDER BY d; +-- Test index scan +RESET enable_indexscan; +RESET enable_indexonlyscan; SET enable_bitmapscan=OFF; +EXPLAIN (costs off) +SELECT count(*) FROM tstsh WHERE t @@ 'wr|qh'; +SELECT count(*) FROM tstsh WHERE t @@ 'wr|qh'; +SELECT count(*) FROM tstsh WHERE t @@ 'wr&qh'; +SELECT count(*) FROM tstsh WHERE t @@ 'eq&yt'; +SELECT count(*) FROM tstsh WHERE t @@ 'eq|yt'; +SELECT count(*) FROM tstsh WHERE t @@ '(eq&yt)|(wr&qh)'; +SELECT count(*) FROM tstsh WHERE t @@ '(eq|yt)&(wr|qh)'; + +EXPLAIN (costs off) +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +EXPLAIN (costs off) +SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=| '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d <=| '2016-05-16 14:21:25' LIMIT 5; +EXPLAIN (costs off) +SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d |=> '2016-05-16 14:21:25' FROM tstsh WHERE t @@ 'wr&qh' ORDER BY d |=> '2016-05-16 14:21:25' LIMIT 5; + +EXPLAIN (costs off) +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tstsh ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; +SELECT id, d, d <=> '2016-05-16 14:21:25' FROM tstsh ORDER BY d <=> '2016-05-16 14:21:25' LIMIT 5; + EXPLAIN (costs off) SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; SELECT id, d FROM tstsh WHERE t @@ 'wr&qh' AND d <= '2016-05-16 14:21:25' ORDER BY d; diff --git a/sql/rum.sql b/sql/rum.sql index 8b8607faa6..8414bb95c5 100644 --- a/sql/rum.sql +++ b/sql/rum.sql @@ -47,34 +47,34 @@ SELECT count(*) FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'def <-> fgr'); SELECT count(*) FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'def <2> fgr'); -SELECT rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way')), - rum_ts_score(a, to_tsquery('pg_catalog.english', 'way')), +SELECT rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way'))::numeric(10,4), + rum_ts_score(a, to_tsquery('pg_catalog.english', 'way'))::numeric(10,7), * FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'way') ORDER BY a <=> to_tsquery('pg_catalog.english', 'way'); -SELECT rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way & (go | half)')), - rum_ts_score(a, to_tsquery('pg_catalog.english', 'way & (go | half)')), +SELECT rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way & (go | half)'))::numeric(10,4), + rum_ts_score(a, to_tsquery('pg_catalog.english', 'way & (go | half)'))::numeric(10,6), * FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'way & (go | half)') ORDER BY a <=> to_tsquery('pg_catalog.english', 'way & (go | half)'); SELECT - a <=> to_tsquery('pg_catalog.english', 'way & (go | half)'), - rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way & (go | half)')), + (a <=> to_tsquery('pg_catalog.english', 'way & (go | half)'))::numeric(10,4) AS distance, + rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way & (go | half)'))::numeric(10,4), * FROM test_rum ORDER BY a <=> to_tsquery('pg_catalog.english', 'way & (go | half)') limit 2; -- Check ranking normalization -SELECT rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way'), 0), - rum_ts_score(a, to_tsquery('pg_catalog.english', 'way'), 0), +SELECT rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way'), 0)::numeric(10,4), + rum_ts_score(a, to_tsquery('pg_catalog.english', 'way'), 0)::numeric(10,7), * FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'way') ORDER BY a <=> to_tsquery('pg_catalog.english', 'way'); -SELECT rum_ts_distance(a, row(to_tsquery('pg_catalog.english', 'way & (go | half)'), 0)::rum_distance_query), - rum_ts_score(a, row(to_tsquery('pg_catalog.english', 'way & (go | half)'), 0)::rum_distance_query), +SELECT rum_ts_distance(a, row(to_tsquery('pg_catalog.english', 'way & (go | half)'), 0)::rum_distance_query)::numeric(10,4), + rum_ts_score(a, row(to_tsquery('pg_catalog.english', 'way & (go | half)'), 0)::rum_distance_query)::numeric(10,6), * FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'way & (go | half)') @@ -93,7 +93,13 @@ SELECT count(*) FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'rat') SELECT a FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'bar') ORDER BY a; -- Check full-index scan with order by -SELECT a <=> to_tsquery('pg_catalog.english', 'ever|wrote') FROM test_rum ORDER BY a <=> to_tsquery('pg_catalog.english', 'ever|wrote'); +SELECT + CASE WHEN distance = 'Infinity' THEN -1 + ELSE distance::numeric(10,4) + END distance + FROM + (SELECT a <=> to_tsquery('pg_catalog.english', 'ever|wrote') AS distance + FROM test_rum ORDER BY a <=> to_tsquery('pg_catalog.english', 'ever|wrote')) t; CREATE TABLE tst (i int4, t tsvector); INSERT INTO tst SELECT i%10, to_tsvector('simple', substr(md5(i::text), 1, 1)) FROM generate_series(1,100000) i; @@ -126,14 +132,23 @@ SELECT a <=> to_tsquery('pg_catalog.english', 'w:*'), * FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'w:*') ORDER BY a <=> to_tsquery('pg_catalog.english', 'w:*'); -SELECT a <=> to_tsquery('pg_catalog.english', 'w:*'), * +SELECT (a <=> to_tsquery('pg_catalog.english', 'w:*'))::numeric(10,4) AS distance, * FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'w:*') ORDER BY a <=> to_tsquery('pg_catalog.english', 'w:*'); -SELECT a <=> to_tsquery('pg_catalog.english', 'b:*'), * +SELECT (a <=> to_tsquery('pg_catalog.english', 'b:*'))::numeric(10,4) AS distance, * FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'b:*') ORDER BY a <=> to_tsquery('pg_catalog.english', 'b:*'); -select 'bjarn:6237 stroustrup:6238'::tsvector <=> 'bjarn <-> stroustrup'::tsquery; -SELECT 'stroustrup:5508B,6233B,6238B bjarn:6235B,6237B' <=> 'bjarn <-> stroustrup'::tsquery; +-- Test correct work of phrase operator when position information is not in index. +create table test_rum_addon as table test_rum; +alter table test_rum_addon add column id serial; +create index on test_rum_addon using rum (a rum_tsvector_addon_ops, id) with (attach = 'id', to='a'); + +select * from test_rum_addon where a @@ to_tsquery('pg_catalog.english', 'half <-> way'); +explain (costs off) select * from test_rum_addon where a @@ to_tsquery('pg_catalog.english', 'half <-> way'); +-- + +select ('bjarn:6237 stroustrup:6238'::tsvector <=> 'bjarn <-> stroustrup'::tsquery)::numeric(10,5) AS distance; +SELECT ('stroustrup:5508B,6233B,6238B bjarn:6235B,6237B' <=> 'bjarn <-> stroustrup'::tsquery)::numeric(10,5) AS distance; diff --git a/sql/rum_hash.sql b/sql/rum_hash.sql index 511e772da5..a33b8fde31 100644 --- a/sql/rum_hash.sql +++ b/sql/rum_hash.sql @@ -35,35 +35,35 @@ SELECT count(*) FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 'def <-> fgr'); SELECT count(*) FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 'def <2> fgr'); -SELECT rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way')), - rum_ts_score(a, to_tsquery('pg_catalog.english', 'way')), +SELECT rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way'))::numeric(10,4), + rum_ts_score(a, to_tsquery('pg_catalog.english', 'way'))::numeric(10,7), * FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 'way') ORDER BY a <=> to_tsquery('pg_catalog.english', 'way'); -SELECT rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way & (go | half)')), - rum_ts_score(a, to_tsquery('pg_catalog.english', 'way & (go | half)')), +SELECT rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way & (go | half)'))::numeric(10,4), + rum_ts_score(a, to_tsquery('pg_catalog.english', 'way & (go | half)'))::numeric(10,6), * FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 'way & (go | half)') ORDER BY a <=> to_tsquery('pg_catalog.english', 'way & (go | half)'); SELECT - a <=> to_tsquery('pg_catalog.english', 'way & (go | half)'), - rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way & (go | half)')), - rum_ts_score(a, to_tsquery('pg_catalog.english', 'way & (go | half)')), + (a <=> to_tsquery('pg_catalog.english', 'way & (go | half)'))::numeric(10,4) AS distance, + rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way & (go | half)'))::numeric(10,4), + rum_ts_score(a, to_tsquery('pg_catalog.english', 'way & (go | half)'))::numeric(10,6), * FROM test_rum_hash ORDER BY a <=> to_tsquery('pg_catalog.english', 'way & (go | half)') limit 2; -- Check ranking normalization -SELECT rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way'), 0), - rum_ts_score(a, to_tsquery('pg_catalog.english', 'way'), 0), +SELECT rum_ts_distance(a, to_tsquery('pg_catalog.english', 'way'), 0)::numeric(10,4), + rum_ts_score(a, to_tsquery('pg_catalog.english', 'way'), 0)::numeric(10,7), * FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 'way') ORDER BY a <=> to_tsquery('pg_catalog.english', 'way'); -SELECT rum_ts_distance(a, row(to_tsquery('pg_catalog.english', 'way & (go | half)'), 0)::rum_distance_query), - rum_ts_score(a, row(to_tsquery('pg_catalog.english', 'way & (go | half)'), 0)::rum_distance_query), +SELECT rum_ts_distance(a, row(to_tsquery('pg_catalog.english', 'way & (go | half)'), 0)::rum_distance_query)::numeric(10,4), + rum_ts_score(a, row(to_tsquery('pg_catalog.english', 'way & (go | half)'), 0)::rum_distance_query)::numeric(10,6), * FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 'way & (go | half)') @@ -82,7 +82,13 @@ SELECT count(*) FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', ' SELECT a FROM test_rum_hash WHERE a @@ to_tsquery('pg_catalog.english', 'bar') ORDER BY a; -- Check full-index scan with order by -SELECT a <=> to_tsquery('pg_catalog.english', 'ever|wrote') FROM test_rum_hash ORDER BY a <=> to_tsquery('pg_catalog.english', 'ever|wrote'); +SELECT + CASE WHEN distance = 'Infinity' THEN -1 + ELSE distance::numeric(10,4) + END distance + FROM + (SELECT a <=> to_tsquery('pg_catalog.english', 'ever|wrote') AS distance + FROM test_rum_hash ORDER BY a <=> to_tsquery('pg_catalog.english', 'ever|wrote')) t; CREATE TABLE tst_hash (i int4, t tsvector); INSERT INTO tst_hash SELECT i%10, to_tsvector('simple', substr(md5(i::text), 1, 1)) FROM generate_series(1,100000) i; diff --git a/sql/rum_validate.sql b/sql/rum_validate.sql index 24bc4aa7c3..455db5db56 100644 --- a/sql/rum_validate.sql +++ b/sql/rum_validate.sql @@ -31,7 +31,7 @@ FROM unnest(array['asc','desc','nulls_first','nulls_last','orderable','distance_ DROP INDEX rumidx; --- Check incorrect operator class +-- PGPRO-1175: Check incorrect operator class, i.e. it shouldn't work correctly CREATE OPERATOR CLASS rum_tsvector_norm_ops FOR TYPE tsvector USING rum AS @@ -54,7 +54,14 @@ SET enable_seqscan=off; SET enable_bitmapscan=off; SET enable_indexscan=on; +-- PGPRO-1175: Select using incorrect operator class SELECT a FROM test_rum WHERE a @@ to_tsquery('pg_catalog.english', 'bar') - ORDER BY a <=> (to_tsquery('pg_catalog.english', 'bar'),0) \ No newline at end of file + ORDER BY a <=> (to_tsquery('pg_catalog.english', 'bar'),0); + +-- PGPRO-9026: column and attached column cannot be the same +CREATE TABLE test_array (i int2[]); +CREATE INDEX idx_array ON test_array USING rum (i rum_anyarray_addon_ops) WITH (attach = 'i', to = 'i'); +SELECT * FROM test_array WHERE i && '{1}'; +DROP TABLE test_array; diff --git a/sql/rum_weight.sql b/sql/rum_weight.sql new file mode 100644 index 0000000000..3fcee8b06e --- /dev/null +++ b/sql/rum_weight.sql @@ -0,0 +1,44 @@ +CREATE TABLE testweight_rum( t text, a tsvector, r text ); + +CREATE FUNCTION fill_weight_trigger() RETURNS trigger AS $$ +begin + new.a := + setweight(to_tsvector('pg_catalog.english', coalesce(new.r,'')), 'A') || + setweight(to_tsvector('pg_catalog.english', coalesce(new.t,'')), 'D'); + return new; +end +$$ LANGUAGE plpgsql; + +CREATE TRIGGER tsvectorweightupdate +BEFORE INSERT OR UPDATE ON testweight_rum +FOR EACH ROW EXECUTE PROCEDURE fill_weight_trigger(); + +CREATE INDEX rumidx_weight ON testweight_rum USING rum (a rum_tsvector_ops); + +\copy testweight_rum(t,r) from 'data/rum_weight.data' DELIMITER '|' ; + +SET enable_seqscan=off; +SET enable_indexscan=off; + +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'ever:A|wrote'); +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'have:A&wish:DAC'); +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'have:A&wish:DAC'); +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'among:ABC'); +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'structure:D&ancient:BCD'); +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', '(complimentary:DC|sight)&(sending:ABC|heart)'); +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', '!gave:D & way'); +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', '(go<->go:a)&(think:d<->go)'); +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', '(go<->go:a)&(think:d<2>go)'); +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'go & (!reach:a | way<->reach)'); +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'go & (!reach:a & way<->reach)'); +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'reach:d & go & !way:a'); +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'show:d & seem & !town:a'); +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', '!way:a'); +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'go & !way:a'); +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'reach:d & !way:a'); +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'reach:d & go'); +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'think<->go:d | go<->see'); +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'reach:d<->think'); +SELECT count(*) FROM testweight_rum WHERE a @@ to_tsquery('pg_catalog.english', 'reach<->think'); + + diff --git a/sql/security.sql b/sql/security.sql new file mode 100644 index 0000000000..da7b83957b --- /dev/null +++ b/sql/security.sql @@ -0,0 +1,5 @@ +-- Check security CVE-2020-14350 +CREATE FUNCTION rum_anyarray_similar(anyarray,anyarray) RETURNS bool AS $$ SELECT false $$ LANGUAGE SQL; +CREATE EXTENSION rum; +DROP FUNCTION rum_anyarray_similar(anyarray,anyarray); + diff --git a/sql/text.sql b/sql/text.sql index 5b7fbab485..1f340b7109 100644 --- a/sql/text.sql +++ b/sql/text.sql @@ -37,6 +37,7 @@ SELECT id FROM test_text_o WHERE t @@ 'wr&qh' AND id >= '400' ORDER BY id; CREATE TABLE test_text_a AS SELECT id::text, t FROM tsts; +-- Should fail, temporarly it isn't allowed to order an index over pass-by-reference column CREATE INDEX test_text_a_idx ON test_text_a USING rum (t rum_tsvector_addon_ops, id) WITH (attach = 'id', to = 't', order_by_attach='t'); @@ -67,6 +68,7 @@ SELECT id FROM test_text_h_o WHERE t @@ 'wr&qh' AND id >= '400' ORDER BY id; CREATE TABLE test_text_h_a AS SELECT id::text, t FROM tsts; +-- Should fail, temporarly it isn't allowed to order an index over pass-by-reference column CREATE INDEX test_text_h_a_idx ON test_text_h_a USING rum (t rum_tsvector_hash_addon_ops, id) WITH (attach = 'id', to = 't', order_by_attach='t'); diff --git a/sql/timestamp.sql b/sql/timestamp.sql index 8025774b82..3386229ddc 100644 --- a/sql/timestamp.sql +++ b/sql/timestamp.sql @@ -1,3 +1,13 @@ +/* + * ------------------------------------ + * NOTE: This test behaves differenly + * ------------------------------------ + * + * timestamp.out - test output for 64-bit systems and + * timestamp_1.out - test output for 32-bit systems. + * + */ + CREATE TABLE test_timestamp ( i timestamp diff --git a/src/btree_rum.c b/src/btree_rum.c index 170ace6aba..dd43a3c037 100644 --- a/src/btree_rum.c +++ b/src/btree_rum.c @@ -112,6 +112,7 @@ rum_btree_extract_query(FunctionCallInfo fcinfo, case BTGreaterEqualStrategyNumber: case BTGreaterStrategyNumber: *ptr_partialmatch = true; + /*FALLTHROUGH*/ case BTEqualStrategyNumber: case RUM_DISTANCE: case RUM_LEFT_DISTANCE: diff --git a/src/disable_core_macro.h b/src/disable_core_macro.h new file mode 100644 index 0000000000..0d6c4a8a3b --- /dev/null +++ b/src/disable_core_macro.h @@ -0,0 +1,33 @@ +/*------------------------------------------------------------------------- + * + * disable_core_macro.h + * Support including tuplesort.c from postgresql core code. + * + * Copyright (c) 2022-2024, Postgres Professional + * + *------------------------------------------------------------------------- + */ + +#ifndef __DISABLE_CORE_MACRO_H__ +#define __DISABLE_CORE_MACRO_H__ + +#undef TRACE_SORT +#undef DEBUG_BOUNDED_SORT +#undef TRACE_POSTGRESQL_SORT_START +#undef TRACE_POSTGRESQL_SORT_DONE + +#if PG_VERSION_NUM >= 110000 +#define TRACE_POSTGRESQL_SORT_START(arg1, arg2, arg3, arg4, arg5, arg6) \ + do {} while(0) +#else +#define TRACE_POSTGRESQL_SORT_START(arg1, arg2, arg3, arg4, arg5) \ + do {} while(0) +#endif + + +#define TRACE_POSTGRESQL_SORT_DONE(arg1, arg2) \ + do {} while(0) + + + +#endif /* __DISABLE_CORE_MACRO_H__ */ diff --git a/src/qsort_tuple.c b/src/qsort_tuple.c new file mode 100644 index 0000000000..0cb46e1416 --- /dev/null +++ b/src/qsort_tuple.c @@ -0,0 +1,332 @@ +/* + * autogenerated by src/backend/utils/sort/gen_qsort_tuple.pl, do not edit! + * + * This file is included by tuplesort.c, rather than compiled separately. + */ + +/* $NetBSD: qsort.c,v 1.13 2003/08/07 16:43:42 agc Exp $ */ + +/*- + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Qsort routine based on J. L. Bentley and M. D. McIlroy, + * "Engineering a sort function", + * Software--Practice and Experience 23 (1993) 1249-1265. + * + * We have modified their original by adding a check for already-sorted input, + * which seems to be a win per discussions on pgsql-hackers around 2006-03-21. + * + * Also, we recurse on the smaller partition and iterate on the larger one, + * which ensures we cannot recurse more than log(N) levels (since the + * partition recursed to is surely no more than half of the input). Bentley + * and McIlroy explicitly rejected doing this on the grounds that it's "not + * worth the effort", but we have seen crashes in the field due to stack + * overrun, so that judgment seems wrong. + */ + +static void +swapfunc(SortTuple *a, SortTuple *b, size_t n) +{ + do + { + SortTuple t = *a; + *a++ = *b; + *b++ = t; + } while (--n > 0); +} + +#define swap(a, b) \ + do { \ + SortTuple t = *(a); \ + *(a) = *(b); \ + *(b) = t; \ + } while (0) + +#define vecswap(a, b, n) if ((n) > 0) swapfunc(a, b, n) + +static SortTuple * +med3_tuple(SortTuple *a, SortTuple *b, SortTuple *c, SortTupleComparator cmp_tuple, Tuplesortstate *state) +{ + return cmp_tuple(a, b, state) < 0 ? + (cmp_tuple(b, c, state) < 0 ? b : + (cmp_tuple(a, c, state) < 0 ? c : a)) + : (cmp_tuple(b, c, state) > 0 ? b : + (cmp_tuple(a, c, state) < 0 ? a : c)); +} + +static void +qsort_tuple(SortTuple *a, size_t n, SortTupleComparator cmp_tuple, Tuplesortstate *state) +{ + SortTuple *pa, + *pb, + *pc, + *pd, + *pl, + *pm, + *pn; + size_t d1, + d2; + int r, + presorted; + +loop: + CHECK_FOR_INTERRUPTS(); + if (n < 7) + { + for (pm = a + 1; pm < a + n; pm++) + for (pl = pm; pl > a && cmp_tuple(pl - 1, pl, state) > 0; pl--) + swap(pl, pl - 1); + return; + } + presorted = 1; + for (pm = a + 1; pm < a + n; pm++) + { + CHECK_FOR_INTERRUPTS(); + if (cmp_tuple(pm - 1, pm, state) > 0) + { + presorted = 0; + break; + } + } + if (presorted) + return; + pm = a + (n / 2); + if (n > 7) + { + pl = a; + pn = a + (n - 1); + if (n > 40) + { + size_t d = (n / 8); + + pl = med3_tuple(pl, pl + d, pl + 2 * d, cmp_tuple, state); + pm = med3_tuple(pm - d, pm, pm + d, cmp_tuple, state); + pn = med3_tuple(pn - 2 * d, pn - d, pn, cmp_tuple, state); + } + pm = med3_tuple(pl, pm, pn, cmp_tuple, state); + } + swap(a, pm); + pa = pb = a + 1; + pc = pd = a + (n - 1); + for (;;) + { + while (pb <= pc && (r = cmp_tuple(pb, a, state)) <= 0) + { + if (r == 0) + { + swap(pa, pb); + pa++; + } + pb++; + CHECK_FOR_INTERRUPTS(); + } + while (pb <= pc && (r = cmp_tuple(pc, a, state)) >= 0) + { + if (r == 0) + { + swap(pc, pd); + pd--; + } + pc--; + CHECK_FOR_INTERRUPTS(); + } + if (pb > pc) + break; + swap(pb, pc); + pb++; + pc--; + } + pn = a + n; + d1 = Min(pa - a, pb - pa); + vecswap(a, pb - d1, d1); + d1 = Min(pd - pc, pn - pd - 1); + vecswap(pb, pn - d1, d1); + d1 = pb - pa; + d2 = pd - pc; + if (d1 <= d2) + { + /* Recurse on left partition, then iterate on right partition */ + if (d1 > 1) + qsort_tuple(a, d1, cmp_tuple, state); + if (d2 > 1) + { + /* Iterate rather than recurse to save stack space */ + /* qsort_tuple(pn - d2, d2, cmp_tuple, state); */ + a = pn - d2; + n = d2; + goto loop; + } + } + else + { + /* Recurse on right partition, then iterate on left partition */ + if (d2 > 1) + qsort_tuple(pn - d2, d2, cmp_tuple, state); + if (d1 > 1) + { + /* Iterate rather than recurse to save stack space */ + /* qsort_tuple(a, d1, cmp_tuple, state); */ + n = d1; + goto loop; + } + } +} + +#define cmp_ssup(a, b, ssup) \ + ApplySortComparator((a)->datum1, (a)->isnull1, \ + (b)->datum1, (b)->isnull1, ssup) + +static SortTuple * +med3_ssup(SortTuple *a, SortTuple *b, SortTuple *c, SortSupport ssup) +{ + return cmp_ssup(a, b, ssup) < 0 ? + (cmp_ssup(b, c, ssup) < 0 ? b : + (cmp_ssup(a, c, ssup) < 0 ? c : a)) + : (cmp_ssup(b, c, ssup) > 0 ? b : + (cmp_ssup(a, c, ssup) < 0 ? a : c)); +} + +static void +qsort_ssup(SortTuple *a, size_t n, SortSupport ssup) +{ + SortTuple *pa, + *pb, + *pc, + *pd, + *pl, + *pm, + *pn; + size_t d1, + d2; + int r, + presorted; + +loop: + CHECK_FOR_INTERRUPTS(); + if (n < 7) + { + for (pm = a + 1; pm < a + n; pm++) + for (pl = pm; pl > a && cmp_ssup(pl - 1, pl, ssup) > 0; pl--) + swap(pl, pl - 1); + return; + } + presorted = 1; + for (pm = a + 1; pm < a + n; pm++) + { + CHECK_FOR_INTERRUPTS(); + if (cmp_ssup(pm - 1, pm, ssup) > 0) + { + presorted = 0; + break; + } + } + if (presorted) + return; + pm = a + (n / 2); + if (n > 7) + { + pl = a; + pn = a + (n - 1); + if (n > 40) + { + size_t d = (n / 8); + + pl = med3_ssup(pl, pl + d, pl + 2 * d, ssup); + pm = med3_ssup(pm - d, pm, pm + d, ssup); + pn = med3_ssup(pn - 2 * d, pn - d, pn, ssup); + } + pm = med3_ssup(pl, pm, pn, ssup); + } + swap(a, pm); + pa = pb = a + 1; + pc = pd = a + (n - 1); + for (;;) + { + while (pb <= pc && (r = cmp_ssup(pb, a, ssup)) <= 0) + { + if (r == 0) + { + swap(pa, pb); + pa++; + } + pb++; + CHECK_FOR_INTERRUPTS(); + } + while (pb <= pc && (r = cmp_ssup(pc, a, ssup)) >= 0) + { + if (r == 0) + { + swap(pc, pd); + pd--; + } + pc--; + CHECK_FOR_INTERRUPTS(); + } + if (pb > pc) + break; + swap(pb, pc); + pb++; + pc--; + } + pn = a + n; + d1 = Min(pa - a, pb - pa); + vecswap(a, pb - d1, d1); + d1 = Min(pd - pc, pn - pd - 1); + vecswap(pb, pn - d1, d1); + d1 = pb - pa; + d2 = pd - pc; + if (d1 <= d2) + { + /* Recurse on left partition, then iterate on right partition */ + if (d1 > 1) + qsort_ssup(a, d1, ssup); + if (d2 > 1) + { + /* Iterate rather than recurse to save stack space */ + /* qsort_ssup(pn - d2, d2, ssup); */ + a = pn - d2; + n = d2; + goto loop; + } + } + else + { + /* Recurse on right partition, then iterate on left partition */ + if (d2 > 1) + qsort_ssup(pn - d2, d2, ssup); + if (d1 > 1) + { + /* Iterate rather than recurse to save stack space */ + /* qsort_ssup(a, d1, ssup); */ + n = d1; + goto loop; + } + } +} diff --git a/src/rum.h b/src/rum.h index ffce9b2aaa..2139774d08 100644 --- a/src/rum.h +++ b/src/rum.h @@ -3,8 +3,8 @@ * rum.h * Exported definitions for RUM index. * - * Portions Copyright (c) 2015-2016, Postgres Professional - * Portions Copyright (c) 2006-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 2015-2024, Postgres Professional + * Portions Copyright (c) 2006-2022, PostgreSQL Global Development Group * *------------------------------------------------------------------------- */ @@ -19,6 +19,8 @@ #include "access/sdir.h" #include "lib/rbtree.h" #include "storage/bufmgr.h" +#include "utils/datum.h" +#include "utils/memutils.h" #include "rumsort.h" @@ -262,6 +264,16 @@ typedef signed char RumNullCategory; /* * Data (posting tree) pages */ +/* + * FIXME -- Currently RumItem is placed as a pages right bound and PostingItem + * is placed as a non-leaf pages item. Both RumItem and PostingItem stores + * AddInfo as a raw Datum, which is bogus. It is fine for pass-by-value + * attributes, but it isn't for pass-by-reference, which may have variable + * length of data. This AddInfo is used only by order_by_attach indexes, so it + * isn't allowed to create index using ordering over pass-by-reference AddInfo, + * see initRumState(). This can be solved by having non-fixed length right bound + * and non-fixed non-leaf posting tree item. + */ #define RumDataPageGetRightBound(page) ((RumItem*) PageGetContents(page)) #define RumDataPageGetData(page) \ (PageGetContents(page) + MAXALIGN(sizeof(RumItem))) @@ -403,7 +415,7 @@ extern bytea *rumoptions(Datum reloptions, bool validate); extern bool rumproperty(Oid index_oid, int attno, IndexAMProperty prop, const char *propname, bool *res, bool *isnull); -extern Datum rumhandler(PG_FUNCTION_ARGS); +extern PGDLLEXPORT Datum rumhandler(PG_FUNCTION_ARGS); extern void initRumState(RumState * state, Relation index); extern Buffer RumNewBuffer(Relation index); extern void RumInitBuffer(GenericXLogState *state, Buffer buffer, uint32 flags, @@ -437,6 +449,9 @@ extern void rumbuildempty(Relation index); extern bool ruminsert(Relation index, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique +#if PG_VERSION_NUM >= 140000 + , bool indexUnchanged +#endif #if PG_VERSION_NUM >= 100000 , struct IndexInfo *indexInfo #endif @@ -519,7 +534,7 @@ extern void rumEntryFillRoot(RumBtree btree, Buffer root, Buffer lbuf, Buffer rb Page page, Page lpage, Page rpage); extern IndexTuple rumPageGetLinkItup(RumBtree btree, Buffer buf, Page page); extern void rumReadTuple(RumState * rumstate, OffsetNumber attnum, - IndexTuple itup, RumItem * items); + IndexTuple itup, RumItem * items, bool copyAddInfo); extern void rumReadTuplePointers(RumState * rumstate, OffsetNumber attnum, IndexTuple itup, ItemPointerData *ipd); extern void updateItemIndexes(Page page, OffsetNumber attnum, RumState * rumstate); @@ -773,9 +788,13 @@ extern IndexBulkDeleteResult *rumvacuumcleanup(IndexVacuumInfo *info, extern bool rumvalidate(Oid opclassoid); /* rumbulk.c */ +#if PG_VERSION_NUM <= 100006 || PG_VERSION_NUM == 110000 +typedef RBNode RBTNode; +#endif + typedef struct RumEntryAccumulator { - RBNode rbnode; + RBTNode rbnode; Datum key; RumNullCategory category; OffsetNumber attnum; @@ -817,16 +836,16 @@ extern RumItem *rumGetBAEntry(BuildAccumulator *accum, #define RUM_ADDINFO_JOIN 10 #define RUMNProcs 10 -extern Datum rum_extract_tsvector(PG_FUNCTION_ARGS); -extern Datum rum_extract_tsquery(PG_FUNCTION_ARGS); -extern Datum rum_tsvector_config(PG_FUNCTION_ARGS); -extern Datum rum_tsquery_pre_consistent(PG_FUNCTION_ARGS); -extern Datum rum_tsquery_distance(PG_FUNCTION_ARGS); -extern Datum rum_ts_distance_tt(PG_FUNCTION_ARGS); -extern Datum rum_ts_distance_ttf(PG_FUNCTION_ARGS); -extern Datum rum_ts_distance_td(PG_FUNCTION_ARGS); +extern PGDLLEXPORT Datum rum_extract_tsvector(PG_FUNCTION_ARGS); +extern PGDLLEXPORT Datum rum_extract_tsquery(PG_FUNCTION_ARGS); +extern PGDLLEXPORT Datum rum_tsvector_config(PG_FUNCTION_ARGS); +extern PGDLLEXPORT Datum rum_tsquery_pre_consistent(PG_FUNCTION_ARGS); +extern PGDLLEXPORT Datum rum_tsquery_distance(PG_FUNCTION_ARGS); +extern PGDLLEXPORT Datum rum_ts_distance_tt(PG_FUNCTION_ARGS); +extern PGDLLEXPORT Datum rum_ts_distance_ttf(PG_FUNCTION_ARGS); +extern PGDLLEXPORT Datum rum_ts_distance_td(PG_FUNCTION_ARGS); -extern Datum tsquery_to_distance_query(PG_FUNCTION_ARGS); +extern PGDLLEXPORT Datum tsquery_to_distance_query(PG_FUNCTION_ARGS); /* rum_arr_utils.c */ typedef enum SimilarityType @@ -839,19 +858,19 @@ typedef enum SimilarityType #define RUM_SIMILARITY_FUNCTION_DEFAULT SMT_COSINE #define RUM_SIMILARITY_THRESHOLD_DEFAULT 0.5 -extern Datum rum_anyarray_config(PG_FUNCTION_ARGS); -extern Datum rum_extract_anyarray(PG_FUNCTION_ARGS); -extern Datum rum_extract_anyarray_query(PG_FUNCTION_ARGS); -extern Datum rum_anyarray_consistent(PG_FUNCTION_ARGS); -extern Datum rum_anyarray_ordering(PG_FUNCTION_ARGS); -extern Datum rum_anyarray_similar(PG_FUNCTION_ARGS); -extern Datum rum_anyarray_distance(PG_FUNCTION_ARGS); +extern PGDLLEXPORT Datum rum_anyarray_config(PG_FUNCTION_ARGS); +extern PGDLLEXPORT Datum rum_extract_anyarray(PG_FUNCTION_ARGS); +extern PGDLLEXPORT Datum rum_extract_anyarray_query(PG_FUNCTION_ARGS); +extern PGDLLEXPORT Datum rum_anyarray_consistent(PG_FUNCTION_ARGS); +extern PGDLLEXPORT Datum rum_anyarray_ordering(PG_FUNCTION_ARGS); +extern PGDLLEXPORT Datum rum_anyarray_similar(PG_FUNCTION_ARGS); +extern PGDLLEXPORT Datum rum_anyarray_distance(PG_FUNCTION_ARGS); /* GUC parameters */ -extern PGDLLIMPORT int RumFuzzySearchLimit; -extern PGDLLIMPORT float8 RumArraySimilarityThreshold; -extern PGDLLIMPORT int RumArraySimilarityFunction; +extern int RumFuzzySearchLimit; +extern float8 RumArraySimilarityThreshold; +extern int RumArraySimilarityFunction; /* @@ -927,10 +946,14 @@ rumDataPageLeafReadItemPointer(char *ptr, ItemPointer iptr, bool *addInfoIsNull) * Reads next item pointer and additional information from leaf data page. * Replaces current item pointer with the next one. Zero item pointer should be * passed in order to read the first item pointer. + * + * It is necessary to pass copyAddInfo=true if additional information is used + * when the data page is unlocked. If the additional information is used without + * locking one can get unexpected behaviour. */ static inline Pointer rumDataPageLeafRead(Pointer ptr, OffsetNumber attnum, RumItem * item, - RumState * rumstate) + bool copyAddInfo, RumState * rumstate) { Form_pg_attribute attr; @@ -995,8 +1018,13 @@ rumDataPageLeafRead(Pointer ptr, OffsetNumber attnum, RumItem * item, } else { - ptr = (Pointer) att_align_pointer(ptr, attr->attalign, attr->attlen, ptr); - item->addInfo = fetch_att(ptr, attr->attbyval, attr->attlen); + Datum addInfo; + + ptr = (Pointer) att_align_pointer(ptr, attr->attalign, attr->attlen, + ptr); + addInfo = fetch_att(ptr, attr->attbyval, attr->attlen); + item->addInfo = copyAddInfo ? + datumCopy(addInfo, attr->attbyval, attr->attlen) : addInfo; } ptr = (Pointer) att_addlength_pointer(ptr, attr->attlen, ptr); @@ -1058,7 +1086,10 @@ extern Datum FunctionCall10Coll(FmgrInfo *flinfo, Oid collation, Datum arg9, Datum arg10); /* PostgreSQL version-agnostic creation of memory context */ -#if PG_VERSION_NUM >= 110000 +#if PG_VERSION_NUM >= 120000 +#define RumContextCreate(parent, name) \ + AllocSetContextCreate(parent, name, ALLOCSET_DEFAULT_SIZES) +#elif PG_VERSION_NUM >= 110000 #define RumContextCreate(parent, name) \ AllocSetContextCreateExtended(parent, name, \ ALLOCSET_DEFAULT_MINSIZE, \ diff --git a/src/rum_arr_utils.c b/src/rum_arr_utils.c index 78b788eb8b..d8dc00699a 100644 --- a/src/rum_arr_utils.c +++ b/src/rum_arr_utils.c @@ -3,7 +3,7 @@ * rum_arr_utils.c * various anyarray-search functions * - * Portions Copyright (c) 2015-2016, Postgres Professional + * Portions Copyright (c) 2015-2024, Postgres Professional * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group * *------------------------------------------------------------------------- @@ -51,7 +51,11 @@ #define CHECKARRVALID(x) \ do { \ - if (x) { \ + if (x == NULL) \ + ereport(ERROR, \ + (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), \ + errmsg("array must not be NULL"))); \ + else if (x) { \ if (ARR_NDIM(x) != NDIM && ARR_NDIM(x) != 0) \ ereport(ERROR, \ (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), \ @@ -733,6 +737,7 @@ cmpAscArrayElem(const void *a, const void *b, void *arg) { FmgrInfo *cmpFunc = (FmgrInfo*)arg; + Assert(a && b); return DatumGetInt32(FunctionCall2Coll(cmpFunc, DEFAULT_COLLATION_OID, *(Datum*)a, *(Datum*)b)); } diff --git a/src/rum_ts_utils.c b/src/rum_ts_utils.c index b43ac3b0a3..d3b9c5478a 100644 --- a/src/rum_ts_utils.c +++ b/src/rum_ts_utils.c @@ -3,8 +3,8 @@ * rum_ts_utils.c * various text-search functions * - * Portions Copyright (c) 2015-2016, Postgres Professional - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 2015-2024, Postgres Professional + * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group * *------------------------------------------------------------------------- */ @@ -34,6 +34,33 @@ #define TS_EXEC_PHRASE_NO_POS TS_EXEC_PHRASE_AS_AND #endif +#if PG_VERSION_NUM >= 130000 +/* Since v13 TS_execute flag naming and defaults have reverted: + * - before v13 - - since v13 - + * TS_EXEC_CALC_NOT (0x01) TS_EXEC_SKIP_NOT (0x01) + */ +#define TS_EXEC_CALC_NOT (0x01) /* Defined here for use with rum_TS_execute for + * compatibility with version < 13 where this + * flag was defined globally. + * XXX Since v13 current global flag + * TS_EXEC_SKIP_NOT has reverted meaning for + * TS_execute but TS_EXEC_CALC_NOT should still + * be passed to rum_TS_execute in unchanged (previous) + * meaning but should not be passed into TS_execute: + * (TS_execute will do 'calc not' by default, and + * if you need skip it, use new TS_EXEC_SKIP_NOT) + */ +typedef TSTernaryValue RumTernaryValue; +#else +typedef enum +{ + TS_NO, /* definitely no match */ + TS_YES, /* definitely does match */ + TS_MAYBE /* can't verify match for lack of pos data */ +} RumTernaryValue; +#endif +typedef RumTernaryValue (*RumExecuteCallbackTernary) (void *arg, QueryOperand *val, ExecPhraseData *data); + PG_FUNCTION_INFO_V1(rum_extract_tsvector); PG_FUNCTION_INFO_V1(rum_extract_tsvector_hash); PG_FUNCTION_INFO_V1(rum_extract_tsquery); @@ -53,13 +80,27 @@ PG_FUNCTION_INFO_V1(rum_ts_join_pos); PG_FUNCTION_INFO_V1(tsquery_to_distance_query); -static int count_pos(char *ptr, int len); +static unsigned int count_pos(char *ptr, int len); static char *decompress_pos(char *ptr, WordEntryPos *pos); static Datum build_tsvector_entry(TSVector vector, WordEntry *we); static Datum build_tsvector_hash_entry(TSVector vector, WordEntry *we); static Datum build_tsquery_entry(TSQuery query, QueryOperand *operand); static Datum build_tsquery_hash_entry(TSQuery query, QueryOperand *operand); +static RumTernaryValue +rum_phrase_output(ExecPhraseData *data, ExecPhraseData *Ldata, ExecPhraseData *Rdata, + int emit, + int Loffset, + int Roffset, + int max_npos); +static RumTernaryValue +rum_phrase_execute(QueryItem *curitem, void *arg, uint32 flags, + RumExecuteCallbackTernary chkcond, + ExecPhraseData *data); +static RumTernaryValue +rum_TS_execute(QueryItem *curitem, void *arg, uint32 flags, + RumExecuteCallbackTernary chkcond); + typedef Datum (*TSVectorEntryBuilder)(TSVector vector, WordEntry *we); typedef Datum (*TSQueryEntryBuilder)(TSQuery query, QueryOperand *operand); @@ -131,7 +172,7 @@ typedef struct DocRepresentation *end; } Extention; -static float weights[] = {1.0/0.1f, 1.0/0.2f, 1.0/0.4f, 1.0/1.0f}; +static float weights[] = {1.0f/0.1f, 1.0f/0.2f, 1.0f/0.4f, 1.0f/1.0f}; /* A dummy WordEntryPos array to use when haspos is false */ static WordEntryPosVector POSNULL = { @@ -148,10 +189,20 @@ static WordEntryPosVector POSNULL = { #define RANK_NORM_RDIVRPLUS1 0x20 #define DEF_NORM_METHOD RANK_NO_NORM +/* + * Should not conflict with defines + * TS_EXEC_EMPTY/TS_EXEC_CALC_NOT/TS_EXEC_PHRASE_NO_POS + */ +#define TS_EXEC_IN_NEG 0x04 + #define QR_GET_OPERAND(q, v) \ (&((q)->operandData[ ((QueryItem*)(v)) - GETQUERY((q)->query) ])) +#if PG_VERSION_NUM >= 130000 +static TSTernaryValue +#else static bool +#endif pre_checkcondition_rum(void *checkval, QueryOperand *val, ExecPhraseData *data) { RumChkVal *gcv = (RumChkVal *) checkval; @@ -163,134 +214,696 @@ pre_checkcondition_rum(void *checkval, QueryOperand *val, ExecPhraseData *data) /* convert item's number to corresponding entry's (operand's) number */ j = gcv->map_item_operand[((QueryItem *) val) - gcv->first_item]; - /* return presence of current entry in indexed value */ + #if PG_VERSION_NUM >= 130000 + return ( *(gcv->need_recheck) ? TS_MAYBE : (gcv->check[j] ? TS_YES : TS_NO) ); + #else return gcv->check[j]; + #endif } Datum rum_tsquery_pre_consistent(PG_FUNCTION_ARGS) { bool *check = (bool *) PG_GETARG_POINTER(0); - TSQuery query = PG_GETARG_TSQUERY(2); - - Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); - bool recheck; + Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); + bool recheck = false; bool res = false; if (query->size > 0) { - QueryItem *item; RumChkVal gcv; /* * check-parameter array has one entry for each value (operand) in the * query. */ - gcv.first_item = item = GETQUERY(query); + gcv.first_item = GETQUERY(query); gcv.check = check; gcv.map_item_operand = (int *) (extra_data[0]); gcv.need_recheck = &recheck; res = TS_execute(GETQUERY(query), &gcv, - TS_EXEC_PHRASE_NO_POS, + TS_EXEC_PHRASE_NO_POS +#if PG_VERSION_NUM >= 130000 + | TS_EXEC_SKIP_NOT +#endif + , pre_checkcondition_rum); } - PG_RETURN_BOOL(res); } -static bool + +static RumTernaryValue checkcondition_rum(void *checkval, QueryOperand *val, ExecPhraseData *data) { RumChkVal *gcv = (RumChkVal *) checkval; int j; - /* if any val requiring a weight is used, set recheck flag */ - if (val->weight != 0) - *(gcv->need_recheck) = true; - /* convert item's number to corresponding entry's (operand's) number */ j = gcv->map_item_operand[((QueryItem *) val) - gcv->first_item]; - /* return presence of current entry in indexed value */ if (!gcv->check[j]) - return false; + /* lexeme not present in indexed value */ + return TS_NO; - /* - * Fill position list for phrase operator if it's needed end it exists - */ - if (data) + else if (gcv->addInfo && gcv->addInfoIsNull[j] == false) { - /* caller wants an array of positions (phrase search) */ + bytea *positions; + int32 i; + char *ptrt; + WordEntryPos post = 0; + int32 npos; + int32 k = 0; + /* + * we don't have positions in index because we store a timestamp in + * addInfo + */ if (gcv->recheckPhrase) { /* - * we don't have a positions because we store a timestamp in - * addInfo + * We cannot return TS_YES here (if "val->weight > 0"), because + * data->npos = 0 and we have incorrect porocessing of this result + * at the upper levels. So return TS_MAYBE. */ - *(gcv->need_recheck) = true; + return TS_MAYBE; } - else if (gcv->addInfo && gcv->addInfoIsNull[j] == false) + + positions = DatumGetByteaP(gcv->addInfo[j]); + ptrt = (char *) VARDATA_ANY(positions); + npos = count_pos(VARDATA_ANY(positions), + VARSIZE_ANY_EXHDR(positions)); + + /* caller wants an array of positions (phrase search) */ + if (data) { - bytea *positions; - int32 i; - char *ptrt; - WordEntryPos post; - - positions = DatumGetByteaP(gcv->addInfo[j]); - data->npos = count_pos(VARDATA_ANY(positions), - VARSIZE_ANY_EXHDR(positions)); - data->pos = palloc(sizeof(*data->pos) * data->npos); + data->pos = palloc(sizeof(*data->pos) * npos); data->allocated = true; - ptrt = (char *) VARDATA_ANY(positions); - post = 0; + /* Fill positions that has right weight to return to a caller */ + for (i = 0; i < npos; i++) + { + ptrt = decompress_pos(ptrt, &post); + + /* + * Weight mark is stored as 2 bits inside position mark in RUM + * index. We compare it to a list of requested positions in + * query operand (4 bits one for each weight mark). + */ + if ((val->weight == 0) || (val->weight >> WEP_GETWEIGHT(post)) & 1) + { + data->pos[k] = post; + k++; + } + } + data->npos = k; + data->pos = repalloc(data->pos, sizeof(*data->pos) * k); + return (k ? TS_YES : TS_NO); + } + + /* + * Not phrase search. We only need to know if there's at least one + * position with right weight then return TS_YES, otherwise return + * TS_NO. For this search work without recheck we need that any + * negation in recursion will give TS_MAYBE and initiate recheck as + * "!word:A" can mean both: "word:BCÐ’" or "!word" + */ + else if (val->weight == 0) + /* Query without weights */ + return TS_YES; + else + { + char KeyWeightsMask = 0; - for (i = 0; i < data->npos; i++) + /* Fill KeyWeightMask contains with weights from all positions */ + for (i = 0; i < npos; i++) { ptrt = decompress_pos(ptrt, &post); - data->pos[i] = post; + KeyWeightsMask |= 1 << WEP_GETWEIGHT(post); + } + return ((KeyWeightsMask & val->weight) ? TS_YES : TS_NO); + } + } +/* Should never come here */ + return TS_MAYBE; +} + +/* + * Compute output position list for a tsquery operator in phrase mode. + * + * Merge the position lists in Ldata and Rdata as specified by "emit", + * returning the result list into *data. The input position lists must be + * sorted and unique, and the output will be as well. + * + * data: pointer to initially-all-zeroes output struct, or NULL + * Ldata, Rdata: input position lists + * emit: bitmask of TSPO_XXX flags + * Loffset: offset to be added to Ldata positions before comparing/outputting + * Roffset: offset to be added to Rdata positions before comparing/outputting + * max_npos: maximum possible required size of output position array + * + * Loffset and Roffset should not be negative, else we risk trying to output + * negative positions, which won't fit into WordEntryPos. + * + * The result is boolean (TS_YES or TS_NO), but for the caller's convenience + * we return it as RumTernaryValue. + * + * Returns TS_YES if any positions were emitted to *data; or if data is NULL, + * returns TS_YES if any positions would have been emitted. + */ +#define TSPO_L_ONLY 0x01 /* emit positions appearing only in L */ +#define TSPO_R_ONLY 0x02 /* emit positions appearing only in R */ +#define TSPO_BOTH 0x04 /* emit positions appearing in both L&R */ + +static RumTernaryValue +rum_phrase_output(ExecPhraseData *data, + ExecPhraseData *Ldata, + ExecPhraseData *Rdata, + int emit, + int Loffset, + int Roffset, + int max_npos) +{ + int Lindex, + Rindex; + + /* Loop until both inputs are exhausted */ + Lindex = Rindex = 0; + while (Lindex < Ldata->npos || Rindex < Rdata->npos) + { + int Lpos, + Rpos; + int output_pos = 0; + + /* + * Fetch current values to compare. WEP_GETPOS() is needed because + * ExecPhraseData->data can point to a tsvector's WordEntryPosVector. + */ + if (Lindex < Ldata->npos) + Lpos = WEP_GETPOS(Ldata->pos[Lindex]) + Loffset; + else + { + /* L array exhausted, so we're done if R_ONLY isn't set */ + if (!(emit & TSPO_R_ONLY)) + break; + Lpos = INT_MAX; + } + if (Rindex < Rdata->npos) + Rpos = WEP_GETPOS(Rdata->pos[Rindex]) + Roffset; + else + { + /* R array exhausted, so we're done if L_ONLY isn't set */ + if (!(emit & TSPO_L_ONLY)) + break; + Rpos = INT_MAX; + } + + /* Merge-join the two input lists */ + if (Lpos < Rpos) + { + /* Lpos is not matched in Rdata, should we output it? */ + if (emit & TSPO_L_ONLY) + output_pos = Lpos; + Lindex++; + } + else if (Lpos == Rpos) + { + /* Lpos and Rpos match ... should we output it? */ + if (emit & TSPO_BOTH) + output_pos = Rpos; + Lindex++; + Rindex++; + } + else /* Lpos > Rpos */ + { + /* Rpos is not matched in Ldata, should we output it? */ + if (emit & TSPO_R_ONLY) + output_pos = Rpos; + Rindex++; + } + + if (output_pos > 0) + { + if (data) + { + /* Store position, first allocating output array if needed */ + if (data->pos == NULL) + { + data->pos = (WordEntryPos *) + palloc(max_npos * sizeof(WordEntryPos)); + data->allocated = true; + } + data->pos[data->npos++] = output_pos; + } + else + { + /* + * Exact positions not needed, so return TS_YES as soon as we + * know there is at least one. + */ + return TS_YES; } } } - return true; + if (data && data->npos > 0) + { + /* Let's assert we didn't overrun the array */ + Assert(data->npos <= max_npos); + return TS_YES; + } + return TS_NO; +} + +/* + * Execute tsquery at or below an OP_PHRASE operator. + * + * This handles tsquery execution at recursion levels where we need to care + * about match locations. + * + * In addition to the same arguments used for TS_execute, the caller may pass + * a preinitialized-to-zeroes ExecPhraseData struct, to be filled with lexeme + * match position info on success. data == NULL if no position data need be + * returned. (In practice, outside callers pass NULL, and only the internal + * recursion cases pass a data pointer.) + * Note: the function assumes data != NULL for operators other than OP_PHRASE. + * This is OK because an outside call always starts from an OP_PHRASE node. + * + * The detailed semantics of the match data, given that the function returned + * TS_YES (successful match), are: + * + * npos > 0, negate = false: + * query is matched at specified position(s) (and only those positions) + * npos > 0, negate = true: + * query is matched at all positions *except* specified position(s) + * npos = 0, negate = true: + * query is matched at all positions + * npos = 0, negate = false: + * disallowed (this should result in TS_NO or TS_MAYBE, as appropriate) + * + * Successful matches also return a "width" value which is the match width in + * lexemes, less one. Hence, "width" is zero for simple one-lexeme matches, + * and is the sum of the phrase operator distances for phrase matches. Note + * that when width > 0, the listed positions represent the ends of matches not + * the starts. (This unintuitive rule is needed to avoid possibly generating + * negative positions, which wouldn't fit into the WordEntryPos arrays.) + * + * If the RumExecuteCallback function reports that an operand is present + * but fails to provide position(s) for it, we will return TS_MAYBE when + * it is possible but not certain that the query is matched. + * + * When the function returns TS_NO or TS_MAYBE, it must return npos = 0, + * negate = false (which is the state initialized by the caller); but the + * "width" output in such cases is undefined. + */ +static RumTernaryValue +rum_phrase_execute(QueryItem *curitem, void *arg, uint32 flags, + RumExecuteCallbackTernary chkcond, + ExecPhraseData *data) +{ + ExecPhraseData Ldata, + Rdata; + RumTernaryValue lmatch, + rmatch; + int Loffset, + Roffset, + maxwidth; + + /* since this function recurses, it could be driven to stack overflow */ + check_stack_depth(); + + if (curitem->type == QI_VAL) + return (chkcond(arg, (QueryOperand *) curitem, data)); + + switch (curitem->qoperator.oper) + { + case OP_NOT: + + /* + * We need not touch data->width, since a NOT operation does not + * change the match width. + */ + if (!(flags & TS_EXEC_CALC_NOT)) + { + /* without CALC_NOT, report NOT as "match everywhere" */ + Assert(data->npos == 0 && !data->negate); + data->negate = true; + return TS_YES; + } + switch (rum_phrase_execute(curitem + 1, arg, flags, chkcond, data)) + { + case TS_NO: + /* change "match nowhere" to "match everywhere" */ + Assert(data->npos == 0 && !data->negate); + data->negate = true; + return TS_YES; + case TS_YES: + if (data->npos > 0) + { + /* we have some positions, invert negate flag */ + data->negate = !data->negate; + return TS_YES; + } + else if (data->negate) + { + /* change "match everywhere" to "match nowhere" */ + data->negate = false; + return TS_NO; + } + /* Should not get here if result was TS_YES */ + Assert(false); + break; + case TS_MAYBE: + /* match positions are, and remain, uncertain */ + return TS_MAYBE; + } + break; + + case OP_PHRASE: + case OP_AND: + memset(&Ldata, 0, sizeof(Ldata)); + memset(&Rdata, 0, sizeof(Rdata)); + + lmatch = rum_phrase_execute(curitem + curitem->qoperator.left, + arg, flags, chkcond, &Ldata); + if (lmatch == TS_NO) + return TS_NO; + + rmatch = rum_phrase_execute(curitem + 1, + arg, flags, chkcond, &Rdata); + if (rmatch == TS_NO) + return TS_NO; + + /* + * If either operand has no position information, then we can't + * return reliable position data, only a MAYBE result. + */ + if (lmatch == TS_MAYBE || rmatch == TS_MAYBE) + return TS_MAYBE; + + if (curitem->qoperator.oper == OP_PHRASE) + { + /* In case of index where position is not available + * (e.g. addon_ops) output TS_MAYBE even in case both + * lmatch and rmatch are TS_YES. Otherwise we can lose + * results of phrase queries. + */ + if (flags & TS_EXEC_PHRASE_NO_POS) + return TS_MAYBE; + + /* + * Compute Loffset and Roffset suitable for phrase match, and + * compute overall width of whole phrase match. + */ + Loffset = curitem->qoperator.distance + Rdata.width; + Roffset = 0; + if (data) + data->width = curitem->qoperator.distance + + Ldata.width + Rdata.width; + } + else + { + /* + * For OP_AND, set output width and alignment like OP_OR (see + * comment below) + */ + maxwidth = Max(Ldata.width, Rdata.width); + Loffset = maxwidth - Ldata.width; + Roffset = maxwidth - Rdata.width; + if (data) + data->width = maxwidth; + } + + if (Ldata.negate && Rdata.negate) + { + /* !L & !R: treat as !(L | R) */ + (void) rum_phrase_output(data, &Ldata, &Rdata, + TSPO_BOTH | TSPO_L_ONLY | TSPO_R_ONLY, + Loffset, Roffset, + Ldata.npos + Rdata.npos); + if (data) + data->negate = true; + return TS_YES; + } + else if (Ldata.negate) + { + /* !L & R */ + return rum_phrase_output(data, &Ldata, &Rdata, + TSPO_R_ONLY, + Loffset, Roffset, + Rdata.npos); + } + else if (Rdata.negate) + { + /* L & !R */ + return rum_phrase_output(data, &Ldata, &Rdata, + TSPO_L_ONLY, + Loffset, Roffset, + Ldata.npos); + } + else + { + /* straight AND */ + return rum_phrase_output(data, &Ldata, &Rdata, + TSPO_BOTH, + Loffset, Roffset, + Min(Ldata.npos, Rdata.npos)); + } + + case OP_OR: + memset(&Ldata, 0, sizeof(Ldata)); + memset(&Rdata, 0, sizeof(Rdata)); + + lmatch = rum_phrase_execute(curitem + curitem->qoperator.left, + arg, flags, chkcond, &Ldata); + rmatch = rum_phrase_execute(curitem + 1, + arg, flags, chkcond, &Rdata); + + if (lmatch == TS_NO && rmatch == TS_NO) + return TS_NO; + + /* + * If either operand has no position information, then we can't + * return reliable position data, only a MAYBE result. + */ + if (lmatch == TS_MAYBE || rmatch == TS_MAYBE) + return TS_MAYBE; + + /* + * Cope with undefined output width from failed submatch. (This + * takes less code than trying to ensure that all failure returns + * et data->width to zero.) + */ + if (lmatch == TS_NO) + Ldata.width = 0; + if (rmatch == TS_NO) + Rdata.width = 0; + + /* + * For OP_AND and OP_OR, report the width of the wider of the two + * inputs, and align the narrower input's positions to the right + * end of that width. This rule deals at least somewhat + * reasonably with cases like "x <-> (y | z <-> q)". + */ + maxwidth = Max(Ldata.width, Rdata.width); + Loffset = maxwidth - Ldata.width; + Roffset = maxwidth - Rdata.width; + data->width = maxwidth; + + if (Ldata.negate && Rdata.negate) + { + /* !L | !R: treat as !(L & R) */ + (void) rum_phrase_output(data, &Ldata, &Rdata, + TSPO_BOTH, + Loffset, Roffset, + Min(Ldata.npos, Rdata.npos)); + data->negate = true; + return TS_YES; + } + else if (Ldata.negate) + { + /* !L | R: treat as !(L & !R) */ + (void) rum_phrase_output(data, &Ldata, &Rdata, + TSPO_L_ONLY, + Loffset, Roffset, + Ldata.npos); + data->negate = true; + return TS_YES; + } + else if (Rdata.negate) + { + /* L | !R: treat as !(!L & R) */ + (void) rum_phrase_output(data, &Ldata, &Rdata, + TSPO_R_ONLY, + Loffset, Roffset, + Rdata.npos); + data->negate = true; + return TS_YES; + } + else + { + /* straight OR */ + return rum_phrase_output(data, &Ldata, &Rdata, + TSPO_BOTH | TSPO_L_ONLY | TSPO_R_ONLY, + Loffset, Roffset, + Ldata.npos + Rdata.npos); + } + + default: + elog(ERROR, "unrecognized operator: %d", curitem->qoperator.oper); + } + + /* not reachable, but keep compiler quiet */ + return TS_NO; +} + +/* + * Evaluates tsquery boolean expression. It is similar to adt/tsvector_op.c + * TS_execute_recurse() but in most cases when ! operator is used it should set + * TS_MAYBE to recheck. The reason is that inside negation we can have one or several + * operands with weights (which we can not easily know) and negative of them is not + * precisely defined i.e. "!word:A" can mean "word:BCD" or "!word" (the same applies to + * logical combination of them). One easily only case we can avoid recheck is when before negation there + * is QI_VAL which doesn't have weight. + * + * curitem: current tsquery item (initially, the first one) + * arg: opaque value to pass through to callback function + * flags: bitmask of flag bits shown in ts_utils.h + * chkcond: callback function to check whether a primitive value is present + */ + +static RumTernaryValue +rum_TS_execute(QueryItem *curitem, void *arg, uint32 flags, + RumExecuteCallbackTernary chkcond) +{ + RumTernaryValue lmatch; + /* since this function recurses, it could be driven to stack overflow */ + check_stack_depth(); + + if (curitem->type == QI_VAL) + { + if ((flags & TS_EXEC_IN_NEG) && curitem->qoperand.weight && + curitem->qoperand.weight != 15) + return TS_MAYBE; + else + return chkcond(arg, (QueryOperand *) curitem, NULL); + } + + switch (curitem->qoperator.oper) + { + case OP_NOT: + if (!(flags & TS_EXEC_CALC_NOT)) + return TS_YES; + switch (rum_TS_execute(curitem + 1, arg, flags | TS_EXEC_IN_NEG, chkcond)) + { + case TS_NO: + return TS_YES; + case TS_YES: + return TS_NO; + case TS_MAYBE: + return TS_MAYBE; + } + break; + + case OP_AND: + lmatch = rum_TS_execute(curitem + curitem->qoperator.left, arg, + flags, chkcond); + if (lmatch == TS_NO) + return TS_NO; + switch (rum_TS_execute(curitem + 1, arg, flags, chkcond)) + { + case TS_NO: + return TS_NO; + case TS_YES: + return lmatch; + case TS_MAYBE: + return TS_MAYBE; + } + break; + + case OP_OR: + lmatch = rum_TS_execute(curitem + curitem->qoperator.left, arg, + flags, chkcond); + if (lmatch == TS_YES) + return TS_YES; + switch (rum_TS_execute(curitem + 1, arg, flags, chkcond)) + { + case TS_NO: + return lmatch; + case TS_YES: + return TS_YES; + case TS_MAYBE: + return TS_MAYBE; + } + break; + + case OP_PHRASE: + + /* + * If we get a MAYBE result, and the caller doesn't want that, + * convert it to NO. It would be more consistent, perhaps, to + * return the result of TS_phrase_execute() verbatim and then + * convert MAYBE results at the top of the recursion. But + * converting at the topmost phrase operator gives results that + * are bug-compatible with the old implementation, so do it like + * this for now. + * + * Checking for TS_EXEC_PHRASE_NO_POS has been moved inside + * rum_phrase_execute, otherwise we can lose results of phrase + * operator when position information is not available in index + * (e.g. index built with addon_ops) + */ + switch (rum_phrase_execute(curitem, arg, flags, chkcond, NULL)) + { + case TS_NO: + return TS_NO; + case TS_YES: + return TS_YES; + case TS_MAYBE: + return (flags & TS_EXEC_PHRASE_NO_POS) ? TS_MAYBE : TS_NO; + } + break; + + default: + elog(ERROR, "unrecognized operator: %d", curitem->qoperator.oper); + } + + /* not reachable, but keep compiler quiet */ + return TS_NO; } Datum rum_tsquery_consistent(PG_FUNCTION_ARGS) { bool *check = (bool *) PG_GETARG_POINTER(0); - /* StrategyNumber strategy = PG_GETARG_UINT16(1); */ TSQuery query = PG_GETARG_TSQUERY(2); - /* int32 nkeys = PG_GETARG_INT32(3); */ - Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); + Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); bool *recheck = (bool *) PG_GETARG_POINTER(5); Datum *addInfo = (Datum *) PG_GETARG_POINTER(8); bool *addInfoIsNull = (bool *) PG_GETARG_POINTER(9); - bool res = false; + + RumTernaryValue res = TS_NO; /* - * The query requires recheck only if it involves weights + * The query doesn't require recheck by default */ *recheck = false; if (query->size > 0) { - QueryItem *item; RumChkVal gcv; /* * check-parameter array has one entry for each value (operand) in the * query. */ - gcv.first_item = item = GETQUERY(query); + gcv.first_item = GETQUERY(query); gcv.check = check; gcv.map_item_operand = (int *) (extra_data[0]); gcv.need_recheck = recheck; @@ -298,28 +911,27 @@ rum_tsquery_consistent(PG_FUNCTION_ARGS) gcv.addInfoIsNull = addInfoIsNull; gcv.recheckPhrase = false; - res = TS_execute(GETQUERY(query), &gcv, - TS_EXEC_CALC_NOT, - checkcondition_rum); + res = rum_TS_execute(GETQUERY(query), &gcv, + TS_EXEC_CALC_NOT, + checkcondition_rum); + if (res == TS_MAYBE) + *recheck = true; } - PG_RETURN_BOOL(res); -} +} Datum rum_tsquery_timestamp_consistent(PG_FUNCTION_ARGS) { bool *check = (bool *) PG_GETARG_POINTER(0); - /* StrategyNumber strategy = PG_GETARG_UINT16(1); */ TSQuery query = PG_GETARG_TSQUERY(2); - /* int32 nkeys = PG_GETARG_INT32(3); */ - Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); + Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); bool *recheck = (bool *) PG_GETARG_POINTER(5); Datum *addInfo = (Datum *) PG_GETARG_POINTER(8); bool *addInfoIsNull = (bool *) PG_GETARG_POINTER(9); - bool res = false; + RumTernaryValue res = TS_NO; /* * The query requires recheck only if it involves weights @@ -328,14 +940,13 @@ rum_tsquery_timestamp_consistent(PG_FUNCTION_ARGS) if (query->size > 0) { - QueryItem *item; RumChkVal gcv; /* * check-parameter array has one entry for each value (operand) in the * query. */ - gcv.first_item = item = GETQUERY(query); + gcv.first_item = GETQUERY(query); gcv.check = check; gcv.map_item_operand = (int *) (extra_data[0]); gcv.need_recheck = recheck; @@ -343,18 +954,19 @@ rum_tsquery_timestamp_consistent(PG_FUNCTION_ARGS) gcv.addInfoIsNull = addInfoIsNull; gcv.recheckPhrase = true; - res = TS_execute(GETQUERY(query), &gcv, - TS_EXEC_CALC_NOT | TS_EXEC_PHRASE_NO_POS, - checkcondition_rum); + res = rum_TS_execute(GETQUERY(query), &gcv, + TS_EXEC_CALC_NOT | TS_EXEC_PHRASE_NO_POS, + checkcondition_rum); + if (res == TS_MAYBE) + *recheck = true; } - PG_RETURN_BOOL(res); } #define SIXTHBIT 0x20 #define LOWERMASK 0x1F -static int +static unsigned int compress_pos(char *target, WordEntryPos *pos, int npos) { int i; @@ -406,6 +1018,7 @@ decompress_pos(char *ptr, WordEntryPos *pos) else { delta |= (v & LOWERMASK) << i; + Assert(delta <= 0x3fff); *pos += delta; WEP_SETWEIGHT(*pos, v >> 5); return ptr; @@ -414,7 +1027,7 @@ decompress_pos(char *ptr, WordEntryPos *pos) } } -static int +static unsigned int count_pos(char *ptr, int len) { int count = 0, @@ -425,6 +1038,7 @@ count_pos(char *ptr, int len) if (!(ptr[i] & HIGHBIT)) count++; } + Assert((ptr[i-1] & HIGHBIT) == 0); return count; } @@ -834,7 +1448,16 @@ compareDocR(const void *va, const void *vb) return (a->pos > b->pos) ? 1 : -1; } -static bool +/* + * Be carefull: clang 11+ is very sensitive to casting function + * with different return value. + */ +static +#if PG_VERSION_NUM >= 130000 +TSTernaryValue +#else +bool +#endif checkcondition_QueryOperand(void *checkval, QueryOperand *val, ExecPhraseData *data) { @@ -855,7 +1478,11 @@ checkcondition_QueryOperand(void *checkval, QueryOperand *val, data->allocated = false; } - return qro->operandexist; + return qro->operandexist +#if PG_VERSION_NUM >= 130000 + ? TS_YES : TS_NO +#endif + ; } static bool @@ -873,7 +1500,7 @@ Cover(DocRepresentation *doc, uint32 len, QueryRepresentation *qr, memset(qr->operandData, 0, sizeof(qr->operandData[0]) * qr->length); - ext->p = 0x7fffffff; + ext->p = PG_INT32_MAX; ext->q = 0; ptr = doc + ext->pos; @@ -900,8 +1527,12 @@ Cover(DocRepresentation *doc, uint32 len, QueryRepresentation *qr, } } - - if (TS_execute(GETQUERY(qr->query), (void *) qr, TS_EXEC_EMPTY, + if (TS_execute(GETQUERY(qr->query), (void *) qr, +#if PG_VERSION_NUM >= 130000 + TS_EXEC_SKIP_NOT, +#else + TS_EXEC_EMPTY, +#endif checkcondition_QueryOperand)) { if (ptr->pos > ext->q) @@ -942,7 +1573,12 @@ Cover(DocRepresentation *doc, uint32 len, QueryRepresentation *qr, WEP_SETWEIGHT(qro->pos, ptr->wclass); } } - if (TS_execute(GETQUERY(qr->query), (void *) qr, TS_EXEC_CALC_NOT, + if (TS_execute(GETQUERY(qr->query), (void *) qr, +#if PG_VERSION_NUM >= 130000 + TS_EXEC_EMPTY, +#else + TS_EXEC_CALC_NOT, +#endif checkcondition_QueryOperand)) { if (ptr->pos < ext->p) @@ -1242,7 +1878,7 @@ calc_score_docr(float4 *arrdata, DocRepresentation *doc, uint32 doclen, int new_cover_key = 0; int nitems = 0; - while (ptr <= ext.end) + while (ptr && ptr <= ext.end) { InvSum += arrdata[ptr->wclass]; /* SK: Quick and dirty hash key. Hope collisions will be not too frequent. */ @@ -1642,7 +2278,8 @@ rum_ts_join_pos(PG_FUNCTION_ARGS) count2 = count_pos(in2, VARSIZE_ANY_EXHDR(addInfo2)), countRes = 0; int i1 = 0, i2 = 0; - Size size; + Size size, + size_compressed; WordEntryPos pos1 = 0, pos2 = 0, *pos; @@ -1654,61 +2291,71 @@ rum_ts_join_pos(PG_FUNCTION_ARGS) in1 = decompress_pos(in1, &pos1); in2 = decompress_pos(in2, &pos2); - while(i1 < count1 && i2 < count2) + for(;;) { if (WEP_GETPOS(pos1) > WEP_GETPOS(pos2)) { pos[countRes++] = pos2; - if (i2 < count2) - in2 = decompress_pos(in2, &pos2); i2++; + if (i2 >= count2) + break; + in2 = decompress_pos(in2, &pos2); } else if (WEP_GETPOS(pos1) < WEP_GETPOS(pos2)) { pos[countRes++] = pos1; - if (i1 < count1) - in1 = decompress_pos(in1, &pos1); i1++; + if (i1 >= count1) + break; + in1 = decompress_pos(in1, &pos1); } else { pos[countRes++] = pos1; + i1++; + i2++; if (i1 < count1) in1 = decompress_pos(in1, &pos1); if (i2 < count2) in2 = decompress_pos(in2, &pos2); - i1++; - i2++; + if (i2 >= count2 || i1 >= count1) + break; } } - while(i1 < count1) - { - pos[countRes++] = pos1; - if (i1 < count1) + if (i1 < count1) + for(;;) + { + pos[countRes++] = pos1; + i1++; + if (i1 >= count1) + break; in1 = decompress_pos(in1, &pos1); - i1++; - } - - while(i2 < count2) + } + else if (i2 < count2) { - pos[countRes++] = pos2; - if (i2 < count2) + for(;;) + { + pos[countRes++] = pos2; + i2++; + if (i2 >= count2) + break; in2 = decompress_pos(in2, &pos2); - i2++; + } } - Assert(countRes <= (count1 + count2)); + Assert(countRes <= count1 + count2); /* * In some cases compressed positions may take more memory than * uncompressed positions. So allocate memory with a margin. */ size = VARHDRSZ + 2 * sizeof(WordEntryPos) * countRes; - result = palloc(size); + result = palloc0(size); - size = compress_pos(result->vl_dat, pos, countRes) + VARHDRSZ; - SET_VARSIZE(result, size); + size_compressed = compress_pos(result->vl_dat, pos, countRes) + VARHDRSZ; + Assert(size >= size_compressed); + SET_VARSIZE(result, size_compressed); PG_RETURN_BYTEA_P(result); } diff --git a/src/rumbtree.c b/src/rumbtree.c index f6244f5308..dfe2f10c30 100644 --- a/src/rumbtree.c +++ b/src/rumbtree.c @@ -4,7 +4,7 @@ * page utilities routines for the postgres inverted index access method. * * - * Portions Copyright (c) 2015-2016, Postgres Professional + * Portions Copyright (c) 2015-2024, Postgres Professional * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -527,7 +527,7 @@ rumInsertValue(Relation index, RumBtree btree, RumBtreeStack * stack, else { BlockNumber rightrightBlkno = InvalidBlockNumber; - Buffer rightrightBuffer; + Buffer rightrightBuffer = InvalidBuffer; /* split non-root page */ if (btree->rumstate->isBuild) diff --git a/src/rumbulk.c b/src/rumbulk.c index 6e08056c55..7a03bf64b4 100644 --- a/src/rumbulk.c +++ b/src/rumbulk.c @@ -4,7 +4,7 @@ * routines for fast build of inverted index * * - * Portions Copyright (c) 2015-2016, Postgres Professional + * Portions Copyright (c) 2015-2024, Postgres Professional * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -14,17 +14,24 @@ #include "postgres.h" #include "utils/datum.h" -#include "utils/memutils.h" #include "rum.h" #define DEF_NENTRY 2048 /* RumEntryAccumulator allocation quantum */ #define DEF_NPTR 5 /* ItemPointer initial allocation quantum */ +/* PostgreSQL pre 10 has different names for this functions */ +#if PG_VERSION_NUM <= 100006 || PG_VERSION_NUM == 110000 +#define rbt_create(node_size, comparator, combiner, allocfunc, freefunc, arg) \ + (rb_create(node_size, comparator, combiner, allocfunc, freefunc, arg)) +#define rbt_insert(rbt, data, isNew) \ + (rb_insert(rbt, data, isNew)) +#endif + /* Combiner function for rbtree.c */ static void -rumCombineData(RBNode *existing, const RBNode *newdata, void *arg) +rumCombineData(RBTNode *existing, const RBTNode *newdata, void *arg) { RumEntryAccumulator *eo = (RumEntryAccumulator *) existing; const RumEntryAccumulator *en = (const RumEntryAccumulator *) newdata; @@ -65,7 +72,7 @@ rumCombineData(RBNode *existing, const RBNode *newdata, void *arg) /* Comparator function for rbtree.c */ static int -cmpEntryAccumulator(const RBNode *a, const RBNode *b, void *arg) +cmpEntryAccumulator(const RBTNode *a, const RBTNode *b, void *arg) { const RumEntryAccumulator *ea = (const RumEntryAccumulator *) a; const RumEntryAccumulator *eb = (const RumEntryAccumulator *) b; @@ -77,7 +84,7 @@ cmpEntryAccumulator(const RBNode *a, const RBNode *b, void *arg) } /* Allocator function for rbtree.c */ -static RBNode * +static RBTNode * rumAllocEntryAccumulator(void *arg) { BuildAccumulator *accum = (BuildAccumulator *) arg; @@ -85,7 +92,7 @@ rumAllocEntryAccumulator(void *arg) /* * Allocate memory by rather big chunks to decrease overhead. We have no - * need to reclaim RBNodes individually, so this costs nothing. + * need to reclaim RBTNodes individually, so this costs nothing. */ if (accum->entryallocator == NULL || accum->eas_used >= DEF_NENTRY) { @@ -94,11 +101,11 @@ rumAllocEntryAccumulator(void *arg) accum->eas_used = 0; } - /* Allocate new RBNode from current chunk */ + /* Allocate new RBTNode from current chunk */ ea = accum->entryallocator + accum->eas_used; accum->eas_used++; - return (RBNode *) ea; + return (RBTNode *) ea; } void @@ -108,12 +115,12 @@ rumInitBA(BuildAccumulator *accum) accum->allocatedMemory = 0; accum->entryallocator = NULL; accum->eas_used = 0; - accum->tree = rb_create(sizeof(RumEntryAccumulator), - cmpEntryAccumulator, - rumCombineData, - rumAllocEntryAccumulator, - NULL, /* no freefunc needed */ - (void *) accum); + accum->tree = rbt_create(sizeof(RumEntryAccumulator), + cmpEntryAccumulator, + rumCombineData, + rumAllocEntryAccumulator, + NULL, /* no freefunc needed */ + (void *) accum); } /* @@ -159,12 +166,13 @@ rumInsertBAEntry(BuildAccumulator *accum, eatmp.category = category; /* temporarily set up single-entry itempointer list */ eatmp.list = &item; + memset(&item, 0, sizeof(item)); item.iptr = *heapptr; item.addInfo = addInfo; item.addInfoIsNull = addInfoIsNull; - ea = (RumEntryAccumulator *) rb_insert(accum->tree, (RBNode *) &eatmp, - &isNew); + ea = (RumEntryAccumulator *) rbt_insert(accum->tree, (RBTNode *) &eatmp, + &isNew); if (isNew) { @@ -272,7 +280,9 @@ qsortCompareRumItem(const void *a, const void *b, void *arg) void rumBeginBAScan(BuildAccumulator *accum) { -#if PG_VERSION_NUM >= 100000 +#if (PG_VERSION_NUM > 100006 && PG_VERSION_NUM < 110000) || PG_VERSION_NUM >= 110001 + rbt_begin_iterate(accum->tree, LeftRightWalk, &accum->tree_walk); +#elif PG_VERSION_NUM >= 100000 rb_begin_iterate(accum->tree, LeftRightWalk, &accum->tree_walk); #else rb_begin_iterate(accum->tree, LeftRightWalk); @@ -292,7 +302,9 @@ rumGetBAEntry(BuildAccumulator *accum, RumEntryAccumulator *entry; RumItem *list; -#if PG_VERSION_NUM >= 100000 +#if (PG_VERSION_NUM > 100006 && PG_VERSION_NUM < 110000) || PG_VERSION_NUM >= 110001 + entry = (RumEntryAccumulator *) rbt_iterate(&accum->tree_walk); +#elif PG_VERSION_NUM >= 100000 entry = (RumEntryAccumulator *) rb_iterate(&accum->tree_walk); #else entry = (RumEntryAccumulator *) rb_iterate(accum->tree); diff --git a/src/rumdatapage.c b/src/rumdatapage.c index bd112930dd..922bb7d19a 100644 --- a/src/rumdatapage.c +++ b/src/rumdatapage.c @@ -4,7 +4,7 @@ * page utilities routines for the postgres inverted index access method. * * - * Portions Copyright (c) 2015-2016, Postgres Professional + * Portions Copyright (c) 2015-2024, Postgres Professional * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -99,7 +99,7 @@ rumDatumWrite(Pointer ptr, Datum datum, bool typbyval, char typalign, elog(ERROR, "unsupported byval length: %d", (int) (typlen)); } - data_length = typlen; + data_length = (Size)typlen; } else if (typlen == -1) { @@ -149,7 +149,7 @@ rumDatumWrite(Pointer ptr, Datum datum, bool typbyval, char typalign, /* fixed-length pass-by-reference */ ptr = (char *) att_align_nominal(ptr, typalign); Assert(typlen > 0); - data_length = typlen; + data_length = (Size)typlen; memmove(ptr, DatumGetPointer(datum), data_length); } @@ -589,7 +589,7 @@ findInLeafPage(RumBtree btree, Page page, OffsetNumber *offset, *iptrOut = item.iptr; ptr = rumDataPageLeafRead(ptr, btree->entryAttnum, &item, - btree->rumstate); + false, btree->rumstate); cmp = compareRumItem(btree->rumstate, btree->entryAttnum, &btree->items[btree->curitem], &item); @@ -736,7 +736,7 @@ RumDataPageAddItem(Page page, void *data, OffsetNumber offset) if (offset <= maxoff) memmove(ptr + sizeof(PostingItem), ptr, - (maxoff - offset + 1) * sizeof(PostingItem)); + ((uint16)(maxoff - offset + 1)) * sizeof(PostingItem)); } memcpy(ptr, data, sizeof(PostingItem)); RumPageGetOpaque(page)->maxoff++; @@ -763,7 +763,7 @@ RumPageDeletePostingItem(Page page, OffsetNumber offset) char *dstptr = RumDataPageGetItem(page, offset), *sourceptr = RumDataPageGetItem(page, offset + 1); - memmove(dstptr, sourceptr, sizeof(PostingItem) * (maxoff - offset)); + memmove(dstptr, sourceptr, sizeof(PostingItem) * (uint16)(maxoff - offset)); } RumPageGetOpaque(page)->maxoff--; @@ -853,7 +853,14 @@ dataPlaceToPage(RumBtree btree, Page page, OffsetNumber off) ItemPointerData iptr = {{0, 0}, 0}; RumItem copyItem; bool copyItemEmpty = true; - char pageCopy[BLCKSZ]; + /* + * Must have pageCopy MAXALIGNed to use PG macros to access data in + * it. Should not rely on compiler alignment preferences to avoid + * pageCopy overflow related to PG in-memory page items alignment + * inside rumDataPageLeafRead() or elsewhere. + */ + char pageCopyStorage[BLCKSZ + MAXIMUM_ALIGNOF]; + char *pageCopy = (char *) MAXALIGN(pageCopyStorage); int maxoff = RumPageGetOpaque(page)->maxoff; int freespace, insertCount = 0; @@ -899,7 +906,8 @@ dataPlaceToPage(RumBtree btree, Page page, OffsetNumber off) if (copyItemEmpty == true && off <= maxoff) { copyPtr = rumDataPageLeafRead(copyPtr, btree->entryAttnum, - ©Item, btree->rumstate); + ©Item, false, + btree->rumstate); copyItemEmpty = false; } @@ -1052,10 +1060,17 @@ dataSplitPageLeaf(RumBtree btree, Buffer lbuf, Buffer rbuf, RumItem maxLeftItem, curItem; RumItem item; - int totalCount = 0; int maxItemIndex = btree->curitem; - static char lpageCopy[BLCKSZ]; + /* + * Must have lpageCopy MAXALIGNed to use PG macros to access data in + * it. Should not rely on compiler alignment preferences to avoid + * lpageCopy overflow related to PG in-memory page items alignment + * inside rumDataPageLeafRead() etc. + */ + static char lpageCopyStorage[BLCKSZ + MAXIMUM_ALIGNOF]; + char *lpageCopy = (char *) MAXALIGN(lpageCopyStorage); + memset(&item, 0, sizeof(item)); dataPrepareData(btree, newlPage, off); maxoff = RumPageGetOpaque(newlPage)->maxoff; @@ -1084,19 +1099,17 @@ dataSplitPageLeaf(RumBtree btree, Buffer lbuf, Buffer rbuf, &item, &prevIptr, btree->rumstate, totalsize); maxItemIndex++; - totalCount++; maxItemSize = Max(maxItemSize, totalsize - prevTotalsize); } prevIptr = item.iptr; copyPtr = rumDataPageLeafRead(copyPtr, btree->entryAttnum, &item, - btree->rumstate); + false, btree->rumstate); prevTotalsize = totalsize; totalsize = rumCheckPlaceToDataPageLeaf(btree->entryAttnum, &item, &prevIptr, btree->rumstate, totalsize); - totalCount++; maxItemSize = Max(maxItemSize, totalsize - prevTotalsize); } @@ -1118,7 +1131,6 @@ dataSplitPageLeaf(RumBtree btree, Buffer lbuf, Buffer rbuf, 2 * RumDataPageSize - 2 * maxItemSize - 2 * MAXIMUM_ALIGNOF) { maxItemIndex++; - totalCount++; maxItemSize = Max(maxItemSize, newTotalsize - totalsize); totalsize = newTotalsize; @@ -1132,8 +1144,6 @@ dataSplitPageLeaf(RumBtree btree, Buffer lbuf, Buffer rbuf, totalsize = rumCheckPlaceToDataPageLeaf(btree->entryAttnum, &item, &prevIptr, btree->rumstate, totalsize); maxItemIndex++; - - totalCount++; } } @@ -1168,7 +1178,7 @@ dataSplitPageLeaf(RumBtree btree, Buffer lbuf, Buffer rbuf, } copyPtr = rumDataPageLeafRead(copyPtr, btree->entryAttnum, &item, - btree->rumstate); + false, btree->rumstate); curItem = item; ptr = rumPlaceToDataPageLeaf(ptr, btree->entryAttnum, &item, @@ -1227,12 +1237,18 @@ dataSplitPageInternal(RumBtree btree, Buffer lbuf, Buffer rbuf, RumItem *bound; Page newlPage = PageGetTempPageCopy(BufferGetPage(lbuf)); RumItem oldbound = *RumDataPageGetRightBound(newlPage); - int sizeofitem = sizeof(PostingItem); + unsigned int sizeofitem = sizeof(PostingItem); OffsetNumber maxoff = RumPageGetOpaque(newlPage)->maxoff; Size pageSize = PageGetPageSize(newlPage); Size freeSpace; - - static char vector[2 * BLCKSZ]; + /* + * Must have vector MAXALIGNed to use PG macros to access data in + * it. Should not rely on compiler alignment preferences to avoid + * vector overflow related to PG in-memory page items alignment + * inside rumDataPageLeafRead() etc. + */ + static char vectorStorage[2 * BLCKSZ + MAXIMUM_ALIGNOF]; + char *vector = (char *) MAXALIGN(vectorStorage); RumInitPage(rPage, RumPageGetOpaque(newlPage)->flags, pageSize); freeSpace = RumDataPageGetFreeSpace(rPage); @@ -1244,7 +1260,7 @@ dataSplitPageInternal(RumBtree btree, Buffer lbuf, Buffer rbuf, Assert(!RumPageIsLeaf(newlPage)); ptr = vector + (off - 1) * sizeofitem; if (maxoff + 1 - off != 0) - memmove(ptr + sizeofitem, ptr, (maxoff - off + 1) * sizeofitem); + memmove(ptr + sizeofitem, ptr, (uint16)(maxoff - off + 1) * sizeofitem); memcpy(ptr, &(btree->pitem), sizeofitem); maxoff++; @@ -1271,7 +1287,7 @@ dataSplitPageInternal(RumBtree btree, Buffer lbuf, Buffer rbuf, ptr = RumDataPageGetItem(rPage, FirstOffsetNumber); memcpy(ptr, vector + separator * sizeofitem, - (maxoff - separator) * sizeofitem); + (uint16)(maxoff - separator) * sizeofitem); RumPageGetOpaque(rPage)->maxoff = maxoff - separator; /* Adjust pd_lower */ ((PageHeader) rPage)->pd_lower = (ptr + @@ -1280,8 +1296,8 @@ dataSplitPageInternal(RumBtree btree, Buffer lbuf, Buffer rbuf, PostingItemSetBlockNumber(&(btree->pitem), BufferGetBlockNumber(lbuf)); if (RumPageIsLeaf(newlPage)) - btree->pitem.item.iptr = *(ItemPointerData *) RumDataPageGetItem(newlPage, - RumPageGetOpaque(newlPage)->maxoff); + btree->pitem.item.iptr = ((PostingItem *) RumDataPageGetItem(newlPage, + RumPageGetOpaque(newlPage)->maxoff))->item.iptr; else btree->pitem.item = ((PostingItem *) RumDataPageGetItem(newlPage, RumPageGetOpaque(newlPage)->maxoff))->item; @@ -1350,7 +1366,7 @@ updateItemIndexes(Page page, OffsetNumber attnum, RumState * rumstate) } j++; } - ptr = rumDataPageLeafRead(ptr, attnum, &item, rumstate); + ptr = rumDataPageLeafRead(ptr, attnum, &item, false, rumstate); } /* Fill rest of page indexes with InvalidOffsetNumber if any */ for (; j < RumDataLeafIndexCount; j++) @@ -1433,10 +1449,12 @@ rumDataFillRoot(RumBtree btree, Buffer root, Buffer lbuf, Buffer rbuf, PostingItem li, ri; + memset(&li, 0, sizeof(PostingItem)); li.item = *RumDataPageGetRightBound(lpage); PostingItemSetBlockNumber(&li, BufferGetBlockNumber(lbuf)); RumDataPageAddItem(page, &li, InvalidOffsetNumber); + memset(&ri, 0, sizeof(PostingItem)); ri.item = *RumDataPageGetRightBound(rpage); PostingItemSetBlockNumber(&ri, BufferGetBlockNumber(rbuf)); RumDataPageAddItem(page, &ri, InvalidOffsetNumber); @@ -1497,8 +1515,10 @@ rumInsertItemPointers(RumState * rumstate, RumItem * items, uint32 nitem, GinStatsData *buildStats) { - BlockNumber rootBlkno = gdi->stack->blkno; + BlockNumber rootBlkno; + Assert(gdi->stack); + rootBlkno = gdi->stack->blkno; gdi->btree.items = items; gdi->btree.nitem = nitem; gdi->btree.curitem = 0; diff --git a/src/rumentrypage.c b/src/rumentrypage.c index 7029942e78..29e1dd25bb 100644 --- a/src/rumentrypage.c +++ b/src/rumentrypage.c @@ -4,7 +4,7 @@ * page utilities routines for the postgres inverted index access method. * * - * Portions Copyright (c) 2015-2016, Postgres Professional + * Portions Copyright (c) 2015-2024, Postgres Professional * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -21,7 +21,7 @@ */ void rumReadTuple(RumState * rumstate, OffsetNumber attnum, - IndexTuple itup, RumItem * items) + IndexTuple itup, RumItem * items, bool copyAddInfo) { Pointer ptr = RumGetPosting(itup); RumItem item; @@ -31,7 +31,7 @@ rumReadTuple(RumState * rumstate, OffsetNumber attnum, ItemPointerSetMin(&item.iptr); for (i = 0; i < nipd; i++) { - ptr = rumDataPageLeafRead(ptr, attnum, &item, rumstate); + ptr = rumDataPageLeafRead(ptr, attnum, &item, copyAddInfo, rumstate); items[i] = item; } } @@ -113,6 +113,8 @@ getRightMostTuple(Page page) { OffsetNumber maxoff = PageGetMaxOffsetNumber(page); + Assert(maxoff != InvalidOffsetNumber); + return (IndexTuple) PageGetItem(page, PageGetItemId(page, maxoff)); } @@ -426,8 +428,14 @@ entrySplitPage(RumBtree btree, Buffer lbuf, Buffer rbuf, Page page; Page newlPage = PageGetTempPageCopy(lPage); Size pageSize = PageGetPageSize(newlPage); - - static char tupstore[2 * BLCKSZ]; + /* + * Must have tupstore MAXALIGNed to use PG macros to access data in + * it. Should not rely on compiler alignment preferences to avoid + * tupstore overflow related to PG in-memory page items alignment + * inside rumDataPageLeafRead() or elsewhere. + */ + static char tupstoreStorage[2 * BLCKSZ + MAXIMUM_ALIGNOF]; + char *tupstore = (char *) MAXALIGN(tupstoreStorage); entryPreparePage(btree, newlPage, off); diff --git a/src/rumget.c b/src/rumget.c index 2e5dd2593b..ca5d83ee00 100644 --- a/src/rumget.c +++ b/src/rumget.c @@ -4,8 +4,8 @@ * fetch tuples from a RUM scan. * * - * Portions Copyright (c) 2015-2016, Postgres Professional - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 2015-2024, Postgres Professional + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * *------------------------------------------------------------------------- @@ -22,8 +22,9 @@ #if PG_VERSION_NUM >= 120000 #include "utils/float.h" #endif -#include "utils/memutils.h" - +#if PG_VERSION_NUM >= 150000 +#include "common/pg_prng.h" +#endif #include "rum.h" /* GUC parameter */ @@ -253,12 +254,14 @@ scanPostingTree(Relation index, RumScanEntry scanEntry, RumScanItem item; Pointer ptr; + MemSet(&item, 0, sizeof(item)); ItemPointerSetMin(&item.item.iptr); ptr = RumDataPageGetData(page); for (i = FirstOffsetNumber; i <= maxoff; i++) { - ptr = rumDataPageLeafRead(ptr, attnum, &item.item, rumstate); + ptr = rumDataPageLeafRead(ptr, attnum, &item.item, false, + rumstate); SCAN_ITEM_PUT_KEY(scanEntry, item, idatum, icategory); rum_tuplesort_putrumitem(scanEntry->matchSortstate, &item); } @@ -464,11 +467,12 @@ collectMatchBitmap(RumBtreeData * btree, RumBtreeStack * stack, char *ptr = RumGetPosting(itup); RumScanItem item; + MemSet(&item, 0, sizeof(item)); ItemPointerSetMin(&item.item.iptr); for (i = 0; i < RumGetNPosting(itup); i++) { ptr = rumDataPageLeafRead(ptr, scanEntry->attnum, &item.item, - rumstate); + true, rumstate); SCAN_ITEM_PUT_KEY(scanEntry, item, idatum, icategory); rum_tuplesort_putrumitem(scanEntry->matchSortstate, &item); } @@ -628,7 +632,6 @@ startScanEntry(RumState * rumstate, RumScanEntry entry, Snapshot snapshot) { BlockNumber rootPostingTree = RumGetPostingTree(itup); RumPostingTreeScan *gdi; - Page page; OffsetNumber maxoff, i; Pointer ptr; @@ -674,7 +677,8 @@ startScanEntry(RumState * rumstate, RumScanEntry entry, Snapshot snapshot) for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { - ptr = rumDataPageLeafRead(ptr, entry->attnum, &item, rumstate); + ptr = rumDataPageLeafRead(ptr, entry->attnum, &item, true, + rumstate); entry->list[i - FirstOffsetNumber] = item; } @@ -686,10 +690,10 @@ startScanEntry(RumState * rumstate, RumScanEntry entry, Snapshot snapshot) else if (RumGetNPosting(itup) > 0) { entry->nlist = RumGetNPosting(itup); - entry->predictNumberResult = entry->nlist; + entry->predictNumberResult = (uint32)entry->nlist; entry->list = (RumItem *) palloc(sizeof(RumItem) * entry->nlist); - rumReadTuple(rumstate, entry->attnum, itup, entry->list); + rumReadTuple(rumstate, entry->attnum, itup, entry->list, true); entry->isFinished = setListPositionScanEntry(rumstate, entry); if (!entry->isFinished) entry->curItem = entry->list[entry->offset]; @@ -935,7 +939,8 @@ entryGetNextItem(RumState * rumstate, RumScanEntry entry, Snapshot snapshot) for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { - ptr = rumDataPageLeafRead(ptr, entry->attnum, &item, rumstate); + ptr = rumDataPageLeafRead(ptr, entry->attnum, &item, true, + rumstate); entry->list[i - FirstOffsetNumber] = item; if (searchBorder) @@ -1045,7 +1050,6 @@ entryGetNextItemList(RumState * rumstate, RumScanEntry entry, Snapshot snapshot) { BlockNumber rootPostingTree = RumGetPostingTree(itup); RumPostingTreeScan *gdi; - Page page; OffsetNumber maxoff, i; Pointer ptr; @@ -1091,7 +1095,8 @@ entryGetNextItemList(RumState * rumstate, RumScanEntry entry, Snapshot snapshot) for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { - ptr = rumDataPageLeafRead(ptr, entry->attnum, &item, rumstate); + ptr = rumDataPageLeafRead(ptr, entry->attnum, &item, true, + rumstate); entry->list[i - FirstOffsetNumber] = item; } @@ -1101,14 +1106,14 @@ entryGetNextItemList(RumState * rumstate, RumScanEntry entry, Snapshot snapshot) else if (RumGetNPosting(itup) > 0) { entry->nlist = RumGetNPosting(itup); - entry->predictNumberResult = entry->nlist; + entry->predictNumberResult = (uint32)entry->nlist; entry->list = (RumItem *) palloc(sizeof(RumItem) * entry->nlist); - rumReadTuple(rumstate, entry->attnum, itup, entry->list); + rumReadTuple(rumstate, entry->attnum, itup, entry->list, true); entry->isFinished = setListPositionScanEntry(rumstate, entry); } - Assert(entry->nlist > 0); + Assert(entry->nlist > 0 && entry->list); entry->curItem = entry->list[entry->offset]; entry->offset += entry->scanDirection; @@ -1126,7 +1131,12 @@ entryGetNextItemList(RumState * rumstate, RumScanEntry entry, Snapshot snapshot) return true; } +#if PG_VERSION_NUM < 150000 #define rum_rand() (((double) random()) / ((double) MAX_RANDOM_VALUE)) +#else +#define rum_rand() pg_prng_double(&pg_global_prng_state) +#endif + #define dropItem(e) ( rum_rand() > ((double)RumFuzzySearchLimit)/((double)((e)->predictNumberResult)) ) /* @@ -1165,7 +1175,7 @@ entryGetItem(RumState * rumstate, RumScanEntry entry, bool *nextEntryList, Snaps if (!ItemPointerIsMin(&entry->collectRumItem.item.iptr)) collected = entry->collectRumItem; else - ItemPointerSetMin(&collected.item.iptr); + MemSet(&collected, 0, sizeof(collected)); ItemPointerSetMin(&entry->curItem.iptr); @@ -1659,7 +1669,8 @@ scanPage(RumState * rumstate, RumScanEntry entry, RumItem *item, bool equalOk) bound = -1; for (i = first; i <= maxoff; i++) { - ptr = rumDataPageLeafRead(ptr, entry->attnum, &iter_item, rumstate); + ptr = rumDataPageLeafRead(ptr, entry->attnum, &iter_item, true, + rumstate); entry->list[i - first] = iter_item; if (bound != -1) @@ -2241,8 +2252,8 @@ insertScanItem(RumScanOpaque so, bool recheck) j; item = (RumSortItem *) - MemoryContextAlloc(rum_tuplesort_get_memorycontext(so->sortstate), - RumSortItemSize(so->norderbys)); + MemoryContextAllocZero(rum_tuplesort_get_memorycontext(so->sortstate), + RumSortItemSize(so->norderbys)); item->iptr = so->item.iptr; item->recheck = recheck; @@ -2352,6 +2363,14 @@ rumgettuple(IndexScanDesc scan, ScanDirection direction) RumSortItem *item; bool should_free; +#if PG_VERSION_NUM >= 120000 +#define GET_SCAN_TID(scan) ((scan)->xs_heaptid) +#define SET_SCAN_TID(scan, tid) ((scan)->xs_heaptid = (tid)) +#else +#define GET_SCAN_TID(scan) ((scan)->xs_ctup.t_self) +#define SET_SCAN_TID(scan, tid) ((scan)->xs_ctup.t_self = (tid)) +#endif + if (so->firstCall) { /* @@ -2361,6 +2380,7 @@ rumgettuple(IndexScanDesc scan, ScanDirection direction) rumNewScanKey(scan); so->firstCall = false; + ItemPointerSetInvalid(&GET_SCAN_TID(scan)); if (RumIsVoidRes(scan)) return false; @@ -2384,7 +2404,7 @@ rumgettuple(IndexScanDesc scan, ScanDirection direction) { if (scanGetItem(scan, &so->item, &so->item, &recheck)) { - scan->xs_ctup.t_self = so->item.iptr; + SET_SCAN_TID(scan, so->item.iptr); scan->xs_recheck = recheck; scan->xs_recheckorderby = false; @@ -2406,7 +2426,7 @@ rumgettuple(IndexScanDesc scan, ScanDirection direction) uint32 i, j = 0; - if (rumCompareItemPointers(&scan->xs_ctup.t_self, &item->iptr) == 0) + if (rumCompareItemPointers(&GET_SCAN_TID(scan), &item->iptr) == 0) { if (should_free) pfree(item); @@ -2414,7 +2434,7 @@ rumgettuple(IndexScanDesc scan, ScanDirection direction) continue; } - scan->xs_ctup.t_self = item->iptr; + SET_SCAN_TID(scan, item->iptr); scan->xs_recheck = item->recheck; scan->xs_recheckorderby = false; diff --git a/src/ruminsert.c b/src/ruminsert.c index f9ce47a30d..255e616c99 100644 --- a/src/ruminsert.c +++ b/src/ruminsert.c @@ -4,7 +4,7 @@ * insert routines for the postgres inverted index access method. * * - * Portions Copyright (c) 2015-2016, Postgres Professional + * Portions Copyright (c) 2015-2024, Postgres Professional * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -14,10 +14,12 @@ #include "postgres.h" #include "access/generic_xlog.h" +#if PG_VERSION_NUM >= 120000 +#include "access/tableam.h" +#endif #include "storage/predicate.h" #include "catalog/index.h" #include "miscadmin.h" -#include "utils/memutils.h" #include "utils/datum.h" #include "rum.h" @@ -32,7 +34,11 @@ typedef struct BuildAccumulator accum; } RumBuildState; -#if PG_VERSION_NUM >= 110000 + +#if PG_VERSION_NUM >= 120000 +#define IndexBuildHeapScan(A, B, C, D, E, F) \ +table_index_build_scan(A, B, C, D, true, E, F, NULL) +#elif PG_VERSION_NUM >= 110000 #define IndexBuildHeapScan(A, B, C, D, E, F) \ IndexBuildHeapScan(A, B, C, D, E, F, NULL) #endif @@ -272,7 +278,7 @@ addItemPointersToLeafTuple(RumState * rumstate, newNPosting = oldNPosting + nitem; newItems = (RumItem *) palloc(sizeof(RumItem) * newNPosting); - rumReadTuple(rumstate, attnum, old, oldItems); + rumReadTuple(rumstate, attnum, old, oldItems, false); newNPosting = rumMergeRumItems(rumstate, attnum, newItems, items, nitem, oldItems, oldNPosting); @@ -524,11 +530,11 @@ rumHeapTupleBulkInsert(RumBuildState * buildstate, OffsetNumber attnum, /* Check existance of additional information attribute in index */ if (!attr) { - Form_pg_attribute attr = RumTupleDescAttr( + Form_pg_attribute current_attr = RumTupleDescAttr( buildstate->rumstate.origTupdesc, attnum - 1); elog(ERROR, "additional information attribute \"%s\" is not found in index", - NameStr(attr->attname)); + NameStr(current_attr->attname)); } addInfo[i] = datumCopy(addInfo[i], attr->attbyval, attr->attlen); @@ -544,7 +550,13 @@ rumHeapTupleBulkInsert(RumBuildState * buildstate, OffsetNumber attnum, } static void -rumBuildCallback(Relation index, HeapTuple htup, Datum *values, +rumBuildCallback(Relation index, +#if PG_VERSION_NUM < 130000 + HeapTuple htup, +#else + ItemPointer tid, +#endif + Datum *values, bool *isnull, bool tupleIsAlive, void *state) { RumBuildState *buildstate = (RumBuildState *) state; @@ -552,6 +564,9 @@ rumBuildCallback(Relation index, HeapTuple htup, Datum *values, int i; Datum outerAddInfo = (Datum) 0; bool outerAddInfoIsNull = true; +#if PG_VERSION_NUM < 130000 + ItemPointer tid = &htup->t_self; +#endif if (AttributeNumberIsValid(buildstate->rumstate.attrnAttachColumn)) { @@ -564,7 +579,7 @@ rumBuildCallback(Relation index, HeapTuple htup, Datum *values, for (i = 0; i < buildstate->rumstate.origTupdesc->natts; i++) rumHeapTupleBulkInsert(buildstate, (OffsetNumber) (i + 1), values[i], isnull[i], - &htup->t_self, + tid, outerAddInfo, outerAddInfoIsNull); /* If we've maxed out our available memory, dump everything to the index */ @@ -789,6 +804,7 @@ rumHeapTupleInsert(RumState * rumstate, OffsetNumber attnum, NameStr(attr->attname)); } + memset(&insert_item, 0, sizeof(insert_item)); insert_item.iptr = *item; insert_item.addInfo = addInfo[i]; insert_item.addInfoIsNull = addInfoIsNull[i]; @@ -802,6 +818,9 @@ bool ruminsert(Relation index, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique +#if PG_VERSION_NUM >= 140000 + , bool indexUnchanged +#endif #if PG_VERSION_NUM >= 100000 , struct IndexInfo *indexInfo #endif diff --git a/src/rumscan.c b/src/rumscan.c index 27d7f05c2d..089730fac4 100644 --- a/src/rumscan.c +++ b/src/rumscan.c @@ -4,7 +4,7 @@ * routines to manage scans of inverted index relations * * - * Portions Copyright (c) 2015-2016, Postgres Professional + * Portions Copyright (c) 2015-2024, Postgres Professional * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -15,7 +15,6 @@ #include "access/relscan.h" #include "pgstat.h" -#include "utils/memutils.h" #include "rum.h" @@ -42,6 +41,13 @@ rumbeginscan(Relation rel, int nkeys, int norderbys) initRumState(&so->rumstate, scan->indexRelation); +#if PG_VERSION_NUM >= 120000 + /* + * Starting from PG 12 we need to invalidate result's item pointer. Earlier + * it was done by invalidating scan->xs_ctup by RelationGetIndexScan(). + */ + ItemPointerSetInvalid(&scan->xs_heaptid); +#endif scan->opaque = so; return scan; @@ -208,7 +214,7 @@ rumFillScanKey(RumScanOpaque so, OffsetNumber attnum, } if (scanKey == NULL) - elog(ERROR, "cannot order without attribute %d in WHERE clause", + elog(ERROR, "cannot order without attribute %d in ORDER BY clause", key->attnum); else if (scanKey->nentries > 1) elog(ERROR, "scan key should contain only one value"); @@ -463,12 +469,12 @@ lookupScanDirection(RumState *state, AttrNumber attno, StrategyNumber strategy) int i; RumConfig *rumConfig = state->rumConfig + attno - 1; - for(i = 0; rumConfig->strategyInfo[i].strategy != InvalidStrategy && - i < MAX_STRATEGIES; i++) + for(i = 0; i < MAX_STRATEGIES; i++) { + if (rumConfig->strategyInfo[i].strategy != InvalidStrategy) + break; if (rumConfig->strategyInfo[i].strategy == strategy) return rumConfig->strategyInfo[i].direction; - } return NoMovementScanDirection; @@ -697,9 +703,12 @@ rumNewScanKey(IndexScanDesc scan) repalloc(so->entries, so->allocentries * sizeof(RumScanEntry)); } - memcpy(so->entries + so->totalentries, - key->scanEntry, sizeof(*key->scanEntry) * key->nentries); - so->totalentries += key->nentries; + if ( key->scanEntry != NULL ) + { + memcpy(so->entries + so->totalentries, + key->scanEntry, sizeof(*key->scanEntry) * key->nentries); + so->totalentries += key->nentries; + } } /* diff --git a/src/rumsort.c b/src/rumsort.c index 94da17252e..0c395f03e7 100644 --- a/src/rumsort.c +++ b/src/rumsort.c @@ -1,118 +1,15 @@ /*------------------------------------------------------------------------- * - * rumsort.h + * rumsort.c * Generalized tuple sorting routines. * - * This module handles sorting of heap tuples, index tuples, or single - * Datums (and could easily support other kinds of sortable objects, - * if necessary). It works efficiently for both small and large amounts - * of data. Small amounts are sorted in-memory using qsort(). Large - * amounts are sorted using temporary files and a standard external sort - * algorithm. + * This module handles sorting of RumSortItem or RumScanItem structures. + * It contains copy of static functions from + * src/backend/utils/sort/tuplesort.c. * - * See Knuth, volume 3, for more than you want to know about the external - * sorting algorithm. Historically, we divided the input into sorted runs - * using replacement selection, in the form of a priority tree implemented - * as a heap (essentially his Algorithm 5.2.3H -- although that strategy is - * often avoided altogether), but that can now only happen first the first - * run. We merge the runs using polyphase merge, Knuth's Algorithm - * 5.4.2D. The logical "tapes" used by Algorithm D are implemented by - * logtape.c, which avoids space wastage by recycling disk space as soon - * as each block is read from its "tape". * - * We never form the initial runs using Knuth's recommended replacement - * selection data structure (Algorithm 5.4.1R), because it uses a fixed - * number of records in memory at all times. Since we are dealing with - * tuples that may vary considerably in size, we want to be able to vary - * the number of records kept in memory to ensure full utilization of the - * allowed sort memory space. So, we keep the tuples in a variable-size - * heap, with the next record to go out at the top of the heap. Like - * Algorithm 5.4.1R, each record is stored with the run number that it - * must go into, and we use (run number, key) as the ordering key for the - * heap. When the run number at the top of the heap changes, we know that - * no more records of the prior run are left in the heap. Note that there - * are in practice only ever two distinct run numbers, due to the greatly - * reduced use of replacement selection in PostgreSQL 9.6. - * - * In PostgreSQL 9.6, a heap (based on Knuth's Algorithm H, with some small - * customizations) is only used with the aim of producing just one run, - * thereby avoiding all merging. Only the first run can use replacement - * selection, which is why there are now only two possible valid run - * numbers, and why heapification is customized to not distinguish between - * tuples in the second run (those will be quicksorted). We generally - * prefer a simple hybrid sort-merge strategy, where runs are sorted in much - * the same way as the entire input of an internal sort is sorted (using - * qsort()). The replacement_sort_tuples GUC controls the limited remaining - * use of replacement selection for the first run. - * - * There are several reasons to favor a hybrid sort-merge strategy. - * Maintaining a priority tree/heap has poor CPU cache characteristics. - * Furthermore, the growth in main memory sizes has greatly diminished the - * value of having runs that are larger than available memory, even in the - * case where there is partially sorted input and runs can be made far - * larger by using a heap. In most cases, a single-pass merge step is all - * that is required even when runs are no larger than available memory. - * Avoiding multiple merge passes was traditionally considered to be the - * major advantage of using replacement selection. - * - * The approximate amount of memory allowed for any one sort operation - * is specified in kilobytes by the caller (most pass work_mem). Initially, - * we absorb tuples and simply store them in an unsorted array as long as - * we haven't exceeded workMem. If we reach the end of the input without - * exceeding workMem, we sort the array using qsort() and subsequently return - * tuples just by scanning the tuple array sequentially. If we do exceed - * workMem, we begin to emit tuples into sorted runs in temporary tapes. - * When tuples are dumped in batch after quicksorting, we begin a new run - * with a new output tape (selected per Algorithm D). After the end of the - * input is reached, we dump out remaining tuples in memory into a final run - * (or two, when replacement selection is still used), then merge the runs - * using Algorithm D. - * - * When merging runs, we use a heap containing just the frontmost tuple from - * each source run; we repeatedly output the smallest tuple and insert the - * next tuple from its source tape (if any). When the heap empties, the merge - * is complete. The basic merge algorithm thus needs very little memory --- - * only M tuples for an M-way merge, and M is constrained to a small number. - * However, we can still make good use of our full workMem allocation by - * pre-reading additional tuples from each source tape. Without prereading, - * our access pattern to the temporary file would be very erratic; on average - * we'd read one block from each of M source tapes during the same time that - * we're writing M blocks to the output tape, so there is no sequentiality of - * access at all, defeating the read-ahead methods used by most Unix kernels. - * Worse, the output tape gets written into a very random sequence of blocks - * of the temp file, ensuring that things will be even worse when it comes - * time to read that tape. A straightforward merge pass thus ends up doing a - * lot of waiting for disk seeks. We can improve matters by prereading from - * each source tape sequentially, loading about workMem/M bytes from each tape - * in turn. Then we run the merge algorithm, writing but not reading until - * one of the preloaded tuple series runs out. Then we switch back to preread - * mode, fill memory again, and repeat. This approach helps to localize both - * read and write accesses. - * - * When the caller requests random access to the sort result, we form - * the final sorted run on a logical tape which is then "frozen", so - * that we can access it randomly. When the caller does not need random - * access, we return from rum_tuplesort_performsort() as soon as we are down - * to one run per logical tape. The final merge is then performed - * on-the-fly as the caller repeatedly calls rum_tuplesort_getXXX; this - * saves one cycle of writing all the data out to disk and reading it in. - * - * Before Postgres 8.2, we always used a seven-tape polyphase merge, on the - * grounds that 7 is the "sweet spot" on the tapes-to-passes curve according - * to Knuth's figure 70 (section 5.4.2). However, Knuth is assuming that - * tape drives are expensive beasts, and in particular that there will always - * be many more runs than tape drives. In our implementation a "tape drive" - * doesn't cost much more than a few Kb of memory buffers, so we can afford - * to have lots of them. In particular, if we can have as many tape drives - * as sorted runs, we can eliminate any repeated I/O at all. In the current - * code we determine the number of tapes M on the basis of workMem: we want - * workMem/M to be large enough that we read a fair amount of data each time - * we preread from a tape, so as to maintain the locality of access described - * above. Nonetheless, with large workMem we can have many tapes. - * - * - * Portions Copyright (c) 2015-2016, Postgres Professional - * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 2015-2024, Postgres Professional + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * *------------------------------------------------------------------------- @@ -122,3885 +19,145 @@ #include "miscadmin.h" #include "rumsort.h" -#include - -#include "access/htup_details.h" -#include "access/nbtree.h" -#include "catalog/index.h" -#include "catalog/pg_am.h" #include "commands/tablespace.h" #include "executor/executor.h" -#include "utils/datum.h" #include "utils/logtape.h" -#include "utils/lsyscache.h" -#include "utils/memutils.h" #include "utils/pg_rusage.h" -#include "utils/probes.h" -#include "utils/rel.h" -#include "utils/sortsupport.h" - -#include "rum.h" /* RumItem */ - -/* sort-type codes for sort__start probes */ -#define HEAP_SORT 0 -#define INDEX_SORT 1 -#define DATUM_SORT 2 -#define CLUSTER_SORT 3 - -/* GUC variables */ -#ifdef TRACE_SORT -bool trace_sort = false; -#endif - -#ifdef DEBUG_BOUNDED_SORT -bool optimize_bounded_sort = true; -#endif - -#if PG_VERSION_NUM < 100000 -/* Provide fallback for old version of tape interface for 9.6 */ -#define LogicalTapeRewindForRead(x, y, z) LogicalTapeRewind((x), (y), false) -#define LogicalTapeRewindForWrite(x, y) LogicalTapeRewind((x), (y), true) -#endif - -#if PG_VERSION_NUM >= 110000 -#define RUM_SORT_START(INT1, INT2, INT3, INT4, INT5) \ -TRACE_POSTGRESQL_SORT_START(INT1, INT2, INT3, INT4, INT5, false) -#else -#define RUM_SORT_START(INT1, INT2, INT3, INT4, INT5) \ -TRACE_POSTGRESQL_SORT_START(INT1, INT2, INT3, INT4, INT5) -#endif - -#if PG_VERSION_NUM >= 110000 -#define LogicalTapeSetCreate(X) LogicalTapeSetCreate(X, NULL, NULL, 1) -#define LogicalTapeFreeze(X, Y) LogicalTapeFreeze(X, Y, NULL) -#endif - -/* - * The objects we actually sort are SortTuple structs. These contain - * a pointer to the tuple proper (might be a MinimalTuple or IndexTuple), - * which is a separate palloc chunk --- we assume it is just one chunk and - * can be freed by a simple pfree(). SortTuples also contain the tuple's - * first key column in Datum/nullflag format, and an index integer. - * - * Storing the first key column lets us save heap_getattr or index_getattr - * calls during tuple comparisons. We could extract and save all the key - * columns not just the first, but this would increase code complexity and - * overhead, and wouldn't actually save any comparison cycles in the common - * case where the first key determines the comparison result. Note that - * for a pass-by-reference datatype, datum1 points into the "tuple" storage. - * - * When sorting single Datums, the data value is represented directly by - * datum1/isnull1. If the datatype is pass-by-reference and isnull1 is false, - * then datum1 points to a separately palloc'd data value that is also pointed - * to by the "tuple" pointer; otherwise "tuple" is NULL. - * - * While building initial runs, tupindex holds the tuple's run number. During - * merge passes, we re-use it to hold the input tape number that each tuple in - * the heap was read from, or to hold the index of the next tuple pre-read - * from the same tape in the case of pre-read entries. tupindex goes unused - * if the sort occurs entirely in memory. - */ -typedef struct -{ - void *tuple; /* the tuple proper */ - Datum datum1; /* value of first key column */ - bool isnull1; /* is first key column NULL? */ - int tupindex; /* see notes above */ -} SortTuple; - - -/* - * Possible states of a Tuplesort object. These denote the states that - * persist between calls of Tuplesort routines. - */ -typedef enum -{ - TSS_INITIAL, /* Loading tuples; still within memory limit */ - TSS_BOUNDED, /* Loading tuples into bounded-size heap */ - TSS_BUILDRUNS, /* Loading tuples; writing to tape */ - TSS_SORTEDINMEM, /* Sort completed entirely in memory */ - TSS_SORTEDONTAPE, /* Sort completed, final run is on tape */ - TSS_FINALMERGE /* Performing final merge on-the-fly */ -} TupSortStatus; - -/* - * Parameters for calculation of number of tapes to use --- see inittapes() - * and rum_tuplesort_merge_order(). - * - * In this calculation we assume that each tape will cost us about 3 blocks - * worth of buffer space (which is an underestimate for very large data - * volumes, but it's probably close enough --- see logtape.c). - * - * MERGE_BUFFER_SIZE is how much data we'd like to read from each input - * tape during a preread cycle (see discussion at top of file). - */ -#define MINORDER 6 /* minimum merge order */ -#define TAPE_BUFFER_OVERHEAD (BLCKSZ * 3) -#define MERGE_BUFFER_SIZE (BLCKSZ * 32) - -typedef int (*SortTupleComparator) (const SortTuple *a, const SortTuple *b, - RumTuplesortstate *state); - -/* - * Private state of a Tuplesort operation. - */ -struct RumTuplesortstate -{ - TupSortStatus status; /* enumerated value as shown above */ - int nKeys; /* number of columns in sort key */ - bool randomAccess; /* did caller request random access? */ - bool bounded; /* did caller specify a maximum number of - * tuples to return? */ - bool boundUsed; /* true if we made use of a bounded heap */ - int bound; /* if bounded, the maximum number of tuples */ - long availMem; /* remaining memory available, in bytes */ - long allowedMem; /* total memory allowed, in bytes */ - int maxTapes; /* number of tapes (Knuth's T) */ - int tapeRange; /* maxTapes-1 (Knuth's P) */ - MemoryContext sortcontext; /* memory context holding all sort data */ - LogicalTapeSet *tapeset; /* logtape.c object for tapes in a temp file */ - - /* - * These function pointers decouple the routines that must know what kind - * of tuple we are sorting from the routines that don't need to know it. - * They are set up by the rum_tuplesort_begin_xxx routines. - * - * Function to compare two tuples; result is per qsort() convention, ie: - * <0, 0, >0 according as ab. The API must match - * qsort_arg_comparator. - */ - SortTupleComparator comparetup; - - /* - * Function to copy a supplied input tuple into palloc'd space and set up - * its SortTuple representation (ie, set tuple/datum1/isnull1). Also, - * state->availMem must be decreased by the amount of space used for the - * tuple copy (note the SortTuple struct itself is not counted). - */ - void (*copytup) (RumTuplesortstate *state, SortTuple *stup, void *tup); - - /* - * Function to write a stored tuple onto tape. The representation of the - * tuple on tape need not be the same as it is in memory; requirements on - * the tape representation are given below. After writing the tuple, - * pfree() the out-of-line data (not the SortTuple struct!), and increase - * state->availMem by the amount of memory space thereby released. - */ - void (*writetup) (RumTuplesortstate *state, int tapenum, - SortTuple *stup); - - /* - * Function to read a stored tuple from tape back into memory. 'len' is - * the already-read length of the stored tuple. Create a palloc'd copy, - * initialize tuple/datum1/isnull1 in the target SortTuple struct, and - * decrease state->availMem by the amount of memory space consumed. - */ - void (*readtup) (RumTuplesortstate *state, SortTuple *stup, - int tapenum, unsigned int len); - - /* - * Function to reverse the sort direction from its current state. (We - * could dispense with this if we wanted to enforce that all variants - * represent the sort key information alike.) - */ - void (*reversedirection) (RumTuplesortstate *state); - - /* - * This array holds the tuples now in sort memory. If we are in state - * INITIAL, the tuples are in no particular order; if we are in state - * SORTEDINMEM, the tuples are in final sorted order; in states BUILDRUNS - * and FINALMERGE, the tuples are organized in "heap" order per Algorithm - * H. (Note that memtupcount only counts the tuples that are part of the - * heap --- during merge passes, memtuples[] entries beyond tapeRange are - * never in the heap and are used to hold pre-read tuples.) In state - * SORTEDONTAPE, the array is not used. - */ - SortTuple *memtuples; /* array of SortTuple structs */ - int memtupcount; /* number of tuples currently present */ - int memtupsize; /* allocated length of memtuples array */ - bool growmemtuples; /* memtuples' growth still underway? */ - - /* Buffer size to use for reading input tapes, during merge. */ - size_t read_buffer_size; - - /* - * While building initial runs, this is the current output run number - * (starting at 0). Afterwards, it is the number of initial runs we made. - */ - int currentRun; - - /* - * Unless otherwise noted, all pointer variables below are pointers to - * arrays of length maxTapes, holding per-tape data. - */ - - /* - * These variables are only used during merge passes. mergeactive[i] is - * true if we are reading an input run from (actual) tape number i and - * have not yet exhausted that run. mergenext[i] is the memtuples index - * of the next pre-read tuple (next to be loaded into the heap) for tape - * i, or 0 if we are out of pre-read tuples. mergelast[i] similarly - * points to the last pre-read tuple from each tape. mergeavailslots[i] - * is the number of unused memtuples[] slots reserved for tape i, and - * mergeavailmem[i] is the amount of unused space allocated for tape i. - * mergefreelist and mergefirstfree keep track of unused locations in the - * memtuples[] array. The memtuples[].tupindex fields link together - * pre-read tuples for each tape as well as recycled locations in - * mergefreelist. It is OK to use 0 as a null link in these lists, because - * memtuples[0] is part of the merge heap and is never a pre-read tuple. - */ - bool *mergeactive; /* active input run source? */ - int *mergenext; /* first preread tuple for each source */ - int *mergelast; /* last preread tuple for each source */ - int *mergeavailslots; /* slots left for prereading each tape */ - long *mergeavailmem; /* availMem for prereading each tape */ - int mergefreelist; /* head of freelist of recycled slots */ - int mergefirstfree; /* first slot never used in this merge */ - - /* - * Variables for Algorithm D. Note that destTape is a "logical" tape - * number, ie, an index into the tp_xxx[] arrays. Be careful to keep - * "logical" and "actual" tape numbers straight! - */ - int Level; /* Knuth's l */ - int destTape; /* current output tape (Knuth's j, less 1) */ - int *tp_fib; /* Target Fibonacci run counts (A[]) */ - int *tp_runs; /* # of real runs on each tape */ - int *tp_dummy; /* # of dummy runs for each tape (D[]) */ - int *tp_tapenum; /* Actual tape numbers (TAPE[]) */ - int activeTapes; /* # of active input tapes in merge pass */ - - /* - * These variables are used after completion of sorting to keep track of - * the next tuple to return. (In the tape case, the tape's current read - * position is also critical state.) - */ - int result_tape; /* actual tape number of finished output */ - int current; /* array index (only used if SORTEDINMEM) */ - bool eof_reached; /* reached EOF (needed for cursors) */ - - /* markpos_xxx holds marked position for mark and restore */ - long markpos_block; /* tape block# (only used if SORTEDONTAPE) */ - int markpos_offset; /* saved "current", or offset in tape block */ - bool markpos_eof; /* saved "eof_reached" */ - - /* - * These variables are specific to the MinimalTuple case; they are set by - * rum_tuplesort_begin_heap and used only by the MinimalTuple routines. - */ - TupleDesc tupDesc; - SortSupport sortKeys; /* array of length nKeys */ - - /* - * This variable is shared by the single-key MinimalTuple case and the - * Datum case (which both use qsort_ssup()). Otherwise it's NULL. - */ - SortSupport onlyKey; - - /* - * These variables are specific to the CLUSTER case; they are set by - * rum_tuplesort_begin_cluster. Note CLUSTER also uses tupDesc and - * indexScanKey. - */ - IndexInfo *indexInfo; /* info about index being used for reference */ - EState *estate; /* for evaluating index expressions */ - - /* - * These variables are specific to the IndexTuple case; they are set by - * rum_tuplesort_begin_index_xxx and used only by the IndexTuple routines. - */ - Relation heapRel; /* table the index is being built on */ - Relation indexRel; /* index being built */ - - /* These are specific to the index_btree subcase: */ - ScanKey indexScanKey; - bool enforceUnique; /* complain if we find duplicate tuples */ - - /* These are specific to the index_hash subcase: */ - uint32 hash_mask; /* mask for sortable part of hash code */ - - /* - * These variables are specific to the Datum case; they are set by - * rum_tuplesort_begin_datum and used only by the DatumTuple routines. - */ - Oid datumType; - /* we need typelen and byval in order to know how to copy the Datums. */ - int datumTypeLen; - bool datumTypeByVal; - - bool reverse; - - /* Do we need ItemPointer comparison in comparetup_rum()? */ - bool compareItemPointer; - - /* compare_rumitem */ - FmgrInfo *cmp; - - /* - * Resource snapshot for time of sort start. - */ -#ifdef TRACE_SORT - PGRUsage ru_start; -#endif -}; - -#define COMPARETUP(state,a,b) ((*(state)->comparetup) (a, b, state)) -#define COPYTUP(state,stup,tup) ((*(state)->copytup) (state, stup, tup)) -#define WRITETUP(state,tape,stup) ((*(state)->writetup) (state, tape, stup)) -#define READTUP(state,stup,tape,len) ((*(state)->readtup) (state, stup, tape, len)) -#define REVERSEDIRECTION(state) ((*(state)->reversedirection) (state)) -#define LACKMEM(state) ((state)->availMem < 0) -#define USEMEM(state,amt) ((state)->availMem -= (amt)) -#define FREEMEM(state,amt) ((state)->availMem += (amt)) - -/* - * NOTES about on-tape representation of tuples: - * - * We require the first "unsigned int" of a stored tuple to be the total size - * on-tape of the tuple, including itself (so it is never zero; an all-zero - * unsigned int is used to delimit runs). The remainder of the stored tuple - * may or may not match the in-memory representation of the tuple --- - * any conversion needed is the job of the writetup and readtup routines. - * - * If state->randomAccess is true, then the stored representation of the - * tuple must be followed by another "unsigned int" that is a copy of the - * length --- so the total tape space used is actually sizeof(unsigned int) - * more than the stored length value. This allows read-backwards. When - * randomAccess is not true, the write/read routines may omit the extra - * length word. - * - * writetup is expected to write both length words as well as the tuple - * data. When readtup is called, the tape is positioned just after the - * front length word; readtup must read the tuple data and advance past - * the back length word (if present). - * - * The write/read routines can make use of the tuple description data - * stored in the Tuplesortstate record, if needed. They are also expected - * to adjust state->availMem by the amount of memory space (not tape space!) - * released or consumed. There is no error return from either writetup - * or readtup; they should ereport() on failure. - * - * - * NOTES about memory consumption calculations: - * - * We count space allocated for tuples against the workMem limit, plus - * the space used by the variable-size memtuples array. Fixed-size space - * is not counted; it's small enough to not be interesting. - * - * Note that we count actual space used (as shown by GetMemoryChunkSpace) - * rather than the originally-requested size. This is important since - * palloc can add substantial overhead. It's not a complete answer since - * we won't count any wasted space in palloc allocation blocks, but it's - * a lot better than what we were doing before 7.3. - */ - -/* When using this macro, beware of double evaluation of len */ -#define LogicalTapeReadExact(tapeset, tapenum, ptr, len) \ - do { \ - if (LogicalTapeRead(tapeset, tapenum, ptr, len) != (size_t) (len)) \ - elog(ERROR, "unexpected end of data"); \ - } while(0) - - -static RumTuplesortstate *rum_tuplesort_begin_common(int workMem, bool randomAccess); -static void puttuple_common(RumTuplesortstate *state, SortTuple *tuple); -static void inittapes(RumTuplesortstate *state); -static void selectnewtape(RumTuplesortstate *state); -static void mergeruns(RumTuplesortstate *state); -static void mergeonerun(RumTuplesortstate *state); -static void beginmerge(RumTuplesortstate *state); -static void mergepreread(RumTuplesortstate *state); -static void mergeprereadone(RumTuplesortstate *state, int srcTape); -static void dumptuples(RumTuplesortstate *state, bool alltuples); -static void make_bounded_heap(RumTuplesortstate *state); -static void sort_bounded_heap(RumTuplesortstate *state); -static void rum_tuplesort_heap_insert(RumTuplesortstate *state, SortTuple *tuple, - int tupleindex, bool checkIndex); -static void rum_tuplesort_heap_siftup(RumTuplesortstate *state, bool checkIndex); -static unsigned int getlen(RumTuplesortstate *state, int tapenum, bool eofOK); -static void markrunend(RumTuplesortstate *state, int tapenum); -static int comparetup_heap(const SortTuple *a, const SortTuple *b, - RumTuplesortstate *state); -static void copytup_heap(RumTuplesortstate *state, SortTuple *stup, void *tup); -static void writetup_heap(RumTuplesortstate *state, int tapenum, - SortTuple *stup); -static void readtup_heap(RumTuplesortstate *state, SortTuple *stup, - int tapenum, unsigned int len); -static void reversedirection_heap(RumTuplesortstate *state); -static int comparetup_cluster(const SortTuple *a, const SortTuple *b, - RumTuplesortstate *state); -static void copytup_cluster(RumTuplesortstate *state, SortTuple *stup, void *tup); -static void writetup_cluster(RumTuplesortstate *state, int tapenum, - SortTuple *stup); -static void readtup_cluster(RumTuplesortstate *state, SortTuple *stup, - int tapenum, unsigned int len); -static int comparetup_index_btree(const SortTuple *a, const SortTuple *b, - RumTuplesortstate *state); -static int comparetup_index_hash(const SortTuple *a, const SortTuple *b, - RumTuplesortstate *state); -static void copytup_index(RumTuplesortstate *state, SortTuple *stup, void *tup); -static void writetup_index(RumTuplesortstate *state, int tapenum, - SortTuple *stup); -static void readtup_index(RumTuplesortstate *state, SortTuple *stup, - int tapenum, unsigned int len); -static void reversedirection_index_btree(RumTuplesortstate *state); -static void reversedirection_index_hash(RumTuplesortstate *state); -static int comparetup_datum(const SortTuple *a, const SortTuple *b, - RumTuplesortstate *state); -static void copytup_datum(RumTuplesortstate *state, SortTuple *stup, void *tup); -static void writetup_datum(RumTuplesortstate *state, int tapenum, - SortTuple *stup); -static void readtup_datum(RumTuplesortstate *state, SortTuple *stup, - int tapenum, unsigned int len); -static void reversedirection_datum(RumTuplesortstate *state); -static void free_sort_tuple(RumTuplesortstate *state, SortTuple *stup); -static int comparetup_rum(const SortTuple *a, const SortTuple *b, - RumTuplesortstate *state); -static void copytup_rum(RumTuplesortstate *state, SortTuple *stup, void *tup); -static void writetup_rum(RumTuplesortstate *state, int tapenum, - SortTuple *stup); -static void readtup_rum(RumTuplesortstate *state, SortTuple *stup, - int tapenum, unsigned int len); -static void reversedirection_rum(RumTuplesortstate *state); -static int comparetup_rumitem(const SortTuple *a, const SortTuple *b, - RumTuplesortstate *state); -static void copytup_rumitem(RumTuplesortstate *state, SortTuple *stup, void *tup); -static void writetup_rumitem(RumTuplesortstate *state, int tapenum, - SortTuple *stup); -static void readtup_rumitem(RumTuplesortstate *state, SortTuple *stup, - int tapenum, unsigned int len); - -/* - * Special versions of qsort just for SortTuple objects. qsort_tuple() sorts - * any variant of SortTuples, using the appropriate comparetup function. - * qsort_ssup() is specialized for the case where the comparetup function - * reduces to ApplySortComparator(), that is single-key MinimalTuple sorts - * and Datum sorts. - */ -/* #include "qsort_tuple.c" */ - -static void -swapfunc(SortTuple *a, SortTuple *b, size_t n) -{ - do - { - SortTuple t = *a; - - *a++ = *b; - *b++ = t; - } while (--n > 0); -} - -#define cmp_ssup(a, b, ssup) \ - ApplySortComparator((a)->datum1, (a)->isnull1, \ - (b)->datum1, (b)->isnull1, ssup) - -#define swap(a, b) \ - do { \ - SortTuple t = *(a); \ - *(a) = *(b); \ - *(b) = t; \ - } while (0); - -#define vecswap(a, b, n) if ((n) > 0) swapfunc(a, b, n) - -static SortTuple * -med3_tuple(SortTuple *a, SortTuple *b, SortTuple *c, SortTupleComparator cmp_tuple, RumTuplesortstate *state) -{ - return cmp_tuple(a, b, state) < 0 ? - (cmp_tuple(b, c, state) < 0 ? b : - (cmp_tuple(a, c, state) < 0 ? c : a)) - : (cmp_tuple(b, c, state) > 0 ? b : - (cmp_tuple(a, c, state) < 0 ? a : c)); -} - -static SortTuple * -med3_ssup(SortTuple *a, SortTuple *b, SortTuple *c, SortSupport ssup) -{ - return cmp_ssup(a, b, ssup) < 0 ? - (cmp_ssup(b, c, ssup) < 0 ? b : - (cmp_ssup(a, c, ssup) < 0 ? c : a)) - : (cmp_ssup(b, c, ssup) > 0 ? b : - (cmp_ssup(a, c, ssup) < 0 ? a : c)); -} - -static void -qsort_ssup(SortTuple *a, size_t n, SortSupport ssup) -{ - SortTuple *pa, - *pb, - *pc, - *pd, - *pl, - *pm, - *pn; - size_t d1, - d2; - int r, - presorted; - -loop: - CHECK_FOR_INTERRUPTS(); - if (n < 7) - { - for (pm = a + 1; pm < a + n; pm++) - for (pl = pm; pl > a && cmp_ssup(pl - 1, pl, ssup) > 0; pl--) - swap(pl, pl - 1); - return; - } - presorted = 1; - for (pm = a + 1; pm < a + n; pm++) - { - CHECK_FOR_INTERRUPTS(); - if (cmp_ssup(pm - 1, pm, ssup) > 0) - { - presorted = 0; - break; - } - } - if (presorted) - return; - pm = a + (n / 2); - if (n > 7) - { - pl = a; - pn = a + (n - 1); - if (n > 40) - { - size_t d = (n / 8); - - pl = med3_ssup(pl, pl + d, pl + 2 * d, ssup); - pm = med3_ssup(pm - d, pm, pm + d, ssup); - pn = med3_ssup(pn - 2 * d, pn - d, pn, ssup); - } - pm = med3_ssup(pl, pm, pn, ssup); - } - swap(a, pm); - pa = pb = a + 1; - pc = pd = a + (n - 1); - for (;;) - { - while (pb <= pc && (r = cmp_ssup(pb, a, ssup)) <= 0) - { - if (r == 0) - { - swap(pa, pb); - pa++; - } - pb++; - CHECK_FOR_INTERRUPTS(); - } - while (pb <= pc && (r = cmp_ssup(pc, a, ssup)) >= 0) - { - if (r == 0) - { - swap(pc, pd); - pd--; - } - pc--; - CHECK_FOR_INTERRUPTS(); - } - if (pb > pc) - break; - swap(pb, pc); - pb++; - pc--; - } - pn = a + n; - d1 = Min(pa - a, pb - pa); - vecswap(a, pb - d1, d1); - d1 = Min(pd - pc, pn - pd - 1); - vecswap(pb, pn - d1, d1); - d1 = pb - pa; - d2 = pd - pc; - if (d1 <= d2) - { - /* Recurse on left partition, then iterate on right partition */ - if (d1 > 1) - qsort_ssup(a, d1, ssup); - if (d2 > 1) - { - /* Iterate rather than recurse to save stack space */ - /* qsort_ssup(pn - d2, d2, ssup); */ - a = pn - d2; - n = d2; - goto loop; - } - } - else - { - /* Recurse on right partition, then iterate on left partition */ - if (d2 > 1) - qsort_ssup(pn - d2, d2, ssup); - if (d1 > 1) - { - /* Iterate rather than recurse to save stack space */ - /* qsort_ssup(a, d1, ssup); */ - n = d1; - goto loop; - } - } -} - -static void -qsort_tuple(SortTuple *a, size_t n, SortTupleComparator cmp_tuple, RumTuplesortstate *state) -{ - SortTuple *pa, - *pb, - *pc, - *pd, - *pl, - *pm, - *pn; - size_t d1, - d2; - int r, - presorted; - -loop: - CHECK_FOR_INTERRUPTS(); - if (n < 7) - { - for (pm = a + 1; pm < a + n; pm++) - for (pl = pm; pl > a && cmp_tuple(pl - 1, pl, state) > 0; pl--) - swap(pl, pl - 1); - return; - } - presorted = 1; - for (pm = a + 1; pm < a + n; pm++) - { - CHECK_FOR_INTERRUPTS(); - if (cmp_tuple(pm - 1, pm, state) > 0) - { - presorted = 0; - break; - } - } - if (presorted) - return; - pm = a + (n / 2); - if (n > 7) - { - pl = a; - pn = a + (n - 1); - if (n > 40) - { - size_t d = (n / 8); - - pl = med3_tuple(pl, pl + d, pl + 2 * d, cmp_tuple, state); - pm = med3_tuple(pm - d, pm, pm + d, cmp_tuple, state); - pn = med3_tuple(pn - 2 * d, pn - d, pn, cmp_tuple, state); - } - pm = med3_tuple(pl, pm, pn, cmp_tuple, state); - } - swap(a, pm); - pa = pb = a + 1; - pc = pd = a + (n - 1); - for (;;) - { - while (pb <= pc && (r = cmp_tuple(pb, a, state)) <= 0) - { - if (r == 0) - { - swap(pa, pb); - pa++; - } - pb++; - CHECK_FOR_INTERRUPTS(); - } - while (pb <= pc && (r = cmp_tuple(pc, a, state)) >= 0) - { - if (r == 0) - { - swap(pc, pd); - pd--; - } - pc--; - CHECK_FOR_INTERRUPTS(); - } - if (pb > pc) - break; - swap(pb, pc); - pb++; - pc--; - } - pn = a + n; - d1 = Min(pa - a, pb - pa); - vecswap(a, pb - d1, d1); - d1 = Min(pd - pc, pn - pd - 1); - vecswap(pb, pn - d1, d1); - d1 = pb - pa; - d2 = pd - pc; - if (d1 <= d2) - { - /* Recurse on left partition, then iterate on right partition */ - if (d1 > 1) - qsort_tuple(a, d1, cmp_tuple, state); - if (d2 > 1) - { - /* Iterate rather than recurse to save stack space */ - /* qsort_tuple(pn - d2, d2, cmp_tuple, state); */ - a = pn - d2; - n = d2; - goto loop; - } - } - else - { - /* Recurse on right partition, then iterate on left partition */ - if (d2 > 1) - qsort_tuple(pn - d2, d2, cmp_tuple, state); - if (d1 > 1) - { - /* Iterate rather than recurse to save stack space */ - /* qsort_tuple(a, d1, cmp_tuple, state); */ - n = d1; - goto loop; - } - } -} - -/* - * rum_tuplesort_begin_xxx - * - * Initialize for a tuple sort operation. - * - * After calling rum_tuplesort_begin, the caller should call rum_tuplesort_putXXX - * zero or more times, then call rum_tuplesort_performsort when all the tuples - * have been supplied. After performsort, retrieve the tuples in sorted - * order by calling rum_tuplesort_getXXX until it returns false/NULL. (If random - * access was requested, rescan, markpos, and restorepos can also be called.) - * Call rum_tuplesort_end to terminate the operation and release memory/disk space. - * - * Each variant of rum_tuplesort_begin has a workMem parameter specifying the - * maximum number of kilobytes of RAM to use before spilling data to disk. - * (The normal value of this parameter is work_mem, but some callers use - * other values.) Each variant also has a randomAccess parameter specifying - * whether the caller needs non-sequential access to the sort result. - */ - -static RumTuplesortstate * -rum_tuplesort_begin_common(int workMem, bool randomAccess) -{ - RumTuplesortstate *state; - MemoryContext sortcontext; - MemoryContext oldcontext; - - /* - * Create a working memory context for this sort operation. All data - * needed by the sort will live inside this context. - */ - sortcontext = RumContextCreate(CurrentMemoryContext, "TupleSort"); - - /* - * Make the Tuplesortstate within the per-sort context. This way, we - * don't need a separate pfree() operation for it at shutdown. - */ - oldcontext = MemoryContextSwitchTo(sortcontext); - - state = (RumTuplesortstate *) palloc0(sizeof(RumTuplesortstate)); - -#ifdef TRACE_SORT - if (trace_sort) - pg_rusage_init(&state->ru_start); +#include "utils/tuplesort.h" + +#include "rum.h" /* RumItem */ + +#if PG_VERSION_NUM >= 160000 +/* + * After allocating a public interface for Tuplesortstate, no need to include + * source code from pg-core. + */ +#elif PG_VERSION_NUM >= 150000 +#include "tuplesort15.c" +#elif PG_VERSION_NUM >= 140000 +#include "tuplesort14.c" +#elif PG_VERSION_NUM >= 130000 +#include "tuplesort13.c" +#elif PG_VERSION_NUM >= 120000 +#include "tuplesort12.c" +#elif PG_VERSION_NUM >= 110000 +#include "tuplesort11.c" +#elif PG_VERSION_NUM >= 100000 +#include "tuplesort10.c" +#elif PG_VERSION_NUM >= 90600 +#include "tuplesort96.c" #endif - state->status = TSS_INITIAL; - state->randomAccess = randomAccess; - state->bounded = false; - state->boundUsed = false; - state->allowedMem = workMem * 1024L; - state->availMem = state->allowedMem; - state->sortcontext = sortcontext; - state->tapeset = NULL; - - state->memtupcount = 0; - - /* - * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD; - * see comments in grow_memtuples(). - */ - state->memtupsize = Max(1024, - ALLOCSET_SEPARATE_THRESHOLD / sizeof(SortTuple) + 1); - - state->growmemtuples = true; - state->memtuples = (SortTuple *) palloc(state->memtupsize * sizeof(SortTuple)); - - USEMEM(state, GetMemoryChunkSpace(state->memtuples)); - - /* workMem must be large enough for the minimal memtuples array */ - if (LACKMEM(state)) - elog(ERROR, "insufficient memory allowed for sort"); - - state->currentRun = 0; - - /* - * maxTapes, tapeRange, and Algorithm D variables will be initialized by - * inittapes(), if needed - */ - - state->result_tape = -1; /* flag that result tape has not been formed */ - - MemoryContextSwitchTo(oldcontext); - - return state; -} - -MemoryContext -rum_tuplesort_get_memorycontext(RumTuplesortstate *state) -{ - return state->sortcontext; -} - -RumTuplesortstate * -rum_tuplesort_begin_heap(TupleDesc tupDesc, - int nkeys, AttrNumber *attNums, - Oid *sortOperators, Oid *sortCollations, - bool *nullsFirstFlags, - int workMem, bool randomAccess) -{ - RumTuplesortstate *state = rum_tuplesort_begin_common(workMem, randomAccess); - MemoryContext oldcontext; - int i; - - oldcontext = MemoryContextSwitchTo(state->sortcontext); - - AssertArg(nkeys > 0); - -#ifdef TRACE_SORT - if (trace_sort) - elog(LOG, - "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c", - nkeys, workMem, randomAccess ? 't' : 'f'); -#endif - - state->nKeys = nkeys; - - RUM_SORT_START(HEAP_SORT, - false, /* no unique check */ - nkeys, - workMem, - randomAccess); - - state->comparetup = comparetup_heap; - state->copytup = copytup_heap; - state->writetup = writetup_heap; - state->readtup = readtup_heap; - state->reversedirection = reversedirection_heap; - - state->tupDesc = tupDesc; /* assume we need not copy tupDesc */ - - /* Prepare SortSupport data for each column */ - state->sortKeys = (SortSupport) palloc0(nkeys * sizeof(SortSupportData)); - - for (i = 0; i < nkeys; i++) - { - SortSupport sortKey = state->sortKeys + i; - - AssertArg(attNums[i] != 0); - AssertArg(sortOperators[i] != 0); - - sortKey->ssup_cxt = CurrentMemoryContext; - sortKey->ssup_collation = sortCollations[i]; - sortKey->ssup_nulls_first = nullsFirstFlags[i]; - sortKey->ssup_attno = attNums[i]; - - PrepareSortSupportFromOrderingOp(sortOperators[i], sortKey); - } - - if (nkeys == 1) - state->onlyKey = state->sortKeys; - - MemoryContextSwitchTo(oldcontext); - - return state; -} - -RumTuplesortstate * -rum_tuplesort_begin_cluster(TupleDesc tupDesc, - Relation indexRel, - int workMem, bool randomAccess) -{ - RumTuplesortstate *state = rum_tuplesort_begin_common(workMem, randomAccess); - MemoryContext oldcontext; - - Assert(indexRel->rd_rel->relam == BTREE_AM_OID); - - oldcontext = MemoryContextSwitchTo(state->sortcontext); - -#ifdef TRACE_SORT - if (trace_sort) - elog(LOG, - "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c", - RelationGetNumberOfAttributes(indexRel), - workMem, randomAccess ? 't' : 'f'); -#endif - - state->nKeys = RelationGetNumberOfAttributes(indexRel); - - RUM_SORT_START(CLUSTER_SORT, - false, /* no unique check */ - state->nKeys, - workMem, - randomAccess); - - state->comparetup = comparetup_cluster; - state->copytup = copytup_cluster; - state->writetup = writetup_cluster; - state->readtup = readtup_cluster; - state->reversedirection = reversedirection_index_btree; - - state->indexInfo = BuildIndexInfo(indexRel); - state->indexScanKey = _bt_mkscankey_nodata(indexRel); - - state->tupDesc = tupDesc; /* assume we need not copy tupDesc */ - - if (state->indexInfo->ii_Expressions != NULL) - { - TupleTableSlot *slot; - ExprContext *econtext; - - /* - * We will need to use FormIndexDatum to evaluate the index - * expressions. To do that, we need an EState, as well as a - * TupleTableSlot to put the table tuples into. The econtext's - * scantuple has to point to that slot, too. - */ - state->estate = CreateExecutorState(); - slot = MakeSingleTupleTableSlot(tupDesc); - econtext = GetPerTupleExprContext(state->estate); - econtext->ecxt_scantuple = slot; - } - - MemoryContextSwitchTo(oldcontext); - - return state; -} - -RumTuplesortstate * -rum_tuplesort_begin_index_btree(Relation heapRel, - Relation indexRel, - bool enforceUnique, - int workMem, bool randomAccess) -{ - RumTuplesortstate *state = rum_tuplesort_begin_common(workMem, randomAccess); - MemoryContext oldcontext; - - oldcontext = MemoryContextSwitchTo(state->sortcontext); - -#ifdef TRACE_SORT - if (trace_sort) - elog(LOG, - "begin index sort: unique = %c, workMem = %d, randomAccess = %c", - enforceUnique ? 't' : 'f', - workMem, randomAccess ? 't' : 'f'); -#endif - - state->nKeys = RelationGetNumberOfAttributes(indexRel); - - RUM_SORT_START(INDEX_SORT, - enforceUnique, - state->nKeys, - workMem, - randomAccess); - - state->comparetup = comparetup_index_btree; - state->copytup = copytup_index; - state->writetup = writetup_index; - state->readtup = readtup_index; - state->reversedirection = reversedirection_index_btree; - - state->heapRel = heapRel; - state->indexRel = indexRel; - state->indexScanKey = _bt_mkscankey_nodata(indexRel); - state->enforceUnique = enforceUnique; - - MemoryContextSwitchTo(oldcontext); - - return state; -} - -RumTuplesortstate * -rum_tuplesort_begin_index_hash(Relation heapRel, - Relation indexRel, - uint32 hash_mask, - int workMem, bool randomAccess) -{ - RumTuplesortstate *state = rum_tuplesort_begin_common(workMem, randomAccess); - MemoryContext oldcontext; - - oldcontext = MemoryContextSwitchTo(state->sortcontext); - -#ifdef TRACE_SORT - if (trace_sort) - elog(LOG, - "begin index sort: hash_mask = 0x%x, workMem = %d, randomAccess = %c", - hash_mask, - workMem, randomAccess ? 't' : 'f'); -#endif - - state->nKeys = 1; /* Only one sort column, the hash code */ - - state->comparetup = comparetup_index_hash; - state->copytup = copytup_index; - state->writetup = writetup_index; - state->readtup = readtup_index; - state->reversedirection = reversedirection_index_hash; - - state->heapRel = heapRel; - state->indexRel = indexRel; - - state->hash_mask = hash_mask; - - MemoryContextSwitchTo(oldcontext); - - return state; -} - -RumTuplesortstate * -rum_tuplesort_begin_rum(int workMem, int nKeys, bool randomAccess, - bool compareItemPointer) -{ - RumTuplesortstate *state = rum_tuplesort_begin_common(workMem, randomAccess); - MemoryContext oldcontext; - - oldcontext = MemoryContextSwitchTo(state->sortcontext); - -#ifdef TRACE_SORT - if (trace_sort) - elog(LOG, - "begin rum sort: nKeys = %d, workMem = %d, randomAccess = %c", - nKeys, workMem, randomAccess ? 't' : 'f'); -#endif - - state->nKeys = nKeys; - - RUM_SORT_START(INDEX_SORT, - false, /* no unique check */ - state->nKeys, - workMem, - randomAccess); - - state->comparetup = comparetup_rum; - state->copytup = copytup_rum; - state->writetup = writetup_rum; - state->readtup = readtup_rum; - state->reversedirection = reversedirection_rum; - state->reverse = false; - state->compareItemPointer = compareItemPointer; - - MemoryContextSwitchTo(oldcontext); - - return state; -} - -RumTuplesortstate * -rum_tuplesort_begin_rumitem(int workMem, FmgrInfo *cmp) -{ - RumTuplesortstate *state = rum_tuplesort_begin_common(workMem, false); - MemoryContext oldcontext; - - oldcontext = MemoryContextSwitchTo(state->sortcontext); - -#ifdef TRACE_SORT - if (trace_sort) - elog(LOG, - "begin rumitem sort: workMem = %d", workMem); -#endif - - RUM_SORT_START(INDEX_SORT, - false, /* no unique check */ - 2, - workMem, - false); - - state->cmp = cmp; - state->comparetup = comparetup_rumitem; - state->copytup = copytup_rumitem; - state->writetup = writetup_rumitem; - state->readtup = readtup_rumitem; - state->reversedirection = reversedirection_rum; - state->reverse = false; - state->compareItemPointer = false; - - MemoryContextSwitchTo(oldcontext); - - return state; -} - -RumTuplesortstate * -rum_tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation, - bool nullsFirstFlag, - int workMem, bool randomAccess) -{ - RumTuplesortstate *state = rum_tuplesort_begin_common(workMem, randomAccess); - MemoryContext oldcontext; - int16 typlen; - bool typbyval; - - oldcontext = MemoryContextSwitchTo(state->sortcontext); - -#ifdef TRACE_SORT - if (trace_sort) - elog(LOG, - "begin datum sort: workMem = %d, randomAccess = %c", - workMem, randomAccess ? 't' : 'f'); -#endif - - state->nKeys = 1; /* always a one-column sort */ - - RUM_SORT_START(DATUM_SORT, - false, /* no unique check */ - 1, - workMem, - randomAccess); - - state->comparetup = comparetup_datum; - state->copytup = copytup_datum; - state->writetup = writetup_datum; - state->readtup = readtup_datum; - state->reversedirection = reversedirection_datum; - - state->datumType = datumType; - - /* Prepare SortSupport data */ - state->onlyKey = (SortSupport) palloc0(sizeof(SortSupportData)); - - state->onlyKey->ssup_cxt = CurrentMemoryContext; - state->onlyKey->ssup_collation = sortCollation; - state->onlyKey->ssup_nulls_first = nullsFirstFlag; - - PrepareSortSupportFromOrderingOp(sortOperator, state->onlyKey); - - /* lookup necessary attributes of the datum type */ - get_typlenbyval(datumType, &typlen, &typbyval); - state->datumTypeLen = typlen; - state->datumTypeByVal = typbyval; - - MemoryContextSwitchTo(oldcontext); - - return state; -} - -/* - * rum_tuplesort_set_bound - * - * Advise tuplesort that at most the first N result tuples are required. - * - * Must be called before inserting any tuples. (Actually, we could allow it - * as long as the sort hasn't spilled to disk, but there seems no need for - * delayed calls at the moment.) - * - * This is a hint only. The tuplesort may still return more tuples than - * requested. - */ -void -rum_tuplesort_set_bound(RumTuplesortstate *state, int64 bound) -{ - /* Assert we're called before loading any tuples */ - Assert(state->status == TSS_INITIAL); - Assert(state->memtupcount == 0); - Assert(!state->bounded); - -#ifdef DEBUG_BOUNDED_SORT - /* Honor GUC setting that disables the feature (for easy testing) */ - if (!optimize_bounded_sort) - return; -#endif - - /* We want to be able to compute bound * 2, so limit the setting */ - if (bound > (int64) (INT_MAX / 2)) - return; - - state->bounded = true; - state->bound = (int) bound; -} - -/* - * rum_tuplesort_end - * - * Release resources and clean up. - * - * NOTE: after calling this, any pointers returned by rum_tuplesort_getXXX are - * pointing to garbage. Be careful not to attempt to use or free such - * pointers afterwards! - */ -void -rum_tuplesort_end(RumTuplesortstate *state) -{ - /* context swap probably not needed, but let's be safe */ - MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); - -#ifdef TRACE_SORT - long spaceUsed; - - if (state->tapeset) - spaceUsed = LogicalTapeSetBlocks(state->tapeset); - else - spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024; -#endif - - /* - * Delete temporary "tape" files, if any. - * - * Note: want to include this in reported total cost of sort, hence need - * for two #ifdef TRACE_SORT sections. - */ - if (state->tapeset) - LogicalTapeSetClose(state->tapeset); - -#ifdef TRACE_SORT - if (trace_sort) - { - if (state->tapeset) - elog(LOG, "external sort ended, %ld disk blocks used: %s", - spaceUsed, pg_rusage_show(&state->ru_start)); - else - elog(LOG, "internal sort ended, %ld KB used: %s", - spaceUsed, pg_rusage_show(&state->ru_start)); - } - - TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, spaceUsed); -#else - - /* - * If you disabled TRACE_SORT, you can still probe sort__done, but you - * ain't getting space-used stats. - */ - TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, 0L); -#endif - - /* Free any execution state created for CLUSTER case */ - if (state->estate != NULL) - { - ExprContext *econtext = GetPerTupleExprContext(state->estate); - - ExecDropSingleTupleTableSlot(econtext->ecxt_scantuple); - FreeExecutorState(state->estate); - } - - MemoryContextSwitchTo(oldcontext); - - /* - * Free the per-sort memory context, thereby releasing all working memory, - * including the Tuplesortstate struct itself. - */ - MemoryContextDelete(state->sortcontext); -} - -/* - * Grow the memtuples[] array, if possible within our memory constraint. - * Return true if we were able to enlarge the array, false if not. - * - * Normally, at each increment we double the size of the array. When we no - * longer have enough memory to do that, we attempt one last, smaller increase - * (and then clear the growmemtuples flag so we don't try any more). That - * allows us to use allowedMem as fully as possible; sticking to the pure - * doubling rule could result in almost half of allowedMem going unused. - * Because availMem moves around with tuple addition/removal, we need some - * rule to prevent making repeated small increases in memtupsize, which would - * just be useless thrashing. The growmemtuples flag accomplishes that and - * also prevents useless recalculations in this function. - */ -static bool -grow_memtuples(RumTuplesortstate *state) -{ - int newmemtupsize; - int memtupsize = state->memtupsize; - long memNowUsed = state->allowedMem - state->availMem; - - /* Forget it if we've already maxed out memtuples, per comment above */ - if (!state->growmemtuples) - return false; - - /* Select new value of memtupsize */ - if (memNowUsed <= state->availMem) - { - /* - * It is surely safe to double memtupsize if we've used no more than - * half of allowedMem. - * - * Note: it might seem that we need to worry about memtupsize * 2 - * overflowing an int, but the MaxAllocSize clamp applied below - * ensures the existing memtupsize can't be large enough for that. - */ - newmemtupsize = memtupsize * 2; - } - else - { - /* - * This will be the last increment of memtupsize. Abandon doubling - * strategy and instead increase as much as we safely can. - * - * To stay within allowedMem, we can't increase memtupsize by more - * than availMem / sizeof(SortTuple) elements. In practice, we want - * to increase it by considerably less, because we need to leave some - * space for the tuples to which the new array slots will refer. We - * assume the new tuples will be about the same size as the tuples - * we've already seen, and thus we can extrapolate from the space - * consumption so far to estimate an appropriate new size for the - * memtuples array. The optimal value might be higher or lower than - * this estimate, but it's hard to know that in advance. - * - * This calculation is safe against enlarging the array so much that - * LACKMEM becomes true, because the memory currently used includes - * the present array; thus, there would be enough allowedMem for the - * new array elements even if no other memory were currently used. - * - * We do the arithmetic in float8, because otherwise the product of - * memtupsize and allowedMem could overflow. (A little algebra shows - * that grow_ratio must be less than 2 here, so we are not risking - * integer overflow this way.) Any inaccuracy in the result should be - * insignificant; but even if we computed a completely insane result, - * the checks below will prevent anything really bad from happening. - */ - double grow_ratio; - - grow_ratio = (double) state->allowedMem / (double) memNowUsed; - newmemtupsize = (int) (memtupsize * grow_ratio); - - /* We won't make any further enlargement attempts */ - state->growmemtuples = false; - } - - /* Must enlarge array by at least one element, else report failure */ - if (newmemtupsize <= memtupsize) - goto noalloc; - - /* - * On a 64-bit machine, allowedMem could be more than MaxAllocSize. Clamp - * to ensure our request won't be rejected by palloc. - */ - if ((Size) newmemtupsize >= MaxAllocSize / sizeof(SortTuple)) - { - newmemtupsize = (int) (MaxAllocSize / sizeof(SortTuple)); - state->growmemtuples = false; /* can't grow any more */ - } - - /* - * We need to be sure that we do not cause LACKMEM to become true, else - * the space management algorithm will go nuts. The code above should - * never generate a dangerous request, but to be safe, check explicitly - * that the array growth fits within availMem. (We could still cause - * LACKMEM if the memory chunk overhead associated with the memtuples - * array were to increase. That shouldn't happen because we chose the - * initial array size large enough to ensure that palloc will be treating - * both old and new arrays as separate chunks. But we'll check LACKMEM - * explicitly below just in case.) - */ - if (state->availMem < (long) ((newmemtupsize - memtupsize) * sizeof(SortTuple))) - goto noalloc; - - /* OK, do it */ - FREEMEM(state, GetMemoryChunkSpace(state->memtuples)); - state->memtupsize = newmemtupsize; - state->memtuples = (SortTuple *) - repalloc(state->memtuples, - state->memtupsize * sizeof(SortTuple)); - USEMEM(state, GetMemoryChunkSpace(state->memtuples)); - if (LACKMEM(state)) - elog(ERROR, "unexpected out-of-memory situation in tuplesort"); - return true; - -noalloc: - /* If for any reason we didn't realloc, shut off future attempts */ - state->growmemtuples = false; - return false; -} - -/* - * Accept one tuple while collecting input data for sort. - * - * Note that the input data is always copied; the caller need not save it. - */ -void -rum_tuplesort_puttupleslot(RumTuplesortstate *state, TupleTableSlot *slot) -{ - MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); - SortTuple stup; - - /* - * Copy the given tuple into memory we control, and decrease availMem. - * Then call the common code. - */ - COPYTUP(state, &stup, (void *) slot); - - puttuple_common(state, &stup); - - MemoryContextSwitchTo(oldcontext); -} - -/* - * Accept one tuple while collecting input data for sort. - * - * Note that the input data is always copied; the caller need not save it. - */ -void -rum_tuplesort_putheaptuple(RumTuplesortstate *state, HeapTuple tup) -{ - MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); - SortTuple stup; - - /* - * Copy the given tuple into memory we control, and decrease availMem. - * Then call the common code. - */ - COPYTUP(state, &stup, (void *) tup); - - puttuple_common(state, &stup); - - MemoryContextSwitchTo(oldcontext); -} - -/* - * Accept one index tuple while collecting input data for sort. - * - * Note that the input tuple is always copied; the caller need not save it. - */ -void -rum_tuplesort_putindextuple(RumTuplesortstate *state, IndexTuple tuple) -{ - MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); - SortTuple stup; - - /* - * Copy the given tuple into memory we control, and decrease availMem. - * Then call the common code. - */ - COPYTUP(state, &stup, (void *) tuple); - - puttuple_common(state, &stup); - - MemoryContextSwitchTo(oldcontext); -} - -/* - * Accept one Datum while collecting input data for sort. - * - * If the Datum is pass-by-ref type, the value will be copied. - */ -void -rum_tuplesort_putdatum(RumTuplesortstate *state, Datum val, bool isNull) -{ - MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); - SortTuple stup; - - /* - * If it's a pass-by-reference value, copy it into memory we control, and - * decrease availMem. Then call the common code. - */ - if (isNull || state->datumTypeByVal) - { - stup.datum1 = val; - stup.isnull1 = isNull; - stup.tuple = NULL; /* no separate storage */ - } - else - { - stup.datum1 = datumCopy(val, false, state->datumTypeLen); - stup.isnull1 = false; - stup.tuple = DatumGetPointer(stup.datum1); - USEMEM(state, GetMemoryChunkSpace(stup.tuple)); - } - - puttuple_common(state, &stup); - - MemoryContextSwitchTo(oldcontext); -} - -void -rum_tuplesort_putrum(RumTuplesortstate *state, RumSortItem * item) -{ - MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); - SortTuple stup; - - /* - * Copy the given tuple into memory we control, and decrease availMem. - * Then call the common code. - */ - COPYTUP(state, &stup, (void *) item); - - puttuple_common(state, &stup); - - MemoryContextSwitchTo(oldcontext); -} - -void -rum_tuplesort_putrumitem(RumTuplesortstate *state, RumScanItem * item) -{ - MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); - SortTuple stup; - - /* - * Copy the given tuple into memory we control, and decrease availMem. - * Then call the common code. - */ - COPYTUP(state, &stup, (void *) item); - - puttuple_common(state, &stup); - - MemoryContextSwitchTo(oldcontext); -} - -/* - * Shared code for tuple and datum cases. - */ -static void -puttuple_common(RumTuplesortstate *state, SortTuple *tuple) -{ - switch (state->status) - { - case TSS_INITIAL: - - /* - * Save the tuple into the unsorted array. First, grow the array - * as needed. Note that we try to grow the array when there is - * still one free slot remaining --- if we fail, there'll still be - * room to store the incoming tuple, and then we'll switch to - * tape-based operation. - */ - if (state->memtupcount >= state->memtupsize - 1) - { - (void) grow_memtuples(state); - Assert(state->memtupcount < state->memtupsize); - } - state->memtuples[state->memtupcount++] = *tuple; - - /* - * Check if it's time to switch over to a bounded heapsort. We do - * so if the input tuple count exceeds twice the desired tuple - * count (this is a heuristic for where heapsort becomes cheaper - * than a quicksort), or if we've just filled workMem and have - * enough tuples to meet the bound. - * - * Note that once we enter TSS_BOUNDED state we will always try to - * complete the sort that way. In the worst case, if later input - * tuples are larger than earlier ones, this might cause us to - * exceed workMem significantly. - */ - if (state->bounded && - (state->memtupcount > state->bound * 2 || - (state->memtupcount > state->bound && LACKMEM(state)))) - { -#ifdef TRACE_SORT - if (trace_sort) - elog(LOG, "switching to bounded heapsort at %d tuples: %s", - state->memtupcount, - pg_rusage_show(&state->ru_start)); -#endif - make_bounded_heap(state); - return; - } - - /* - * Done if we still fit in available memory and have array slots. - */ - if (state->memtupcount < state->memtupsize && !LACKMEM(state)) - return; - - /* - * Nope; time to switch to tape-based operation. - */ - inittapes(state); - - /* - * Dump tuples until we are back under the limit. - */ - dumptuples(state, false); - break; - - case TSS_BOUNDED: - - /* - * We don't want to grow the array here, so check whether the new - * tuple can be discarded before putting it in. This should be a - * good speed optimization, too, since when there are many more - * input tuples than the bound, most input tuples can be discarded - * with just this one comparison. Note that because we currently - * have the sort direction reversed, we must check for <= not >=. - */ - if (COMPARETUP(state, tuple, &state->memtuples[0]) <= 0) - { - /* new tuple <= top of the heap, so we can discard it */ - free_sort_tuple(state, tuple); - CHECK_FOR_INTERRUPTS(); - } - else - { - /* discard top of heap, sift up, insert new tuple */ - free_sort_tuple(state, &state->memtuples[0]); - rum_tuplesort_heap_siftup(state, false); - rum_tuplesort_heap_insert(state, tuple, 0, false); - } - break; - - case TSS_BUILDRUNS: - - /* - * Insert the tuple into the heap, with run number currentRun if - * it can go into the current run, else run number currentRun+1. - * The tuple can go into the current run if it is >= the first - * not-yet-output tuple. (Actually, it could go into the current - * run if it is >= the most recently output tuple ... but that - * would require keeping around the tuple we last output, and it's - * simplest to let writetup free each tuple as soon as it's - * written.) - * - * Note there will always be at least one tuple in the heap at - * this point; see dumptuples. - */ - Assert(state->memtupcount > 0); - if (COMPARETUP(state, tuple, &state->memtuples[0]) >= 0) - rum_tuplesort_heap_insert(state, tuple, state->currentRun, true); - else - rum_tuplesort_heap_insert(state, tuple, state->currentRun + 1, true); - - /* - * If we are over the memory limit, dump tuples till we're under. - */ - dumptuples(state, false); - break; - - default: - elog(ERROR, "invalid tuplesort state"); - break; - } -} - -/* - * All tuples have been provided; finish the sort. - */ -void -rum_tuplesort_performsort(RumTuplesortstate *state) -{ - MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); - -#ifdef TRACE_SORT - if (trace_sort) - elog(LOG, "performsort starting: %s", - pg_rusage_show(&state->ru_start)); -#endif - - switch (state->status) - { - case TSS_INITIAL: - - /* - * We were able to accumulate all the tuples within the allowed - * amount of memory. Just qsort 'em and we're done. - */ - if (state->memtupcount > 1) - { - /* Can we use the single-key sort function? */ - if (state->onlyKey != NULL) - qsort_ssup(state->memtuples, state->memtupcount, - state->onlyKey); - else - qsort_tuple(state->memtuples, - state->memtupcount, - state->comparetup, - state); - } - state->current = 0; - state->eof_reached = false; - state->markpos_offset = 0; - state->markpos_eof = false; - state->status = TSS_SORTEDINMEM; - break; - - case TSS_BOUNDED: - - /* - * We were able to accumulate all the tuples required for output - * in memory, using a heap to eliminate excess tuples. Now we - * have to transform the heap to a properly-sorted array. - */ - sort_bounded_heap(state); - state->current = 0; - state->eof_reached = false; - state->markpos_offset = 0; - state->markpos_eof = false; - state->status = TSS_SORTEDINMEM; - break; - - case TSS_BUILDRUNS: - - /* - * Finish tape-based sort. First, flush all tuples remaining in - * memory out to tape; then merge until we have a single remaining - * run (or, if !randomAccess, one run per tape). Note that - * mergeruns sets the correct state->status. - */ - dumptuples(state, true); - mergeruns(state); - state->eof_reached = false; - state->markpos_block = 0L; - state->markpos_offset = 0; - state->markpos_eof = false; - break; - - default: - elog(ERROR, "invalid tuplesort state"); - break; - } - -#ifdef TRACE_SORT - if (trace_sort) - { - if (state->status == TSS_FINALMERGE) - elog(LOG, "performsort done (except %d-way final merge): %s", - state->activeTapes, - pg_rusage_show(&state->ru_start)); - else - elog(LOG, "performsort done: %s", - pg_rusage_show(&state->ru_start)); - } -#endif - - MemoryContextSwitchTo(oldcontext); -} - -/* - * Internal routine to fetch the next tuple in either forward or back - * direction into *stup. Returns false if no more tuples. - * If *should_free is set, the caller must pfree stup.tuple when done with it. - */ -static bool -rum_tuplesort_gettuple_common(RumTuplesortstate *state, bool forward, - SortTuple *stup, bool *should_free) -{ - unsigned int tuplen; - - switch (state->status) - { - case TSS_SORTEDINMEM: - Assert(forward || state->randomAccess); - *should_free = false; - if (forward) - { - if (state->current < state->memtupcount) - { - *stup = state->memtuples[state->current++]; - return true; - } - state->eof_reached = true; - - /* - * Complain if caller tries to retrieve more tuples than - * originally asked for in a bounded sort. This is because - * returning EOF here might be the wrong thing. - */ - if (state->bounded && state->current >= state->bound) - elog(ERROR, "retrieved too many tuples in a bounded sort"); - - return false; - } - else - { - if (state->current <= 0) - return false; - - /* - * if all tuples are fetched already then we return last - * tuple, else - tuple before last returned. - */ - if (state->eof_reached) - state->eof_reached = false; - else - { - state->current--; /* last returned tuple */ - if (state->current <= 0) - return false; - } - *stup = state->memtuples[state->current - 1]; - return true; - } - break; - - case TSS_SORTEDONTAPE: - Assert(forward || state->randomAccess); - *should_free = true; - if (forward) - { - if (state->eof_reached) - return false; - if ((tuplen = getlen(state, state->result_tape, true)) != 0) - { - READTUP(state, stup, state->result_tape, tuplen); - return true; - } - else - { - state->eof_reached = true; - return false; - } - } - - /* - * Backward. - * - * if all tuples are fetched already then we return last tuple, - * else - tuple before last returned. - */ - if (state->eof_reached) - { - /* - * Seek position is pointing just past the zero tuplen at the - * end of file; back up to fetch last tuple's ending length - * word. If seek fails we must have a completely empty file. - */ - if (!LogicalTapeBackspace(state->tapeset, - state->result_tape, - 2 * sizeof(unsigned int))) - return false; - state->eof_reached = false; - } - else - { - /* - * Back up and fetch previously-returned tuple's ending length - * word. If seek fails, assume we are at start of file. - */ - if (!LogicalTapeBackspace(state->tapeset, - state->result_tape, - sizeof(unsigned int))) - return false; - tuplen = getlen(state, state->result_tape, false); - - /* - * Back up to get ending length word of tuple before it. - */ - if (!LogicalTapeBackspace(state->tapeset, - state->result_tape, - tuplen + 2 * sizeof(unsigned int))) - { - /* - * If that fails, presumably the prev tuple is the first - * in the file. Back up so that it becomes next to read - * in forward direction (not obviously right, but that is - * what in-memory case does). - */ - if (!LogicalTapeBackspace(state->tapeset, - state->result_tape, - tuplen + sizeof(unsigned int))) - elog(ERROR, "bogus tuple length in backward scan"); - return false; - } - } - - tuplen = getlen(state, state->result_tape, false); - - /* - * Now we have the length of the prior tuple, back up and read it. - * Note: READTUP expects we are positioned after the initial - * length word of the tuple, so back up to that point. - */ - if (!LogicalTapeBackspace(state->tapeset, - state->result_tape, - tuplen)) - elog(ERROR, "bogus tuple length in backward scan"); - READTUP(state, stup, state->result_tape, tuplen); - return true; - - case TSS_FINALMERGE: - Assert(forward); - *should_free = true; - - /* - * This code should match the inner loop of mergeonerun(). - */ - if (state->memtupcount > 0) - { - int srcTape = state->memtuples[0].tupindex; - Size tuplen; - int tupIndex; - SortTuple *newtup; - - *stup = state->memtuples[0]; - /* returned tuple is no longer counted in our memory space */ - if (stup->tuple) - { - tuplen = GetMemoryChunkSpace(stup->tuple); - state->availMem += tuplen; - state->mergeavailmem[srcTape] += tuplen; - } - rum_tuplesort_heap_siftup(state, false); - if ((tupIndex = state->mergenext[srcTape]) == 0) - { - /* - * out of preloaded data on this tape, try to read more - * - * Unlike mergeonerun(), we only preload from the single - * tape that's run dry. See mergepreread() comments. - */ - mergeprereadone(state, srcTape); - - /* - * if still no data, we've reached end of run on this tape - */ - if ((tupIndex = state->mergenext[srcTape]) == 0) - return true; - } - /* pull next preread tuple from list, insert in heap */ - newtup = &state->memtuples[tupIndex]; - state->mergenext[srcTape] = newtup->tupindex; - if (state->mergenext[srcTape] == 0) - state->mergelast[srcTape] = 0; - rum_tuplesort_heap_insert(state, newtup, srcTape, false); - /* put the now-unused memtuples entry on the freelist */ - newtup->tupindex = state->mergefreelist; - state->mergefreelist = tupIndex; - state->mergeavailslots[srcTape]++; - return true; - } - return false; - - default: - elog(ERROR, "invalid tuplesort state"); - return false; /* keep compiler quiet */ - } -} - -/* - * Fetch the next tuple in either forward or back direction. - * If successful, put tuple in slot and return true; else, clear the slot - * and return false. - */ -bool -rum_tuplesort_gettupleslot(RumTuplesortstate *state, bool forward, - TupleTableSlot *slot) -{ - MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); - SortTuple stup; - bool should_free; - - if (!rum_tuplesort_gettuple_common(state, forward, &stup, &should_free)) - stup.tuple = NULL; - - MemoryContextSwitchTo(oldcontext); - - if (stup.tuple) - { - ExecStoreMinimalTuple((MinimalTuple) stup.tuple, slot, should_free); - return true; - } - else - { - ExecClearTuple(slot); - return false; - } -} - -/* - * Fetch the next tuple in either forward or back direction. - * Returns NULL if no more tuples. If *should_free is set, the - * caller must pfree the returned tuple when done with it. - */ -HeapTuple -rum_tuplesort_getheaptuple(RumTuplesortstate *state, bool forward, bool *should_free) -{ - MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); - SortTuple stup; - - if (!rum_tuplesort_gettuple_common(state, forward, &stup, should_free)) - stup.tuple = NULL; - - MemoryContextSwitchTo(oldcontext); - - return stup.tuple; -} - -/* - * Fetch the next index tuple in either forward or back direction. - * Returns NULL if no more tuples. If *should_free is set, the - * caller must pfree the returned tuple when done with it. - */ -IndexTuple -rum_tuplesort_getindextuple(RumTuplesortstate *state, bool forward, - bool *should_free) -{ - MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); - SortTuple stup; - - if (!rum_tuplesort_gettuple_common(state, forward, &stup, should_free)) - stup.tuple = NULL; - - MemoryContextSwitchTo(oldcontext); - - return (IndexTuple) stup.tuple; -} - -/* - * Fetch the next Datum in either forward or back direction. - * Returns false if no more datums. - * - * If the Datum is pass-by-ref type, the returned value is freshly palloc'd - * and is now owned by the caller. - */ -bool -rum_tuplesort_getdatum(RumTuplesortstate *state, bool forward, - Datum *val, bool *isNull) -{ - MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); - SortTuple stup; - bool should_free; - - if (!rum_tuplesort_gettuple_common(state, forward, &stup, &should_free)) - { - MemoryContextSwitchTo(oldcontext); - return false; - } - - if (stup.isnull1 || state->datumTypeByVal) - { - *val = stup.datum1; - *isNull = stup.isnull1; - } - else - { - if (should_free) - *val = stup.datum1; - else - *val = datumCopy(stup.datum1, false, state->datumTypeLen); - *isNull = false; - } - - MemoryContextSwitchTo(oldcontext); - - return true; -} - -RumSortItem * -rum_tuplesort_getrum(RumTuplesortstate *state, bool forward, bool *should_free) -{ - MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); - SortTuple stup; - - if (!rum_tuplesort_gettuple_common(state, forward, &stup, should_free)) - stup.tuple = NULL; - - MemoryContextSwitchTo(oldcontext); - - return (RumSortItem *) stup.tuple; -} - -RumScanItem * -rum_tuplesort_getrumitem(RumTuplesortstate *state, bool forward, bool *should_free) -{ - MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); - SortTuple stup; - - if (!rum_tuplesort_gettuple_common(state, forward, &stup, should_free)) - stup.tuple = NULL; - - MemoryContextSwitchTo(oldcontext); - - return (RumScanItem *) stup.tuple; -} - -/* - * rum_tuplesort_merge_order - report merge order we'll use for given memory - * (note: "merge order" just means the number of input tapes in the merge). - * - * This is exported for use by the planner. allowedMem is in bytes. - */ -int -rum_tuplesort_merge_order(long allowedMem) -{ - int mOrder; - - /* - * We need one tape for each merge input, plus another one for the output, - * and each of these tapes needs buffer space. In addition we want - * MERGE_BUFFER_SIZE workspace per input tape (but the output tape doesn't - * count). - * - * Note: you might be thinking we need to account for the memtuples[] - * array in this calculation, but we effectively treat that as part of the - * MERGE_BUFFER_SIZE workspace. - */ - mOrder = (allowedMem - TAPE_BUFFER_OVERHEAD) / - (MERGE_BUFFER_SIZE + TAPE_BUFFER_OVERHEAD); - - /* Even in minimum memory, use at least a MINORDER merge */ - mOrder = Max(mOrder, MINORDER); - - return mOrder; -} - -/* - * inittapes - initialize for tape sorting. - * - * This is called only if we have found we don't have room to sort in memory. - */ -static void -inittapes(RumTuplesortstate *state) -{ - int maxTapes, - ntuples, - j; - long tapeSpace; - - /* Compute number of tapes to use: merge order plus 1 */ - maxTapes = rum_tuplesort_merge_order(state->allowedMem) + 1; - - /* - * We must have at least 2*maxTapes slots in the memtuples[] array, else - * we'd not have room for merge heap plus preread. It seems unlikely that - * this case would ever occur, but be safe. - */ - maxTapes = Min(maxTapes, state->memtupsize / 2); - - state->maxTapes = maxTapes; - state->tapeRange = maxTapes - 1; - -#ifdef TRACE_SORT - if (trace_sort) - elog(LOG, "switching to external sort with %d tapes: %s", - maxTapes, pg_rusage_show(&state->ru_start)); -#endif - - /* - * Decrease availMem to reflect the space needed for tape buffers; but - * don't decrease it to the point that we have no room for tuples. (That - * case is only likely to occur if sorting pass-by-value Datums; in all - * other scenarios the memtuples[] array is unlikely to occupy more than - * half of allowedMem. In the pass-by-value case it's not important to - * account for tuple space, so we don't care if LACKMEM becomes - * inaccurate.) - */ - tapeSpace = (long) maxTapes *TAPE_BUFFER_OVERHEAD; - - if (tapeSpace + GetMemoryChunkSpace(state->memtuples) < state->allowedMem) - USEMEM(state, tapeSpace); - - /* - * Make sure that the temp file(s) underlying the tape set are created in - * suitable temp tablespaces. - */ - PrepareTempTablespaces(); - - /* - * Create the tape set and allocate the per-tape data arrays. - */ - state->tapeset = LogicalTapeSetCreate(maxTapes); - - state->mergeactive = (bool *) palloc0(maxTapes * sizeof(bool)); - state->mergenext = (int *) palloc0(maxTapes * sizeof(int)); - state->mergelast = (int *) palloc0(maxTapes * sizeof(int)); - state->mergeavailslots = (int *) palloc0(maxTapes * sizeof(int)); - state->mergeavailmem = (long *) palloc0(maxTapes * sizeof(long)); - state->tp_fib = (int *) palloc0(maxTapes * sizeof(int)); - state->tp_runs = (int *) palloc0(maxTapes * sizeof(int)); - state->tp_dummy = (int *) palloc0(maxTapes * sizeof(int)); - state->tp_tapenum = (int *) palloc0(maxTapes * sizeof(int)); - - /* - * Convert the unsorted contents of memtuples[] into a heap. Each tuple is - * marked as belonging to run number zero. - * - * NOTE: we pass false for checkIndex since there's no point in comparing - * indexes in this step, even though we do intend the indexes to be part - * of the sort key... - */ - ntuples = state->memtupcount; - state->memtupcount = 0; /* make the heap empty */ - for (j = 0; j < ntuples; j++) - { - /* Must copy source tuple to avoid possible overwrite */ - SortTuple stup = state->memtuples[j]; - - rum_tuplesort_heap_insert(state, &stup, 0, false); - } - Assert(state->memtupcount == ntuples); - - state->currentRun = 0; - - /* - * Initialize variables of Algorithm D (step D1). - */ - for (j = 0; j < maxTapes; j++) - { - state->tp_fib[j] = 1; - state->tp_runs[j] = 0; - state->tp_dummy[j] = 1; - state->tp_tapenum[j] = j; - } - state->tp_fib[state->tapeRange] = 0; - state->tp_dummy[state->tapeRange] = 0; - - state->Level = 1; - state->destTape = 0; - - state->status = TSS_BUILDRUNS; -} - -/* - * selectnewtape -- select new tape for new initial run. - * - * This is called after finishing a run when we know another run - * must be started. This implements steps D3, D4 of Algorithm D. - */ -static void -selectnewtape(RumTuplesortstate *state) -{ - int j; - int a; - - /* Step D3: advance j (destTape) */ - if (state->tp_dummy[state->destTape] < state->tp_dummy[state->destTape + 1]) - { - state->destTape++; - return; - } - if (state->tp_dummy[state->destTape] != 0) - { - state->destTape = 0; - return; - } - - /* Step D4: increase level */ - state->Level++; - a = state->tp_fib[0]; - for (j = 0; j < state->tapeRange; j++) - { - state->tp_dummy[j] = a + state->tp_fib[j + 1] - state->tp_fib[j]; - state->tp_fib[j] = a + state->tp_fib[j + 1]; - } - state->destTape = 0; -} - -/* - * mergeruns -- merge all the completed initial runs. - * - * This implements steps D5, D6 of Algorithm D. All input data has - * already been written to initial runs on tape (see dumptuples). - */ -static void -mergeruns(RumTuplesortstate *state) -{ - int tapenum, - svTape, - svRuns, - svDummy; - int numTapes; - int numInputTapes; - - Assert(state->status == TSS_BUILDRUNS); - Assert(state->memtupcount == 0); - - /* - * If we produced only one initial run (quite likely if the total data - * volume is between 1X and 2X workMem), we can just use that tape as the - * finished output, rather than doing a useless merge. (This obvious - * optimization is not in Knuth's algorithm.) - */ - if (state->currentRun == 1) - { - state->result_tape = state->tp_tapenum[state->destTape]; - /* must freeze and rewind the finished output tape */ - LogicalTapeFreeze(state->tapeset, state->result_tape); - state->status = TSS_SORTEDONTAPE; - return; - } - - /* - * If we had fewer runs than tapes, refund the memory that we imagined we - * would need for the tape buffers of the unused tapes. - * - * numTapes and numInputTapes reflect the actual number of tapes we will - * use. Note that the output tape's tape number is maxTapes - 1, so the - * tape numbers of the used tapes are not consecutive, and you cannot just - * loop from 0 to numTapes to visit all used tapes! - */ - if (state->Level == 1) - { - numInputTapes = state->currentRun; - numTapes = numInputTapes + 1; - FREEMEM(state, (state->maxTapes - numTapes) * TAPE_BUFFER_OVERHEAD); - } - else - { - numInputTapes = state->tapeRange; - numTapes = state->maxTapes; - } - - state->read_buffer_size = Max(state->availMem / numInputTapes, 0); - USEMEM(state, state->read_buffer_size * numInputTapes); - - /* End of step D2: rewind all output tapes to prepare for merging */ - for (tapenum = 0; tapenum < state->tapeRange; tapenum++) - LogicalTapeRewindForRead(state->tapeset, tapenum, state->read_buffer_size); - - for (;;) - { - /* - * At this point we know that tape[T] is empty. If there's just one - * (real or dummy) run left on each input tape, then only one merge - * pass remains. If we don't have to produce a materialized sorted - * tape, we can stop at this point and do the final merge on-the-fly. - */ - if (!state->randomAccess) - { - bool allOneRun = true; - - Assert(state->tp_runs[state->tapeRange] == 0); - for (tapenum = 0; tapenum < state->tapeRange; tapenum++) - { - if (state->tp_runs[tapenum] + state->tp_dummy[tapenum] != 1) - { - allOneRun = false; - break; - } - } - if (allOneRun) - { - /* Tell logtape.c we won't be writing anymore */ - LogicalTapeSetForgetFreeSpace(state->tapeset); - /* Initialize for the final merge pass */ - beginmerge(state); - state->status = TSS_FINALMERGE; - return; - } - } - - /* Step D5: merge runs onto tape[T] until tape[P] is empty */ - while (state->tp_runs[state->tapeRange - 1] || - state->tp_dummy[state->tapeRange - 1]) - { - bool allDummy = true; - - for (tapenum = 0; tapenum < state->tapeRange; tapenum++) - { - if (state->tp_dummy[tapenum] == 0) - { - allDummy = false; - break; - } - } - - if (allDummy) - { - state->tp_dummy[state->tapeRange]++; - for (tapenum = 0; tapenum < state->tapeRange; tapenum++) - state->tp_dummy[tapenum]--; - } - else - mergeonerun(state); - } - - /* Step D6: decrease level */ - if (--state->Level == 0) - break; - /* rewind output tape T to use as new input */ - LogicalTapeRewindForRead(state->tapeset, state->tp_tapenum[state->tapeRange], - state->read_buffer_size); - /* rewind used-up input tape P, and prepare it for write pass */ - LogicalTapeRewindForWrite(state->tapeset, state->tp_tapenum[state->tapeRange - 1]); - state->tp_runs[state->tapeRange - 1] = 0; - - /* - * reassign tape units per step D6; note we no longer care about A[] - */ - svTape = state->tp_tapenum[state->tapeRange]; - svDummy = state->tp_dummy[state->tapeRange]; - svRuns = state->tp_runs[state->tapeRange]; - for (tapenum = state->tapeRange; tapenum > 0; tapenum--) - { - state->tp_tapenum[tapenum] = state->tp_tapenum[tapenum - 1]; - state->tp_dummy[tapenum] = state->tp_dummy[tapenum - 1]; - state->tp_runs[tapenum] = state->tp_runs[tapenum - 1]; - } - state->tp_tapenum[0] = svTape; - state->tp_dummy[0] = svDummy; - state->tp_runs[0] = svRuns; - } - - /* - * Done. Knuth says that the result is on TAPE[1], but since we exited - * the loop without performing the last iteration of step D6, we have not - * rearranged the tape unit assignment, and therefore the result is on - * TAPE[T]. We need to do it this way so that we can freeze the final - * output tape while rewinding it. The last iteration of step D6 would be - * a waste of cycles anyway... - */ - state->result_tape = state->tp_tapenum[state->tapeRange]; - LogicalTapeFreeze(state->tapeset, state->result_tape); - state->status = TSS_SORTEDONTAPE; -} - -/* - * Merge one run from each input tape, except ones with dummy runs. - * - * This is the inner loop of Algorithm D step D5. We know that the - * output tape is TAPE[T]. - */ -static void -mergeonerun(RumTuplesortstate *state) -{ - int destTape = state->tp_tapenum[state->tapeRange]; - int srcTape; - int tupIndex; - SortTuple *tup; - long priorAvail, - spaceFreed; - - /* - * Start the merge by loading one tuple from each active source tape into - * the heap. We can also decrease the input run/dummy run counts. - */ - beginmerge(state); - - /* - * Execute merge by repeatedly extracting lowest tuple in heap, writing it - * out, and replacing it with next tuple from same tape (if there is - * another one). - */ - while (state->memtupcount > 0) - { - /* write the tuple to destTape */ - priorAvail = state->availMem; - srcTape = state->memtuples[0].tupindex; - WRITETUP(state, destTape, &state->memtuples[0]); - /* writetup adjusted total free space, now fix per-tape space */ - spaceFreed = state->availMem - priorAvail; - state->mergeavailmem[srcTape] += spaceFreed; - /* compact the heap */ - rum_tuplesort_heap_siftup(state, false); - if ((tupIndex = state->mergenext[srcTape]) == 0) - { - /* out of preloaded data on this tape, try to read more */ - mergepreread(state); - /* if still no data, we've reached end of run on this tape */ - if ((tupIndex = state->mergenext[srcTape]) == 0) - continue; - } - /* pull next preread tuple from list, insert in heap */ - tup = &state->memtuples[tupIndex]; - state->mergenext[srcTape] = tup->tupindex; - if (state->mergenext[srcTape] == 0) - state->mergelast[srcTape] = 0; - rum_tuplesort_heap_insert(state, tup, srcTape, false); - /* put the now-unused memtuples entry on the freelist */ - tup->tupindex = state->mergefreelist; - state->mergefreelist = tupIndex; - state->mergeavailslots[srcTape]++; - } - - /* - * When the heap empties, we're done. Write an end-of-run marker on the - * output tape, and increment its count of real runs. - */ - markrunend(state, destTape); - state->tp_runs[state->tapeRange]++; - -#ifdef TRACE_SORT - if (trace_sort) - elog(LOG, "finished %d-way merge step: %s", state->activeTapes, - pg_rusage_show(&state->ru_start)); -#endif -} - -/* - * beginmerge - initialize for a merge pass - * - * We decrease the counts of real and dummy runs for each tape, and mark - * which tapes contain active input runs in mergeactive[]. Then, load - * as many tuples as we can from each active input tape, and finally - * fill the merge heap with the first tuple from each active tape. - */ -static void -beginmerge(RumTuplesortstate *state) -{ - int activeTapes; - int tapenum; - int srcTape; - int slotsPerTape; - long spacePerTape; - - /* Heap should be empty here */ - Assert(state->memtupcount == 0); - - /* Adjust run counts and mark the active tapes */ - memset(state->mergeactive, 0, - state->maxTapes * sizeof(*state->mergeactive)); - activeTapes = 0; - for (tapenum = 0; tapenum < state->tapeRange; tapenum++) - { - if (state->tp_dummy[tapenum] > 0) - state->tp_dummy[tapenum]--; - else - { - Assert(state->tp_runs[tapenum] > 0); - state->tp_runs[tapenum]--; - srcTape = state->tp_tapenum[tapenum]; - state->mergeactive[srcTape] = true; - activeTapes++; - } - } - state->activeTapes = activeTapes; - - /* Clear merge-pass state variables */ - memset(state->mergenext, 0, - state->maxTapes * sizeof(*state->mergenext)); - memset(state->mergelast, 0, - state->maxTapes * sizeof(*state->mergelast)); - state->mergefreelist = 0; /* nothing in the freelist */ - state->mergefirstfree = activeTapes; /* 1st slot avail for preread */ - - /* - * Initialize space allocation to let each active input tape have an equal - * share of preread space. - */ - Assert(activeTapes > 0); - slotsPerTape = (state->memtupsize - state->mergefirstfree) / activeTapes; - Assert(slotsPerTape > 0); - spacePerTape = state->availMem / activeTapes; - for (srcTape = 0; srcTape < state->maxTapes; srcTape++) - { - if (state->mergeactive[srcTape]) - { - state->mergeavailslots[srcTape] = slotsPerTape; - state->mergeavailmem[srcTape] = spacePerTape; - } - } - - /* - * Preread as many tuples as possible (and at least one) from each active - * tape - */ - mergepreread(state); - - /* Load the merge heap with the first tuple from each input tape */ - for (srcTape = 0; srcTape < state->maxTapes; srcTape++) - { - int tupIndex = state->mergenext[srcTape]; - SortTuple *tup; - - if (tupIndex) - { - tup = &state->memtuples[tupIndex]; - state->mergenext[srcTape] = tup->tupindex; - if (state->mergenext[srcTape] == 0) - state->mergelast[srcTape] = 0; - rum_tuplesort_heap_insert(state, tup, srcTape, false); - /* put the now-unused memtuples entry on the freelist */ - tup->tupindex = state->mergefreelist; - state->mergefreelist = tupIndex; - state->mergeavailslots[srcTape]++; - } - } -} - -/* - * mergepreread - load tuples from merge input tapes - * - * This routine exists to improve sequentiality of reads during a merge pass, - * as explained in the header comments of this file. Load tuples from each - * active source tape until the tape's run is exhausted or it has used up - * its fair share of available memory. In any case, we guarantee that there - * is at least one preread tuple available from each unexhausted input tape. - * - * We invoke this routine at the start of a merge pass for initial load, - * and then whenever any tape's preread data runs out. Note that we load - * as much data as possible from all tapes, not just the one that ran out. - * This is because logtape.c works best with a usage pattern that alternates - * between reading a lot of data and writing a lot of data, so whenever we - * are forced to read, we should fill working memory completely. - * - * In FINALMERGE state, we *don't* use this routine, but instead just preread - * from the single tape that ran dry. There's no read/write alternation in - * that state and so no point in scanning through all the tapes to fix one. - * (Moreover, there may be quite a lot of inactive tapes in that state, since - * we might have had many fewer runs than tapes. In a regular tape-to-tape - * merge we can expect most of the tapes to be active.) - */ -static void -mergepreread(RumTuplesortstate *state) -{ - int srcTape; - - for (srcTape = 0; srcTape < state->maxTapes; srcTape++) - mergeprereadone(state, srcTape); -} - -/* - * mergeprereadone - load tuples from one merge input tape - * - * Read tuples from the specified tape until it has used up its free memory - * or array slots; but ensure that we have at least one tuple, if any are - * to be had. - */ -static void -mergeprereadone(RumTuplesortstate *state, int srcTape) -{ - unsigned int tuplen; - SortTuple stup; - int tupIndex; - long priorAvail, - spaceUsed; - - if (!state->mergeactive[srcTape]) - return; /* tape's run is already exhausted */ - priorAvail = state->availMem; - state->availMem = state->mergeavailmem[srcTape]; - while ((state->mergeavailslots[srcTape] > 0 && !LACKMEM(state)) || - state->mergenext[srcTape] == 0) - { - /* read next tuple, if any */ - if ((tuplen = getlen(state, srcTape, true)) == 0) - { - state->mergeactive[srcTape] = false; - break; - } - READTUP(state, &stup, srcTape, tuplen); - /* find a free slot in memtuples[] for it */ - tupIndex = state->mergefreelist; - if (tupIndex) - state->mergefreelist = state->memtuples[tupIndex].tupindex; - else - { - tupIndex = state->mergefirstfree++; - Assert(tupIndex < state->memtupsize); - } - state->mergeavailslots[srcTape]--; - /* store tuple, append to list for its tape */ - stup.tupindex = 0; - state->memtuples[tupIndex] = stup; - if (state->mergelast[srcTape]) - state->memtuples[state->mergelast[srcTape]].tupindex = tupIndex; - else - state->mergenext[srcTape] = tupIndex; - state->mergelast[srcTape] = tupIndex; - } - /* update per-tape and global availmem counts */ - spaceUsed = state->mergeavailmem[srcTape] - state->availMem; - state->mergeavailmem[srcTape] = state->availMem; - state->availMem = priorAvail - spaceUsed; -} - -/* - * dumptuples - remove tuples from heap and write to tape - * - * This is used during initial-run building, but not during merging. - * - * When alltuples = false, dump only enough tuples to get under the - * availMem limit (and leave at least one tuple in the heap in any case, - * since puttuple assumes it always has a tuple to compare to). We also - * insist there be at least one free slot in the memtuples[] array. - * - * When alltuples = true, dump everything currently in memory. - * (This case is only used at end of input data.) - * - * If we empty the heap, close out the current run and return (this should - * only happen at end of input data). If we see that the tuple run number - * at the top of the heap has changed, start a new run. - */ -static void -dumptuples(RumTuplesortstate *state, bool alltuples) -{ - while (alltuples || - (LACKMEM(state) && state->memtupcount > 1) || - state->memtupcount >= state->memtupsize) - { - /* - * Dump the heap's frontmost entry, and sift up to remove it from the - * heap. - */ - Assert(state->memtupcount > 0); - WRITETUP(state, state->tp_tapenum[state->destTape], - &state->memtuples[0]); - rum_tuplesort_heap_siftup(state, true); - - /* - * If the heap is empty *or* top run number has changed, we've - * finished the current run. - */ - if (state->memtupcount == 0 || - state->currentRun != state->memtuples[0].tupindex) - { - markrunend(state, state->tp_tapenum[state->destTape]); - state->currentRun++; - state->tp_runs[state->destTape]++; - state->tp_dummy[state->destTape]--; /* per Alg D step D2 */ - -#ifdef TRACE_SORT - if (trace_sort) - elog(LOG, "finished writing%s run %d to tape %d: %s", - (state->memtupcount == 0) ? " final" : "", - state->currentRun, state->destTape, - pg_rusage_show(&state->ru_start)); -#endif - - /* - * Done if heap is empty, else prepare for new run. - */ - if (state->memtupcount == 0) - break; - Assert(state->currentRun == state->memtuples[0].tupindex); - selectnewtape(state); - } - } -} - -/* - * rum_tuplesort_rescan - rewind and replay the scan - */ -void -rum_tuplesort_rescan(RumTuplesortstate *state) -{ - MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); - - Assert(state->randomAccess); - - switch (state->status) - { - case TSS_SORTEDINMEM: - state->current = 0; - state->eof_reached = false; - state->markpos_offset = 0; - state->markpos_eof = false; - break; - case TSS_SORTEDONTAPE: - LogicalTapeRewindForRead(state->tapeset, - state->result_tape, - state->read_buffer_size); - state->eof_reached = false; - state->markpos_block = 0L; - state->markpos_offset = 0; - state->markpos_eof = false; - break; - default: - elog(ERROR, "invalid tuplesort state"); - break; - } - - MemoryContextSwitchTo(oldcontext); -} - -/* - * rum_tuplesort_markpos - saves current position in the merged sort file - */ -void -rum_tuplesort_markpos(RumTuplesortstate *state) -{ - MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); - - Assert(state->randomAccess); - - switch (state->status) - { - case TSS_SORTEDINMEM: - state->markpos_offset = state->current; - state->markpos_eof = state->eof_reached; - break; - case TSS_SORTEDONTAPE: - LogicalTapeTell(state->tapeset, - state->result_tape, - &state->markpos_block, - &state->markpos_offset); - state->markpos_eof = state->eof_reached; - break; - default: - elog(ERROR, "invalid tuplesort state"); - break; - } - - MemoryContextSwitchTo(oldcontext); -} - -/* - * rum_tuplesort_restorepos - restores current position in merged sort file to - * last saved position - */ -void -rum_tuplesort_restorepos(RumTuplesortstate *state) -{ - MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); - - Assert(state->randomAccess); - - switch (state->status) - { - case TSS_SORTEDINMEM: - state->current = state->markpos_offset; - state->eof_reached = state->markpos_eof; - break; - case TSS_SORTEDONTAPE: -#if PG_VERSION_NUM < 100000 - if (!LogicalTapeSeek(state->tapeset, - state->result_tape, - state->markpos_block, - state->markpos_offset)) - elog(ERROR, "rum_tuplesort_restorepos failed"); -#else - LogicalTapeSeek(state->tapeset, - state->result_tape, - state->markpos_block, - state->markpos_offset); -#endif - state->eof_reached = state->markpos_eof; - break; - default: - elog(ERROR, "invalid tuplesort state"); - break; - } - - MemoryContextSwitchTo(oldcontext); -} - -/* - * rum_tuplesort_get_stats - extract summary statistics - * - * This can be called after rum_tuplesort_performsort() finishes to obtain - * printable summary information about how the sort was performed. - * spaceUsed is measured in kilobytes. - */ -void -rum_tuplesort_get_stats(RumTuplesortstate *state, - const char **sortMethod, - const char **spaceType, - long *spaceUsed) -{ - /* - * Note: it might seem we should provide both memory and disk usage for a - * disk-based sort. However, the current code doesn't track memory space - * accurately once we have begun to return tuples to the caller (since we - * don't account for pfree's the caller is expected to do), so we cannot - * rely on availMem in a disk sort. This does not seem worth the overhead - * to fix. Is it worth creating an API for the memory context code to - * tell us how much is actually used in sortcontext? - */ - if (state->tapeset) - { - *spaceType = "Disk"; - *spaceUsed = LogicalTapeSetBlocks(state->tapeset) * (BLCKSZ / 1024); - } - else - { - *spaceType = "Memory"; - *spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024; - } - - switch (state->status) - { - case TSS_SORTEDINMEM: - if (state->boundUsed) - *sortMethod = "top-N heapsort"; - else - *sortMethod = "quicksort"; - break; - case TSS_SORTEDONTAPE: - *sortMethod = "external sort"; - break; - case TSS_FINALMERGE: - *sortMethod = "external merge"; - break; - default: - *sortMethod = "still in progress"; - break; - } -} - - -/* - * Heap manipulation routines, per Knuth's Algorithm 5.2.3H. - * - * Compare two SortTuples. If checkIndex is true, use the tuple index - * as the front of the sort key; otherwise, no. - */ - -#define HEAPCOMPARE(tup1,tup2) \ - (checkIndex && ((tup1)->tupindex != (tup2)->tupindex) ? \ - ((tup1)->tupindex) - ((tup2)->tupindex) : \ - COMPARETUP(state, tup1, tup2)) - -/* - * Convert the existing unordered array of SortTuples to a bounded heap, - * discarding all but the smallest "state->bound" tuples. - * - * When working with a bounded heap, we want to keep the largest entry - * at the root (array entry zero), instead of the smallest as in the normal - * sort case. This allows us to discard the largest entry cheaply. - * Therefore, we temporarily reverse the sort direction. - * - * We assume that all entries in a bounded heap will always have tupindex - * zero; it therefore doesn't matter that HEAPCOMPARE() doesn't reverse - * the direction of comparison for tupindexes. - */ -static void -make_bounded_heap(RumTuplesortstate *state) -{ - int tupcount = state->memtupcount; - int i; - - Assert(state->status == TSS_INITIAL); - Assert(state->bounded); - Assert(tupcount >= state->bound); - - /* Reverse sort direction so largest entry will be at root */ - REVERSEDIRECTION(state); - - state->memtupcount = 0; /* make the heap empty */ - for (i = 0; i < tupcount; i++) - { - if (state->memtupcount >= state->bound && - COMPARETUP(state, &state->memtuples[i], &state->memtuples[0]) <= 0) - { - /* New tuple would just get thrown out, so skip it */ - free_sort_tuple(state, &state->memtuples[i]); - CHECK_FOR_INTERRUPTS(); - } - else - { - /* Insert next tuple into heap */ - /* Must copy source tuple to avoid possible overwrite */ - SortTuple stup = state->memtuples[i]; - - rum_tuplesort_heap_insert(state, &stup, 0, false); - - /* If heap too full, discard largest entry */ - if (state->memtupcount > state->bound) - { - free_sort_tuple(state, &state->memtuples[0]); - rum_tuplesort_heap_siftup(state, false); - } - } - } - - Assert(state->memtupcount == state->bound); - state->status = TSS_BOUNDED; -} - -/* - * Convert the bounded heap to a properly-sorted array - */ -static void -sort_bounded_heap(RumTuplesortstate *state) -{ - int tupcount = state->memtupcount; - - Assert(state->status == TSS_BOUNDED); - Assert(state->bounded); - Assert(tupcount == state->bound); - - /* - * We can unheapify in place because each sift-up will remove the largest - * entry, which we can promptly store in the newly freed slot at the end. - * Once we're down to a single-entry heap, we're done. - */ - while (state->memtupcount > 1) - { - SortTuple stup = state->memtuples[0]; - - /* this sifts-up the next-largest entry and decreases memtupcount */ - rum_tuplesort_heap_siftup(state, false); - state->memtuples[state->memtupcount] = stup; - } - state->memtupcount = tupcount; - - /* - * Reverse sort direction back to the original state. This is not - * actually necessary but seems like a good idea for tidiness. - */ - REVERSEDIRECTION(state); - - state->status = TSS_SORTEDINMEM; - state->boundUsed = true; -} - -/* - * Insert a new tuple into an empty or existing heap, maintaining the - * heap invariant. Caller is responsible for ensuring there's room. - * - * Note: we assume *tuple is a temporary variable that can be scribbled on. - * For some callers, tuple actually points to a memtuples[] entry above the - * end of the heap. This is safe as long as it's not immediately adjacent - * to the end of the heap (ie, in the [memtupcount] array entry) --- if it - * is, it might get overwritten before being moved into the heap! - */ -static void -rum_tuplesort_heap_insert(RumTuplesortstate *state, SortTuple *tuple, - int tupleindex, bool checkIndex) -{ - SortTuple *memtuples; - int j; - - /* - * Save the tupleindex --- see notes above about writing on *tuple. It's a - * historical artifact that tupleindex is passed as a separate argument - * and not in *tuple, but it's notationally convenient so let's leave it - * that way. - */ - tuple->tupindex = tupleindex; - - memtuples = state->memtuples; - Assert(state->memtupcount < state->memtupsize); - - CHECK_FOR_INTERRUPTS(); - - /* - * Sift-up the new entry, per Knuth 5.2.3 exercise 16. Note that Knuth is - * using 1-based array indexes, not 0-based. - */ - j = state->memtupcount++; - while (j > 0) - { - int i = (j - 1) >> 1; - - if (HEAPCOMPARE(tuple, &memtuples[i]) >= 0) - break; - memtuples[j] = memtuples[i]; - j = i; - } - memtuples[j] = *tuple; -} - -/* - * The tuple at state->memtuples[0] has been removed from the heap. - * Decrement memtupcount, and sift up to maintain the heap invariant. - */ -static void -rum_tuplesort_heap_siftup(RumTuplesortstate *state, bool checkIndex) -{ - SortTuple *memtuples = state->memtuples; - SortTuple *tuple; - int i, - n; - - if (--state->memtupcount <= 0) - return; - - CHECK_FOR_INTERRUPTS(); - - n = state->memtupcount; - tuple = &memtuples[n]; /* tuple that must be reinserted */ - i = 0; /* i is where the "hole" is */ - for (;;) - { - int j = 2 * i + 1; - - if (j >= n) - break; - if (j + 1 < n && - HEAPCOMPARE(&memtuples[j], &memtuples[j + 1]) > 0) - j++; - if (HEAPCOMPARE(tuple, &memtuples[j]) <= 0) - break; - memtuples[i] = memtuples[j]; - i = j; - } - memtuples[i] = *tuple; -} - - -/* - * Tape interface routines - */ - -static unsigned int -getlen(RumTuplesortstate *state, int tapenum, bool eofOK) -{ - unsigned int len; - - if (LogicalTapeRead(state->tapeset, tapenum, - &len, sizeof(len)) != sizeof(len)) - elog(ERROR, "unexpected end of tape"); - if (len == 0 && !eofOK) - elog(ERROR, "unexpected end of data"); - return len; -} - -static void -markrunend(RumTuplesortstate *state, int tapenum) -{ - unsigned int len = 0; - - LogicalTapeWrite(state->tapeset, tapenum, (void *) &len, sizeof(len)); -} - - -/* - * Inline-able copy of FunctionCall2Coll() to save some cycles in sorting. - */ -static inline Datum -myFunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2) -{ - FunctionCallInfoData fcinfo; - Datum result; - - InitFunctionCallInfoData(fcinfo, flinfo, 2, collation, NULL, NULL); - - fcinfo.arg[0] = arg1; - fcinfo.arg[1] = arg2; - fcinfo.argnull[0] = false; - fcinfo.argnull[1] = false; - - result = FunctionCallInvoke(&fcinfo); - - /* Check for null result, since caller is clearly not expecting one */ - if (fcinfo.isnull) - elog(ERROR, "function %u returned NULL", fcinfo.flinfo->fn_oid); - - return result; -} - -/* - * Apply a sort function (by now converted to fmgr lookup form) - * and return a 3-way comparison result. This takes care of handling - * reverse-sort and NULLs-ordering properly. We assume that DESC and - * NULLS_FIRST options are encoded in sk_flags the same way btree does it. - */ -static inline int32 -inlineApplySortFunction(FmgrInfo *sortFunction, int sk_flags, Oid collation, - Datum datum1, bool isNull1, - Datum datum2, bool isNull2) -{ - int32 compare; - - if (isNull1) - { - if (isNull2) - compare = 0; /* NULL "=" NULL */ - else if (sk_flags & SK_BT_NULLS_FIRST) - compare = -1; /* NULL "<" NOT_NULL */ - else - compare = 1; /* NULL ">" NOT_NULL */ - } - else if (isNull2) - { - if (sk_flags & SK_BT_NULLS_FIRST) - compare = 1; /* NOT_NULL ">" NULL */ - else - compare = -1; /* NOT_NULL "<" NULL */ - } - else - { - compare = DatumGetInt32(myFunctionCall2Coll(sortFunction, collation, - datum1, datum2)); - - if (sk_flags & SK_BT_DESC) - compare = -compare; - } - - return compare; -} - - -/* - * Routines specialized for HeapTuple (actually MinimalTuple) case - */ - -static int -comparetup_heap(const SortTuple *a, const SortTuple *b, RumTuplesortstate *state) -{ - SortSupport sortKey = state->sortKeys; - HeapTupleData ltup; - HeapTupleData rtup; - TupleDesc tupDesc; - int nkey; - int32 compare; - - /* Compare the leading sort key */ - compare = ApplySortComparator(a->datum1, a->isnull1, - b->datum1, b->isnull1, - sortKey); - if (compare != 0) - return compare; - - /* Compare additional sort keys */ - ltup.t_len = ((MinimalTuple) a->tuple)->t_len + MINIMAL_TUPLE_OFFSET; - ltup.t_data = (HeapTupleHeader) ((char *) a->tuple - MINIMAL_TUPLE_OFFSET); - rtup.t_len = ((MinimalTuple) b->tuple)->t_len + MINIMAL_TUPLE_OFFSET; - rtup.t_data = (HeapTupleHeader) ((char *) b->tuple - MINIMAL_TUPLE_OFFSET); - tupDesc = state->tupDesc; - sortKey++; - for (nkey = 1; nkey < state->nKeys; nkey++, sortKey++) - { - AttrNumber attno = sortKey->ssup_attno; - Datum datum1, - datum2; - bool isnull1, - isnull2; - - datum1 = heap_getattr(<up, attno, tupDesc, &isnull1); - datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2); - - compare = ApplySortComparator(datum1, isnull1, - datum2, isnull2, - sortKey); - if (compare != 0) - return compare; - } - - return 0; -} - -static void -copytup_heap(RumTuplesortstate *state, SortTuple *stup, void *tup) -{ - /* - * We expect the passed "tup" to be a TupleTableSlot, and form a - * MinimalTuple using the exported interface for that. - */ - TupleTableSlot *slot = (TupleTableSlot *) tup; - MinimalTuple tuple; - HeapTupleData htup; - - /* copy the tuple into sort storage */ - tuple = ExecCopySlotMinimalTuple(slot); - stup->tuple = (void *) tuple; - USEMEM(state, GetMemoryChunkSpace(tuple)); - /* set up first-column key value */ - htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET; - htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET); - stup->datum1 = heap_getattr(&htup, - state->sortKeys[0].ssup_attno, - state->tupDesc, - &stup->isnull1); -} - -static void -writetup_heap(RumTuplesortstate *state, int tapenum, SortTuple *stup) -{ - MinimalTuple tuple = (MinimalTuple) stup->tuple; - - /* the part of the MinimalTuple we'll write: */ - char *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET; - unsigned int tupbodylen = tuple->t_len - MINIMAL_TUPLE_DATA_OFFSET; - - /* total on-disk footprint: */ - unsigned int tuplen = tupbodylen + sizeof(int); - - LogicalTapeWrite(state->tapeset, tapenum, - (void *) &tuplen, sizeof(tuplen)); - LogicalTapeWrite(state->tapeset, tapenum, - (void *) tupbody, tupbodylen); - if (state->randomAccess) /* need trailing length word? */ - LogicalTapeWrite(state->tapeset, tapenum, - (void *) &tuplen, sizeof(tuplen)); - - FREEMEM(state, GetMemoryChunkSpace(tuple)); - heap_free_minimal_tuple(tuple); -} - -static void -readtup_heap(RumTuplesortstate *state, SortTuple *stup, - int tapenum, unsigned int len) -{ - unsigned int tupbodylen = len - sizeof(int); - unsigned int tuplen = tupbodylen + MINIMAL_TUPLE_DATA_OFFSET; - MinimalTuple tuple = (MinimalTuple) palloc(tuplen); - char *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET; - HeapTupleData htup; - - USEMEM(state, GetMemoryChunkSpace(tuple)); - /* read in the tuple proper */ - tuple->t_len = tuplen; - LogicalTapeReadExact(state->tapeset, tapenum, - tupbody, tupbodylen); - if (state->randomAccess) /* need trailing length word? */ - LogicalTapeReadExact(state->tapeset, tapenum, - &tuplen, sizeof(tuplen)); - stup->tuple = (void *) tuple; - /* set up first-column key value */ - htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET; - htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET); - stup->datum1 = heap_getattr(&htup, - state->sortKeys[0].ssup_attno, - state->tupDesc, - &stup->isnull1); -} - -static void -reversedirection_heap(RumTuplesortstate *state) -{ - SortSupport sortKey = state->sortKeys; - int nkey; - - for (nkey = 0; nkey < state->nKeys; nkey++, sortKey++) - { - sortKey->ssup_reverse = !sortKey->ssup_reverse; - sortKey->ssup_nulls_first = !sortKey->ssup_nulls_first; - } -} - - -/* - * Routines specialized for the CLUSTER case (HeapTuple data, with - * comparisons per a btree index definition) - */ - -static int -comparetup_cluster(const SortTuple *a, const SortTuple *b, - RumTuplesortstate *state) -{ - ScanKey scanKey = state->indexScanKey; - HeapTuple ltup; - HeapTuple rtup; - TupleDesc tupDesc; - int nkey; - int32 compare; - - /* Compare the leading sort key, if it's simple */ -#if PG_VERSION_NUM >= 110000 - if (state->indexInfo->ii_IndexAttrNumbers[0] != 0) -#else - if (state->indexInfo->ii_KeyAttrNumbers[0] != 0) -#endif - { - compare = inlineApplySortFunction(&scanKey->sk_func, scanKey->sk_flags, - scanKey->sk_collation, - a->datum1, a->isnull1, - b->datum1, b->isnull1); - if (compare != 0 || state->nKeys == 1) - return compare; - /* Compare additional columns the hard way */ - scanKey++; - nkey = 1; - } - else - { - /* Must compare all keys the hard way */ - nkey = 0; - } - - /* Compare additional sort keys */ - ltup = (HeapTuple) a->tuple; - rtup = (HeapTuple) b->tuple; - - if (state->indexInfo->ii_Expressions == NULL) - { - /* If not expression index, just compare the proper heap attrs */ - tupDesc = state->tupDesc; - - for (; nkey < state->nKeys; nkey++, scanKey++) - { -#if PG_VERSION_NUM >= 110000 - AttrNumber attno = state->indexInfo->ii_IndexAttrNumbers[nkey]; -#else - AttrNumber attno = state->indexInfo->ii_KeyAttrNumbers[nkey]; -#endif - Datum datum1, - datum2; - bool isnull1, - isnull2; - - datum1 = heap_getattr(ltup, attno, tupDesc, &isnull1); - datum2 = heap_getattr(rtup, attno, tupDesc, &isnull2); - - compare = inlineApplySortFunction(&scanKey->sk_func, - scanKey->sk_flags, - scanKey->sk_collation, - datum1, isnull1, - datum2, isnull2); - if (compare != 0) - return compare; - } - } - else - { - /* - * In the expression index case, compute the whole index tuple and - * then compare values. It would perhaps be faster to compute only as - * many columns as we need to compare, but that would require - * duplicating all the logic in FormIndexDatum. - */ - Datum l_index_values[INDEX_MAX_KEYS]; - bool l_index_isnull[INDEX_MAX_KEYS]; - Datum r_index_values[INDEX_MAX_KEYS]; - bool r_index_isnull[INDEX_MAX_KEYS]; - TupleTableSlot *ecxt_scantuple; - - /* Reset context each time to prevent memory leakage */ - ResetPerTupleExprContext(state->estate); - - ecxt_scantuple = GetPerTupleExprContext(state->estate)->ecxt_scantuple; - -#if PG_VERSION_NUM >= 120000 - ExecStoreHeapTuple(ltup, ecxt_scantuple, false); -#else - ExecStoreTuple(ltup, ecxt_scantuple, InvalidBuffer, false); -#endif - FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate, - l_index_values, l_index_isnull); - -#if PG_VERSION_NUM >= 120000 - ExecStoreHeapTuple(rtup, ecxt_scantuple, false); -#else - ExecStoreTuple(rtup, ecxt_scantuple, InvalidBuffer, false); -#endif - FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate, - r_index_values, r_index_isnull); - - for (; nkey < state->nKeys; nkey++, scanKey++) - { - compare = inlineApplySortFunction(&scanKey->sk_func, - scanKey->sk_flags, - scanKey->sk_collation, - l_index_values[nkey], - l_index_isnull[nkey], - r_index_values[nkey], - r_index_isnull[nkey]); - if (compare != 0) - return compare; - } - } - - return 0; -} - -static void -copytup_cluster(RumTuplesortstate *state, SortTuple *stup, void *tup) -{ - HeapTuple tuple = (HeapTuple) tup; -#if PG_VERSION_NUM >= 110000 - AttrNumber attno = state->indexInfo->ii_IndexAttrNumbers[0]; -#else - AttrNumber attno = state->indexInfo->ii_KeyAttrNumbers[0]; -#endif - - /* copy the tuple into sort storage */ - tuple = heap_copytuple(tuple); - stup->tuple = (void *) tuple; - USEMEM(state, GetMemoryChunkSpace(tuple)); - /* set up first-column key value, if it's a simple column */ - if (attno != 0) - stup->datum1 = heap_getattr(tuple, - attno, - state->tupDesc, - &stup->isnull1); -} - -static void -writetup_cluster(RumTuplesortstate *state, int tapenum, SortTuple *stup) -{ - HeapTuple tuple = (HeapTuple) stup->tuple; - unsigned int tuplen = tuple->t_len + sizeof(ItemPointerData) + sizeof(int); - - /* We need to store t_self, but not other fields of HeapTupleData */ - LogicalTapeWrite(state->tapeset, tapenum, - &tuplen, sizeof(tuplen)); - LogicalTapeWrite(state->tapeset, tapenum, - &tuple->t_self, sizeof(ItemPointerData)); - LogicalTapeWrite(state->tapeset, tapenum, - tuple->t_data, tuple->t_len); - if (state->randomAccess) /* need trailing length word? */ - LogicalTapeWrite(state->tapeset, tapenum, - &tuplen, sizeof(tuplen)); - - FREEMEM(state, GetMemoryChunkSpace(tuple)); - heap_freetuple(tuple); -} - -static void -readtup_cluster(RumTuplesortstate *state, SortTuple *stup, - int tapenum, unsigned int tuplen) -{ - unsigned int t_len = tuplen - sizeof(ItemPointerData) - sizeof(int); - HeapTuple tuple = (HeapTuple) palloc(t_len + HEAPTUPLESIZE); -#if PG_VERSION_NUM >= 110000 - AttrNumber attno = state->indexInfo->ii_IndexAttrNumbers[0]; -#else - AttrNumber attno = state->indexInfo->ii_KeyAttrNumbers[0]; -#endif - - USEMEM(state, GetMemoryChunkSpace(tuple)); - /* Reconstruct the HeapTupleData header */ - tuple->t_data = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE); - tuple->t_len = t_len; - LogicalTapeReadExact(state->tapeset, tapenum, - &tuple->t_self, sizeof(ItemPointerData)); - /* We don't currently bother to reconstruct t_tableOid */ - tuple->t_tableOid = InvalidOid; - /* Read in the tuple body */ - LogicalTapeReadExact(state->tapeset, tapenum, - tuple->t_data, tuple->t_len); - if (state->randomAccess) /* need trailing length word? */ - LogicalTapeReadExact(state->tapeset, tapenum, - &tuplen, sizeof(tuplen)); - stup->tuple = (void *) tuple; - /* set up first-column key value, if it's a simple column */ - if (attno != 0) - stup->datum1 = heap_getattr(tuple, - attno, - state->tupDesc, - &stup->isnull1); -} - - -/* - * Routines specialized for IndexTuple case - * - * The btree and hash cases require separate comparison functions, but the - * IndexTuple representation is the same so the copy/write/read support - * functions can be shared. - */ - -static int -comparetup_index_btree(const SortTuple *a, const SortTuple *b, - RumTuplesortstate *state) -{ - /* - * This is similar to _bt_tuplecompare(), but we have already done the - * index_getattr calls for the first column, and we need to keep track of - * whether any null fields are present. Also see the special treatment - * for equal keys at the end. - */ - ScanKey scanKey = state->indexScanKey; - IndexTuple tuple1; - IndexTuple tuple2; - int keysz; - TupleDesc tupDes; - bool equal_hasnull = false; - int nkey; - int32 compare; - - /* Compare the leading sort key */ - compare = inlineApplySortFunction(&scanKey->sk_func, scanKey->sk_flags, - scanKey->sk_collation, - a->datum1, a->isnull1, - b->datum1, b->isnull1); - if (compare != 0) - return compare; - - /* they are equal, so we only need to examine one null flag */ - if (a->isnull1) - equal_hasnull = true; - - /* Compare additional sort keys */ - tuple1 = (IndexTuple) a->tuple; - tuple2 = (IndexTuple) b->tuple; - keysz = state->nKeys; - tupDes = RelationGetDescr(state->indexRel); - scanKey++; - for (nkey = 2; nkey <= keysz; nkey++, scanKey++) - { - Datum datum1, - datum2; - bool isnull1, - isnull2; - - datum1 = index_getattr(tuple1, nkey, tupDes, &isnull1); - datum2 = index_getattr(tuple2, nkey, tupDes, &isnull2); - - compare = inlineApplySortFunction(&scanKey->sk_func, scanKey->sk_flags, - scanKey->sk_collation, - datum1, isnull1, - datum2, isnull2); - if (compare != 0) - return compare; /* done when we find unequal attributes */ - - /* they are equal, so we only need to examine one null flag */ - if (isnull1) - equal_hasnull = true; - } - - /* - * If btree has asked us to enforce uniqueness, complain if two equal - * tuples are detected (unless there was at least one NULL field). - * - * It is sufficient to make the test here, because if two tuples are equal - * they *must* get compared at some stage of the sort --- otherwise the - * sort algorithm wouldn't have checked whether one must appear before the - * other. - */ - if (state->enforceUnique && !equal_hasnull) - { - Datum values[INDEX_MAX_KEYS]; - bool isnull[INDEX_MAX_KEYS]; - char *key_desc; - - /* - * Some rather brain-dead implementations of qsort (such as the one in - * QNX 4) will sometimes call the comparison routine to compare a - * value to itself, but we always use our own implementation, which - * does not. - */ - Assert(tuple1 != tuple2); - - index_deform_tuple(tuple1, tupDes, values, isnull); - - key_desc = BuildIndexValueDescription(state->indexRel, values, isnull); - - ereport(ERROR, - (errcode(ERRCODE_UNIQUE_VIOLATION), - errmsg("could not create unique index \"%s\"", - RelationGetRelationName(state->indexRel)), - key_desc ? errdetail("Key %s is duplicated.", key_desc) : - errdetail("Duplicate keys exist."), - errtableconstraint(state->heapRel, - RelationGetRelationName(state->indexRel)))); - } - - /* - * If key values are equal, we sort on ItemPointer. This does not affect - * validity of the finished index, but it may be useful to have index - * scans in physical order. - */ - { - BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid); - BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid); - - if (blk1 != blk2) - return (blk1 < blk2) ? -1 : 1; - } - { - OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid); - OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid); - - if (pos1 != pos2) - return (pos1 < pos2) ? -1 : 1; - } - - return 0; -} - -static int -comparetup_index_hash(const SortTuple *a, const SortTuple *b, - RumTuplesortstate *state) -{ - uint32 hash1; - uint32 hash2; - IndexTuple tuple1; - IndexTuple tuple2; - - /* - * Fetch hash keys and mask off bits we don't want to sort by. We know - * that the first column of the index tuple is the hash key. - */ - Assert(!a->isnull1); - hash1 = DatumGetUInt32(a->datum1) & state->hash_mask; - Assert(!b->isnull1); - hash2 = DatumGetUInt32(b->datum1) & state->hash_mask; - - if (hash1 > hash2) - return 1; - else if (hash1 < hash2) - return -1; - - /* - * If hash values are equal, we sort on ItemPointer. This does not affect - * validity of the finished index, but it may be useful to have index - * scans in physical order. - */ - tuple1 = (IndexTuple) a->tuple; - tuple2 = (IndexTuple) b->tuple; - - { - BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid); - BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid); - - if (blk1 != blk2) - return (blk1 < blk2) ? -1 : 1; - } - { - OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid); - OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid); - - if (pos1 != pos2) - return (pos1 < pos2) ? -1 : 1; - } - - return 0; -} - -static void -copytup_index(RumTuplesortstate *state, SortTuple *stup, void *tup) -{ - IndexTuple tuple = (IndexTuple) tup; - unsigned int tuplen = IndexTupleSize(tuple); - IndexTuple newtuple; - - /* copy the tuple into sort storage */ - newtuple = (IndexTuple) palloc(tuplen); - memcpy(newtuple, tuple, tuplen); - USEMEM(state, GetMemoryChunkSpace(newtuple)); - stup->tuple = (void *) newtuple; - /* set up first-column key value */ - stup->datum1 = index_getattr(newtuple, - 1, - RelationGetDescr(state->indexRel), - &stup->isnull1); -} - -static void -writetup_index(RumTuplesortstate *state, int tapenum, SortTuple *stup) -{ - IndexTuple tuple = (IndexTuple) stup->tuple; - unsigned int tuplen; - - tuplen = IndexTupleSize(tuple) + sizeof(tuplen); - LogicalTapeWrite(state->tapeset, tapenum, - (void *) &tuplen, sizeof(tuplen)); - LogicalTapeWrite(state->tapeset, tapenum, - (void *) tuple, IndexTupleSize(tuple)); - if (state->randomAccess) /* need trailing length word? */ - LogicalTapeWrite(state->tapeset, tapenum, - (void *) &tuplen, sizeof(tuplen)); - - FREEMEM(state, GetMemoryChunkSpace(tuple)); - pfree(tuple); -} - -static void -readtup_index(RumTuplesortstate *state, SortTuple *stup, - int tapenum, unsigned int len) -{ - unsigned int tuplen = len - sizeof(unsigned int); - IndexTuple tuple = (IndexTuple) palloc(tuplen); - - USEMEM(state, GetMemoryChunkSpace(tuple)); - LogicalTapeReadExact(state->tapeset, tapenum, - tuple, tuplen); - if (state->randomAccess) /* need trailing length word? */ - LogicalTapeReadExact(state->tapeset, tapenum, - &tuplen, sizeof(tuplen)); - stup->tuple = (void *) tuple; - /* set up first-column key value */ - stup->datum1 = index_getattr(tuple, - 1, - RelationGetDescr(state->indexRel), - &stup->isnull1); -} - -static void -reversedirection_index_btree(RumTuplesortstate *state) -{ - ScanKey scanKey = state->indexScanKey; - int nkey; - - for (nkey = 0; nkey < state->nKeys; nkey++, scanKey++) - { - scanKey->sk_flags ^= (SK_BT_DESC | SK_BT_NULLS_FIRST); - } -} - -static void -reversedirection_index_hash(RumTuplesortstate *state) -{ - /* We don't support reversing direction in a hash index sort */ - elog(ERROR, "reversedirection_index_hash is not implemented"); -} - - -/* - * Routines specialized for DatumTuple case - */ - -static int -comparetup_datum(const SortTuple *a, const SortTuple *b, RumTuplesortstate *state) -{ - return ApplySortComparator(a->datum1, a->isnull1, - b->datum1, b->isnull1, - state->onlyKey); -} - -static void -copytup_datum(RumTuplesortstate *state, SortTuple *stup, void *tup) -{ - /* Not currently needed */ - elog(ERROR, "copytup_datum() should not be called"); -} - -static void -writetup_datum(RumTuplesortstate *state, int tapenum, SortTuple *stup) -{ - void *waddr; - unsigned int tuplen; - unsigned int writtenlen; - - if (stup->isnull1) - { - waddr = NULL; - tuplen = 0; - } - else if (state->datumTypeByVal) - { - waddr = &stup->datum1; - tuplen = sizeof(Datum); - } - else - { - waddr = DatumGetPointer(stup->datum1); - tuplen = datumGetSize(stup->datum1, false, state->datumTypeLen); - Assert(tuplen != 0); - } - - writtenlen = tuplen + sizeof(unsigned int); +/* + * In case of using custom compare function we should store function pointer in + * sort stare in order to use it later. + */ - LogicalTapeWrite(state->tapeset, tapenum, - (void *) &writtenlen, sizeof(writtenlen)); - LogicalTapeWrite(state->tapeset, tapenum, - waddr, tuplen); - if (state->randomAccess) /* need trailing length word? */ - LogicalTapeWrite(state->tapeset, tapenum, - (void *) &writtenlen, sizeof(writtenlen)); +#if PG_VERSION_NUM >= 160000 +/* + * After allocating a public interface for Tuplesortstate we may use + * TuplesortPublic->arg filed to store pointer to the compare function. + */ - if (stup->tuple) - { - FREEMEM(state, GetMemoryChunkSpace(stup->tuple)); - pfree(stup->tuple); - } -} +/* GUC variables */ +#ifdef TRACE_SORT +extern PGDLLIMPORT bool trace_sort; +#endif -static void -readtup_datum(RumTuplesortstate *state, SortTuple *stup, - int tapenum, unsigned int len) +/* All memory management should be inside Tuplesortstate module. */ +#define USEMEM(state,amt) do {} while(0) + +#else /* PG_VERSION_NUM >= 160000 */ +/* + * We need extra field in a state structure but we should not modify struct + * RumTuplesortstate which is inherited from Tuplesortstate core function. + */ +typedef struct RumTuplesortstateExt { - unsigned int tuplen = len - sizeof(unsigned int); + RumTuplesortstate ts; + FmgrInfo *cmp; +} RumTuplesortstateExt; +#endif /* PG_VERSION_NUM < 160000 */ - if (tuplen == 0) - { - /* it's NULL */ - stup->datum1 = (Datum) 0; - stup->isnull1 = true; - stup->tuple = NULL; - } - else if (state->datumTypeByVal) - { - Assert(tuplen == sizeof(Datum)); - LogicalTapeReadExact(state->tapeset, tapenum, - &stup->datum1, tuplen); - stup->isnull1 = false; - stup->tuple = NULL; - } - else - { - void *raddr = palloc(tuplen); +static int comparetup_rum(const SortTuple *a, const SortTuple *b, + RumTuplesortstate *state, bool compareItemPointer); +static int comparetup_rum_true(const SortTuple *a, const SortTuple *b, + RumTuplesortstate *state); +static int comparetup_rum_false(const SortTuple *a, const SortTuple *b, + RumTuplesortstate *state); +static int comparetup_rumitem(const SortTuple *a, const SortTuple *b, + RumTuplesortstate *state); +static void copytup_rum(RumTuplesortstate *state, SortTuple *stup, void *tup); +static void copytup_rumitem(RumTuplesortstate *state, SortTuple *stup, + void *tup); +static void *rum_tuplesort_getrum_internal(RumTuplesortstate *state, + bool forward, bool *should_free); - LogicalTapeReadExact(state->tapeset, tapenum, - raddr, tuplen); - stup->datum1 = PointerGetDatum(raddr); - stup->isnull1 = false; - stup->tuple = raddr; - USEMEM(state, GetMemoryChunkSpace(raddr)); - } +/* + * Tuplesortstate handling should be done through this macro. + */ +#if PG_VERSION_NUM >= 160000 +# define TSS_GET(state) TuplesortstateGetPublic((state)) +#else +# define TSS_GET(state) (state) +#endif - if (state->randomAccess) /* need trailing length word? */ - LogicalTapeReadExact(state->tapeset, tapenum, - &tuplen, sizeof(tuplen)); -} +/* + * Logical tape handling should be done through this macro. + */ +#if PG_VERSION_NUM >= 150000 +#define LT_TYPE LogicalTape * +#define LT_ARG tape +#define TAPE(state, LT_ARG) LT_ARG +#else +#define LT_TYPE int +#define LT_ARG tapenum +#define TAPE(state, LT_ARG) state->tapeset, LT_ARG +#endif -static void -reversedirection_datum(RumTuplesortstate *state) -{ - state->onlyKey->ssup_reverse = !state->onlyKey->ssup_reverse; - state->onlyKey->ssup_nulls_first = !state->onlyKey->ssup_nulls_first; -} +/* + * Just for convenience and uniformity. + */ +#if PG_VERSION_NUM >= 110000 +#define tuplesort_begin_common(x,y) tuplesort_begin_common((x), NULL, (y)) +#endif /* - * Convenience routine to free a tuple previously loaded into sort memory + * Trace log wrapper. */ -static void -free_sort_tuple(RumTuplesortstate *state, SortTuple *stup) +#ifdef TRACE_SORT +# define LOG_SORT(...) \ + if (trace_sort) \ + ereport(LOG, errmsg_internal(__VA_ARGS__)) +#else +# define LOG_SORT(...) \ + {} +#endif + +static inline int +compare_rum_itempointer(ItemPointerData p1, ItemPointerData p2) { - FREEMEM(state, GetMemoryChunkSpace(stup->tuple)); - pfree(stup->tuple); + if (p1.ip_blkid.bi_hi < p2.ip_blkid.bi_hi) + return -1; + else if (p1.ip_blkid.bi_hi > p2.ip_blkid.bi_hi) + return 1; + + if (p1.ip_blkid.bi_lo < p2.ip_blkid.bi_lo) + return -1; + else if (p1.ip_blkid.bi_lo > p2.ip_blkid.bi_lo) + return 1; + + if (p1.ip_posid < p2.ip_posid) + return -1; + else if (p1.ip_posid > p2.ip_posid) + return 1; + + return 0; } static int -comparetup_rum(const SortTuple *a, const SortTuple *b, RumTuplesortstate *state) +comparetup_rum(const SortTuple *a, const SortTuple *b, + RumTuplesortstate *state, bool compareItemPointer) { RumSortItem *i1, *i2; @@ -4015,7 +172,8 @@ comparetup_rum(const SortTuple *a, const SortTuple *b, RumTuplesortstate *state) i1 = (RumSortItem *) a->tuple; i2 = (RumSortItem *) b->tuple; - for (i = 1; i < state->nKeys; i++) + + for (i = 1; i < TSS_GET(state)->nKeys; i++) { if (i1->data[i] < i2->data[i]) return -1; @@ -4023,97 +181,53 @@ comparetup_rum(const SortTuple *a, const SortTuple *b, RumTuplesortstate *state) return 1; } - if (!state->compareItemPointer) + if (!compareItemPointer) return 0; /* * If key values are equal, we sort on ItemPointer. */ - if (i1->iptr.ip_blkid.bi_hi < i2->iptr.ip_blkid.bi_hi) - return -1; - else if (i1->iptr.ip_blkid.bi_hi > i2->iptr.ip_blkid.bi_hi) - return 1; - - if (i1->iptr.ip_blkid.bi_lo < i2->iptr.ip_blkid.bi_lo) - return -1; - else if (i1->iptr.ip_blkid.bi_lo > i2->iptr.ip_blkid.bi_lo) - return 1; - - if (i1->iptr.ip_posid < i2->iptr.ip_posid) - return -1; - else if (i1->iptr.ip_posid > i2->iptr.ip_posid) - return 1; - - return 0; -} - -static void -copytup_rum(RumTuplesortstate *state, SortTuple *stup, void *tup) -{ - RumSortItem *item = (RumSortItem *) tup; - - stup->datum1 = Float8GetDatum(state->nKeys > 0 ? item->data[0] : 0); - stup->isnull1 = false; - stup->tuple = tup; - USEMEM(state, GetMemoryChunkSpace(tup)); + return compare_rum_itempointer(i1->iptr, i2->iptr); } -static void -writetup_rum(RumTuplesortstate *state, int tapenum, SortTuple *stup) +static int +comparetup_rum_true(const SortTuple *a, const SortTuple *b, + RumTuplesortstate *state) { - RumSortItem *item = (RumSortItem *) stup->tuple; - unsigned int writtenlen = RumSortItemSize(state->nKeys) + sizeof(unsigned int); - - - LogicalTapeWrite(state->tapeset, tapenum, - (void *) &writtenlen, sizeof(writtenlen)); - LogicalTapeWrite(state->tapeset, tapenum, - (void *) item, RumSortItemSize(state->nKeys)); - if (state->randomAccess) /* need trailing length word? */ - LogicalTapeWrite(state->tapeset, tapenum, - (void *) &writtenlen, sizeof(writtenlen)); - - FREEMEM(state, GetMemoryChunkSpace(item)); - pfree(item); + return comparetup_rum(a, b, state, true); } -static void -readtup_rum(RumTuplesortstate *state, SortTuple *stup, - int tapenum, unsigned int len) +static int +comparetup_rum_false(const SortTuple *a, const SortTuple *b, + RumTuplesortstate *state) { - unsigned int tuplen = len - sizeof(unsigned int); - RumSortItem *item = (RumSortItem *) palloc(RumSortItemSize(state->nKeys)); - - Assert(tuplen == RumSortItemSize(state->nKeys)); - - USEMEM(state, GetMemoryChunkSpace(item)); - LogicalTapeReadExact(state->tapeset, tapenum, - (void *) item, RumSortItemSize(state->nKeys)); - stup->datum1 = Float8GetDatum(state->nKeys > 0 ? item->data[0] : 0); - stup->isnull1 = false; - stup->tuple = item; - - if (state->randomAccess) /* need trailing length word? */ - LogicalTapeReadExact(state->tapeset, tapenum, - &tuplen, sizeof(tuplen)); + return comparetup_rum(a, b, state, false); } -static void -reversedirection_rum(RumTuplesortstate *state) +static inline FmgrInfo * +comparetup_rumitem_custom_fun(RumTuplesortstate *state) { - state->reverse = !state->reverse; +#if PG_VERSION_NUM >= 160000 + return (FmgrInfo *) TSS_GET(state)->arg; +#else + return ((RumTuplesortstateExt *) state)->cmp; +#endif } static int -comparetup_rumitem(const SortTuple *a, const SortTuple *b, RumTuplesortstate *state) +comparetup_rumitem(const SortTuple *a, const SortTuple *b, + RumTuplesortstate *state) { - RumItem *i1, *i2; + RumItem *i1, + *i2; + FmgrInfo *cmp; /* Extract RumItem from RumScanItem */ i1 = (RumItem *) a->tuple; i2 = (RumItem *) b->tuple; - if (state->cmp) + cmp = comparetup_rumitem_custom_fun(state); + if (cmp != NULL) { if (i1->addInfoIsNull || i2->addInfoIsNull) { @@ -4123,9 +237,9 @@ comparetup_rumitem(const SortTuple *a, const SortTuple *b, RumTuplesortstate *st } else { - int r; + int r; - r = DatumGetInt32(FunctionCall2(state->cmp, + r = DatumGetInt32(FunctionCall2(cmp, i1->addInfo, i2->addInfo)); @@ -4137,22 +251,19 @@ comparetup_rumitem(const SortTuple *a, const SortTuple *b, RumTuplesortstate *st /* * If key values are equal, we sort on ItemPointer. */ - if (i1->iptr.ip_blkid.bi_hi < i2->iptr.ip_blkid.bi_hi) - return -1; - else if (i1->iptr.ip_blkid.bi_hi > i2->iptr.ip_blkid.bi_hi) - return 1; - - if (i1->iptr.ip_blkid.bi_lo < i2->iptr.ip_blkid.bi_lo) - return -1; - else if (i1->iptr.ip_blkid.bi_lo > i2->iptr.ip_blkid.bi_lo) - return 1; + return compare_rum_itempointer(i1->iptr, i2->iptr); +} - if (i1->iptr.ip_posid < i2->iptr.ip_posid) - return -1; - else if (i1->iptr.ip_posid > i2->iptr.ip_posid) - return 1; +static void +copytup_rum(RumTuplesortstate *state, SortTuple *stup, void *tup) +{ + RumSortItem *item = (RumSortItem *) tup; + int nKeys = TSS_GET(state)->nKeys; - return 0; + stup->datum1 = Float8GetDatum(nKeys > 0 ? item->data[0] : 0); + stup->isnull1 = false; + stup->tuple = tup; + USEMEM(state, GetMemoryChunkSpace(tup)); } static void @@ -4164,41 +275,314 @@ copytup_rumitem(RumTuplesortstate *state, SortTuple *stup, void *tup) USEMEM(state, GetMemoryChunkSpace(stup->tuple)); } +static void readtup_rum(RumTuplesortstate *state, SortTuple *stup, + LT_TYPE LT_ARG, unsigned int len); + +static void readtup_rumitem(RumTuplesortstate *state, SortTuple *stup, + LT_TYPE LT_ARG, unsigned int len); + +static Size +rum_item_size(RumTuplesortstate *state) +{ + if (TSS_GET(state)->readtup == readtup_rum) + return RumSortItemSize(TSS_GET(state)->nKeys); + else if (TSS_GET(state)->readtup == readtup_rumitem) + return sizeof(RumScanItem); + + elog (FATAL, "Unknown RUM state"); + return 0; /* keep compiler quiet */ +} + static void -writetup_rumitem(RumTuplesortstate *state, int tapenum, SortTuple *stup) +writetup_rum_internal(RumTuplesortstate *state, LT_TYPE LT_ARG, + SortTuple *stup) { - RumScanItem *item = (RumScanItem *) stup->tuple; - unsigned int writtenlen = sizeof(*item) + sizeof(unsigned int); + void *item = stup->tuple; + size_t size = rum_item_size(state); + unsigned int writtenlen = size + sizeof(unsigned int); + bool randomAccess; - LogicalTapeWrite(state->tapeset, tapenum, + LogicalTapeWrite(TAPE(state, LT_ARG), (void *) &writtenlen, sizeof(writtenlen)); - LogicalTapeWrite(state->tapeset, tapenum, - (void *) item, sizeof(*item)); - if (state->randomAccess) /* need trailing length word? */ - LogicalTapeWrite(state->tapeset, tapenum, - (void *) &writtenlen, sizeof(writtenlen)); + LogicalTapeWrite(TAPE(state, LT_ARG), + (void *) item, size); - FREEMEM(state, GetMemoryChunkSpace(item)); - pfree(item); + randomAccess = +# if PG_VERSION_NUM >= 150000 + (TSS_GET(state)->sortopt & TUPLESORT_RANDOMACCESS) != 0; +# else + TSS_GET(state)->randomAccess; +# endif + + if (randomAccess) + LogicalTapeWrite(TAPE(TSS_GET(state), LT_ARG), (void *) &writtenlen, + sizeof(writtenlen)); +} + +static void +writetup_rum(RumTuplesortstate *state, LT_TYPE LT_ARG, SortTuple *stup) +{ + writetup_rum_internal(state, LT_ARG, stup); +} + +static void +writetup_rumitem(RumTuplesortstate *state, LT_TYPE LT_ARG, SortTuple *stup) +{ + writetup_rum_internal(state, LT_ARG, stup); } static void -readtup_rumitem(RumTuplesortstate *state, SortTuple *stup, - int tapenum, unsigned int len) +readtup_rum_internal(RumTuplesortstate *state, SortTuple *stup, + LT_TYPE LT_ARG, unsigned int len, bool is_item) { unsigned int tuplen = len - sizeof(unsigned int); - RumScanItem *item = (RumScanItem *) palloc(sizeof(RumScanItem)); + size_t size = rum_item_size(state); + void *item = palloc(size); - Assert(tuplen == sizeof(RumScanItem)); + Assert(tuplen == size); USEMEM(state, GetMemoryChunkSpace(item)); - LogicalTapeReadExact(state->tapeset, tapenum, - (void *) item, tuplen); - stup->isnull1 = true; + +#if PG_VERSION_NUM >= 150000 + LogicalTapeReadExact(LT_ARG, item, size); +#else + LogicalTapeReadExact(TSS_GET(state)->tapeset, LT_ARG, item, size); +#endif stup->tuple = item; + stup->isnull1 = is_item; + + if (!is_item) + stup->datum1 = Float8GetDatum(TSS_GET(state)->nKeys > 0 ? + ((RumSortItem *) item)->data[0] : 0); +#if PG_VERSION_NUM >= 150000 + if (TSS_GET(state)->sortopt & TUPLESORT_RANDOMACCESS) /* need trailing + * length word? */ + LogicalTapeReadExact(LT_ARG, &tuplen, sizeof(tuplen)); +#else + if (TSS_GET(state)->randomAccess) + LogicalTapeReadExact(TSS_GET(state)->tapeset, LT_ARG, &tuplen, + sizeof(tuplen)); +#endif +} + +static void +readtup_rum(RumTuplesortstate *state, SortTuple *stup, LT_TYPE LT_ARG, + unsigned int len) +{ + readtup_rum_internal(state, stup, LT_ARG, len, false); +} + +static void +readtup_rumitem(RumTuplesortstate *state, SortTuple *stup, LT_TYPE LT_ARG, + unsigned int len) +{ + readtup_rum_internal(state, stup, LT_ARG, len, true); +} + +RumTuplesortstate * +rum_tuplesort_begin_rum(int workMem, int nKeys, bool randomAccess, + bool compareItemPointer) +{ +#if PG_VERSION_NUM >= 150000 + RumTuplesortstate *state = tuplesort_begin_common(workMem, + randomAccess ? + TUPLESORT_RANDOMACCESS : + TUPLESORT_NONE); +#else + RumTuplesortstate *state = tuplesort_begin_common(workMem, randomAccess); +#endif + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(TSS_GET(state)->sortcontext); + + LOG_SORT("begin rum sort: nKeys = %d, workMem = %d, randomAccess = %c", + nKeys, workMem, randomAccess ? 't' : 'f'); + + TSS_GET(state)->nKeys = nKeys; + TSS_GET(state)->comparetup = compareItemPointer ? comparetup_rum_true : + comparetup_rum_false; + TSS_GET(state)->writetup = writetup_rum; + TSS_GET(state)->readtup = readtup_rum; + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +RumTuplesortstate * +rum_tuplesort_begin_rumitem(int workMem, FmgrInfo *cmp) +{ +#if PG_VERSION_NUM >= 160000 + RumTuplesortstate *state = tuplesort_begin_common(workMem, false); + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(TSS_GET(state)->sortcontext); + + LOG_SORT("begin rumitem sort: workMem = %d", workMem); + + TSS_GET(state)->comparetup = comparetup_rumitem; + TSS_GET(state)->writetup = writetup_rumitem; + TSS_GET(state)->readtup = readtup_rumitem; + TSS_GET(state)->arg = cmp; + + MemoryContextSwitchTo(oldcontext); + + return state; +#else + RumTuplesortstate *state = tuplesort_begin_common(workMem, false); + RumTuplesortstateExt *rs; + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(TSS_GET(state)->sortcontext); + + /* Allocate extended state in the same context as state */ + rs = palloc(sizeof(*rs)); + + LOG_SORT("begin rumitem sort: workMem = %d", workMem); + + rs->cmp = cmp; + TSS_GET(state)->comparetup = comparetup_rumitem; + TSS_GET(state)->writetup = writetup_rumitem; + TSS_GET(state)->readtup = readtup_rumitem; + memcpy(&rs->ts, state, sizeof(RumTuplesortstate)); + pfree(state); /* just to be sure *state isn't used anywhere + * else */ + + MemoryContextSwitchTo(oldcontext); + + return (RumTuplesortstate *) rs; +#endif +} + +/* + * rum_tuplesort_end + * + * Release resources and clean up. + * + * NOTE: after calling this, any pointers returned by rum_tuplesort_getXXX are + * pointing to garbage. Be careful not to attempt to use or free such + * pointers afterwards! + */ +void +rum_tuplesort_end(RumTuplesortstate *state) +{ +#if PG_VERSION_NUM < 160000 && PG_VERSION_NUM >= 130000 + tuplesort_free(state); +#else + tuplesort_end(state); +#endif +} + +/* + * Get sort state memory context. Currently it is used only to allocate + * RumSortItem. + */ +MemoryContext +rum_tuplesort_get_memorycontext(RumTuplesortstate *state) +{ + return TSS_GET(state)->sortcontext; +} + +void +rum_tuplesort_putrum(RumTuplesortstate *state, RumSortItem *item) +{ + MemoryContext oldcontext; + SortTuple stup; +#if PG_VERSION_NUM >= 170000 + MinimalTuple tuple = (MinimalTuple)item; + Size tuplen; + TuplesortPublic *base = TuplesortstateGetPublic((TuplesortPublic *)state); +#endif + + oldcontext = MemoryContextSwitchTo(rum_tuplesort_get_memorycontext(state)); + copytup_rum(state, &stup, item); + +#if PG_VERSION_NUM >= 170000 + /* GetMemoryChunkSpace is not supported for bump contexts */ + if (TupleSortUseBumpTupleCxt(base->sortopt)) + tuplen = MAXALIGN(tuple->t_len); + else + tuplen = GetMemoryChunkSpace(tuple); + tuplesort_puttuple_common(state, &stup, false, tuplen); +#elif PG_VERSION_NUM >= 160000 + tuplesort_puttuple_common(state, &stup, false); +#else + puttuple_common(state, &stup); +#endif + + MemoryContextSwitchTo(oldcontext); +} + +void +rum_tuplesort_putrumitem(RumTuplesortstate *state, RumScanItem *item) +{ + MemoryContext oldcontext; + SortTuple stup; +#if PG_VERSION_NUM >= 170000 + MinimalTuple tuple = (MinimalTuple)item; + Size tuplen; + TuplesortPublic *base = TuplesortstateGetPublic((TuplesortPublic *)state); +#endif + + oldcontext = MemoryContextSwitchTo(rum_tuplesort_get_memorycontext(state)); + copytup_rumitem(state, &stup, item); + +#if PG_VERSION_NUM >= 170000 + /* GetMemoryChunkSpace is not supported for bump contexts */ + if (TupleSortUseBumpTupleCxt(base->sortopt)) + tuplen = MAXALIGN(tuple->t_len); + else + tuplen = GetMemoryChunkSpace(tuple); + tuplesort_puttuple_common(state, &stup, false, tuplen); +#elif PG_VERSION_NUM >= 160000 + tuplesort_puttuple_common(state, &stup, false); +#else + puttuple_common(state, &stup); +#endif + + MemoryContextSwitchTo(oldcontext); +} + +void +rum_tuplesort_performsort(RumTuplesortstate *state) +{ + tuplesort_performsort(state); +} - if (state->randomAccess) /* need trailing length word? */ - LogicalTapeReadExact(state->tapeset, tapenum, - &tuplen, sizeof(tuplen)); +/* + * Internal routine to fetch the next index tuple in either forward or back + * direction. Returns NULL if no more tuples. Returned tuple belongs to + * tuplesort memory context. Caller may not rely on tuple remaining valid after + * any further manipulation of tuplesort. + * + * If *should_free is set, the caller must pfree stup.tuple when done with it. + * + * NOTE: in PG 10 and newer tuple is always allocated tuple in tuplesort context + * and should not be freed by caller. + */ +static void * +rum_tuplesort_getrum_internal(RumTuplesortstate *state, bool forward, + bool *should_free) +{ +#if PG_VERSION_NUM >= 100000 + *should_free = false; + return (RumSortItem *)tuplesort_getindextuple(state, forward); +#else + return (RumSortItem *)tuplesort_getindextuple(state, forward, should_free); +#endif } +RumSortItem * +rum_tuplesort_getrum(RumTuplesortstate *state, bool forward, bool *should_free) +{ + return (RumSortItem *) rum_tuplesort_getrum_internal(state, forward, + should_free); +} + +RumScanItem * +rum_tuplesort_getrumitem(RumTuplesortstate *state, bool forward, + bool *should_free) +{ + return (RumScanItem *) rum_tuplesort_getrum_internal(state, forward, + should_free); +} diff --git a/src/rumsort.h b/src/rumsort.h index 9cb74feeab..160aa5c8da 100644 --- a/src/rumsort.h +++ b/src/rumsort.h @@ -3,13 +3,11 @@ * rumsort.h * Generalized tuple sorting routines. * - * This module handles sorting of heap tuples, index tuples, or single - * Datums (and could easily support other kinds of sortable objects, - * if necessary). It works efficiently for both small and large amounts - * of data. Small amounts are sorted in-memory using qsort(). Large - * amounts are sorted using temporary files and a standard external sort - * algorithm. + * This module handles sorting of RumSortItem or RumScanItem structures. + * It contains copy of static functions from + * src/backend/utils/sort/tuplesort.c. * + * Portions Copyright (c) 2015-2024, Postgres Professional * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -22,43 +20,14 @@ #include "postgres.h" #include "fmgr.h" -#include "access/itup.h" #include "executor/tuptable.h" -#include "utils/relcache.h" -/* Tuplesortstate is an opaque type whose details are not known outside - * tuplesort.c. +/* RumTuplesortstate is an opaque type whose details are not known outside + * rumsort.c. */ -typedef struct RumTuplesortstate RumTuplesortstate; +typedef struct Tuplesortstate RumTuplesortstate; struct RumScanItem; -/* - * We provide multiple interfaces to what is essentially the same code, - * since different callers have different data to be sorted and want to - * specify the sort key information differently. There are two APIs for - * sorting HeapTuples and two more for sorting IndexTuples. Yet another - * API supports sorting bare Datums. - * - * The "heap" API actually stores/sorts MinimalTuples, which means it doesn't - * preserve the system columns (tuple identity and transaction visibility - * info). The sort keys are specified by column numbers within the tuples - * and sort operator OIDs. We save some cycles by passing and returning the - * tuples in TupleTableSlots, rather than forming actual HeapTuples (which'd - * have to be converted to MinimalTuples). This API works well for sorts - * executed as parts of plan trees. - * - * The "cluster" API stores/sorts full HeapTuples including all visibility - * info. The sort keys are specified by reference to a btree index that is - * defined on the relation to be sorted. Note that putheaptuple/getheaptuple - * go with this API, not the "begin_heap" one! - * - * The "index_btree" API stores/sorts IndexTuples (preserving all their - * header fields). The sort keys are specified by a btree index definition. - * - * The "index_hash" API is similar to index_btree, but the tuples are - * actually sorted by their hash codes not the raw data. - */ - typedef struct { ItemPointerData iptr; @@ -69,52 +38,16 @@ typedef struct #define RumSortItemSize(nKeys) (offsetof(RumSortItem,data)+(nKeys)*sizeof(float8)) extern MemoryContext rum_tuplesort_get_memorycontext(RumTuplesortstate *state); -extern RumTuplesortstate *rum_tuplesort_begin_heap(TupleDesc tupDesc, - int nkeys, AttrNumber *attNums, - Oid *sortOperators, Oid *sortCollations, - bool *nullsFirstFlags, - int workMem, bool randomAccess); -extern RumTuplesortstate *rum_tuplesort_begin_cluster(TupleDesc tupDesc, - Relation indexRel, - int workMem, bool randomAccess); -extern RumTuplesortstate *rum_tuplesort_begin_index_btree(Relation heapRel, - Relation indexRel, - bool enforceUnique, - int workMem, bool randomAccess); -extern RumTuplesortstate *rum_tuplesort_begin_index_hash(Relation heapRel, - Relation indexRel, - uint32 hash_mask, - int workMem, bool randomAccess); -extern RumTuplesortstate *rum_tuplesort_begin_datum(Oid datumType, - Oid sortOperator, Oid sortCollation, - bool nullsFirstFlag, - int workMem, bool randomAccess); extern RumTuplesortstate *rum_tuplesort_begin_rum(int workMem, int nKeys, bool randomAccess, bool compareItemPointer); extern RumTuplesortstate *rum_tuplesort_begin_rumitem(int workMem, FmgrInfo *cmp); -extern void rum_tuplesort_set_bound(RumTuplesortstate *state, int64 bound); - -extern void rum_tuplesort_puttupleslot(RumTuplesortstate *state, - TupleTableSlot *slot); -extern void rum_tuplesort_putheaptuple(RumTuplesortstate *state, HeapTuple tup); -extern void rum_tuplesort_putindextuple(RumTuplesortstate *state, IndexTuple tuple); -extern void rum_tuplesort_putdatum(RumTuplesortstate *state, Datum val, - bool isNull); extern void rum_tuplesort_putrum(RumTuplesortstate *state, RumSortItem * item); extern void rum_tuplesort_putrumitem(RumTuplesortstate *state, struct RumScanItem * item); extern void rum_tuplesort_performsort(RumTuplesortstate *state); -extern bool rum_tuplesort_gettupleslot(RumTuplesortstate *state, bool forward, - TupleTableSlot *slot); -extern HeapTuple rum_tuplesort_getheaptuple(RumTuplesortstate *state, bool forward, - bool *should_free); -extern IndexTuple rum_tuplesort_getindextuple(RumTuplesortstate *state, bool forward, - bool *should_free); -extern bool rum_tuplesort_getdatum(RumTuplesortstate *state, bool forward, - Datum *val, bool *isNull); extern RumSortItem *rum_tuplesort_getrum(RumTuplesortstate *state, bool forward, bool *should_free); extern struct RumScanItem *rum_tuplesort_getrumitem(RumTuplesortstate *state, bool forward, @@ -122,21 +55,4 @@ extern struct RumScanItem *rum_tuplesort_getrumitem(RumTuplesortstate *state, bo extern void rum_tuplesort_end(RumTuplesortstate *state); -extern void rum_tuplesort_get_stats(RumTuplesortstate *state, - const char **sortMethod, - const char **spaceType, - long *spaceUsed); - -extern int rum_tuplesort_merge_order(long allowedMem); - -/* - * These routines may only be called if randomAccess was specified 'true'. - * Likewise, backwards scan in gettuple/getdatum is only allowed if - * randomAccess was specified. - */ - -extern void rum_tuplesort_rescan(RumTuplesortstate *state); -extern void rum_tuplesort_markpos(RumTuplesortstate *state); -extern void rum_tuplesort_restorepos(RumTuplesortstate *state); - #endif /* RUMSORT_H */ diff --git a/src/rumtsquery.c b/src/rumtsquery.c index 205526ff37..6c6b3c86d0 100644 --- a/src/rumtsquery.c +++ b/src/rumtsquery.c @@ -3,7 +3,7 @@ * rumtsquery.c * Inverted fulltext search: indexing tsqueries. * - * Portions Copyright (c) 2015-2016, Postgres Professional + * Portions Copyright (c) 2015-2024, Postgres Professional * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group * *------------------------------------------------------------------------- @@ -108,6 +108,7 @@ make_query_item_wrap(QueryItem *item, QueryItemWrap *parent, bool not) } case OP_PHRASE: elog(ERROR, "Indexing of phrase tsqueries isn't supported yet"); + break; default: elog(ERROR, "Invalid tsquery operator"); } diff --git a/src/rumutil.c b/src/rumutil.c index 838674882f..4a239c85c7 100644 --- a/src/rumutil.c +++ b/src/rumutil.c @@ -4,7 +4,7 @@ * utilities routines for the postgres inverted index access method. * * - * Portions Copyright (c) 2015-2016, Postgres Professional + * Portions Copyright (c) 2015-2024, Postgres Professional * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -83,13 +83,25 @@ _PG_init(void) add_string_reloption(rum_relopt_kind, "attach", "Column name to attach as additional info", - NULL, NULL); + NULL, NULL +#if PG_VERSION_NUM >= 130000 + , AccessExclusiveLock +#endif + ); add_string_reloption(rum_relopt_kind, "to", "Column name to add a order by column", - NULL, NULL); + NULL, NULL +#if PG_VERSION_NUM >= 130000 + , AccessExclusiveLock +#endif + ); add_bool_reloption(rum_relopt_kind, "order_by_attach", "Use (addinfo, itempointer) order instead of just itempointer", - false); + false +#if PG_VERSION_NUM >= 130000 + , AccessExclusiveLock +#endif + ); } /* @@ -199,6 +211,9 @@ initRumState(RumState * state, Relation index) if (!AttributeNumberIsValid(state->attrnAddToColumn)) elog(ERROR, "attribute \"%s\" is not found in index", colname); + + if (state->attrnAddToColumn == state->attrnAttachColumn) + elog(ERROR, "column \"%s\" and attached column cannot be the same", colname); } if (!(AttributeNumberIsValid(state->attrnAttachColumn) && @@ -239,13 +254,20 @@ initRumState(RumState * state, Relation index) if (OidIsValid(rumConfig->addInfoTypeOid)) elog(ERROR, "AddTo could should not have AddInfo"); + if (state->useAlternativeOrder && origAddAttr->attbyval == false) + elog(ERROR, "doesn't support order index over pass-by-reference column"); + rumConfig->addInfoTypeOid = origAddAttr->atttypid; } if (state->oneCol) { state->tupdesc[i] = CreateTemplateTupleDesc( +#if PG_VERSION_NUM >= 120000 + OidIsValid(rumConfig->addInfoTypeOid) ? 2 : 1); +#else OidIsValid(rumConfig->addInfoTypeOid) ? 2 : 1, false); +#endif TupleDescInitEntry(state->tupdesc[i], (AttrNumber) 1, NULL, origAttr->atttypid, origAttr->atttypmod, @@ -266,7 +288,11 @@ initRumState(RumState * state, Relation index) else { state->tupdesc[i] = CreateTemplateTupleDesc( +#if PG_VERSION_NUM >= 120000 + OidIsValid(rumConfig->addInfoTypeOid) ? 3 : 2); +#else OidIsValid(rumConfig->addInfoTypeOid) ? 3 : 2, false); +#endif TupleDescInitEntry(state->tupdesc[i], (AttrNumber) 1, NULL, INT2OID, -1, 0); TupleDescInitEntry(state->tupdesc[i], (AttrNumber) 2, NULL, @@ -557,6 +583,7 @@ RumInitPage(Page page, uint32 f, Size pageSize) opaque->flags = f; opaque->leftlink = InvalidBlockNumber; opaque->rightlink = InvalidBlockNumber; + RumItemSetMin(RumDataPageGetRightBound(page)); } void @@ -863,14 +890,15 @@ rumExtractEntries(RumState * rumstate, OffsetNumber attnum, bytea * rumoptions(Datum reloptions, bool validate) { - relopt_value *options; - RumOptions *rdopts; - int numoptions; static const relopt_parse_elt tab[] = { {"attach", RELOPT_TYPE_STRING, offsetof(RumOptions, attachColumn)}, {"to", RELOPT_TYPE_STRING, offsetof(RumOptions, addToColumn)}, {"order_by_attach", RELOPT_TYPE_BOOL, offsetof(RumOptions, useAlternativeOrder)} }; +#if PG_VERSION_NUM < 130000 + relopt_value *options; + RumOptions *rdopts; + int numoptions; options = parseRelOptions(reloptions, validate, rum_relopt_kind, &numoptions); @@ -887,6 +915,10 @@ rumoptions(Datum reloptions, bool validate) pfree(options); return (bytea *) rdopts; +#else + return (bytea *) build_reloptions(reloptions, validate, rum_relopt_kind, + sizeof(RumOptions), tab, lengthof(tab)); +#endif } bool @@ -1045,8 +1077,40 @@ FunctionCall10Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2, Datum arg6, Datum arg7, Datum arg8, Datum arg9, Datum arg10) { - FunctionCallInfoData fcinfo; Datum result; +#if PG_VERSION_NUM >= 120000 + LOCAL_FCINFO(fcinfo, 10); + + InitFunctionCallInfoData(*fcinfo, flinfo, 10, collation, NULL, NULL); + + fcinfo->args[0].value = arg1; + fcinfo->args[0].isnull = false; + fcinfo->args[1].value = arg2; + fcinfo->args[1].isnull = false; + fcinfo->args[2].value = arg3; + fcinfo->args[2].isnull = false; + fcinfo->args[3].value = arg4; + fcinfo->args[3].isnull = false; + fcinfo->args[4].value = arg5; + fcinfo->args[4].isnull = false; + fcinfo->args[5].value = arg6; + fcinfo->args[5].isnull = false; + fcinfo->args[6].value = arg7; + fcinfo->args[6].isnull = false; + fcinfo->args[7].value = arg8; + fcinfo->args[7].isnull = false; + fcinfo->args[8].value = arg9; + fcinfo->args[8].isnull = false; + fcinfo->args[9].value = arg10; + fcinfo->args[9].isnull = false; + + result = FunctionCallInvoke(fcinfo); + + /* Check for null result, since caller is clearly not expecting one */ + if (fcinfo->isnull) + elog(ERROR, "function %u returned NULL", fcinfo->flinfo->fn_oid); +#else + FunctionCallInfoData fcinfo; InitFunctionCallInfoData(fcinfo, flinfo, 10, collation, NULL, NULL); @@ -1076,6 +1140,7 @@ FunctionCall10Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2, /* Check for null result, since caller is clearly not expecting one */ if (fcinfo.isnull) elog(ERROR, "function %u returned NULL", fcinfo.flinfo->fn_oid); +#endif return result; } diff --git a/src/rumvacuum.c b/src/rumvacuum.c index 32662c65f6..fd5e4206b4 100644 --- a/src/rumvacuum.c +++ b/src/rumvacuum.c @@ -4,7 +4,7 @@ * delete & vacuum routines for the postgres RUM * * - * Portions Copyright (c) 2015-2016, Postgres Professional + * Portions Copyright (c) 2015-2024, Postgres Professional * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -53,6 +53,7 @@ rumVacuumPostingList(RumVacuumState * gvs, OffsetNumber attnum, Pointer src, prev, ptr = src; + *newSize = 0; ItemPointerSetMin(&item.iptr); /* @@ -63,7 +64,7 @@ rumVacuumPostingList(RumVacuumState * gvs, OffsetNumber attnum, Pointer src, for (i = 0; i < nitem; i++) { prev = ptr; - ptr = rumDataPageLeafRead(ptr, attnum, &item, &gvs->rumstate); + ptr = rumDataPageLeafRead(ptr, attnum, &item, false, &gvs->rumstate); if (gvs->callback(&item.iptr, gvs->callback_state)) { gvs->result->tuples_removed += 1; diff --git a/src/rumvalidate.c b/src/rumvalidate.c index 39c2f5c1e1..0adbb10ac7 100644 --- a/src/rumvalidate.c +++ b/src/rumvalidate.c @@ -3,7 +3,7 @@ * rumvalidate.c * Opclass validator for RUM. * - * Portions Copyright (c) 2015-2016, Postgres Professional + * Portions Copyright (c) 2015-2024, Postgres Professional * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * diff --git a/src/tuplesort10.c b/src/tuplesort10.c new file mode 100644 index 0000000000..5a829a9240 --- /dev/null +++ b/src/tuplesort10.c @@ -0,0 +1,4469 @@ +/*------------------------------------------------------------------------- + * + * tuplesort.c + * Generalized tuple sorting routines. + * + * This module handles sorting of heap tuples, index tuples, or single + * Datums (and could easily support other kinds of sortable objects, + * if necessary). It works efficiently for both small and large amounts + * of data. Small amounts are sorted in-memory using qsort(). Large + * amounts are sorted using temporary files and a standard external sort + * algorithm. + * + * See Knuth, volume 3, for more than you want to know about the external + * sorting algorithm. Historically, we divided the input into sorted runs + * using replacement selection, in the form of a priority tree implemented + * as a heap (essentially his Algorithm 5.2.3H), but now we only do that + * for the first run, and only if the run would otherwise end up being very + * short. We merge the runs using polyphase merge, Knuth's Algorithm + * 5.4.2D. The logical "tapes" used by Algorithm D are implemented by + * logtape.c, which avoids space wastage by recycling disk space as soon + * as each block is read from its "tape". + * + * We do not use Knuth's recommended data structure (Algorithm 5.4.1R) for + * the replacement selection, because it uses a fixed number of records + * in memory at all times. Since we are dealing with tuples that may vary + * considerably in size, we want to be able to vary the number of records + * kept in memory to ensure full utilization of the allowed sort memory + * space. So, we keep the tuples in a variable-size heap, with the next + * record to go out at the top of the heap. Like Algorithm 5.4.1R, each + * record is stored with the run number that it must go into, and we use + * (run number, key) as the ordering key for the heap. When the run number + * at the top of the heap changes, we know that no more records of the prior + * run are left in the heap. Note that there are in practice only ever two + * distinct run numbers, because since PostgreSQL 9.6, we only use + * replacement selection to form the first run. + * + * In PostgreSQL 9.6, a heap (based on Knuth's Algorithm H, with some small + * customizations) is only used with the aim of producing just one run, + * thereby avoiding all merging. Only the first run can use replacement + * selection, which is why there are now only two possible valid run + * numbers, and why heapification is customized to not distinguish between + * tuples in the second run (those will be quicksorted). We generally + * prefer a simple hybrid sort-merge strategy, where runs are sorted in much + * the same way as the entire input of an internal sort is sorted (using + * qsort()). The replacement_sort_tuples GUC controls the limited remaining + * use of replacement selection for the first run. + * + * There are several reasons to favor a hybrid sort-merge strategy. + * Maintaining a priority tree/heap has poor CPU cache characteristics. + * Furthermore, the growth in main memory sizes has greatly diminished the + * value of having runs that are larger than available memory, even in the + * case where there is partially sorted input and runs can be made far + * larger by using a heap. In most cases, a single-pass merge step is all + * that is required even when runs are no larger than available memory. + * Avoiding multiple merge passes was traditionally considered to be the + * major advantage of using replacement selection. + * + * The approximate amount of memory allowed for any one sort operation + * is specified in kilobytes by the caller (most pass work_mem). Initially, + * we absorb tuples and simply store them in an unsorted array as long as + * we haven't exceeded workMem. If we reach the end of the input without + * exceeding workMem, we sort the array using qsort() and subsequently return + * tuples just by scanning the tuple array sequentially. If we do exceed + * workMem, we begin to emit tuples into sorted runs in temporary tapes. + * When tuples are dumped in batch after quicksorting, we begin a new run + * with a new output tape (selected per Algorithm D). After the end of the + * input is reached, we dump out remaining tuples in memory into a final run + * (or two, when replacement selection is still used), then merge the runs + * using Algorithm D. + * + * When merging runs, we use a heap containing just the frontmost tuple from + * each source run; we repeatedly output the smallest tuple and replace it + * with the next tuple from its source tape (if any). When the heap empties, + * the merge is complete. The basic merge algorithm thus needs very little + * memory --- only M tuples for an M-way merge, and M is constrained to a + * small number. However, we can still make good use of our full workMem + * allocation by pre-reading additional blocks from each source tape. Without + * prereading, our access pattern to the temporary file would be very erratic; + * on average we'd read one block from each of M source tapes during the same + * time that we're writing M blocks to the output tape, so there is no + * sequentiality of access at all, defeating the read-ahead methods used by + * most Unix kernels. Worse, the output tape gets written into a very random + * sequence of blocks of the temp file, ensuring that things will be even + * worse when it comes time to read that tape. A straightforward merge pass + * thus ends up doing a lot of waiting for disk seeks. We can improve matters + * by prereading from each source tape sequentially, loading about workMem/M + * bytes from each tape in turn, and making the sequential blocks immediately + * available for reuse. This approach helps to localize both read and write + * accesses. The pre-reading is handled by logtape.c, we just tell it how + * much memory to use for the buffers. + * + * When the caller requests random access to the sort result, we form + * the final sorted run on a logical tape which is then "frozen", so + * that we can access it randomly. When the caller does not need random + * access, we return from tuplesort_performsort() as soon as we are down + * to one run per logical tape. The final merge is then performed + * on-the-fly as the caller repeatedly calls tuplesort_getXXX; this + * saves one cycle of writing all the data out to disk and reading it in. + * + * Before Postgres 8.2, we always used a seven-tape polyphase merge, on the + * grounds that 7 is the "sweet spot" on the tapes-to-passes curve according + * to Knuth's figure 70 (section 5.4.2). However, Knuth is assuming that + * tape drives are expensive beasts, and in particular that there will always + * be many more runs than tape drives. In our implementation a "tape drive" + * doesn't cost much more than a few Kb of memory buffers, so we can afford + * to have lots of them. In particular, if we can have as many tape drives + * as sorted runs, we can eliminate any repeated I/O at all. In the current + * code we determine the number of tapes M on the basis of workMem: we want + * workMem/M to be large enough that we read a fair amount of data each time + * we preread from a tape, so as to maintain the locality of access described + * above. Nonetheless, with large workMem we can have many tapes (but not + * too many -- see the comments in tuplesort_merge_order). + * + * + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/utils/sort/tuplesort.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include + +#include "access/htup_details.h" +#include "access/nbtree.h" +#include "access/hash.h" +#include "catalog/index.h" +#include "catalog/pg_am.h" +#include "commands/tablespace.h" +#include "executor/executor.h" +#include "miscadmin.h" +#include "pg_trace.h" +#include "utils/datum.h" +#include "utils/logtape.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/pg_rusage.h" +#include "utils/rel.h" +#include "utils/sortsupport.h" +#include "utils/tuplesort.h" + +/* Should be the last include */ +#include "disable_core_macro.h" + +/* sort-type codes for sort__start probes */ +#define HEAP_SORT 0 +#define INDEX_SORT 1 +#define DATUM_SORT 2 +#define CLUSTER_SORT 3 + +/* GUC variables */ +#ifdef TRACE_SORT +bool trace_sort = false; +#endif + +#ifdef DEBUG_BOUNDED_SORT +bool optimize_bounded_sort = true; +#endif + + +/* + * The objects we actually sort are SortTuple structs. These contain + * a pointer to the tuple proper (might be a MinimalTuple or IndexTuple), + * which is a separate palloc chunk --- we assume it is just one chunk and + * can be freed by a simple pfree() (except during merge, when we use a + * simple slab allocator). SortTuples also contain the tuple's first key + * column in Datum/nullflag format, and an index integer. + * + * Storing the first key column lets us save heap_getattr or index_getattr + * calls during tuple comparisons. We could extract and save all the key + * columns not just the first, but this would increase code complexity and + * overhead, and wouldn't actually save any comparison cycles in the common + * case where the first key determines the comparison result. Note that + * for a pass-by-reference datatype, datum1 points into the "tuple" storage. + * + * There is one special case: when the sort support infrastructure provides an + * "abbreviated key" representation, where the key is (typically) a pass by + * value proxy for a pass by reference type. In this case, the abbreviated key + * is stored in datum1 in place of the actual first key column. + * + * When sorting single Datums, the data value is represented directly by + * datum1/isnull1 for pass by value types (or null values). If the datatype is + * pass-by-reference and isnull1 is false, then "tuple" points to a separately + * palloc'd data value, otherwise "tuple" is NULL. The value of datum1 is then + * either the same pointer as "tuple", or is an abbreviated key value as + * described above. Accordingly, "tuple" is always used in preference to + * datum1 as the authoritative value for pass-by-reference cases. + * + * While building initial runs, tupindex holds the tuple's run number. + * Historically, the run number could meaningfully distinguish many runs, but + * it now only distinguishes RUN_FIRST and HEAP_RUN_NEXT, since replacement + * selection is always abandoned after the first run; no other run number + * should be represented here. During merge passes, we re-use it to hold the + * input tape number that each tuple in the heap was read from. tupindex goes + * unused if the sort occurs entirely in memory. + */ +typedef struct +{ + void *tuple; /* the tuple itself */ + Datum datum1; /* value of first key column */ + bool isnull1; /* is first key column NULL? */ + int tupindex; /* see notes above */ +} SortTuple; + +/* + * During merge, we use a pre-allocated set of fixed-size slots to hold + * tuples. To avoid palloc/pfree overhead. + * + * Merge doesn't require a lot of memory, so we can afford to waste some, + * by using gratuitously-sized slots. If a tuple is larger than 1 kB, the + * palloc() overhead is not significant anymore. + * + * 'nextfree' is valid when this chunk is in the free list. When in use, the + * slot holds a tuple. + */ +#define SLAB_SLOT_SIZE 1024 + +typedef union SlabSlot +{ + union SlabSlot *nextfree; + char buffer[SLAB_SLOT_SIZE]; +} SlabSlot; + +/* + * Possible states of a Tuplesort object. These denote the states that + * persist between calls of Tuplesort routines. + */ +typedef enum +{ + TSS_INITIAL, /* Loading tuples; still within memory limit */ + TSS_BOUNDED, /* Loading tuples into bounded-size heap */ + TSS_BUILDRUNS, /* Loading tuples; writing to tape */ + TSS_SORTEDINMEM, /* Sort completed entirely in memory */ + TSS_SORTEDONTAPE, /* Sort completed, final run is on tape */ + TSS_FINALMERGE /* Performing final merge on-the-fly */ +} TupSortStatus; + +/* + * Parameters for calculation of number of tapes to use --- see inittapes() + * and tuplesort_merge_order(). + * + * In this calculation we assume that each tape will cost us about 1 blocks + * worth of buffer space. This ignores the overhead of all the other data + * structures needed for each tape, but it's probably close enough. + * + * MERGE_BUFFER_SIZE is how much data we'd like to read from each input + * tape during a preread cycle (see discussion at top of file). + */ +#define MINORDER 6 /* minimum merge order */ +#define MAXORDER 500 /* maximum merge order */ +#define TAPE_BUFFER_OVERHEAD BLCKSZ +#define MERGE_BUFFER_SIZE (BLCKSZ * 32) + + /* + * Run numbers, used during external sort operations. + * + * HEAP_RUN_NEXT is only used for SortTuple.tupindex, never state.currentRun. + */ +#define RUN_FIRST 0 +#define HEAP_RUN_NEXT INT_MAX +#define RUN_SECOND 1 + +typedef int (*SortTupleComparator) (const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); + +/* + * Private state of a Tuplesort operation. + */ +struct Tuplesortstate +{ + TupSortStatus status; /* enumerated value as shown above */ + int nKeys; /* number of columns in sort key */ + bool randomAccess; /* did caller request random access? */ + bool bounded; /* did caller specify a maximum number of + * tuples to return? */ + bool boundUsed; /* true if we made use of a bounded heap */ + int bound; /* if bounded, the maximum number of tuples */ + bool tuples; /* Can SortTuple.tuple ever be set? */ + int64 availMem; /* remaining memory available, in bytes */ + int64 allowedMem; /* total memory allowed, in bytes */ + int maxTapes; /* number of tapes (Knuth's T) */ + int tapeRange; /* maxTapes-1 (Knuth's P) */ + MemoryContext sortcontext; /* memory context holding most sort data */ + MemoryContext tuplecontext; /* sub-context of sortcontext for tuple data */ + LogicalTapeSet *tapeset; /* logtape.c object for tapes in a temp file */ + + /* + * These function pointers decouple the routines that must know what kind + * of tuple we are sorting from the routines that don't need to know it. + * They are set up by the tuplesort_begin_xxx routines. + * + * Function to compare two tuples; result is per qsort() convention, ie: + * <0, 0, >0 according as ab. The API must match + * qsort_arg_comparator. + */ + SortTupleComparator comparetup; + + /* + * Function to copy a supplied input tuple into palloc'd space and set up + * its SortTuple representation (ie, set tuple/datum1/isnull1). Also, + * state->availMem must be decreased by the amount of space used for the + * tuple copy (note the SortTuple struct itself is not counted). + */ + void (*copytup) (Tuplesortstate *state, SortTuple *stup, void *tup); + + /* + * Function to write a stored tuple onto tape. The representation of the + * tuple on tape need not be the same as it is in memory; requirements on + * the tape representation are given below. Unless the slab allocator is + * used, after writing the tuple, pfree() the out-of-line data (not the + * SortTuple struct!), and increase state->availMem by the amount of + * memory space thereby released. + */ + void (*writetup) (Tuplesortstate *state, int tapenum, + SortTuple *stup); + + /* + * Function to read a stored tuple from tape back into memory. 'len' is + * the already-read length of the stored tuple. The tuple is allocated + * from the slab memory arena, or is palloc'd, see readtup_alloc(). + */ + void (*readtup) (Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); + + /* + * This array holds the tuples now in sort memory. If we are in state + * INITIAL, the tuples are in no particular order; if we are in state + * SORTEDINMEM, the tuples are in final sorted order; in states BUILDRUNS + * and FINALMERGE, the tuples are organized in "heap" order per Algorithm + * H. In state SORTEDONTAPE, the array is not used. + */ + SortTuple *memtuples; /* array of SortTuple structs */ + int memtupcount; /* number of tuples currently present */ + int memtupsize; /* allocated length of memtuples array */ + bool growmemtuples; /* memtuples' growth still underway? */ + + /* + * Memory for tuples is sometimes allocated using a simple slab allocator, + * rather than with palloc(). Currently, we switch to slab allocation + * when we start merging. Merging only needs to keep a small, fixed + * number of tuples in memory at any time, so we can avoid the + * palloc/pfree overhead by recycling a fixed number of fixed-size slots + * to hold the tuples. + * + * For the slab, we use one large allocation, divided into SLAB_SLOT_SIZE + * slots. The allocation is sized to have one slot per tape, plus one + * additional slot. We need that many slots to hold all the tuples kept + * in the heap during merge, plus the one we have last returned from the + * sort, with tuplesort_gettuple. + * + * Initially, all the slots are kept in a linked list of free slots. When + * a tuple is read from a tape, it is put to the next available slot, if + * it fits. If the tuple is larger than SLAB_SLOT_SIZE, it is palloc'd + * instead. + * + * When we're done processing a tuple, we return the slot back to the free + * list, or pfree() if it was palloc'd. We know that a tuple was + * allocated from the slab, if its pointer value is between + * slabMemoryBegin and -End. + * + * When the slab allocator is used, the USEMEM/LACKMEM mechanism of + * tracking memory usage is not used. + */ + bool slabAllocatorUsed; + + char *slabMemoryBegin; /* beginning of slab memory arena */ + char *slabMemoryEnd; /* end of slab memory arena */ + SlabSlot *slabFreeHead; /* head of free list */ + + /* Buffer size to use for reading input tapes, during merge. */ + size_t read_buffer_size; + + /* + * When we return a tuple to the caller in tuplesort_gettuple_XXX, that + * came from a tape (that is, in TSS_SORTEDONTAPE or TSS_FINALMERGE + * modes), we remember the tuple in 'lastReturnedTuple', so that we can + * recycle the memory on next gettuple call. + */ + void *lastReturnedTuple; + + /* + * While building initial runs, this indicates if the replacement + * selection strategy is in use. When it isn't, then a simple hybrid + * sort-merge strategy is in use instead (runs are quicksorted). + */ + bool replaceActive; + + /* + * While building initial runs, this is the current output run number + * (starting at RUN_FIRST). Afterwards, it is the number of initial runs + * we made. + */ + int currentRun; + + /* + * Unless otherwise noted, all pointer variables below are pointers to + * arrays of length maxTapes, holding per-tape data. + */ + + /* + * This variable is only used during merge passes. mergeactive[i] is true + * if we are reading an input run from (actual) tape number i and have not + * yet exhausted that run. + */ + bool *mergeactive; /* active input run source? */ + + /* + * Variables for Algorithm D. Note that destTape is a "logical" tape + * number, ie, an index into the tp_xxx[] arrays. Be careful to keep + * "logical" and "actual" tape numbers straight! + */ + int Level; /* Knuth's l */ + int destTape; /* current output tape (Knuth's j, less 1) */ + int *tp_fib; /* Target Fibonacci run counts (A[]) */ + int *tp_runs; /* # of real runs on each tape */ + int *tp_dummy; /* # of dummy runs for each tape (D[]) */ + int *tp_tapenum; /* Actual tape numbers (TAPE[]) */ + int activeTapes; /* # of active input tapes in merge pass */ + + /* + * These variables are used after completion of sorting to keep track of + * the next tuple to return. (In the tape case, the tape's current read + * position is also critical state.) + */ + int result_tape; /* actual tape number of finished output */ + int current; /* array index (only used if SORTEDINMEM) */ + bool eof_reached; /* reached EOF (needed for cursors) */ + + /* markpos_xxx holds marked position for mark and restore */ + long markpos_block; /* tape block# (only used if SORTEDONTAPE) */ + int markpos_offset; /* saved "current", or offset in tape block */ + bool markpos_eof; /* saved "eof_reached" */ + + /* + * The sortKeys variable is used by every case other than the hash index + * case; it is set by tuplesort_begin_xxx. tupDesc is only used by the + * MinimalTuple and CLUSTER routines, though. + */ + TupleDesc tupDesc; + SortSupport sortKeys; /* array of length nKeys */ + + /* + * This variable is shared by the single-key MinimalTuple case and the + * Datum case (which both use qsort_ssup()). Otherwise it's NULL. + */ + SortSupport onlyKey; + + /* + * Additional state for managing "abbreviated key" sortsupport routines + * (which currently may be used by all cases except the hash index case). + * Tracks the intervals at which the optimization's effectiveness is + * tested. + */ + int64 abbrevNext; /* Tuple # at which to next check + * applicability */ + + /* + * These variables are specific to the CLUSTER case; they are set by + * tuplesort_begin_cluster. + */ + IndexInfo *indexInfo; /* info about index being used for reference */ + EState *estate; /* for evaluating index expressions */ + + /* + * These variables are specific to the IndexTuple case; they are set by + * tuplesort_begin_index_xxx and used only by the IndexTuple routines. + */ + Relation heapRel; /* table the index is being built on */ + Relation indexRel; /* index being built */ + + /* These are specific to the index_btree subcase: */ + bool enforceUnique; /* complain if we find duplicate tuples */ + + /* These are specific to the index_hash subcase: */ + uint32 high_mask; /* masks for sortable part of hash code */ + uint32 low_mask; + uint32 max_buckets; + + /* + * These variables are specific to the Datum case; they are set by + * tuplesort_begin_datum and used only by the DatumTuple routines. + */ + Oid datumType; + /* we need typelen in order to know how to copy the Datums. */ + int datumTypeLen; + + /* + * Resource snapshot for time of sort start. + */ +#ifdef TRACE_SORT + PGRUsage ru_start; +#endif +}; + +/* + * Is the given tuple allocated from the slab memory arena? + */ +#define IS_SLAB_SLOT(state, tuple) \ + ((char *) (tuple) >= (state)->slabMemoryBegin && \ + (char *) (tuple) < (state)->slabMemoryEnd) + +/* + * Return the given tuple to the slab memory free list, or free it + * if it was palloc'd. + */ +#define RELEASE_SLAB_SLOT(state, tuple) \ + do { \ + SlabSlot *buf = (SlabSlot *) tuple; \ + \ + if (IS_SLAB_SLOT((state), buf)) \ + { \ + buf->nextfree = (state)->slabFreeHead; \ + (state)->slabFreeHead = buf; \ + } else \ + pfree(buf); \ + } while(0) + +#define COMPARETUP(state,a,b) ((*(state)->comparetup) (a, b, state)) +#define COPYTUP(state,stup,tup) ((*(state)->copytup) (state, stup, tup)) +#define WRITETUP(state,tape,stup) ((*(state)->writetup) (state, tape, stup)) +#define READTUP(state,stup,tape,len) ((*(state)->readtup) (state, stup, tape, len)) +#define LACKMEM(state) ((state)->availMem < 0 && !(state)->slabAllocatorUsed) +#define USEMEM(state,amt) ((state)->availMem -= (amt)) +#define FREEMEM(state,amt) ((state)->availMem += (amt)) + +/* + * NOTES about on-tape representation of tuples: + * + * We require the first "unsigned int" of a stored tuple to be the total size + * on-tape of the tuple, including itself (so it is never zero; an all-zero + * unsigned int is used to delimit runs). The remainder of the stored tuple + * may or may not match the in-memory representation of the tuple --- + * any conversion needed is the job of the writetup and readtup routines. + * + * If state->randomAccess is true, then the stored representation of the + * tuple must be followed by another "unsigned int" that is a copy of the + * length --- so the total tape space used is actually sizeof(unsigned int) + * more than the stored length value. This allows read-backwards. When + * randomAccess is not true, the write/read routines may omit the extra + * length word. + * + * writetup is expected to write both length words as well as the tuple + * data. When readtup is called, the tape is positioned just after the + * front length word; readtup must read the tuple data and advance past + * the back length word (if present). + * + * The write/read routines can make use of the tuple description data + * stored in the Tuplesortstate record, if needed. They are also expected + * to adjust state->availMem by the amount of memory space (not tape space!) + * released or consumed. There is no error return from either writetup + * or readtup; they should ereport() on failure. + * + * + * NOTES about memory consumption calculations: + * + * We count space allocated for tuples against the workMem limit, plus + * the space used by the variable-size memtuples array. Fixed-size space + * is not counted; it's small enough to not be interesting. + * + * Note that we count actual space used (as shown by GetMemoryChunkSpace) + * rather than the originally-requested size. This is important since + * palloc can add substantial overhead. It's not a complete answer since + * we won't count any wasted space in palloc allocation blocks, but it's + * a lot better than what we were doing before 7.3. As of 9.6, a + * separate memory context is used for caller passed tuples. Resetting + * it at certain key increments significantly ameliorates fragmentation. + * Note that this places a responsibility on readtup and copytup routines + * to use the right memory context for these tuples (and to not use the + * reset context for anything whose lifetime needs to span multiple + * external sort runs). + */ + +/* When using this macro, beware of double evaluation of len */ +#define LogicalTapeReadExact(tapeset, tapenum, ptr, len) \ + do { \ + if (LogicalTapeRead(tapeset, tapenum, ptr, len) != (size_t) (len)) \ + elog(ERROR, "unexpected end of data"); \ + } while(0) + + +static Tuplesortstate *tuplesort_begin_common(int workMem, bool randomAccess); +static void puttuple_common(Tuplesortstate *state, SortTuple *tuple); +static bool consider_abort_common(Tuplesortstate *state); +static bool useselection(Tuplesortstate *state); +static void inittapes(Tuplesortstate *state); +static void selectnewtape(Tuplesortstate *state); +static void init_slab_allocator(Tuplesortstate *state, int numSlots); +static void mergeruns(Tuplesortstate *state); +static void mergeonerun(Tuplesortstate *state); +static void beginmerge(Tuplesortstate *state); +static bool mergereadnext(Tuplesortstate *state, int srcTape, SortTuple *stup); +static void dumptuples(Tuplesortstate *state, bool alltuples); +static void dumpbatch(Tuplesortstate *state, bool alltuples); +static void make_bounded_heap(Tuplesortstate *state); +static void sort_bounded_heap(Tuplesortstate *state); +static void tuplesort_sort_memtuples(Tuplesortstate *state); +static void tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple, + bool checkIndex); +static void tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple *tuple, + bool checkIndex); +static void tuplesort_heap_delete_top(Tuplesortstate *state, bool checkIndex); +static void reversedirection(Tuplesortstate *state); +static unsigned int getlen(Tuplesortstate *state, int tapenum, bool eofOK); +static void markrunend(Tuplesortstate *state, int tapenum); +static void *readtup_alloc(Tuplesortstate *state, Size tuplen); +static int comparetup_heap(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_heap(Tuplesortstate *state, int tapenum, + SortTuple *stup); +static void readtup_heap(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); +static int comparetup_cluster(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_cluster(Tuplesortstate *state, int tapenum, + SortTuple *stup); +static void readtup_cluster(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); +static int comparetup_index_btree(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static int comparetup_index_hash(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_index(Tuplesortstate *state, int tapenum, + SortTuple *stup); +static void readtup_index(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); +static int comparetup_datum(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_datum(Tuplesortstate *state, int tapenum, + SortTuple *stup); +static void readtup_datum(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); +static void free_sort_tuple(Tuplesortstate *state, SortTuple *stup); + +/* + * Special versions of qsort just for SortTuple objects. qsort_tuple() sorts + * any variant of SortTuples, using the appropriate comparetup function. + * qsort_ssup() is specialized for the case where the comparetup function + * reduces to ApplySortComparator(), that is single-key MinimalTuple sorts + * and Datum sorts. + */ +#include "qsort_tuple.c" + + +/* + * tuplesort_begin_xxx + * + * Initialize for a tuple sort operation. + * + * After calling tuplesort_begin, the caller should call tuplesort_putXXX + * zero or more times, then call tuplesort_performsort when all the tuples + * have been supplied. After performsort, retrieve the tuples in sorted + * order by calling tuplesort_getXXX until it returns false/NULL. (If random + * access was requested, rescan, markpos, and restorepos can also be called.) + * Call tuplesort_end to terminate the operation and release memory/disk space. + * + * Each variant of tuplesort_begin has a workMem parameter specifying the + * maximum number of kilobytes of RAM to use before spilling data to disk. + * (The normal value of this parameter is work_mem, but some callers use + * other values.) Each variant also has a randomAccess parameter specifying + * whether the caller needs non-sequential access to the sort result. + */ + +static Tuplesortstate * +tuplesort_begin_common(int workMem, bool randomAccess) +{ + Tuplesortstate *state; + MemoryContext sortcontext; + MemoryContext tuplecontext; + MemoryContext oldcontext; + + /* + * Create a working memory context for this sort operation. All data + * needed by the sort will live inside this context. + */ + sortcontext = AllocSetContextCreate(CurrentMemoryContext, + "TupleSort main", + ALLOCSET_DEFAULT_SIZES); + + /* + * Caller tuple (e.g. IndexTuple) memory context. + * + * A dedicated child context used exclusively for caller passed tuples + * eases memory management. Resetting at key points reduces + * fragmentation. Note that the memtuples array of SortTuples is allocated + * in the parent context, not this context, because there is no need to + * free memtuples early. + */ + tuplecontext = AllocSetContextCreate(sortcontext, + "Caller tuples", + ALLOCSET_DEFAULT_SIZES); + + /* + * Make the Tuplesortstate within the per-sort context. This way, we + * don't need a separate pfree() operation for it at shutdown. + */ + oldcontext = MemoryContextSwitchTo(sortcontext); + + state = (Tuplesortstate *) palloc0(sizeof(Tuplesortstate)); + +#ifdef TRACE_SORT + if (trace_sort) + pg_rusage_init(&state->ru_start); +#endif + + state->status = TSS_INITIAL; + state->randomAccess = randomAccess; + state->bounded = false; + state->tuples = true; + state->boundUsed = false; + state->allowedMem = workMem * (int64) 1024; + state->availMem = state->allowedMem; + state->sortcontext = sortcontext; + state->tuplecontext = tuplecontext; + state->tapeset = NULL; + + state->memtupcount = 0; + + /* + * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD; + * see comments in grow_memtuples(). + */ + state->memtupsize = Max(1024, + ALLOCSET_SEPARATE_THRESHOLD / sizeof(SortTuple) + 1); + + state->growmemtuples = true; + state->slabAllocatorUsed = false; + state->memtuples = (SortTuple *) palloc(state->memtupsize * sizeof(SortTuple)); + + USEMEM(state, GetMemoryChunkSpace(state->memtuples)); + + /* workMem must be large enough for the minimal memtuples array */ + if (LACKMEM(state)) + elog(ERROR, "insufficient memory allowed for sort"); + + state->currentRun = RUN_FIRST; + + /* + * maxTapes, tapeRange, and Algorithm D variables will be initialized by + * inittapes(), if needed + */ + + state->result_tape = -1; /* flag that result tape has not been formed */ + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_heap(TupleDesc tupDesc, + int nkeys, AttrNumber *attNums, + Oid *sortOperators, Oid *sortCollations, + bool *nullsFirstFlags, + int workMem, bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess); + MemoryContext oldcontext; + int i; + + oldcontext = MemoryContextSwitchTo(state->sortcontext); + + AssertArg(nkeys > 0); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c", + nkeys, workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = nkeys; + + TRACE_POSTGRESQL_SORT_START(HEAP_SORT, + false, /* no unique check */ + nkeys, + workMem, + randomAccess); + + state->comparetup = comparetup_heap; + state->copytup = copytup_heap; + state->writetup = writetup_heap; + state->readtup = readtup_heap; + + state->tupDesc = tupDesc; /* assume we need not copy tupDesc */ + state->abbrevNext = 10; + + /* Prepare SortSupport data for each column */ + state->sortKeys = (SortSupport) palloc0(nkeys * sizeof(SortSupportData)); + + for (i = 0; i < nkeys; i++) + { + SortSupport sortKey = state->sortKeys + i; + + AssertArg(attNums[i] != 0); + AssertArg(sortOperators[i] != 0); + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = sortCollations[i]; + sortKey->ssup_nulls_first = nullsFirstFlags[i]; + sortKey->ssup_attno = attNums[i]; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0); + + PrepareSortSupportFromOrderingOp(sortOperators[i], sortKey); + } + + /* + * The "onlyKey" optimization cannot be used with abbreviated keys, since + * tie-breaker comparisons may be required. Typically, the optimization + * is only of value to pass-by-value types anyway, whereas abbreviated + * keys are typically only of value to pass-by-reference types. + */ + if (nkeys == 1 && !state->sortKeys->abbrev_converter) + state->onlyKey = state->sortKeys; + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_cluster(TupleDesc tupDesc, + Relation indexRel, + int workMem, bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess); + ScanKey indexScanKey; + MemoryContext oldcontext; + int i; + + Assert(indexRel->rd_rel->relam == BTREE_AM_OID); + + oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c", + RelationGetNumberOfAttributes(indexRel), + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = RelationGetNumberOfAttributes(indexRel); + + TRACE_POSTGRESQL_SORT_START(CLUSTER_SORT, + false, /* no unique check */ + state->nKeys, + workMem, + randomAccess); + + state->comparetup = comparetup_cluster; + state->copytup = copytup_cluster; + state->writetup = writetup_cluster; + state->readtup = readtup_cluster; + state->abbrevNext = 10; + + state->indexInfo = BuildIndexInfo(indexRel); + + state->tupDesc = tupDesc; /* assume we need not copy tupDesc */ + + indexScanKey = _bt_mkscankey_nodata(indexRel); + + if (state->indexInfo->ii_Expressions != NULL) + { + TupleTableSlot *slot; + ExprContext *econtext; + + /* + * We will need to use FormIndexDatum to evaluate the index + * expressions. To do that, we need an EState, as well as a + * TupleTableSlot to put the table tuples into. The econtext's + * scantuple has to point to that slot, too. + */ + state->estate = CreateExecutorState(); + slot = MakeSingleTupleTableSlot(tupDesc); + econtext = GetPerTupleExprContext(state->estate); + econtext->ecxt_scantuple = slot; + } + + /* Prepare SortSupport data for each column */ + state->sortKeys = (SortSupport) palloc0(state->nKeys * + sizeof(SortSupportData)); + + for (i = 0; i < state->nKeys; i++) + { + SortSupport sortKey = state->sortKeys + i; + ScanKey scanKey = indexScanKey + i; + int16 strategy; + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = scanKey->sk_collation; + sortKey->ssup_nulls_first = + (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0; + sortKey->ssup_attno = scanKey->sk_attno; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0); + + AssertState(sortKey->ssup_attno != 0); + + strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ? + BTGreaterStrategyNumber : BTLessStrategyNumber; + + PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey); + } + + _bt_freeskey(indexScanKey); + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_index_btree(Relation heapRel, + Relation indexRel, + bool enforceUnique, + int workMem, bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess); + ScanKey indexScanKey; + MemoryContext oldcontext; + int i; + + oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin index sort: unique = %c, workMem = %d, randomAccess = %c", + enforceUnique ? 't' : 'f', + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = RelationGetNumberOfAttributes(indexRel); + + TRACE_POSTGRESQL_SORT_START(INDEX_SORT, + enforceUnique, + state->nKeys, + workMem, + randomAccess); + + state->comparetup = comparetup_index_btree; + state->copytup = copytup_index; + state->writetup = writetup_index; + state->readtup = readtup_index; + state->abbrevNext = 10; + + state->heapRel = heapRel; + state->indexRel = indexRel; + state->enforceUnique = enforceUnique; + + indexScanKey = _bt_mkscankey_nodata(indexRel); + state->nKeys = RelationGetNumberOfAttributes(indexRel); + + /* Prepare SortSupport data for each column */ + state->sortKeys = (SortSupport) palloc0(state->nKeys * + sizeof(SortSupportData)); + + for (i = 0; i < state->nKeys; i++) + { + SortSupport sortKey = state->sortKeys + i; + ScanKey scanKey = indexScanKey + i; + int16 strategy; + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = scanKey->sk_collation; + sortKey->ssup_nulls_first = + (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0; + sortKey->ssup_attno = scanKey->sk_attno; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0); + + AssertState(sortKey->ssup_attno != 0); + + strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ? + BTGreaterStrategyNumber : BTLessStrategyNumber; + + PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey); + } + + _bt_freeskey(indexScanKey); + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_index_hash(Relation heapRel, + Relation indexRel, + uint32 high_mask, + uint32 low_mask, + uint32 max_buckets, + int workMem, bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess); + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin index sort: high_mask = 0x%x, low_mask = 0x%x, " + "max_buckets = 0x%x, workMem = %d, randomAccess = %c", + high_mask, + low_mask, + max_buckets, + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = 1; /* Only one sort column, the hash code */ + + state->comparetup = comparetup_index_hash; + state->copytup = copytup_index; + state->writetup = writetup_index; + state->readtup = readtup_index; + + state->heapRel = heapRel; + state->indexRel = indexRel; + + state->high_mask = high_mask; + state->low_mask = low_mask; + state->max_buckets = max_buckets; + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation, + bool nullsFirstFlag, + int workMem, bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess); + MemoryContext oldcontext; + int16 typlen; + bool typbyval; + + oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin datum sort: workMem = %d, randomAccess = %c", + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = 1; /* always a one-column sort */ + + TRACE_POSTGRESQL_SORT_START(DATUM_SORT, + false, /* no unique check */ + 1, + workMem, + randomAccess); + + state->comparetup = comparetup_datum; + state->copytup = copytup_datum; + state->writetup = writetup_datum; + state->readtup = readtup_datum; + state->abbrevNext = 10; + + state->datumType = datumType; + + /* lookup necessary attributes of the datum type */ + get_typlenbyval(datumType, &typlen, &typbyval); + state->datumTypeLen = typlen; + state->tuples = !typbyval; + + /* Prepare SortSupport data */ + state->sortKeys = (SortSupport) palloc0(sizeof(SortSupportData)); + + state->sortKeys->ssup_cxt = CurrentMemoryContext; + state->sortKeys->ssup_collation = sortCollation; + state->sortKeys->ssup_nulls_first = nullsFirstFlag; + + /* + * Abbreviation is possible here only for by-reference types. In theory, + * a pass-by-value datatype could have an abbreviated form that is cheaper + * to compare. In a tuple sort, we could support that, because we can + * always extract the original datum from the tuple is needed. Here, we + * can't, because a datum sort only stores a single copy of the datum; the + * "tuple" field of each sortTuple is NULL. + */ + state->sortKeys->abbreviate = !typbyval; + + PrepareSortSupportFromOrderingOp(sortOperator, state->sortKeys); + + /* + * The "onlyKey" optimization cannot be used with abbreviated keys, since + * tie-breaker comparisons may be required. Typically, the optimization + * is only of value to pass-by-value types anyway, whereas abbreviated + * keys are typically only of value to pass-by-reference types. + */ + if (!state->sortKeys->abbrev_converter) + state->onlyKey = state->sortKeys; + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +/* + * tuplesort_set_bound + * + * Advise tuplesort that at most the first N result tuples are required. + * + * Must be called before inserting any tuples. (Actually, we could allow it + * as long as the sort hasn't spilled to disk, but there seems no need for + * delayed calls at the moment.) + * + * This is a hint only. The tuplesort may still return more tuples than + * requested. + */ +void +tuplesort_set_bound(Tuplesortstate *state, int64 bound) +{ + /* Assert we're called before loading any tuples */ + Assert(state->status == TSS_INITIAL); + Assert(state->memtupcount == 0); + Assert(!state->bounded); + +#ifdef DEBUG_BOUNDED_SORT + /* Honor GUC setting that disables the feature (for easy testing) */ + if (!optimize_bounded_sort) + return; +#endif + + /* We want to be able to compute bound * 2, so limit the setting */ + if (bound > (int64) (INT_MAX / 2)) + return; + + state->bounded = true; + state->bound = (int) bound; + + /* + * Bounded sorts are not an effective target for abbreviated key + * optimization. Disable by setting state to be consistent with no + * abbreviation support. + */ + state->sortKeys->abbrev_converter = NULL; + if (state->sortKeys->abbrev_full_comparator) + state->sortKeys->comparator = state->sortKeys->abbrev_full_comparator; + + /* Not strictly necessary, but be tidy */ + state->sortKeys->abbrev_abort = NULL; + state->sortKeys->abbrev_full_comparator = NULL; +} + +/* + * tuplesort_end + * + * Release resources and clean up. + * + * NOTE: after calling this, any pointers returned by tuplesort_getXXX are + * pointing to garbage. Be careful not to attempt to use or free such + * pointers afterwards! + */ +void +tuplesort_end(Tuplesortstate *state) +{ + /* context swap probably not needed, but let's be safe */ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + long spaceUsed; + + if (state->tapeset) + spaceUsed = LogicalTapeSetBlocks(state->tapeset); + else + spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024; +#endif + + /* + * Delete temporary "tape" files, if any. + * + * Note: want to include this in reported total cost of sort, hence need + * for two #ifdef TRACE_SORT sections. + */ + if (state->tapeset) + LogicalTapeSetClose(state->tapeset); + +#ifdef TRACE_SORT + if (trace_sort) + { + if (state->tapeset) + elog(LOG, "external sort ended, %ld disk blocks used: %s", + spaceUsed, pg_rusage_show(&state->ru_start)); + else + elog(LOG, "internal sort ended, %ld KB used: %s", + spaceUsed, pg_rusage_show(&state->ru_start)); + } + + TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, spaceUsed); +#else + + /* + * If you disabled TRACE_SORT, you can still probe sort__done, but you + * ain't getting space-used stats. + */ + TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, 0L); +#endif + + /* Free any execution state created for CLUSTER case */ + if (state->estate != NULL) + { + ExprContext *econtext = GetPerTupleExprContext(state->estate); + + ExecDropSingleTupleTableSlot(econtext->ecxt_scantuple); + FreeExecutorState(state->estate); + } + + MemoryContextSwitchTo(oldcontext); + + /* + * Free the per-sort memory context, thereby releasing all working memory, + * including the Tuplesortstate struct itself. + */ + MemoryContextDelete(state->sortcontext); +} + +/* + * Grow the memtuples[] array, if possible within our memory constraint. We + * must not exceed INT_MAX tuples in memory or the caller-provided memory + * limit. Return TRUE if we were able to enlarge the array, FALSE if not. + * + * Normally, at each increment we double the size of the array. When doing + * that would exceed a limit, we attempt one last, smaller increase (and then + * clear the growmemtuples flag so we don't try any more). That allows us to + * use memory as fully as permitted; sticking to the pure doubling rule could + * result in almost half going unused. Because availMem moves around with + * tuple addition/removal, we need some rule to prevent making repeated small + * increases in memtupsize, which would just be useless thrashing. The + * growmemtuples flag accomplishes that and also prevents useless + * recalculations in this function. + */ +static bool +grow_memtuples(Tuplesortstate *state) +{ + int newmemtupsize; + int memtupsize = state->memtupsize; + int64 memNowUsed = state->allowedMem - state->availMem; + + /* Forget it if we've already maxed out memtuples, per comment above */ + if (!state->growmemtuples) + return false; + + /* Select new value of memtupsize */ + if (memNowUsed <= state->availMem) + { + /* + * We've used no more than half of allowedMem; double our usage, + * clamping at INT_MAX tuples. + */ + if (memtupsize < INT_MAX / 2) + newmemtupsize = memtupsize * 2; + else + { + newmemtupsize = INT_MAX; + state->growmemtuples = false; + } + } + else + { + /* + * This will be the last increment of memtupsize. Abandon doubling + * strategy and instead increase as much as we safely can. + * + * To stay within allowedMem, we can't increase memtupsize by more + * than availMem / sizeof(SortTuple) elements. In practice, we want + * to increase it by considerably less, because we need to leave some + * space for the tuples to which the new array slots will refer. We + * assume the new tuples will be about the same size as the tuples + * we've already seen, and thus we can extrapolate from the space + * consumption so far to estimate an appropriate new size for the + * memtuples array. The optimal value might be higher or lower than + * this estimate, but it's hard to know that in advance. We again + * clamp at INT_MAX tuples. + * + * This calculation is safe against enlarging the array so much that + * LACKMEM becomes true, because the memory currently used includes + * the present array; thus, there would be enough allowedMem for the + * new array elements even if no other memory were currently used. + * + * We do the arithmetic in float8, because otherwise the product of + * memtupsize and allowedMem could overflow. Any inaccuracy in the + * result should be insignificant; but even if we computed a + * completely insane result, the checks below will prevent anything + * really bad from happening. + */ + double grow_ratio; + + grow_ratio = (double) state->allowedMem / (double) memNowUsed; + if (memtupsize * grow_ratio < INT_MAX) + newmemtupsize = (int) (memtupsize * grow_ratio); + else + newmemtupsize = INT_MAX; + + /* We won't make any further enlargement attempts */ + state->growmemtuples = false; + } + + /* Must enlarge array by at least one element, else report failure */ + if (newmemtupsize <= memtupsize) + goto noalloc; + + /* + * On a 32-bit machine, allowedMem could exceed MaxAllocHugeSize. Clamp + * to ensure our request won't be rejected. Note that we can easily + * exhaust address space before facing this outcome. (This is presently + * impossible due to guc.c's MAX_KILOBYTES limitation on work_mem, but + * don't rely on that at this distance.) + */ + if ((Size) newmemtupsize >= MaxAllocHugeSize / sizeof(SortTuple)) + { + newmemtupsize = (int) (MaxAllocHugeSize / sizeof(SortTuple)); + state->growmemtuples = false; /* can't grow any more */ + } + + /* + * We need to be sure that we do not cause LACKMEM to become true, else + * the space management algorithm will go nuts. The code above should + * never generate a dangerous request, but to be safe, check explicitly + * that the array growth fits within availMem. (We could still cause + * LACKMEM if the memory chunk overhead associated with the memtuples + * array were to increase. That shouldn't happen because we chose the + * initial array size large enough to ensure that palloc will be treating + * both old and new arrays as separate chunks. But we'll check LACKMEM + * explicitly below just in case.) + */ + if (state->availMem < (int64) ((newmemtupsize - memtupsize) * sizeof(SortTuple))) + goto noalloc; + + /* OK, do it */ + FREEMEM(state, GetMemoryChunkSpace(state->memtuples)); + state->memtupsize = newmemtupsize; + state->memtuples = (SortTuple *) + repalloc_huge(state->memtuples, + state->memtupsize * sizeof(SortTuple)); + USEMEM(state, GetMemoryChunkSpace(state->memtuples)); + if (LACKMEM(state)) + elog(ERROR, "unexpected out-of-memory situation in tuplesort"); + return true; + +noalloc: + /* If for any reason we didn't realloc, shut off future attempts */ + state->growmemtuples = false; + return false; +} + +/* + * Accept one tuple while collecting input data for sort. + * + * Note that the input data is always copied; the caller need not save it. + */ +void +tuplesort_puttupleslot(Tuplesortstate *state, TupleTableSlot *slot) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + /* + * Copy the given tuple into memory we control, and decrease availMem. + * Then call the common code. + */ + COPYTUP(state, &stup, (void *) slot); + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Accept one tuple while collecting input data for sort. + * + * Note that the input data is always copied; the caller need not save it. + */ +void +tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + /* + * Copy the given tuple into memory we control, and decrease availMem. + * Then call the common code. + */ + COPYTUP(state, &stup, (void *) tup); + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Collect one index tuple while collecting input data for sort, building + * it from caller-supplied values. + */ +void +tuplesort_putindextuplevalues(Tuplesortstate *state, Relation rel, + ItemPointer self, Datum *values, + bool *isnull) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + SortTuple stup; + Datum original; + IndexTuple tuple; + + stup.tuple = index_form_tuple(RelationGetDescr(rel), values, isnull); + tuple = ((IndexTuple) stup.tuple); + tuple->t_tid = *self; + USEMEM(state, GetMemoryChunkSpace(stup.tuple)); + /* set up first-column key value */ + original = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &stup.isnull1); + + MemoryContextSwitchTo(state->sortcontext); + + if (!state->sortKeys || !state->sortKeys->abbrev_converter || stup.isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to zeroed representation (to be + * consistent, and to support cheap inequality tests for NULL + * abbreviated keys). + */ + stup.datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup.datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup.datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + tuple = mtup->tuple; + mtup->datum1 = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &mtup->isnull1); + } + } + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Accept one Datum while collecting input data for sort. + * + * If the Datum is pass-by-ref type, the value will be copied. + */ +void +tuplesort_putdatum(Tuplesortstate *state, Datum val, bool isNull) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + SortTuple stup; + + /* + * Pass-by-value types or null values are just stored directly in + * stup.datum1 (and stup.tuple is not used and set to NULL). + * + * Non-null pass-by-reference values need to be copied into memory we + * control, and possibly abbreviated. The copied value is pointed to by + * stup.tuple and is treated as the canonical copy (e.g. to return via + * tuplesort_getdatum or when writing to tape); stup.datum1 gets the + * abbreviated value if abbreviation is happening, otherwise it's + * identical to stup.tuple. + */ + + if (isNull || !state->tuples) + { + /* + * Set datum1 to zeroed representation for NULLs (to be consistent, + * and to support cheap inequality tests for NULL abbreviated keys). + */ + stup.datum1 = !isNull ? val : (Datum) 0; + stup.isnull1 = isNull; + stup.tuple = NULL; /* no separate storage */ + MemoryContextSwitchTo(state->sortcontext); + } + else + { + Datum original = datumCopy(val, false, state->datumTypeLen); + + stup.isnull1 = false; + stup.tuple = DatumGetPointer(original); + USEMEM(state, GetMemoryChunkSpace(stup.tuple)); + MemoryContextSwitchTo(state->sortcontext); + + if (!state->sortKeys->abbrev_converter) + { + stup.datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup.datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup.datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any + * case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + mtup->datum1 = PointerGetDatum(mtup->tuple); + } + } + } + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Shared code for tuple and datum cases. + */ +static void +puttuple_common(Tuplesortstate *state, SortTuple *tuple) +{ + switch (state->status) + { + case TSS_INITIAL: + + /* + * Save the tuple into the unsorted array. First, grow the array + * as needed. Note that we try to grow the array when there is + * still one free slot remaining --- if we fail, there'll still be + * room to store the incoming tuple, and then we'll switch to + * tape-based operation. + */ + if (state->memtupcount >= state->memtupsize - 1) + { + (void) grow_memtuples(state); + Assert(state->memtupcount < state->memtupsize); + } + state->memtuples[state->memtupcount++] = *tuple; + + /* + * Check if it's time to switch over to a bounded heapsort. We do + * so if the input tuple count exceeds twice the desired tuple + * count (this is a heuristic for where heapsort becomes cheaper + * than a quicksort), or if we've just filled workMem and have + * enough tuples to meet the bound. + * + * Note that once we enter TSS_BOUNDED state we will always try to + * complete the sort that way. In the worst case, if later input + * tuples are larger than earlier ones, this might cause us to + * exceed workMem significantly. + */ + if (state->bounded && + (state->memtupcount > state->bound * 2 || + (state->memtupcount > state->bound && LACKMEM(state)))) + { +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "switching to bounded heapsort at %d tuples: %s", + state->memtupcount, + pg_rusage_show(&state->ru_start)); +#endif + make_bounded_heap(state); + return; + } + + /* + * Done if we still fit in available memory and have array slots. + */ + if (state->memtupcount < state->memtupsize && !LACKMEM(state)) + return; + + /* + * Nope; time to switch to tape-based operation. + */ + inittapes(state); + + /* + * Dump tuples until we are back under the limit. + */ + dumptuples(state, false); + break; + + case TSS_BOUNDED: + + /* + * We don't want to grow the array here, so check whether the new + * tuple can be discarded before putting it in. This should be a + * good speed optimization, too, since when there are many more + * input tuples than the bound, most input tuples can be discarded + * with just this one comparison. Note that because we currently + * have the sort direction reversed, we must check for <= not >=. + */ + if (COMPARETUP(state, tuple, &state->memtuples[0]) <= 0) + { + /* new tuple <= top of the heap, so we can discard it */ + free_sort_tuple(state, tuple); + CHECK_FOR_INTERRUPTS(); + } + else + { + /* discard top of heap, replacing it with the new tuple */ + free_sort_tuple(state, &state->memtuples[0]); + tuple->tupindex = 0; /* not used */ + tuplesort_heap_replace_top(state, tuple, false); + } + break; + + case TSS_BUILDRUNS: + + /* + * Insert the tuple into the heap, with run number currentRun if + * it can go into the current run, else HEAP_RUN_NEXT. The tuple + * can go into the current run if it is >= the first + * not-yet-output tuple. (Actually, it could go into the current + * run if it is >= the most recently output tuple ... but that + * would require keeping around the tuple we last output, and it's + * simplest to let writetup free each tuple as soon as it's + * written.) + * + * Note that this only applies when: + * + * - currentRun is RUN_FIRST + * + * - Replacement selection is in use (typically it is never used). + * + * When these two conditions are not both true, all tuples are + * appended indifferently, much like the TSS_INITIAL case. + * + * There should always be room to store the incoming tuple. + */ + Assert(!state->replaceActive || state->memtupcount > 0); + if (state->replaceActive && + COMPARETUP(state, tuple, &state->memtuples[0]) >= 0) + { + Assert(state->currentRun == RUN_FIRST); + + /* + * Insert tuple into first, fully heapified run. + * + * Unlike classic replacement selection, which this module was + * previously based on, only RUN_FIRST tuples are fully + * heapified. Any second/next run tuples are appended + * indifferently. While HEAP_RUN_NEXT tuples may be sifted + * out of the way of first run tuples, COMPARETUP() will never + * be called for the run's tuples during sifting (only our + * initial COMPARETUP() call is required for the tuple, to + * determine that the tuple does not belong in RUN_FIRST). + */ + tuple->tupindex = state->currentRun; + tuplesort_heap_insert(state, tuple, true); + } + else + { + /* + * Tuple was determined to not belong to heapified RUN_FIRST, + * or replacement selection not in play. Append the tuple to + * memtuples indifferently. + * + * dumptuples() does not trust that the next run's tuples are + * heapified. Anything past the first run will always be + * quicksorted even when replacement selection is initially + * used. (When it's never used, every tuple still takes this + * path.) + */ + tuple->tupindex = HEAP_RUN_NEXT; + state->memtuples[state->memtupcount++] = *tuple; + } + + /* + * If we are over the memory limit, dump tuples till we're under. + */ + dumptuples(state, false); + break; + + default: + elog(ERROR, "invalid tuplesort state"); + break; + } +} + +static bool +consider_abort_common(Tuplesortstate *state) +{ + Assert(state->sortKeys[0].abbrev_converter != NULL); + Assert(state->sortKeys[0].abbrev_abort != NULL); + Assert(state->sortKeys[0].abbrev_full_comparator != NULL); + + /* + * Check effectiveness of abbreviation optimization. Consider aborting + * when still within memory limit. + */ + if (state->status == TSS_INITIAL && + state->memtupcount >= state->abbrevNext) + { + state->abbrevNext *= 2; + + /* + * Check opclass-supplied abbreviation abort routine. It may indicate + * that abbreviation should not proceed. + */ + if (!state->sortKeys->abbrev_abort(state->memtupcount, + state->sortKeys)) + return false; + + /* + * Finally, restore authoritative comparator, and indicate that + * abbreviation is not in play by setting abbrev_converter to NULL + */ + state->sortKeys[0].comparator = state->sortKeys[0].abbrev_full_comparator; + state->sortKeys[0].abbrev_converter = NULL; + /* Not strictly necessary, but be tidy */ + state->sortKeys[0].abbrev_abort = NULL; + state->sortKeys[0].abbrev_full_comparator = NULL; + + /* Give up - expect original pass-by-value representation */ + return true; + } + + return false; +} + +/* + * All tuples have been provided; finish the sort. + */ +void +tuplesort_performsort(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "performsort starting: %s", + pg_rusage_show(&state->ru_start)); +#endif + + switch (state->status) + { + case TSS_INITIAL: + + /* + * We were able to accumulate all the tuples within the allowed + * amount of memory. Just qsort 'em and we're done. + */ + tuplesort_sort_memtuples(state); + state->current = 0; + state->eof_reached = false; + state->markpos_offset = 0; + state->markpos_eof = false; + state->status = TSS_SORTEDINMEM; + break; + + case TSS_BOUNDED: + + /* + * We were able to accumulate all the tuples required for output + * in memory, using a heap to eliminate excess tuples. Now we + * have to transform the heap to a properly-sorted array. + */ + sort_bounded_heap(state); + state->current = 0; + state->eof_reached = false; + state->markpos_offset = 0; + state->markpos_eof = false; + state->status = TSS_SORTEDINMEM; + break; + + case TSS_BUILDRUNS: + + /* + * Finish tape-based sort. First, flush all tuples remaining in + * memory out to tape; then merge until we have a single remaining + * run (or, if !randomAccess, one run per tape). Note that + * mergeruns sets the correct state->status. + */ + dumptuples(state, true); + mergeruns(state); + state->eof_reached = false; + state->markpos_block = 0L; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + +#ifdef TRACE_SORT + if (trace_sort) + { + if (state->status == TSS_FINALMERGE) + elog(LOG, "performsort done (except %d-way final merge): %s", + state->activeTapes, + pg_rusage_show(&state->ru_start)); + else + elog(LOG, "performsort done: %s", + pg_rusage_show(&state->ru_start)); + } +#endif + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Internal routine to fetch the next tuple in either forward or back + * direction into *stup. Returns FALSE if no more tuples. + * Returned tuple belongs to tuplesort memory context, and must not be freed + * by caller. Note that fetched tuple is stored in memory that may be + * recycled by any future fetch. + */ +static bool +tuplesort_gettuple_common(Tuplesortstate *state, bool forward, + SortTuple *stup) +{ + unsigned int tuplen; + size_t nmoved; + + switch (state->status) + { + case TSS_SORTEDINMEM: + Assert(forward || state->randomAccess); + Assert(!state->slabAllocatorUsed); + if (forward) + { + if (state->current < state->memtupcount) + { + *stup = state->memtuples[state->current++]; + return true; + } + state->eof_reached = true; + + /* + * Complain if caller tries to retrieve more tuples than + * originally asked for in a bounded sort. This is because + * returning EOF here might be the wrong thing. + */ + if (state->bounded && state->current >= state->bound) + elog(ERROR, "retrieved too many tuples in a bounded sort"); + + return false; + } + else + { + if (state->current <= 0) + return false; + + /* + * if all tuples are fetched already then we return last + * tuple, else - tuple before last returned. + */ + if (state->eof_reached) + state->eof_reached = false; + else + { + state->current--; /* last returned tuple */ + if (state->current <= 0) + return false; + } + *stup = state->memtuples[state->current - 1]; + return true; + } + break; + + case TSS_SORTEDONTAPE: + Assert(forward || state->randomAccess); + Assert(state->slabAllocatorUsed); + + /* + * The slot that held the tuple that we returned in previous + * gettuple call can now be reused. + */ + if (state->lastReturnedTuple) + { + RELEASE_SLAB_SLOT(state, state->lastReturnedTuple); + state->lastReturnedTuple = NULL; + } + + if (forward) + { + if (state->eof_reached) + return false; + + if ((tuplen = getlen(state, state->result_tape, true)) != 0) + { + READTUP(state, stup, state->result_tape, tuplen); + + /* + * Remember the tuple we return, so that we can recycle + * its memory on next call. (This can be NULL, in the + * !state->tuples case). + */ + state->lastReturnedTuple = stup->tuple; + + return true; + } + else + { + state->eof_reached = true; + return false; + } + } + + /* + * Backward. + * + * if all tuples are fetched already then we return last tuple, + * else - tuple before last returned. + */ + if (state->eof_reached) + { + /* + * Seek position is pointing just past the zero tuplen at the + * end of file; back up to fetch last tuple's ending length + * word. If seek fails we must have a completely empty file. + */ + nmoved = LogicalTapeBackspace(state->tapeset, + state->result_tape, + 2 * sizeof(unsigned int)); + if (nmoved == 0) + return false; + else if (nmoved != 2 * sizeof(unsigned int)) + elog(ERROR, "unexpected tape position"); + state->eof_reached = false; + } + else + { + /* + * Back up and fetch previously-returned tuple's ending length + * word. If seek fails, assume we are at start of file. + */ + nmoved = LogicalTapeBackspace(state->tapeset, + state->result_tape, + sizeof(unsigned int)); + if (nmoved == 0) + return false; + else if (nmoved != sizeof(unsigned int)) + elog(ERROR, "unexpected tape position"); + tuplen = getlen(state, state->result_tape, false); + + /* + * Back up to get ending length word of tuple before it. + */ + nmoved = LogicalTapeBackspace(state->tapeset, + state->result_tape, + tuplen + 2 * sizeof(unsigned int)); + if (nmoved == tuplen + sizeof(unsigned int)) + { + /* + * We backed up over the previous tuple, but there was no + * ending length word before it. That means that the prev + * tuple is the first tuple in the file. It is now the + * next to read in forward direction (not obviously right, + * but that is what in-memory case does). + */ + return false; + } + else if (nmoved != tuplen + 2 * sizeof(unsigned int)) + elog(ERROR, "bogus tuple length in backward scan"); + } + + tuplen = getlen(state, state->result_tape, false); + + /* + * Now we have the length of the prior tuple, back up and read it. + * Note: READTUP expects we are positioned after the initial + * length word of the tuple, so back up to that point. + */ + nmoved = LogicalTapeBackspace(state->tapeset, + state->result_tape, + tuplen); + if (nmoved != tuplen) + elog(ERROR, "bogus tuple length in backward scan"); + READTUP(state, stup, state->result_tape, tuplen); + + /* + * Remember the tuple we return, so that we can recycle its memory + * on next call. (This can be NULL, in the Datum case). + */ + state->lastReturnedTuple = stup->tuple; + + return true; + + case TSS_FINALMERGE: + Assert(forward); + /* We are managing memory ourselves, with the slab allocator. */ + Assert(state->slabAllocatorUsed); + + /* + * The slab slot holding the tuple that we returned in previous + * gettuple call can now be reused. + */ + if (state->lastReturnedTuple) + { + RELEASE_SLAB_SLOT(state, state->lastReturnedTuple); + state->lastReturnedTuple = NULL; + } + + /* + * This code should match the inner loop of mergeonerun(). + */ + if (state->memtupcount > 0) + { + int srcTape = state->memtuples[0].tupindex; + SortTuple newtup; + + *stup = state->memtuples[0]; + + /* + * Remember the tuple we return, so that we can recycle its + * memory on next call. (This can be NULL, in the Datum case). + */ + state->lastReturnedTuple = stup->tuple; + + /* + * Pull next tuple from tape, and replace the returned tuple + * at top of the heap with it. + */ + if (!mergereadnext(state, srcTape, &newtup)) + { + /* + * If no more data, we've reached end of run on this tape. + * Remove the top node from the heap. + */ + tuplesort_heap_delete_top(state, false); + + /* + * Rewind to free the read buffer. It'd go away at the + * end of the sort anyway, but better to release the + * memory early. + */ + LogicalTapeRewindForWrite(state->tapeset, srcTape); + return true; + } + newtup.tupindex = srcTape; + tuplesort_heap_replace_top(state, &newtup, false); + return true; + } + return false; + + default: + elog(ERROR, "invalid tuplesort state"); + return false; /* keep compiler quiet */ + } +} + +/* + * Fetch the next tuple in either forward or back direction. + * If successful, put tuple in slot and return TRUE; else, clear the slot + * and return FALSE. + * + * Caller may optionally be passed back abbreviated value (on TRUE return + * value) when abbreviation was used, which can be used to cheaply avoid + * equality checks that might otherwise be required. Caller can safely make a + * determination of "non-equal tuple" based on simple binary inequality. A + * NULL value in leading attribute will set abbreviated value to zeroed + * representation, which caller may rely on in abbreviated inequality check. + * + * If copy is true, the slot receives a tuple that's been copied into the + * caller's memory context, so that it will stay valid regardless of future + * manipulations of the tuplesort's state (up to and including deleting the + * tuplesort). If copy is false, the slot will just receive a pointer to a + * tuple held within the tuplesort, which is more efficient, but only safe for + * callers that are prepared to have any subsequent manipulation of the + * tuplesort's state invalidate slot contents. + */ +bool +tuplesort_gettupleslot(Tuplesortstate *state, bool forward, bool copy, + TupleTableSlot *slot, Datum *abbrev) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + stup.tuple = NULL; + + MemoryContextSwitchTo(oldcontext); + + if (stup.tuple) + { + /* Record abbreviated key for caller */ + if (state->sortKeys->abbrev_converter && abbrev) + *abbrev = stup.datum1; + + if (copy) + stup.tuple = heap_copy_minimal_tuple((MinimalTuple) stup.tuple); + + ExecStoreMinimalTuple((MinimalTuple) stup.tuple, slot, copy); + return true; + } + else + { + ExecClearTuple(slot); + return false; + } +} + +/* + * Fetch the next tuple in either forward or back direction. + * Returns NULL if no more tuples. Returned tuple belongs to tuplesort memory + * context, and must not be freed by caller. Caller may not rely on tuple + * remaining valid after any further manipulation of tuplesort. + */ +HeapTuple +tuplesort_getheaptuple(Tuplesortstate *state, bool forward) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + stup.tuple = NULL; + + MemoryContextSwitchTo(oldcontext); + + return stup.tuple; +} + +/* + * Fetch the next index tuple in either forward or back direction. + * Returns NULL if no more tuples. Returned tuple belongs to tuplesort memory + * context, and must not be freed by caller. Caller may not rely on tuple + * remaining valid after any further manipulation of tuplesort. + */ +IndexTuple +tuplesort_getindextuple(Tuplesortstate *state, bool forward) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + stup.tuple = NULL; + + MemoryContextSwitchTo(oldcontext); + + return (IndexTuple) stup.tuple; +} + +/* + * Fetch the next Datum in either forward or back direction. + * Returns FALSE if no more datums. + * + * If the Datum is pass-by-ref type, the returned value is freshly palloc'd + * in caller's context, and is now owned by the caller (this differs from + * similar routines for other types of tuplesorts). + * + * Caller may optionally be passed back abbreviated value (on TRUE return + * value) when abbreviation was used, which can be used to cheaply avoid + * equality checks that might otherwise be required. Caller can safely make a + * determination of "non-equal tuple" based on simple binary inequality. A + * NULL value will have a zeroed abbreviated value representation, which caller + * may rely on in abbreviated inequality check. + */ +bool +tuplesort_getdatum(Tuplesortstate *state, bool forward, + Datum *val, bool *isNull, Datum *abbrev) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + { + MemoryContextSwitchTo(oldcontext); + return false; + } + + /* Ensure we copy into caller's memory context */ + MemoryContextSwitchTo(oldcontext); + + /* Record abbreviated key for caller */ + if (state->sortKeys->abbrev_converter && abbrev) + *abbrev = stup.datum1; + + if (stup.isnull1 || !state->tuples) + { + *val = stup.datum1; + *isNull = stup.isnull1; + } + else + { + /* use stup.tuple because stup.datum1 may be an abbreviation */ + *val = datumCopy(PointerGetDatum(stup.tuple), false, state->datumTypeLen); + *isNull = false; + } + + return true; +} + +/* + * Advance over N tuples in either forward or back direction, + * without returning any data. N==0 is a no-op. + * Returns TRUE if successful, FALSE if ran out of tuples. + */ +bool +tuplesort_skiptuples(Tuplesortstate *state, int64 ntuples, bool forward) +{ + MemoryContext oldcontext; + + /* + * We don't actually support backwards skip yet, because no callers need + * it. The API is designed to allow for that later, though. + */ + Assert(forward); + Assert(ntuples >= 0); + + switch (state->status) + { + case TSS_SORTEDINMEM: + if (state->memtupcount - state->current >= ntuples) + { + state->current += ntuples; + return true; + } + state->current = state->memtupcount; + state->eof_reached = true; + + /* + * Complain if caller tries to retrieve more tuples than + * originally asked for in a bounded sort. This is because + * returning EOF here might be the wrong thing. + */ + if (state->bounded && state->current >= state->bound) + elog(ERROR, "retrieved too many tuples in a bounded sort"); + + return false; + + case TSS_SORTEDONTAPE: + case TSS_FINALMERGE: + + /* + * We could probably optimize these cases better, but for now it's + * not worth the trouble. + */ + oldcontext = MemoryContextSwitchTo(state->sortcontext); + while (ntuples-- > 0) + { + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + { + MemoryContextSwitchTo(oldcontext); + return false; + } + CHECK_FOR_INTERRUPTS(); + } + MemoryContextSwitchTo(oldcontext); + return true; + + default: + elog(ERROR, "invalid tuplesort state"); + return false; /* keep compiler quiet */ + } +} + +/* + * tuplesort_merge_order - report merge order we'll use for given memory + * (note: "merge order" just means the number of input tapes in the merge). + * + * This is exported for use by the planner. allowedMem is in bytes. + */ +int +tuplesort_merge_order(int64 allowedMem) +{ + int mOrder; + + /* + * We need one tape for each merge input, plus another one for the output, + * and each of these tapes needs buffer space. In addition we want + * MERGE_BUFFER_SIZE workspace per input tape (but the output tape doesn't + * count). + * + * Note: you might be thinking we need to account for the memtuples[] + * array in this calculation, but we effectively treat that as part of the + * MERGE_BUFFER_SIZE workspace. + */ + mOrder = (allowedMem - TAPE_BUFFER_OVERHEAD) / + (MERGE_BUFFER_SIZE + TAPE_BUFFER_OVERHEAD); + + /* + * Even in minimum memory, use at least a MINORDER merge. On the other + * hand, even when we have lots of memory, do not use more than a MAXORDER + * merge. Tapes are pretty cheap, but they're not entirely free. Each + * additional tape reduces the amount of memory available to build runs, + * which in turn can cause the same sort to need more runs, which makes + * merging slower even if it can still be done in a single pass. Also, + * high order merges are quite slow due to CPU cache effects; it can be + * faster to pay the I/O cost of a polyphase merge than to perform a + * single merge pass across many hundreds of tapes. + */ + mOrder = Max(mOrder, MINORDER); + mOrder = Min(mOrder, MAXORDER); + + return mOrder; +} + +/* + * useselection - determine algorithm to use to sort first run. + * + * It can sometimes be useful to use the replacement selection algorithm if it + * results in one large run, and there is little available workMem. See + * remarks on RUN_SECOND optimization within dumptuples(). + */ +static bool +useselection(Tuplesortstate *state) +{ + /* + * memtupsize might be noticeably higher than memtupcount here in atypical + * cases. It seems slightly preferable to not allow recent outliers to + * impact this determination. Note that caller's trace_sort output + * reports memtupcount instead. + */ + if (state->memtupsize <= replacement_sort_tuples) + return true; + + return false; +} + +/* + * inittapes - initialize for tape sorting. + * + * This is called only if we have found we don't have room to sort in memory. + */ +static void +inittapes(Tuplesortstate *state) +{ + int maxTapes, + j; + int64 tapeSpace; + + /* Compute number of tapes to use: merge order plus 1 */ + maxTapes = tuplesort_merge_order(state->allowedMem) + 1; + + state->maxTapes = maxTapes; + state->tapeRange = maxTapes - 1; + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "switching to external sort with %d tapes: %s", + maxTapes, pg_rusage_show(&state->ru_start)); +#endif + + /* + * Decrease availMem to reflect the space needed for tape buffers, when + * writing the initial runs; but don't decrease it to the point that we + * have no room for tuples. (That case is only likely to occur if sorting + * pass-by-value Datums; in all other scenarios the memtuples[] array is + * unlikely to occupy more than half of allowedMem. In the pass-by-value + * case it's not important to account for tuple space, so we don't care if + * LACKMEM becomes inaccurate.) + */ + tapeSpace = (int64) maxTapes * TAPE_BUFFER_OVERHEAD; + + if (tapeSpace + GetMemoryChunkSpace(state->memtuples) < state->allowedMem) + USEMEM(state, tapeSpace); + + /* + * Make sure that the temp file(s) underlying the tape set are created in + * suitable temp tablespaces. + */ + PrepareTempTablespaces(); + + /* + * Create the tape set and allocate the per-tape data arrays. + */ + state->tapeset = LogicalTapeSetCreate(maxTapes); + + state->mergeactive = (bool *) palloc0(maxTapes * sizeof(bool)); + state->tp_fib = (int *) palloc0(maxTapes * sizeof(int)); + state->tp_runs = (int *) palloc0(maxTapes * sizeof(int)); + state->tp_dummy = (int *) palloc0(maxTapes * sizeof(int)); + state->tp_tapenum = (int *) palloc0(maxTapes * sizeof(int)); + + /* + * Give replacement selection a try based on user setting. There will be + * a switch to a simple hybrid sort-merge strategy after the first run + * (iff we could not output one long run). + */ + state->replaceActive = useselection(state); + + if (state->replaceActive) + { + /* + * Convert the unsorted contents of memtuples[] into a heap. Each + * tuple is marked as belonging to run number zero. + * + * NOTE: we pass false for checkIndex since there's no point in + * comparing indexes in this step, even though we do intend the + * indexes to be part of the sort key... + */ + int ntuples = state->memtupcount; + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "replacement selection will sort %d first run tuples", + state->memtupcount); +#endif + state->memtupcount = 0; /* make the heap empty */ + + for (j = 0; j < ntuples; j++) + { + /* Must copy source tuple to avoid possible overwrite */ + SortTuple stup = state->memtuples[j]; + + stup.tupindex = RUN_FIRST; + tuplesort_heap_insert(state, &stup, false); + } + Assert(state->memtupcount == ntuples); + } + + state->currentRun = RUN_FIRST; + + /* + * Initialize variables of Algorithm D (step D1). + */ + for (j = 0; j < maxTapes; j++) + { + state->tp_fib[j] = 1; + state->tp_runs[j] = 0; + state->tp_dummy[j] = 1; + state->tp_tapenum[j] = j; + } + state->tp_fib[state->tapeRange] = 0; + state->tp_dummy[state->tapeRange] = 0; + + state->Level = 1; + state->destTape = 0; + + state->status = TSS_BUILDRUNS; +} + +/* + * selectnewtape -- select new tape for new initial run. + * + * This is called after finishing a run when we know another run + * must be started. This implements steps D3, D4 of Algorithm D. + */ +static void +selectnewtape(Tuplesortstate *state) +{ + int j; + int a; + + /* Step D3: advance j (destTape) */ + if (state->tp_dummy[state->destTape] < state->tp_dummy[state->destTape + 1]) + { + state->destTape++; + return; + } + if (state->tp_dummy[state->destTape] != 0) + { + state->destTape = 0; + return; + } + + /* Step D4: increase level */ + state->Level++; + a = state->tp_fib[0]; + for (j = 0; j < state->tapeRange; j++) + { + state->tp_dummy[j] = a + state->tp_fib[j + 1] - state->tp_fib[j]; + state->tp_fib[j] = a + state->tp_fib[j + 1]; + } + state->destTape = 0; +} + +/* + * Initialize the slab allocation arena, for the given number of slots. + */ +static void +init_slab_allocator(Tuplesortstate *state, int numSlots) +{ + if (numSlots > 0) + { + char *p; + int i; + + state->slabMemoryBegin = palloc(numSlots * SLAB_SLOT_SIZE); + state->slabMemoryEnd = state->slabMemoryBegin + + numSlots * SLAB_SLOT_SIZE; + state->slabFreeHead = (SlabSlot *) state->slabMemoryBegin; + USEMEM(state, numSlots * SLAB_SLOT_SIZE); + + p = state->slabMemoryBegin; + for (i = 0; i < numSlots - 1; i++) + { + ((SlabSlot *) p)->nextfree = (SlabSlot *) (p + SLAB_SLOT_SIZE); + p += SLAB_SLOT_SIZE; + } + ((SlabSlot *) p)->nextfree = NULL; + } + else + { + state->slabMemoryBegin = state->slabMemoryEnd = NULL; + state->slabFreeHead = NULL; + } + state->slabAllocatorUsed = true; +} + +/* + * mergeruns -- merge all the completed initial runs. + * + * This implements steps D5, D6 of Algorithm D. All input data has + * already been written to initial runs on tape (see dumptuples). + */ +static void +mergeruns(Tuplesortstate *state) +{ + int tapenum, + svTape, + svRuns, + svDummy; + int numTapes; + int numInputTapes; + + Assert(state->status == TSS_BUILDRUNS); + Assert(state->memtupcount == 0); + + if (state->sortKeys != NULL && state->sortKeys->abbrev_converter != NULL) + { + /* + * If there are multiple runs to be merged, when we go to read back + * tuples from disk, abbreviated keys will not have been stored, and + * we don't care to regenerate them. Disable abbreviation from this + * point on. + */ + state->sortKeys->abbrev_converter = NULL; + state->sortKeys->comparator = state->sortKeys->abbrev_full_comparator; + + /* Not strictly necessary, but be tidy */ + state->sortKeys->abbrev_abort = NULL; + state->sortKeys->abbrev_full_comparator = NULL; + } + + /* + * Reset tuple memory. We've freed all the tuples that we previously + * allocated. We will use the slab allocator from now on. + */ + MemoryContextDelete(state->tuplecontext); + state->tuplecontext = NULL; + + /* + * We no longer need a large memtuples array. (We will allocate a smaller + * one for the heap later.) + */ + FREEMEM(state, GetMemoryChunkSpace(state->memtuples)); + pfree(state->memtuples); + state->memtuples = NULL; + + /* + * If we had fewer runs than tapes, refund the memory that we imagined we + * would need for the tape buffers of the unused tapes. + * + * numTapes and numInputTapes reflect the actual number of tapes we will + * use. Note that the output tape's tape number is maxTapes - 1, so the + * tape numbers of the used tapes are not consecutive, and you cannot just + * loop from 0 to numTapes to visit all used tapes! + */ + if (state->Level == 1) + { + numInputTapes = state->currentRun; + numTapes = numInputTapes + 1; + FREEMEM(state, (state->maxTapes - numTapes) * TAPE_BUFFER_OVERHEAD); + } + else + { + numInputTapes = state->tapeRange; + numTapes = state->maxTapes; + } + + /* + * Initialize the slab allocator. We need one slab slot per input tape, + * for the tuples in the heap, plus one to hold the tuple last returned + * from tuplesort_gettuple. (If we're sorting pass-by-val Datums, + * however, we don't need to do allocate anything.) + * + * From this point on, we no longer use the USEMEM()/LACKMEM() mechanism + * to track memory usage of individual tuples. + */ + if (state->tuples) + init_slab_allocator(state, numInputTapes + 1); + else + init_slab_allocator(state, 0); + + /* + * If we produced only one initial run (quite likely if the total data + * volume is between 1X and 2X workMem when replacement selection is used, + * but something we particularly count on when input is presorted), we can + * just use that tape as the finished output, rather than doing a useless + * merge. (This obvious optimization is not in Knuth's algorithm.) + */ + if (state->currentRun == RUN_SECOND) + { + state->result_tape = state->tp_tapenum[state->destTape]; + /* must freeze and rewind the finished output tape */ + LogicalTapeFreeze(state->tapeset, state->result_tape); + state->status = TSS_SORTEDONTAPE; + return; + } + + /* + * Allocate a new 'memtuples' array, for the heap. It will hold one tuple + * from each input tape. + */ + state->memtupsize = numInputTapes; + state->memtuples = (SortTuple *) palloc(numInputTapes * sizeof(SortTuple)); + USEMEM(state, GetMemoryChunkSpace(state->memtuples)); + + /* + * Use all the remaining memory we have available for read buffers among + * the input tapes. + * + * We do this only after checking for the case that we produced only one + * initial run, because there is no need to use a large read buffer when + * we're reading from a single tape. With one tape, the I/O pattern will + * be the same regardless of the buffer size. + * + * We don't try to "rebalance" the memory among tapes, when we start a new + * merge phase, even if some tapes are inactive in the new phase. That + * would be hard, because logtape.c doesn't know where one run ends and + * another begins. When a new merge phase begins, and a tape doesn't + * participate in it, its buffer nevertheless already contains tuples from + * the next run on same tape, so we cannot release the buffer. That's OK + * in practice, merge performance isn't that sensitive to the amount of + * buffers used, and most merge phases use all or almost all tapes, + * anyway. + */ +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "using " INT64_FORMAT " KB of memory for read buffers among %d input tapes", + (state->availMem) / 1024, numInputTapes); +#endif + + state->read_buffer_size = Max(state->availMem / numInputTapes, 0); + USEMEM(state, state->read_buffer_size * numInputTapes); + + /* End of step D2: rewind all output tapes to prepare for merging */ + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + LogicalTapeRewindForRead(state->tapeset, tapenum, state->read_buffer_size); + + for (;;) + { + /* + * At this point we know that tape[T] is empty. If there's just one + * (real or dummy) run left on each input tape, then only one merge + * pass remains. If we don't have to produce a materialized sorted + * tape, we can stop at this point and do the final merge on-the-fly. + */ + if (!state->randomAccess) + { + bool allOneRun = true; + + Assert(state->tp_runs[state->tapeRange] == 0); + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + { + if (state->tp_runs[tapenum] + state->tp_dummy[tapenum] != 1) + { + allOneRun = false; + break; + } + } + if (allOneRun) + { + /* Tell logtape.c we won't be writing anymore */ + LogicalTapeSetForgetFreeSpace(state->tapeset); + /* Initialize for the final merge pass */ + beginmerge(state); + state->status = TSS_FINALMERGE; + return; + } + } + + /* Step D5: merge runs onto tape[T] until tape[P] is empty */ + while (state->tp_runs[state->tapeRange - 1] || + state->tp_dummy[state->tapeRange - 1]) + { + bool allDummy = true; + + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + { + if (state->tp_dummy[tapenum] == 0) + { + allDummy = false; + break; + } + } + + if (allDummy) + { + state->tp_dummy[state->tapeRange]++; + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + state->tp_dummy[tapenum]--; + } + else + mergeonerun(state); + } + + /* Step D6: decrease level */ + if (--state->Level == 0) + break; + /* rewind output tape T to use as new input */ + LogicalTapeRewindForRead(state->tapeset, state->tp_tapenum[state->tapeRange], + state->read_buffer_size); + /* rewind used-up input tape P, and prepare it for write pass */ + LogicalTapeRewindForWrite(state->tapeset, state->tp_tapenum[state->tapeRange - 1]); + state->tp_runs[state->tapeRange - 1] = 0; + + /* + * reassign tape units per step D6; note we no longer care about A[] + */ + svTape = state->tp_tapenum[state->tapeRange]; + svDummy = state->tp_dummy[state->tapeRange]; + svRuns = state->tp_runs[state->tapeRange]; + for (tapenum = state->tapeRange; tapenum > 0; tapenum--) + { + state->tp_tapenum[tapenum] = state->tp_tapenum[tapenum - 1]; + state->tp_dummy[tapenum] = state->tp_dummy[tapenum - 1]; + state->tp_runs[tapenum] = state->tp_runs[tapenum - 1]; + } + state->tp_tapenum[0] = svTape; + state->tp_dummy[0] = svDummy; + state->tp_runs[0] = svRuns; + } + + /* + * Done. Knuth says that the result is on TAPE[1], but since we exited + * the loop without performing the last iteration of step D6, we have not + * rearranged the tape unit assignment, and therefore the result is on + * TAPE[T]. We need to do it this way so that we can freeze the final + * output tape while rewinding it. The last iteration of step D6 would be + * a waste of cycles anyway... + */ + state->result_tape = state->tp_tapenum[state->tapeRange]; + LogicalTapeFreeze(state->tapeset, state->result_tape); + state->status = TSS_SORTEDONTAPE; + + /* Release the read buffers of all the other tapes, by rewinding them. */ + for (tapenum = 0; tapenum < state->maxTapes; tapenum++) + { + if (tapenum != state->result_tape) + LogicalTapeRewindForWrite(state->tapeset, tapenum); + } +} + +/* + * Merge one run from each input tape, except ones with dummy runs. + * + * This is the inner loop of Algorithm D step D5. We know that the + * output tape is TAPE[T]. + */ +static void +mergeonerun(Tuplesortstate *state) +{ + int destTape = state->tp_tapenum[state->tapeRange]; + int srcTape; + + /* + * Start the merge by loading one tuple from each active source tape into + * the heap. We can also decrease the input run/dummy run counts. + */ + beginmerge(state); + + /* + * Execute merge by repeatedly extracting lowest tuple in heap, writing it + * out, and replacing it with next tuple from same tape (if there is + * another one). + */ + while (state->memtupcount > 0) + { + SortTuple stup; + + /* write the tuple to destTape */ + srcTape = state->memtuples[0].tupindex; + WRITETUP(state, destTape, &state->memtuples[0]); + + /* recycle the slot of the tuple we just wrote out, for the next read */ + if (state->memtuples[0].tuple) + RELEASE_SLAB_SLOT(state, state->memtuples[0].tuple); + + /* + * pull next tuple from the tape, and replace the written-out tuple in + * the heap with it. + */ + if (mergereadnext(state, srcTape, &stup)) + { + stup.tupindex = srcTape; + tuplesort_heap_replace_top(state, &stup, false); + + } + else + tuplesort_heap_delete_top(state, false); + } + + /* + * When the heap empties, we're done. Write an end-of-run marker on the + * output tape, and increment its count of real runs. + */ + markrunend(state, destTape); + state->tp_runs[state->tapeRange]++; + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "finished %d-way merge step: %s", state->activeTapes, + pg_rusage_show(&state->ru_start)); +#endif +} + +/* + * beginmerge - initialize for a merge pass + * + * We decrease the counts of real and dummy runs for each tape, and mark + * which tapes contain active input runs in mergeactive[]. Then, fill the + * merge heap with the first tuple from each active tape. + */ +static void +beginmerge(Tuplesortstate *state) +{ + int activeTapes; + int tapenum; + int srcTape; + + /* Heap should be empty here */ + Assert(state->memtupcount == 0); + + /* Adjust run counts and mark the active tapes */ + memset(state->mergeactive, 0, + state->maxTapes * sizeof(*state->mergeactive)); + activeTapes = 0; + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + { + if (state->tp_dummy[tapenum] > 0) + state->tp_dummy[tapenum]--; + else + { + Assert(state->tp_runs[tapenum] > 0); + state->tp_runs[tapenum]--; + srcTape = state->tp_tapenum[tapenum]; + state->mergeactive[srcTape] = true; + activeTapes++; + } + } + Assert(activeTapes > 0); + state->activeTapes = activeTapes; + + /* Load the merge heap with the first tuple from each input tape */ + for (srcTape = 0; srcTape < state->maxTapes; srcTape++) + { + SortTuple tup; + + if (mergereadnext(state, srcTape, &tup)) + { + tup.tupindex = srcTape; + tuplesort_heap_insert(state, &tup, false); + } + } +} + +/* + * mergereadnext - read next tuple from one merge input tape + * + * Returns false on EOF. + */ +static bool +mergereadnext(Tuplesortstate *state, int srcTape, SortTuple *stup) +{ + unsigned int tuplen; + + if (!state->mergeactive[srcTape]) + return false; /* tape's run is already exhausted */ + + /* read next tuple, if any */ + if ((tuplen = getlen(state, srcTape, true)) == 0) + { + state->mergeactive[srcTape] = false; + return false; + } + READTUP(state, stup, srcTape, tuplen); + + return true; +} + +/* + * dumptuples - remove tuples from memtuples and write to tape + * + * This is used during initial-run building, but not during merging. + * + * When alltuples = false and replacement selection is still active, dump + * only enough tuples to get under the availMem limit (and leave at least + * one tuple in memtuples, since puttuple will then assume it is a heap that + * has a tuple to compare to). We always insist there be at least one free + * slot in the memtuples[] array. + * + * When alltuples = true, dump everything currently in memory. (This + * case is only used at end of input data, although in practice only the + * first run could fail to dump all tuples when we LACKMEM(), and only + * when replacement selection is active.) + * + * If, when replacement selection is active, we see that the tuple run + * number at the top of the heap has changed, start a new run. This must be + * the first run, because replacement selection is always abandoned for all + * further runs. + */ +static void +dumptuples(Tuplesortstate *state, bool alltuples) +{ + while (alltuples || + (LACKMEM(state) && state->memtupcount > 1) || + state->memtupcount >= state->memtupsize) + { + if (state->replaceActive) + { + /* + * Still holding out for a case favorable to replacement + * selection. Still incrementally spilling using heap. + * + * Dump the heap's frontmost entry, and remove it from the heap. + */ + Assert(state->memtupcount > 0); + WRITETUP(state, state->tp_tapenum[state->destTape], + &state->memtuples[0]); + tuplesort_heap_delete_top(state, true); + } + else + { + /* + * Once committed to quicksorting runs, never incrementally spill + */ + dumpbatch(state, alltuples); + break; + } + + /* + * If top run number has changed, we've finished the current run (this + * can only be the first run), and will no longer spill incrementally. + */ + if (state->memtupcount == 0 || + state->memtuples[0].tupindex == HEAP_RUN_NEXT) + { + markrunend(state, state->tp_tapenum[state->destTape]); + Assert(state->currentRun == RUN_FIRST); + state->currentRun++; + state->tp_runs[state->destTape]++; + state->tp_dummy[state->destTape]--; /* per Alg D step D2 */ + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "finished incrementally writing %s run %d to tape %d: %s", + (state->memtupcount == 0) ? "only" : "first", + state->currentRun, state->destTape, + pg_rusage_show(&state->ru_start)); +#endif + + /* + * Done if heap is empty, which is possible when there is only one + * long run. + */ + Assert(state->currentRun == RUN_SECOND); + if (state->memtupcount == 0) + { + /* + * Replacement selection best case; no final merge required, + * because there was only one initial run (second run has no + * tuples). See RUN_SECOND case in mergeruns(). + */ + break; + } + + /* + * Abandon replacement selection for second run (as well as any + * subsequent runs). + */ + state->replaceActive = false; + + /* + * First tuple of next run should not be heapified, and so will + * bear placeholder run number. In practice this must actually be + * the second run, which just became the currentRun, so we're + * clear to quicksort and dump the tuples in batch next time + * memtuples becomes full. + */ + Assert(state->memtuples[0].tupindex == HEAP_RUN_NEXT); + selectnewtape(state); + } + } +} + +/* + * dumpbatch - sort and dump all memtuples, forming one run on tape + * + * Second or subsequent runs are never heapified by this module (although + * heapification still respects run number differences between the first and + * second runs), and a heap (replacement selection priority queue) is often + * avoided in the first place. + */ +static void +dumpbatch(Tuplesortstate *state, bool alltuples) +{ + int memtupwrite; + int i; + + /* + * Final call might require no sorting, in rare cases where we just so + * happen to have previously LACKMEM()'d at the point where exactly all + * remaining tuples are loaded into memory, just before input was + * exhausted. + * + * In general, short final runs are quite possible. Rather than allowing + * a special case where there was a superfluous selectnewtape() call (i.e. + * a call with no subsequent run actually written to destTape), we prefer + * to write out a 0 tuple run. + * + * mergereadnext() is prepared for 0 tuple runs, and will reliably mark + * the tape inactive for the merge when called from beginmerge(). This + * case is therefore similar to the case where mergeonerun() finds a dummy + * run for the tape, and so doesn't need to merge a run from the tape (or + * conceptually "merges" the dummy run, if you prefer). According to + * Knuth, Algorithm D "isn't strictly optimal" in its method of + * distribution and dummy run assignment; this edge case seems very + * unlikely to make that appreciably worse. + */ + Assert(state->status == TSS_BUILDRUNS); + + /* + * It seems unlikely that this limit will ever be exceeded, but take no + * chances + */ + if (state->currentRun == INT_MAX) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("cannot have more than %d runs for an external sort", + INT_MAX))); + + state->currentRun++; + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "starting quicksort of run %d: %s", + state->currentRun, pg_rusage_show(&state->ru_start)); +#endif + + /* + * Sort all tuples accumulated within the allowed amount of memory for + * this run using quicksort + */ + tuplesort_sort_memtuples(state); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "finished quicksort of run %d: %s", + state->currentRun, pg_rusage_show(&state->ru_start)); +#endif + + memtupwrite = state->memtupcount; + for (i = 0; i < memtupwrite; i++) + { + WRITETUP(state, state->tp_tapenum[state->destTape], + &state->memtuples[i]); + state->memtupcount--; + } + + /* + * Reset tuple memory. We've freed all of the tuples that we previously + * allocated. It's important to avoid fragmentation when there is a stark + * change in the sizes of incoming tuples. Fragmentation due to + * AllocSetFree's bucketing by size class might be particularly bad if + * this step wasn't taken. + */ + MemoryContextReset(state->tuplecontext); + + markrunend(state, state->tp_tapenum[state->destTape]); + state->tp_runs[state->destTape]++; + state->tp_dummy[state->destTape]--; /* per Alg D step D2 */ + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "finished writing run %d to tape %d: %s", + state->currentRun, state->destTape, + pg_rusage_show(&state->ru_start)); +#endif + + if (!alltuples) + selectnewtape(state); +} + +/* + * tuplesort_rescan - rewind and replay the scan + */ +void +tuplesort_rescan(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + + Assert(state->randomAccess); + + switch (state->status) + { + case TSS_SORTEDINMEM: + state->current = 0; + state->eof_reached = false; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + case TSS_SORTEDONTAPE: + LogicalTapeRewindForRead(state->tapeset, + state->result_tape, + 0); + state->eof_reached = false; + state->markpos_block = 0L; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + + MemoryContextSwitchTo(oldcontext); +} + +/* + * tuplesort_markpos - saves current position in the merged sort file + */ +void +tuplesort_markpos(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + + Assert(state->randomAccess); + + switch (state->status) + { + case TSS_SORTEDINMEM: + state->markpos_offset = state->current; + state->markpos_eof = state->eof_reached; + break; + case TSS_SORTEDONTAPE: + LogicalTapeTell(state->tapeset, + state->result_tape, + &state->markpos_block, + &state->markpos_offset); + state->markpos_eof = state->eof_reached; + break; + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + + MemoryContextSwitchTo(oldcontext); +} + +/* + * tuplesort_restorepos - restores current position in merged sort file to + * last saved position + */ +void +tuplesort_restorepos(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + + Assert(state->randomAccess); + + switch (state->status) + { + case TSS_SORTEDINMEM: + state->current = state->markpos_offset; + state->eof_reached = state->markpos_eof; + break; + case TSS_SORTEDONTAPE: + LogicalTapeSeek(state->tapeset, + state->result_tape, + state->markpos_block, + state->markpos_offset); + state->eof_reached = state->markpos_eof; + break; + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + + MemoryContextSwitchTo(oldcontext); +} + +/* + * tuplesort_get_stats - extract summary statistics + * + * This can be called after tuplesort_performsort() finishes to obtain + * printable summary information about how the sort was performed. + * spaceUsed is measured in kilobytes. + */ +void +tuplesort_get_stats(Tuplesortstate *state, + const char **sortMethod, + const char **spaceType, + long *spaceUsed) +{ + /* + * Note: it might seem we should provide both memory and disk usage for a + * disk-based sort. However, the current code doesn't track memory space + * accurately once we have begun to return tuples to the caller (since we + * don't account for pfree's the caller is expected to do), so we cannot + * rely on availMem in a disk sort. This does not seem worth the overhead + * to fix. Is it worth creating an API for the memory context code to + * tell us how much is actually used in sortcontext? + */ + if (state->tapeset) + { + *spaceType = "Disk"; + *spaceUsed = LogicalTapeSetBlocks(state->tapeset) * (BLCKSZ / 1024); + } + else + { + *spaceType = "Memory"; + *spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024; + } + + switch (state->status) + { + case TSS_SORTEDINMEM: + if (state->boundUsed) + *sortMethod = "top-N heapsort"; + else + *sortMethod = "quicksort"; + break; + case TSS_SORTEDONTAPE: + *sortMethod = "external sort"; + break; + case TSS_FINALMERGE: + *sortMethod = "external merge"; + break; + default: + *sortMethod = "still in progress"; + break; + } +} + + +/* + * Heap manipulation routines, per Knuth's Algorithm 5.2.3H. + * + * Compare two SortTuples. If checkIndex is true, use the tuple index + * as the front of the sort key; otherwise, no. + * + * Note that for checkIndex callers, the heap invariant is never + * maintained beyond the first run, and so there are no COMPARETUP() + * calls needed to distinguish tuples in HEAP_RUN_NEXT. + */ + +#define HEAPCOMPARE(tup1,tup2) \ + (checkIndex && ((tup1)->tupindex != (tup2)->tupindex || \ + (tup1)->tupindex == HEAP_RUN_NEXT) ? \ + ((tup1)->tupindex) - ((tup2)->tupindex) : \ + COMPARETUP(state, tup1, tup2)) + +/* + * Convert the existing unordered array of SortTuples to a bounded heap, + * discarding all but the smallest "state->bound" tuples. + * + * When working with a bounded heap, we want to keep the largest entry + * at the root (array entry zero), instead of the smallest as in the normal + * sort case. This allows us to discard the largest entry cheaply. + * Therefore, we temporarily reverse the sort direction. + * + * We assume that all entries in a bounded heap will always have tupindex + * zero; it therefore doesn't matter that HEAPCOMPARE() doesn't reverse + * the direction of comparison for tupindexes. + */ +static void +make_bounded_heap(Tuplesortstate *state) +{ + int tupcount = state->memtupcount; + int i; + + Assert(state->status == TSS_INITIAL); + Assert(state->bounded); + Assert(tupcount >= state->bound); + + /* Reverse sort direction so largest entry will be at root */ + reversedirection(state); + + state->memtupcount = 0; /* make the heap empty */ + for (i = 0; i < tupcount; i++) + { + if (state->memtupcount < state->bound) + { + /* Insert next tuple into heap */ + /* Must copy source tuple to avoid possible overwrite */ + SortTuple stup = state->memtuples[i]; + + stup.tupindex = 0; /* not used */ + tuplesort_heap_insert(state, &stup, false); + } + else + { + /* + * The heap is full. Replace the largest entry with the new + * tuple, or just discard it, if it's larger than anything already + * in the heap. + */ + if (COMPARETUP(state, &state->memtuples[i], &state->memtuples[0]) <= 0) + { + free_sort_tuple(state, &state->memtuples[i]); + CHECK_FOR_INTERRUPTS(); + } + else + tuplesort_heap_replace_top(state, &state->memtuples[i], false); + } + } + + Assert(state->memtupcount == state->bound); + state->status = TSS_BOUNDED; +} + +/* + * Convert the bounded heap to a properly-sorted array + */ +static void +sort_bounded_heap(Tuplesortstate *state) +{ + int tupcount = state->memtupcount; + + Assert(state->status == TSS_BOUNDED); + Assert(state->bounded); + Assert(tupcount == state->bound); + + /* + * We can unheapify in place because each delete-top call will remove the + * largest entry, which we can promptly store in the newly freed slot at + * the end. Once we're down to a single-entry heap, we're done. + */ + while (state->memtupcount > 1) + { + SortTuple stup = state->memtuples[0]; + + /* this sifts-up the next-largest entry and decreases memtupcount */ + tuplesort_heap_delete_top(state, false); + state->memtuples[state->memtupcount] = stup; + } + state->memtupcount = tupcount; + + /* + * Reverse sort direction back to the original state. This is not + * actually necessary but seems like a good idea for tidiness. + */ + reversedirection(state); + + state->status = TSS_SORTEDINMEM; + state->boundUsed = true; +} + +/* + * Sort all memtuples using specialized qsort() routines. + * + * Quicksort is used for small in-memory sorts. Quicksort is also generally + * preferred to replacement selection for generating runs during external sort + * operations, although replacement selection is sometimes used for the first + * run. + */ +static void +tuplesort_sort_memtuples(Tuplesortstate *state) +{ + if (state->memtupcount > 1) + { + /* Can we use the single-key sort function? */ + if (state->onlyKey != NULL) + qsort_ssup(state->memtuples, state->memtupcount, + state->onlyKey); + else + qsort_tuple(state->memtuples, + state->memtupcount, + state->comparetup, + state); + } +} + +/* + * Insert a new tuple into an empty or existing heap, maintaining the + * heap invariant. Caller is responsible for ensuring there's room. + * + * Note: For some callers, tuple points to a memtuples[] entry above the + * end of the heap. This is safe as long as it's not immediately adjacent + * to the end of the heap (ie, in the [memtupcount] array entry) --- if it + * is, it might get overwritten before being moved into the heap! + */ +static void +tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple, + bool checkIndex) +{ + SortTuple *memtuples; + int j; + + memtuples = state->memtuples; + Assert(state->memtupcount < state->memtupsize); + Assert(!checkIndex || tuple->tupindex == RUN_FIRST); + + CHECK_FOR_INTERRUPTS(); + + /* + * Sift-up the new entry, per Knuth 5.2.3 exercise 16. Note that Knuth is + * using 1-based array indexes, not 0-based. + */ + j = state->memtupcount++; + while (j > 0) + { + int i = (j - 1) >> 1; + + if (HEAPCOMPARE(tuple, &memtuples[i]) >= 0) + break; + memtuples[j] = memtuples[i]; + j = i; + } + memtuples[j] = *tuple; +} + +/* + * Remove the tuple at state->memtuples[0] from the heap. Decrement + * memtupcount, and sift up to maintain the heap invariant. + * + * The caller has already free'd the tuple the top node points to, + * if necessary. + */ +static void +tuplesort_heap_delete_top(Tuplesortstate *state, bool checkIndex) +{ + SortTuple *memtuples = state->memtuples; + SortTuple *tuple; + + Assert(!checkIndex || state->currentRun == RUN_FIRST); + if (--state->memtupcount <= 0) + return; + + /* + * Remove the last tuple in the heap, and re-insert it, by replacing the + * current top node with it. + */ + tuple = &memtuples[state->memtupcount]; + tuplesort_heap_replace_top(state, tuple, checkIndex); +} + +/* + * Replace the tuple at state->memtuples[0] with a new tuple. Sift up to + * maintain the heap invariant. + * + * This corresponds to Knuth's "sift-up" algorithm (Algorithm 5.2.3H, + * Heapsort, steps H3-H8). + */ +static void +tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple *tuple, + bool checkIndex) +{ + SortTuple *memtuples = state->memtuples; + unsigned int i, + n; + + Assert(!checkIndex || state->currentRun == RUN_FIRST); + Assert(state->memtupcount >= 1); + + CHECK_FOR_INTERRUPTS(); + + /* + * state->memtupcount is "int", but we use "unsigned int" for i, j, n. + * This prevents overflow in the "2 * i + 1" calculation, since at the top + * of the loop we must have i < n <= INT_MAX <= UINT_MAX/2. + */ + n = state->memtupcount; + i = 0; /* i is where the "hole" is */ + for (;;) + { + unsigned int j = 2 * i + 1; + + if (j >= n) + break; + if (j + 1 < n && + HEAPCOMPARE(&memtuples[j], &memtuples[j + 1]) > 0) + j++; + if (HEAPCOMPARE(tuple, &memtuples[j]) <= 0) + break; + memtuples[i] = memtuples[j]; + i = j; + } + memtuples[i] = *tuple; +} + +/* + * Function to reverse the sort direction from its current state + * + * It is not safe to call this when performing hash tuplesorts + */ +static void +reversedirection(Tuplesortstate *state) +{ + SortSupport sortKey = state->sortKeys; + int nkey; + + for (nkey = 0; nkey < state->nKeys; nkey++, sortKey++) + { + sortKey->ssup_reverse = !sortKey->ssup_reverse; + sortKey->ssup_nulls_first = !sortKey->ssup_nulls_first; + } +} + + +/* + * Tape interface routines + */ + +static unsigned int +getlen(Tuplesortstate *state, int tapenum, bool eofOK) +{ + unsigned int len; + + if (LogicalTapeRead(state->tapeset, tapenum, + &len, sizeof(len)) != sizeof(len)) + elog(ERROR, "unexpected end of tape"); + if (len == 0 && !eofOK) + elog(ERROR, "unexpected end of data"); + return len; +} + +static void +markrunend(Tuplesortstate *state, int tapenum) +{ + unsigned int len = 0; + + LogicalTapeWrite(state->tapeset, tapenum, (void *) &len, sizeof(len)); +} + +/* + * Get memory for tuple from within READTUP() routine. + * + * We use next free slot from the slab allocator, or palloc() if the tuple + * is too large for that. + */ +static void * +readtup_alloc(Tuplesortstate *state, Size tuplen) +{ + SlabSlot *buf; + + /* + * We pre-allocate enough slots in the slab arena that we should never run + * out. + */ + Assert(state->slabFreeHead); + + if (tuplen > SLAB_SLOT_SIZE || !state->slabFreeHead) + return MemoryContextAlloc(state->sortcontext, tuplen); + else + { + buf = state->slabFreeHead; + /* Reuse this slot */ + state->slabFreeHead = buf->nextfree; + + return buf; + } +} + + +/* + * Routines specialized for HeapTuple (actually MinimalTuple) case + */ + +static int +comparetup_heap(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) +{ + SortSupport sortKey = state->sortKeys; + HeapTupleData ltup; + HeapTupleData rtup; + TupleDesc tupDesc; + int nkey; + int32 compare; + AttrNumber attno; + Datum datum1, + datum2; + bool isnull1, + isnull2; + + + /* Compare the leading sort key */ + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + sortKey); + if (compare != 0) + return compare; + + /* Compare additional sort keys */ + ltup.t_len = ((MinimalTuple) a->tuple)->t_len + MINIMAL_TUPLE_OFFSET; + ltup.t_data = (HeapTupleHeader) ((char *) a->tuple - MINIMAL_TUPLE_OFFSET); + rtup.t_len = ((MinimalTuple) b->tuple)->t_len + MINIMAL_TUPLE_OFFSET; + rtup.t_data = (HeapTupleHeader) ((char *) b->tuple - MINIMAL_TUPLE_OFFSET); + tupDesc = state->tupDesc; + + if (sortKey->abbrev_converter) + { + attno = sortKey->ssup_attno; + + datum1 = heap_getattr(<up, attno, tupDesc, &isnull1); + datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2); + + compare = ApplySortAbbrevFullComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + + sortKey++; + for (nkey = 1; nkey < state->nKeys; nkey++, sortKey++) + { + attno = sortKey->ssup_attno; + + datum1 = heap_getattr(<up, attno, tupDesc, &isnull1); + datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2); + + compare = ApplySortComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + + return 0; +} + +static void +copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + /* + * We expect the passed "tup" to be a TupleTableSlot, and form a + * MinimalTuple using the exported interface for that. + */ + TupleTableSlot *slot = (TupleTableSlot *) tup; + Datum original; + MinimalTuple tuple; + HeapTupleData htup; + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + + /* copy the tuple into sort storage */ + tuple = ExecCopySlotMinimalTuple(slot); + stup->tuple = (void *) tuple; + USEMEM(state, GetMemoryChunkSpace(tuple)); + /* set up first-column key value */ + htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET; + htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET); + original = heap_getattr(&htup, + state->sortKeys[0].ssup_attno, + state->tupDesc, + &stup->isnull1); + + MemoryContextSwitchTo(oldcontext); + + if (!state->sortKeys->abbrev_converter || stup->isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to zeroed representation (to be + * consistent, and to support cheap inequality tests for NULL + * abbreviated keys). + */ + stup->datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup->datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup->datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + htup.t_len = ((MinimalTuple) mtup->tuple)->t_len + + MINIMAL_TUPLE_OFFSET; + htup.t_data = (HeapTupleHeader) ((char *) mtup->tuple - + MINIMAL_TUPLE_OFFSET); + + mtup->datum1 = heap_getattr(&htup, + state->sortKeys[0].ssup_attno, + state->tupDesc, + &mtup->isnull1); + } + } +} + +static void +writetup_heap(Tuplesortstate *state, int tapenum, SortTuple *stup) +{ + MinimalTuple tuple = (MinimalTuple) stup->tuple; + + /* the part of the MinimalTuple we'll write: */ + char *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET; + unsigned int tupbodylen = tuple->t_len - MINIMAL_TUPLE_DATA_OFFSET; + + /* total on-disk footprint: */ + unsigned int tuplen = tupbodylen + sizeof(int); + + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &tuplen, sizeof(tuplen)); + LogicalTapeWrite(state->tapeset, tapenum, + (void *) tupbody, tupbodylen); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &tuplen, sizeof(tuplen)); + + if (!state->slabAllocatorUsed) + { + FREEMEM(state, GetMemoryChunkSpace(tuple)); + heap_free_minimal_tuple(tuple); + } +} + +static void +readtup_heap(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len) +{ + unsigned int tupbodylen = len - sizeof(int); + unsigned int tuplen = tupbodylen + MINIMAL_TUPLE_DATA_OFFSET; + MinimalTuple tuple = (MinimalTuple) readtup_alloc(state, tuplen); + char *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET; + HeapTupleData htup; + + /* read in the tuple proper */ + tuple->t_len = tuplen; + LogicalTapeReadExact(state->tapeset, tapenum, + tupbody, tupbodylen); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeReadExact(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + stup->tuple = (void *) tuple; + /* set up first-column key value */ + htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET; + htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET); + stup->datum1 = heap_getattr(&htup, + state->sortKeys[0].ssup_attno, + state->tupDesc, + &stup->isnull1); +} + +/* + * Routines specialized for the CLUSTER case (HeapTuple data, with + * comparisons per a btree index definition) + */ + +static int +comparetup_cluster(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state) +{ + SortSupport sortKey = state->sortKeys; + HeapTuple ltup; + HeapTuple rtup; + TupleDesc tupDesc; + int nkey; + int32 compare; + Datum datum1, + datum2; + bool isnull1, + isnull2; + AttrNumber leading = state->indexInfo->ii_KeyAttrNumbers[0]; + + /* Be prepared to compare additional sort keys */ + ltup = (HeapTuple) a->tuple; + rtup = (HeapTuple) b->tuple; + tupDesc = state->tupDesc; + + /* Compare the leading sort key, if it's simple */ + if (leading != 0) + { + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + sortKey); + if (compare != 0) + return compare; + + if (sortKey->abbrev_converter) + { + datum1 = heap_getattr(ltup, leading, tupDesc, &isnull1); + datum2 = heap_getattr(rtup, leading, tupDesc, &isnull2); + + compare = ApplySortAbbrevFullComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + } + if (compare != 0 || state->nKeys == 1) + return compare; + /* Compare additional columns the hard way */ + sortKey++; + nkey = 1; + } + else + { + /* Must compare all keys the hard way */ + nkey = 0; + } + + if (state->indexInfo->ii_Expressions == NULL) + { + /* If not expression index, just compare the proper heap attrs */ + + for (; nkey < state->nKeys; nkey++, sortKey++) + { + AttrNumber attno = state->indexInfo->ii_KeyAttrNumbers[nkey]; + + datum1 = heap_getattr(ltup, attno, tupDesc, &isnull1); + datum2 = heap_getattr(rtup, attno, tupDesc, &isnull2); + + compare = ApplySortComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + } + else + { + /* + * In the expression index case, compute the whole index tuple and + * then compare values. It would perhaps be faster to compute only as + * many columns as we need to compare, but that would require + * duplicating all the logic in FormIndexDatum. + */ + Datum l_index_values[INDEX_MAX_KEYS]; + bool l_index_isnull[INDEX_MAX_KEYS]; + Datum r_index_values[INDEX_MAX_KEYS]; + bool r_index_isnull[INDEX_MAX_KEYS]; + TupleTableSlot *ecxt_scantuple; + + /* Reset context each time to prevent memory leakage */ + ResetPerTupleExprContext(state->estate); + + ecxt_scantuple = GetPerTupleExprContext(state->estate)->ecxt_scantuple; + + ExecStoreTuple(ltup, ecxt_scantuple, InvalidBuffer, false); + FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate, + l_index_values, l_index_isnull); + + ExecStoreTuple(rtup, ecxt_scantuple, InvalidBuffer, false); + FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate, + r_index_values, r_index_isnull); + + for (; nkey < state->nKeys; nkey++, sortKey++) + { + compare = ApplySortComparator(l_index_values[nkey], + l_index_isnull[nkey], + r_index_values[nkey], + r_index_isnull[nkey], + sortKey); + if (compare != 0) + return compare; + } + } + + return 0; +} + +static void +copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + HeapTuple tuple = (HeapTuple) tup; + Datum original; + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + + /* copy the tuple into sort storage */ + tuple = heap_copytuple(tuple); + stup->tuple = (void *) tuple; + USEMEM(state, GetMemoryChunkSpace(tuple)); + + MemoryContextSwitchTo(oldcontext); + + /* + * set up first-column key value, and potentially abbreviate, if it's a + * simple column + */ + if (state->indexInfo->ii_KeyAttrNumbers[0] == 0) + return; + + original = heap_getattr(tuple, + state->indexInfo->ii_KeyAttrNumbers[0], + state->tupDesc, + &stup->isnull1); + + if (!state->sortKeys->abbrev_converter || stup->isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to zeroed representation (to be + * consistent, and to support cheap inequality tests for NULL + * abbreviated keys). + */ + stup->datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup->datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup->datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + tuple = (HeapTuple) mtup->tuple; + mtup->datum1 = heap_getattr(tuple, + state->indexInfo->ii_KeyAttrNumbers[0], + state->tupDesc, + &mtup->isnull1); + } + } +} + +static void +writetup_cluster(Tuplesortstate *state, int tapenum, SortTuple *stup) +{ + HeapTuple tuple = (HeapTuple) stup->tuple; + unsigned int tuplen = tuple->t_len + sizeof(ItemPointerData) + sizeof(int); + + /* We need to store t_self, but not other fields of HeapTupleData */ + LogicalTapeWrite(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + LogicalTapeWrite(state->tapeset, tapenum, + &tuple->t_self, sizeof(ItemPointerData)); + LogicalTapeWrite(state->tapeset, tapenum, + tuple->t_data, tuple->t_len); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeWrite(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + + if (!state->slabAllocatorUsed) + { + FREEMEM(state, GetMemoryChunkSpace(tuple)); + heap_freetuple(tuple); + } +} + +static void +readtup_cluster(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int tuplen) +{ + unsigned int t_len = tuplen - sizeof(ItemPointerData) - sizeof(int); + HeapTuple tuple = (HeapTuple) readtup_alloc(state, + t_len + HEAPTUPLESIZE); + + /* Reconstruct the HeapTupleData header */ + tuple->t_data = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE); + tuple->t_len = t_len; + LogicalTapeReadExact(state->tapeset, tapenum, + &tuple->t_self, sizeof(ItemPointerData)); + /* We don't currently bother to reconstruct t_tableOid */ + tuple->t_tableOid = InvalidOid; + /* Read in the tuple body */ + LogicalTapeReadExact(state->tapeset, tapenum, + tuple->t_data, tuple->t_len); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeReadExact(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + stup->tuple = (void *) tuple; + /* set up first-column key value, if it's a simple column */ + if (state->indexInfo->ii_KeyAttrNumbers[0] != 0) + stup->datum1 = heap_getattr(tuple, + state->indexInfo->ii_KeyAttrNumbers[0], + state->tupDesc, + &stup->isnull1); +} + +/* + * Routines specialized for IndexTuple case + * + * The btree and hash cases require separate comparison functions, but the + * IndexTuple representation is the same so the copy/write/read support + * functions can be shared. + */ + +static int +comparetup_index_btree(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state) +{ + /* + * This is similar to comparetup_heap(), but expects index tuples. There + * is also special handling for enforcing uniqueness, and special + * treatment for equal keys at the end. + */ + SortSupport sortKey = state->sortKeys; + IndexTuple tuple1; + IndexTuple tuple2; + int keysz; + TupleDesc tupDes; + bool equal_hasnull = false; + int nkey; + int32 compare; + Datum datum1, + datum2; + bool isnull1, + isnull2; + + + /* Compare the leading sort key */ + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + sortKey); + if (compare != 0) + return compare; + + /* Compare additional sort keys */ + tuple1 = (IndexTuple) a->tuple; + tuple2 = (IndexTuple) b->tuple; + keysz = state->nKeys; + tupDes = RelationGetDescr(state->indexRel); + + if (sortKey->abbrev_converter) + { + datum1 = index_getattr(tuple1, 1, tupDes, &isnull1); + datum2 = index_getattr(tuple2, 1, tupDes, &isnull2); + + compare = ApplySortAbbrevFullComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + + /* they are equal, so we only need to examine one null flag */ + if (a->isnull1) + equal_hasnull = true; + + sortKey++; + for (nkey = 2; nkey <= keysz; nkey++, sortKey++) + { + datum1 = index_getattr(tuple1, nkey, tupDes, &isnull1); + datum2 = index_getattr(tuple2, nkey, tupDes, &isnull2); + + compare = ApplySortComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; /* done when we find unequal attributes */ + + /* they are equal, so we only need to examine one null flag */ + if (isnull1) + equal_hasnull = true; + } + + /* + * If btree has asked us to enforce uniqueness, complain if two equal + * tuples are detected (unless there was at least one NULL field). + * + * It is sufficient to make the test here, because if two tuples are equal + * they *must* get compared at some stage of the sort --- otherwise the + * sort algorithm wouldn't have checked whether one must appear before the + * other. + */ + if (state->enforceUnique && !equal_hasnull) + { + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + char *key_desc; + + /* + * Some rather brain-dead implementations of qsort (such as the one in + * QNX 4) will sometimes call the comparison routine to compare a + * value to itself, but we always use our own implementation, which + * does not. + */ + Assert(tuple1 != tuple2); + + index_deform_tuple(tuple1, tupDes, values, isnull); + + key_desc = BuildIndexValueDescription(state->indexRel, values, isnull); + + ereport(ERROR, + (errcode(ERRCODE_UNIQUE_VIOLATION), + errmsg("could not create unique index \"%s\"", + RelationGetRelationName(state->indexRel)), + key_desc ? errdetail("Key %s is duplicated.", key_desc) : + errdetail("Duplicate keys exist."), + errtableconstraint(state->heapRel, + RelationGetRelationName(state->indexRel)))); + } + + /* + * If key values are equal, we sort on ItemPointer. This does not affect + * validity of the finished index, but it may be useful to have index + * scans in physical order. + */ + { + BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid); + BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid); + + if (blk1 != blk2) + return (blk1 < blk2) ? -1 : 1; + } + { + OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid); + OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid); + + if (pos1 != pos2) + return (pos1 < pos2) ? -1 : 1; + } + + /* ItemPointer values should never be equal */ + Assert(false); + + return 0; +} + +static int +comparetup_index_hash(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state) +{ + Bucket bucket1; + Bucket bucket2; + IndexTuple tuple1; + IndexTuple tuple2; + + /* + * Fetch hash keys and mask off bits we don't want to sort by. We know + * that the first column of the index tuple is the hash key. + */ + Assert(!a->isnull1); + bucket1 = _hash_hashkey2bucket(DatumGetUInt32(a->datum1), + state->max_buckets, state->high_mask, + state->low_mask); + Assert(!b->isnull1); + bucket2 = _hash_hashkey2bucket(DatumGetUInt32(b->datum1), + state->max_buckets, state->high_mask, + state->low_mask); + if (bucket1 > bucket2) + return 1; + else if (bucket1 < bucket2) + return -1; + + /* + * If hash values are equal, we sort on ItemPointer. This does not affect + * validity of the finished index, but it may be useful to have index + * scans in physical order. + */ + tuple1 = (IndexTuple) a->tuple; + tuple2 = (IndexTuple) b->tuple; + + { + BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid); + BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid); + + if (blk1 != blk2) + return (blk1 < blk2) ? -1 : 1; + } + { + OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid); + OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid); + + if (pos1 != pos2) + return (pos1 < pos2) ? -1 : 1; + } + + /* ItemPointer values should never be equal */ + Assert(false); + + return 0; +} + +static void +copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + IndexTuple tuple = (IndexTuple) tup; + unsigned int tuplen = IndexTupleSize(tuple); + IndexTuple newtuple; + Datum original; + + /* copy the tuple into sort storage */ + newtuple = (IndexTuple) MemoryContextAlloc(state->tuplecontext, tuplen); + memcpy(newtuple, tuple, tuplen); + USEMEM(state, GetMemoryChunkSpace(newtuple)); + stup->tuple = (void *) newtuple; + /* set up first-column key value */ + original = index_getattr(newtuple, + 1, + RelationGetDescr(state->indexRel), + &stup->isnull1); + + if (!state->sortKeys->abbrev_converter || stup->isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to zeroed representation (to be + * consistent, and to support cheap inequality tests for NULL + * abbreviated keys). + */ + stup->datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup->datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup->datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + tuple = (IndexTuple) mtup->tuple; + mtup->datum1 = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &mtup->isnull1); + } + } +} + +static void +writetup_index(Tuplesortstate *state, int tapenum, SortTuple *stup) +{ + IndexTuple tuple = (IndexTuple) stup->tuple; + unsigned int tuplen; + + tuplen = IndexTupleSize(tuple) + sizeof(tuplen); + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &tuplen, sizeof(tuplen)); + LogicalTapeWrite(state->tapeset, tapenum, + (void *) tuple, IndexTupleSize(tuple)); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &tuplen, sizeof(tuplen)); + + if (!state->slabAllocatorUsed) + { + FREEMEM(state, GetMemoryChunkSpace(tuple)); + pfree(tuple); + } +} + +static void +readtup_index(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len) +{ + unsigned int tuplen = len - sizeof(unsigned int); + IndexTuple tuple = (IndexTuple) readtup_alloc(state, tuplen); + + LogicalTapeReadExact(state->tapeset, tapenum, + tuple, tuplen); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeReadExact(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + stup->tuple = (void *) tuple; + /* set up first-column key value */ + stup->datum1 = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &stup->isnull1); +} + +/* + * Routines specialized for DatumTuple case + */ + +static int +comparetup_datum(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) +{ + int compare; + + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + state->sortKeys); + if (compare != 0) + return compare; + + /* if we have abbreviations, then "tuple" has the original value */ + + if (state->sortKeys->abbrev_converter) + compare = ApplySortAbbrevFullComparator(PointerGetDatum(a->tuple), a->isnull1, + PointerGetDatum(b->tuple), b->isnull1, + state->sortKeys); + + return compare; +} + +static void +copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + /* Not currently needed */ + elog(ERROR, "copytup_datum() should not be called"); +} + +static void +writetup_datum(Tuplesortstate *state, int tapenum, SortTuple *stup) +{ + void *waddr; + unsigned int tuplen; + unsigned int writtenlen; + + if (stup->isnull1) + { + waddr = NULL; + tuplen = 0; + } + else if (!state->tuples) + { + waddr = &stup->datum1; + tuplen = sizeof(Datum); + } + else + { + waddr = stup->tuple; + tuplen = datumGetSize(PointerGetDatum(stup->tuple), false, state->datumTypeLen); + Assert(tuplen != 0); + } + + writtenlen = tuplen + sizeof(unsigned int); + + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &writtenlen, sizeof(writtenlen)); + LogicalTapeWrite(state->tapeset, tapenum, + waddr, tuplen); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &writtenlen, sizeof(writtenlen)); + + if (!state->slabAllocatorUsed && stup->tuple) + { + FREEMEM(state, GetMemoryChunkSpace(stup->tuple)); + pfree(stup->tuple); + } +} + +static void +readtup_datum(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len) +{ + unsigned int tuplen = len - sizeof(unsigned int); + + if (tuplen == 0) + { + /* it's NULL */ + stup->datum1 = (Datum) 0; + stup->isnull1 = true; + stup->tuple = NULL; + } + else if (!state->tuples) + { + Assert(tuplen == sizeof(Datum)); + LogicalTapeReadExact(state->tapeset, tapenum, + &stup->datum1, tuplen); + stup->isnull1 = false; + stup->tuple = NULL; + } + else + { + void *raddr = readtup_alloc(state, tuplen); + + LogicalTapeReadExact(state->tapeset, tapenum, + raddr, tuplen); + stup->datum1 = PointerGetDatum(raddr); + stup->isnull1 = false; + stup->tuple = raddr; + } + + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeReadExact(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); +} + +/* + * Convenience routine to free a tuple previously loaded into sort memory + */ +static void +free_sort_tuple(Tuplesortstate *state, SortTuple *stup) +{ + if (stup->tuple) + { + FREEMEM(state, GetMemoryChunkSpace(stup->tuple)); + pfree(stup->tuple); + stup->tuple = NULL; + } +} diff --git a/src/tuplesort11.c b/src/tuplesort11.c new file mode 100644 index 0000000000..23de559073 --- /dev/null +++ b/src/tuplesort11.c @@ -0,0 +1,4595 @@ +/*------------------------------------------------------------------------- + * + * tuplesort.c + * Generalized tuple sorting routines. + * + * This module handles sorting of heap tuples, index tuples, or single + * Datums (and could easily support other kinds of sortable objects, + * if necessary). It works efficiently for both small and large amounts + * of data. Small amounts are sorted in-memory using qsort(). Large + * amounts are sorted using temporary files and a standard external sort + * algorithm. + * + * See Knuth, volume 3, for more than you want to know about the external + * sorting algorithm. Historically, we divided the input into sorted runs + * using replacement selection, in the form of a priority tree implemented + * as a heap (essentially his Algorithm 5.2.3H), but now we always use + * quicksort for run generation. We merge the runs using polyphase merge, + * Knuth's Algorithm 5.4.2D. The logical "tapes" used by Algorithm D are + * implemented by logtape.c, which avoids space wastage by recycling disk + * space as soon as each block is read from its "tape". + * + * The approximate amount of memory allowed for any one sort operation + * is specified in kilobytes by the caller (most pass work_mem). Initially, + * we absorb tuples and simply store them in an unsorted array as long as + * we haven't exceeded workMem. If we reach the end of the input without + * exceeding workMem, we sort the array using qsort() and subsequently return + * tuples just by scanning the tuple array sequentially. If we do exceed + * workMem, we begin to emit tuples into sorted runs in temporary tapes. + * When tuples are dumped in batch after quicksorting, we begin a new run + * with a new output tape (selected per Algorithm D). After the end of the + * input is reached, we dump out remaining tuples in memory into a final run, + * then merge the runs using Algorithm D. + * + * When merging runs, we use a heap containing just the frontmost tuple from + * each source run; we repeatedly output the smallest tuple and replace it + * with the next tuple from its source tape (if any). When the heap empties, + * the merge is complete. The basic merge algorithm thus needs very little + * memory --- only M tuples for an M-way merge, and M is constrained to a + * small number. However, we can still make good use of our full workMem + * allocation by pre-reading additional blocks from each source tape. Without + * prereading, our access pattern to the temporary file would be very erratic; + * on average we'd read one block from each of M source tapes during the same + * time that we're writing M blocks to the output tape, so there is no + * sequentiality of access at all, defeating the read-ahead methods used by + * most Unix kernels. Worse, the output tape gets written into a very random + * sequence of blocks of the temp file, ensuring that things will be even + * worse when it comes time to read that tape. A straightforward merge pass + * thus ends up doing a lot of waiting for disk seeks. We can improve matters + * by prereading from each source tape sequentially, loading about workMem/M + * bytes from each tape in turn, and making the sequential blocks immediately + * available for reuse. This approach helps to localize both read and write + * accesses. The pre-reading is handled by logtape.c, we just tell it how + * much memory to use for the buffers. + * + * When the caller requests random access to the sort result, we form + * the final sorted run on a logical tape which is then "frozen", so + * that we can access it randomly. When the caller does not need random + * access, we return from tuplesort_performsort() as soon as we are down + * to one run per logical tape. The final merge is then performed + * on-the-fly as the caller repeatedly calls tuplesort_getXXX; this + * saves one cycle of writing all the data out to disk and reading it in. + * + * Before Postgres 8.2, we always used a seven-tape polyphase merge, on the + * grounds that 7 is the "sweet spot" on the tapes-to-passes curve according + * to Knuth's figure 70 (section 5.4.2). However, Knuth is assuming that + * tape drives are expensive beasts, and in particular that there will always + * be many more runs than tape drives. In our implementation a "tape drive" + * doesn't cost much more than a few Kb of memory buffers, so we can afford + * to have lots of them. In particular, if we can have as many tape drives + * as sorted runs, we can eliminate any repeated I/O at all. In the current + * code we determine the number of tapes M on the basis of workMem: we want + * workMem/M to be large enough that we read a fair amount of data each time + * we preread from a tape, so as to maintain the locality of access described + * above. Nonetheless, with large workMem we can have many tapes (but not + * too many -- see the comments in tuplesort_merge_order). + * + * This module supports parallel sorting. Parallel sorts involve coordination + * among one or more worker processes, and a leader process, each with its own + * tuplesort state. The leader process (or, more accurately, the + * Tuplesortstate associated with a leader process) creates a full tapeset + * consisting of worker tapes with one run to merge; a run for every + * worker process. This is then merged. Worker processes are guaranteed to + * produce exactly one output run from their partial input. + * + * + * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/utils/sort/tuplesort.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include + +#include "access/htup_details.h" +#include "access/nbtree.h" +#include "access/hash.h" +#include "catalog/index.h" +#include "catalog/pg_am.h" +#include "commands/tablespace.h" +#include "executor/executor.h" +#include "miscadmin.h" +#include "pg_trace.h" +#include "utils/datum.h" +#include "utils/logtape.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/pg_rusage.h" +#include "utils/rel.h" +#include "utils/sortsupport.h" +#include "utils/tuplesort.h" + +/* Should be the last include */ +#include "disable_core_macro.h" + +/* sort-type codes for sort__start probes */ +#define HEAP_SORT 0 +#define INDEX_SORT 1 +#define DATUM_SORT 2 +#define CLUSTER_SORT 3 + +/* Sort parallel code from state for sort__start probes */ +#define PARALLEL_SORT(state) ((state)->shared == NULL ? 0 : \ + (state)->worker >= 0 ? 1 : 2) + +/* GUC variables */ +#ifdef TRACE_SORT +bool trace_sort = false; +#endif + +#ifdef DEBUG_BOUNDED_SORT +bool optimize_bounded_sort = true; +#endif + + +/* + * The objects we actually sort are SortTuple structs. These contain + * a pointer to the tuple proper (might be a MinimalTuple or IndexTuple), + * which is a separate palloc chunk --- we assume it is just one chunk and + * can be freed by a simple pfree() (except during merge, when we use a + * simple slab allocator). SortTuples also contain the tuple's first key + * column in Datum/nullflag format, and an index integer. + * + * Storing the first key column lets us save heap_getattr or index_getattr + * calls during tuple comparisons. We could extract and save all the key + * columns not just the first, but this would increase code complexity and + * overhead, and wouldn't actually save any comparison cycles in the common + * case where the first key determines the comparison result. Note that + * for a pass-by-reference datatype, datum1 points into the "tuple" storage. + * + * There is one special case: when the sort support infrastructure provides an + * "abbreviated key" representation, where the key is (typically) a pass by + * value proxy for a pass by reference type. In this case, the abbreviated key + * is stored in datum1 in place of the actual first key column. + * + * When sorting single Datums, the data value is represented directly by + * datum1/isnull1 for pass by value types (or null values). If the datatype is + * pass-by-reference and isnull1 is false, then "tuple" points to a separately + * palloc'd data value, otherwise "tuple" is NULL. The value of datum1 is then + * either the same pointer as "tuple", or is an abbreviated key value as + * described above. Accordingly, "tuple" is always used in preference to + * datum1 as the authoritative value for pass-by-reference cases. + * + * tupindex holds the input tape number that each tuple in the heap was read + * from during merge passes. + */ +typedef struct +{ + void *tuple; /* the tuple itself */ + Datum datum1; /* value of first key column */ + bool isnull1; /* is first key column NULL? */ + int tupindex; /* see notes above */ +} SortTuple; + +/* + * During merge, we use a pre-allocated set of fixed-size slots to hold + * tuples. To avoid palloc/pfree overhead. + * + * Merge doesn't require a lot of memory, so we can afford to waste some, + * by using gratuitously-sized slots. If a tuple is larger than 1 kB, the + * palloc() overhead is not significant anymore. + * + * 'nextfree' is valid when this chunk is in the free list. When in use, the + * slot holds a tuple. + */ +#define SLAB_SLOT_SIZE 1024 + +typedef union SlabSlot +{ + union SlabSlot *nextfree; + char buffer[SLAB_SLOT_SIZE]; +} SlabSlot; + +/* + * Possible states of a Tuplesort object. These denote the states that + * persist between calls of Tuplesort routines. + */ +typedef enum +{ + TSS_INITIAL, /* Loading tuples; still within memory limit */ + TSS_BOUNDED, /* Loading tuples into bounded-size heap */ + TSS_BUILDRUNS, /* Loading tuples; writing to tape */ + TSS_SORTEDINMEM, /* Sort completed entirely in memory */ + TSS_SORTEDONTAPE, /* Sort completed, final run is on tape */ + TSS_FINALMERGE /* Performing final merge on-the-fly */ +} TupSortStatus; + +/* + * Parameters for calculation of number of tapes to use --- see inittapes() + * and tuplesort_merge_order(). + * + * In this calculation we assume that each tape will cost us about 1 blocks + * worth of buffer space. This ignores the overhead of all the other data + * structures needed for each tape, but it's probably close enough. + * + * MERGE_BUFFER_SIZE is how much data we'd like to read from each input + * tape during a preread cycle (see discussion at top of file). + */ +#define MINORDER 6 /* minimum merge order */ +#define MAXORDER 500 /* maximum merge order */ +#define TAPE_BUFFER_OVERHEAD BLCKSZ +#define MERGE_BUFFER_SIZE (BLCKSZ * 32) + +typedef int (*SortTupleComparator) (const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); + +/* + * Private state of a Tuplesort operation. + */ +struct Tuplesortstate +{ + TupSortStatus status; /* enumerated value as shown above */ + int nKeys; /* number of columns in sort key */ + bool randomAccess; /* did caller request random access? */ + bool bounded; /* did caller specify a maximum number of + * tuples to return? */ + bool boundUsed; /* true if we made use of a bounded heap */ + int bound; /* if bounded, the maximum number of tuples */ + bool tuples; /* Can SortTuple.tuple ever be set? */ + int64 availMem; /* remaining memory available, in bytes */ + int64 allowedMem; /* total memory allowed, in bytes */ + int maxTapes; /* number of tapes (Knuth's T) */ + int tapeRange; /* maxTapes-1 (Knuth's P) */ + MemoryContext sortcontext; /* memory context holding most sort data */ + MemoryContext tuplecontext; /* sub-context of sortcontext for tuple data */ + LogicalTapeSet *tapeset; /* logtape.c object for tapes in a temp file */ + + /* + * These function pointers decouple the routines that must know what kind + * of tuple we are sorting from the routines that don't need to know it. + * They are set up by the tuplesort_begin_xxx routines. + * + * Function to compare two tuples; result is per qsort() convention, ie: + * <0, 0, >0 according as ab. The API must match + * qsort_arg_comparator. + */ + SortTupleComparator comparetup; + + /* + * Function to copy a supplied input tuple into palloc'd space and set up + * its SortTuple representation (ie, set tuple/datum1/isnull1). Also, + * state->availMem must be decreased by the amount of space used for the + * tuple copy (note the SortTuple struct itself is not counted). + */ + void (*copytup) (Tuplesortstate *state, SortTuple *stup, void *tup); + + /* + * Function to write a stored tuple onto tape. The representation of the + * tuple on tape need not be the same as it is in memory; requirements on + * the tape representation are given below. Unless the slab allocator is + * used, after writing the tuple, pfree() the out-of-line data (not the + * SortTuple struct!), and increase state->availMem by the amount of + * memory space thereby released. + */ + void (*writetup) (Tuplesortstate *state, int tapenum, + SortTuple *stup); + + /* + * Function to read a stored tuple from tape back into memory. 'len' is + * the already-read length of the stored tuple. The tuple is allocated + * from the slab memory arena, or is palloc'd, see readtup_alloc(). + */ + void (*readtup) (Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); + + /* + * This array holds the tuples now in sort memory. If we are in state + * INITIAL, the tuples are in no particular order; if we are in state + * SORTEDINMEM, the tuples are in final sorted order; in states BUILDRUNS + * and FINALMERGE, the tuples are organized in "heap" order per Algorithm + * H. In state SORTEDONTAPE, the array is not used. + */ + SortTuple *memtuples; /* array of SortTuple structs */ + int memtupcount; /* number of tuples currently present */ + int memtupsize; /* allocated length of memtuples array */ + bool growmemtuples; /* memtuples' growth still underway? */ + + /* + * Memory for tuples is sometimes allocated using a simple slab allocator, + * rather than with palloc(). Currently, we switch to slab allocation + * when we start merging. Merging only needs to keep a small, fixed + * number of tuples in memory at any time, so we can avoid the + * palloc/pfree overhead by recycling a fixed number of fixed-size slots + * to hold the tuples. + * + * For the slab, we use one large allocation, divided into SLAB_SLOT_SIZE + * slots. The allocation is sized to have one slot per tape, plus one + * additional slot. We need that many slots to hold all the tuples kept + * in the heap during merge, plus the one we have last returned from the + * sort, with tuplesort_gettuple. + * + * Initially, all the slots are kept in a linked list of free slots. When + * a tuple is read from a tape, it is put to the next available slot, if + * it fits. If the tuple is larger than SLAB_SLOT_SIZE, it is palloc'd + * instead. + * + * When we're done processing a tuple, we return the slot back to the free + * list, or pfree() if it was palloc'd. We know that a tuple was + * allocated from the slab, if its pointer value is between + * slabMemoryBegin and -End. + * + * When the slab allocator is used, the USEMEM/LACKMEM mechanism of + * tracking memory usage is not used. + */ + bool slabAllocatorUsed; + + char *slabMemoryBegin; /* beginning of slab memory arena */ + char *slabMemoryEnd; /* end of slab memory arena */ + SlabSlot *slabFreeHead; /* head of free list */ + + /* Buffer size to use for reading input tapes, during merge. */ + size_t read_buffer_size; + + /* + * When we return a tuple to the caller in tuplesort_gettuple_XXX, that + * came from a tape (that is, in TSS_SORTEDONTAPE or TSS_FINALMERGE + * modes), we remember the tuple in 'lastReturnedTuple', so that we can + * recycle the memory on next gettuple call. + */ + void *lastReturnedTuple; + + /* + * While building initial runs, this is the current output run number. + * Afterwards, it is the number of initial runs we made. + */ + int currentRun; + + /* + * Unless otherwise noted, all pointer variables below are pointers to + * arrays of length maxTapes, holding per-tape data. + */ + + /* + * This variable is only used during merge passes. mergeactive[i] is true + * if we are reading an input run from (actual) tape number i and have not + * yet exhausted that run. + */ + bool *mergeactive; /* active input run source? */ + + /* + * Variables for Algorithm D. Note that destTape is a "logical" tape + * number, ie, an index into the tp_xxx[] arrays. Be careful to keep + * "logical" and "actual" tape numbers straight! + */ + int Level; /* Knuth's l */ + int destTape; /* current output tape (Knuth's j, less 1) */ + int *tp_fib; /* Target Fibonacci run counts (A[]) */ + int *tp_runs; /* # of real runs on each tape */ + int *tp_dummy; /* # of dummy runs for each tape (D[]) */ + int *tp_tapenum; /* Actual tape numbers (TAPE[]) */ + int activeTapes; /* # of active input tapes in merge pass */ + + /* + * These variables are used after completion of sorting to keep track of + * the next tuple to return. (In the tape case, the tape's current read + * position is also critical state.) + */ + int result_tape; /* actual tape number of finished output */ + int current; /* array index (only used if SORTEDINMEM) */ + bool eof_reached; /* reached EOF (needed for cursors) */ + + /* markpos_xxx holds marked position for mark and restore */ + long markpos_block; /* tape block# (only used if SORTEDONTAPE) */ + int markpos_offset; /* saved "current", or offset in tape block */ + bool markpos_eof; /* saved "eof_reached" */ + + /* + * These variables are used during parallel sorting. + * + * worker is our worker identifier. Follows the general convention that + * -1 value relates to a leader tuplesort, and values >= 0 worker + * tuplesorts. (-1 can also be a serial tuplesort.) + * + * shared is mutable shared memory state, which is used to coordinate + * parallel sorts. + * + * nParticipants is the number of worker Tuplesortstates known by the + * leader to have actually been launched, which implies that they must + * finish a run leader can merge. Typically includes a worker state held + * by the leader process itself. Set in the leader Tuplesortstate only. + */ + int worker; + Sharedsort *shared; + int nParticipants; + + /* + * The sortKeys variable is used by every case other than the hash index + * case; it is set by tuplesort_begin_xxx. tupDesc is only used by the + * MinimalTuple and CLUSTER routines, though. + */ + TupleDesc tupDesc; + SortSupport sortKeys; /* array of length nKeys */ + + /* + * This variable is shared by the single-key MinimalTuple case and the + * Datum case (which both use qsort_ssup()). Otherwise it's NULL. + */ + SortSupport onlyKey; + + /* + * Additional state for managing "abbreviated key" sortsupport routines + * (which currently may be used by all cases except the hash index case). + * Tracks the intervals at which the optimization's effectiveness is + * tested. + */ + int64 abbrevNext; /* Tuple # at which to next check + * applicability */ + + /* + * These variables are specific to the CLUSTER case; they are set by + * tuplesort_begin_cluster. + */ + IndexInfo *indexInfo; /* info about index being used for reference */ + EState *estate; /* for evaluating index expressions */ + + /* + * These variables are specific to the IndexTuple case; they are set by + * tuplesort_begin_index_xxx and used only by the IndexTuple routines. + */ + Relation heapRel; /* table the index is being built on */ + Relation indexRel; /* index being built */ + + /* These are specific to the index_btree subcase: */ + bool enforceUnique; /* complain if we find duplicate tuples */ + + /* These are specific to the index_hash subcase: */ + uint32 high_mask; /* masks for sortable part of hash code */ + uint32 low_mask; + uint32 max_buckets; + + /* + * These variables are specific to the Datum case; they are set by + * tuplesort_begin_datum and used only by the DatumTuple routines. + */ + Oid datumType; + /* we need typelen in order to know how to copy the Datums. */ + int datumTypeLen; + + /* + * Resource snapshot for time of sort start. + */ +#ifdef TRACE_SORT + PGRUsage ru_start; +#endif +}; + +/* + * Private mutable state of tuplesort-parallel-operation. This is allocated + * in shared memory. + */ +struct Sharedsort +{ + /* mutex protects all fields prior to tapes */ + slock_t mutex; + + /* + * currentWorker generates ordinal identifier numbers for parallel sort + * workers. These start from 0, and are always gapless. + * + * Workers increment workersFinished to indicate having finished. If this + * is equal to state.nParticipants within the leader, leader is ready to + * merge worker runs. + */ + int currentWorker; + int workersFinished; + + /* Temporary file space */ + SharedFileSet fileset; + + /* Size of tapes flexible array */ + int nTapes; + + /* + * Tapes array used by workers to report back information needed by the + * leader to concatenate all worker tapes into one for merging + */ + TapeShare tapes[FLEXIBLE_ARRAY_MEMBER]; +}; + +/* + * Is the given tuple allocated from the slab memory arena? + */ +#define IS_SLAB_SLOT(state, tuple) \ + ((char *) (tuple) >= (state)->slabMemoryBegin && \ + (char *) (tuple) < (state)->slabMemoryEnd) + +/* + * Return the given tuple to the slab memory free list, or free it + * if it was palloc'd. + */ +#define RELEASE_SLAB_SLOT(state, tuple) \ + do { \ + SlabSlot *buf = (SlabSlot *) tuple; \ + \ + if (IS_SLAB_SLOT((state), buf)) \ + { \ + buf->nextfree = (state)->slabFreeHead; \ + (state)->slabFreeHead = buf; \ + } else \ + pfree(buf); \ + } while(0) + +#define COMPARETUP(state,a,b) ((*(state)->comparetup) (a, b, state)) +#define COPYTUP(state,stup,tup) ((*(state)->copytup) (state, stup, tup)) +#define WRITETUP(state,tape,stup) ((*(state)->writetup) (state, tape, stup)) +#define READTUP(state,stup,tape,len) ((*(state)->readtup) (state, stup, tape, len)) +#define LACKMEM(state) ((state)->availMem < 0 && !(state)->slabAllocatorUsed) +#define USEMEM(state,amt) ((state)->availMem -= (amt)) +#define FREEMEM(state,amt) ((state)->availMem += (amt)) +#define SERIAL(state) ((state)->shared == NULL) +#define WORKER(state) ((state)->shared && (state)->worker != -1) +#define LEADER(state) ((state)->shared && (state)->worker == -1) + +/* + * NOTES about on-tape representation of tuples: + * + * We require the first "unsigned int" of a stored tuple to be the total size + * on-tape of the tuple, including itself (so it is never zero; an all-zero + * unsigned int is used to delimit runs). The remainder of the stored tuple + * may or may not match the in-memory representation of the tuple --- + * any conversion needed is the job of the writetup and readtup routines. + * + * If state->randomAccess is true, then the stored representation of the + * tuple must be followed by another "unsigned int" that is a copy of the + * length --- so the total tape space used is actually sizeof(unsigned int) + * more than the stored length value. This allows read-backwards. When + * randomAccess is not true, the write/read routines may omit the extra + * length word. + * + * writetup is expected to write both length words as well as the tuple + * data. When readtup is called, the tape is positioned just after the + * front length word; readtup must read the tuple data and advance past + * the back length word (if present). + * + * The write/read routines can make use of the tuple description data + * stored in the Tuplesortstate record, if needed. They are also expected + * to adjust state->availMem by the amount of memory space (not tape space!) + * released or consumed. There is no error return from either writetup + * or readtup; they should ereport() on failure. + * + * + * NOTES about memory consumption calculations: + * + * We count space allocated for tuples against the workMem limit, plus + * the space used by the variable-size memtuples array. Fixed-size space + * is not counted; it's small enough to not be interesting. + * + * Note that we count actual space used (as shown by GetMemoryChunkSpace) + * rather than the originally-requested size. This is important since + * palloc can add substantial overhead. It's not a complete answer since + * we won't count any wasted space in palloc allocation blocks, but it's + * a lot better than what we were doing before 7.3. As of 9.6, a + * separate memory context is used for caller passed tuples. Resetting + * it at certain key increments significantly ameliorates fragmentation. + * Note that this places a responsibility on readtup and copytup routines + * to use the right memory context for these tuples (and to not use the + * reset context for anything whose lifetime needs to span multiple + * external sort runs). + */ + +/* When using this macro, beware of double evaluation of len */ +#define LogicalTapeReadExact(tapeset, tapenum, ptr, len) \ + do { \ + if (LogicalTapeRead(tapeset, tapenum, ptr, len) != (size_t) (len)) \ + elog(ERROR, "unexpected end of data"); \ + } while(0) + + +static Tuplesortstate *tuplesort_begin_common(int workMem, + SortCoordinate coordinate, + bool randomAccess); +static void puttuple_common(Tuplesortstate *state, SortTuple *tuple); +static bool consider_abort_common(Tuplesortstate *state); +static void inittapes(Tuplesortstate *state, bool mergeruns); +static void inittapestate(Tuplesortstate *state, int maxTapes); +static void selectnewtape(Tuplesortstate *state); +static void init_slab_allocator(Tuplesortstate *state, int numSlots); +static void mergeruns(Tuplesortstate *state); +static void mergeonerun(Tuplesortstate *state); +static void beginmerge(Tuplesortstate *state); +static bool mergereadnext(Tuplesortstate *state, int srcTape, SortTuple *stup); +static void dumptuples(Tuplesortstate *state, bool alltuples); +static void make_bounded_heap(Tuplesortstate *state); +static void sort_bounded_heap(Tuplesortstate *state); +static void tuplesort_sort_memtuples(Tuplesortstate *state); +static void tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple); +static void tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple *tuple); +static void tuplesort_heap_delete_top(Tuplesortstate *state); +static void reversedirection(Tuplesortstate *state); +static unsigned int getlen(Tuplesortstate *state, int tapenum, bool eofOK); +static void markrunend(Tuplesortstate *state, int tapenum); +static void *readtup_alloc(Tuplesortstate *state, Size tuplen); +static int comparetup_heap(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_heap(Tuplesortstate *state, int tapenum, + SortTuple *stup); +static void readtup_heap(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); +static int comparetup_cluster(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_cluster(Tuplesortstate *state, int tapenum, + SortTuple *stup); +static void readtup_cluster(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); +static int comparetup_index_btree(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static int comparetup_index_hash(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_index(Tuplesortstate *state, int tapenum, + SortTuple *stup); +static void readtup_index(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); +static int comparetup_datum(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_datum(Tuplesortstate *state, int tapenum, + SortTuple *stup); +static void readtup_datum(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); +static int worker_get_identifier(Tuplesortstate *state); +static void worker_freeze_result_tape(Tuplesortstate *state); +static void worker_nomergeruns(Tuplesortstate *state); +static void leader_takeover_tapes(Tuplesortstate *state); +static void free_sort_tuple(Tuplesortstate *state, SortTuple *stup); + +/* + * Special versions of qsort just for SortTuple objects. qsort_tuple() sorts + * any variant of SortTuples, using the appropriate comparetup function. + * qsort_ssup() is specialized for the case where the comparetup function + * reduces to ApplySortComparator(), that is single-key MinimalTuple sorts + * and Datum sorts. + */ +#include "qsort_tuple.c" + + +/* + * tuplesort_begin_xxx + * + * Initialize for a tuple sort operation. + * + * After calling tuplesort_begin, the caller should call tuplesort_putXXX + * zero or more times, then call tuplesort_performsort when all the tuples + * have been supplied. After performsort, retrieve the tuples in sorted + * order by calling tuplesort_getXXX until it returns false/NULL. (If random + * access was requested, rescan, markpos, and restorepos can also be called.) + * Call tuplesort_end to terminate the operation and release memory/disk space. + * + * Each variant of tuplesort_begin has a workMem parameter specifying the + * maximum number of kilobytes of RAM to use before spilling data to disk. + * (The normal value of this parameter is work_mem, but some callers use + * other values.) Each variant also has a randomAccess parameter specifying + * whether the caller needs non-sequential access to the sort result. + */ + +static Tuplesortstate * +tuplesort_begin_common(int workMem, SortCoordinate coordinate, + bool randomAccess) +{ + Tuplesortstate *state; + MemoryContext sortcontext; + MemoryContext tuplecontext; + MemoryContext oldcontext; + + /* See leader_takeover_tapes() remarks on randomAccess support */ + if (coordinate && randomAccess) + elog(ERROR, "random access disallowed under parallel sort"); + + /* + * Create a working memory context for this sort operation. All data + * needed by the sort will live inside this context. + */ + sortcontext = AllocSetContextCreate(CurrentMemoryContext, + "TupleSort main", + ALLOCSET_DEFAULT_SIZES); + + /* + * Caller tuple (e.g. IndexTuple) memory context. + * + * A dedicated child context used exclusively for caller passed tuples + * eases memory management. Resetting at key points reduces + * fragmentation. Note that the memtuples array of SortTuples is allocated + * in the parent context, not this context, because there is no need to + * free memtuples early. + */ + tuplecontext = AllocSetContextCreate(sortcontext, + "Caller tuples", + ALLOCSET_DEFAULT_SIZES); + + /* + * Make the Tuplesortstate within the per-sort context. This way, we + * don't need a separate pfree() operation for it at shutdown. + */ + oldcontext = MemoryContextSwitchTo(sortcontext); + + state = (Tuplesortstate *) palloc0(sizeof(Tuplesortstate)); + +#ifdef TRACE_SORT + if (trace_sort) + pg_rusage_init(&state->ru_start); +#endif + + state->status = TSS_INITIAL; + state->randomAccess = randomAccess; + state->bounded = false; + state->tuples = true; + state->boundUsed = false; + + /* + * workMem is forced to be at least 64KB, the current minimum valid value + * for the work_mem GUC. This is a defense against parallel sort callers + * that divide out memory among many workers in a way that leaves each + * with very little memory. + */ + state->allowedMem = Max(workMem, 64) * (int64) 1024; + state->availMem = state->allowedMem; + state->sortcontext = sortcontext; + state->tuplecontext = tuplecontext; + state->tapeset = NULL; + + state->memtupcount = 0; + + /* + * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD; + * see comments in grow_memtuples(). + */ + state->memtupsize = Max(1024, + ALLOCSET_SEPARATE_THRESHOLD / sizeof(SortTuple) + 1); + + state->growmemtuples = true; + state->slabAllocatorUsed = false; + state->memtuples = (SortTuple *) palloc(state->memtupsize * sizeof(SortTuple)); + + USEMEM(state, GetMemoryChunkSpace(state->memtuples)); + + /* workMem must be large enough for the minimal memtuples array */ + if (LACKMEM(state)) + elog(ERROR, "insufficient memory allowed for sort"); + + state->currentRun = 0; + + /* + * maxTapes, tapeRange, and Algorithm D variables will be initialized by + * inittapes(), if needed + */ + + state->result_tape = -1; /* flag that result tape has not been formed */ + + /* + * Initialize parallel-related state based on coordination information + * from caller + */ + if (!coordinate) + { + /* Serial sort */ + state->shared = NULL; + state->worker = -1; + state->nParticipants = -1; + } + else if (coordinate->isWorker) + { + /* Parallel worker produces exactly one final run from all input */ + state->shared = coordinate->sharedsort; + state->worker = worker_get_identifier(state); + state->nParticipants = -1; + } + else + { + /* Parallel leader state only used for final merge */ + state->shared = coordinate->sharedsort; + state->worker = -1; + state->nParticipants = coordinate->nParticipants; + Assert(state->nParticipants >= 1); + } + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_heap(TupleDesc tupDesc, + int nkeys, AttrNumber *attNums, + Oid *sortOperators, Oid *sortCollations, + bool *nullsFirstFlags, + int workMem, SortCoordinate coordinate, bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + randomAccess); + MemoryContext oldcontext; + int i; + + oldcontext = MemoryContextSwitchTo(state->sortcontext); + + AssertArg(nkeys > 0); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c", + nkeys, workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = nkeys; + + TRACE_POSTGRESQL_SORT_START(HEAP_SORT, + false, /* no unique check */ + nkeys, + workMem, + randomAccess, + PARALLEL_SORT(state)); + + state->comparetup = comparetup_heap; + state->copytup = copytup_heap; + state->writetup = writetup_heap; + state->readtup = readtup_heap; + + state->tupDesc = tupDesc; /* assume we need not copy tupDesc */ + state->abbrevNext = 10; + + /* Prepare SortSupport data for each column */ + state->sortKeys = (SortSupport) palloc0(nkeys * sizeof(SortSupportData)); + + for (i = 0; i < nkeys; i++) + { + SortSupport sortKey = state->sortKeys + i; + + AssertArg(attNums[i] != 0); + AssertArg(sortOperators[i] != 0); + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = sortCollations[i]; + sortKey->ssup_nulls_first = nullsFirstFlags[i]; + sortKey->ssup_attno = attNums[i]; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0); + + PrepareSortSupportFromOrderingOp(sortOperators[i], sortKey); + } + + /* + * The "onlyKey" optimization cannot be used with abbreviated keys, since + * tie-breaker comparisons may be required. Typically, the optimization + * is only of value to pass-by-value types anyway, whereas abbreviated + * keys are typically only of value to pass-by-reference types. + */ + if (nkeys == 1 && !state->sortKeys->abbrev_converter) + state->onlyKey = state->sortKeys; + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_cluster(TupleDesc tupDesc, + Relation indexRel, + int workMem, + SortCoordinate coordinate, bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + randomAccess); + ScanKey indexScanKey; + MemoryContext oldcontext; + int i; + + Assert(indexRel->rd_rel->relam == BTREE_AM_OID); + + oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c", + RelationGetNumberOfAttributes(indexRel), + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = IndexRelationGetNumberOfKeyAttributes(indexRel); + + TRACE_POSTGRESQL_SORT_START(CLUSTER_SORT, + false, /* no unique check */ + state->nKeys, + workMem, + randomAccess, + PARALLEL_SORT(state)); + + state->comparetup = comparetup_cluster; + state->copytup = copytup_cluster; + state->writetup = writetup_cluster; + state->readtup = readtup_cluster; + state->abbrevNext = 10; + + state->indexInfo = BuildIndexInfo(indexRel); + + state->tupDesc = tupDesc; /* assume we need not copy tupDesc */ + + indexScanKey = _bt_mkscankey_nodata(indexRel); + + if (state->indexInfo->ii_Expressions != NULL) + { + TupleTableSlot *slot; + ExprContext *econtext; + + /* + * We will need to use FormIndexDatum to evaluate the index + * expressions. To do that, we need an EState, as well as a + * TupleTableSlot to put the table tuples into. The econtext's + * scantuple has to point to that slot, too. + */ + state->estate = CreateExecutorState(); + slot = MakeSingleTupleTableSlot(tupDesc); + econtext = GetPerTupleExprContext(state->estate); + econtext->ecxt_scantuple = slot; + } + + /* Prepare SortSupport data for each column */ + state->sortKeys = (SortSupport) palloc0(state->nKeys * + sizeof(SortSupportData)); + + for (i = 0; i < state->nKeys; i++) + { + SortSupport sortKey = state->sortKeys + i; + ScanKey scanKey = indexScanKey + i; + int16 strategy; + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = scanKey->sk_collation; + sortKey->ssup_nulls_first = + (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0; + sortKey->ssup_attno = scanKey->sk_attno; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0); + + AssertState(sortKey->ssup_attno != 0); + + strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ? + BTGreaterStrategyNumber : BTLessStrategyNumber; + + PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey); + } + + _bt_freeskey(indexScanKey); + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_index_btree(Relation heapRel, + Relation indexRel, + bool enforceUnique, + int workMem, + SortCoordinate coordinate, + bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + randomAccess); + ScanKey indexScanKey; + MemoryContext oldcontext; + int i; + + oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin index sort: unique = %c, workMem = %d, randomAccess = %c", + enforceUnique ? 't' : 'f', + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = IndexRelationGetNumberOfKeyAttributes(indexRel); + + TRACE_POSTGRESQL_SORT_START(INDEX_SORT, + enforceUnique, + state->nKeys, + workMem, + randomAccess, + PARALLEL_SORT(state)); + + state->comparetup = comparetup_index_btree; + state->copytup = copytup_index; + state->writetup = writetup_index; + state->readtup = readtup_index; + state->abbrevNext = 10; + + state->heapRel = heapRel; + state->indexRel = indexRel; + state->enforceUnique = enforceUnique; + + indexScanKey = _bt_mkscankey_nodata(indexRel); + + /* Prepare SortSupport data for each column */ + state->sortKeys = (SortSupport) palloc0(state->nKeys * + sizeof(SortSupportData)); + + for (i = 0; i < state->nKeys; i++) + { + SortSupport sortKey = state->sortKeys + i; + ScanKey scanKey = indexScanKey + i; + int16 strategy; + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = scanKey->sk_collation; + sortKey->ssup_nulls_first = + (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0; + sortKey->ssup_attno = scanKey->sk_attno; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0); + + AssertState(sortKey->ssup_attno != 0); + + strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ? + BTGreaterStrategyNumber : BTLessStrategyNumber; + + PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey); + } + + _bt_freeskey(indexScanKey); + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_index_hash(Relation heapRel, + Relation indexRel, + uint32 high_mask, + uint32 low_mask, + uint32 max_buckets, + int workMem, + SortCoordinate coordinate, + bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + randomAccess); + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin index sort: high_mask = 0x%x, low_mask = 0x%x, " + "max_buckets = 0x%x, workMem = %d, randomAccess = %c", + high_mask, + low_mask, + max_buckets, + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = 1; /* Only one sort column, the hash code */ + + state->comparetup = comparetup_index_hash; + state->copytup = copytup_index; + state->writetup = writetup_index; + state->readtup = readtup_index; + + state->heapRel = heapRel; + state->indexRel = indexRel; + + state->high_mask = high_mask; + state->low_mask = low_mask; + state->max_buckets = max_buckets; + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation, + bool nullsFirstFlag, int workMem, + SortCoordinate coordinate, bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + randomAccess); + MemoryContext oldcontext; + int16 typlen; + bool typbyval; + + oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin datum sort: workMem = %d, randomAccess = %c", + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = 1; /* always a one-column sort */ + + TRACE_POSTGRESQL_SORT_START(DATUM_SORT, + false, /* no unique check */ + 1, + workMem, + randomAccess, + PARALLEL_SORT(state)); + + state->comparetup = comparetup_datum; + state->copytup = copytup_datum; + state->writetup = writetup_datum; + state->readtup = readtup_datum; + state->abbrevNext = 10; + + state->datumType = datumType; + + /* lookup necessary attributes of the datum type */ + get_typlenbyval(datumType, &typlen, &typbyval); + state->datumTypeLen = typlen; + state->tuples = !typbyval; + + /* Prepare SortSupport data */ + state->sortKeys = (SortSupport) palloc0(sizeof(SortSupportData)); + + state->sortKeys->ssup_cxt = CurrentMemoryContext; + state->sortKeys->ssup_collation = sortCollation; + state->sortKeys->ssup_nulls_first = nullsFirstFlag; + + /* + * Abbreviation is possible here only for by-reference types. In theory, + * a pass-by-value datatype could have an abbreviated form that is cheaper + * to compare. In a tuple sort, we could support that, because we can + * always extract the original datum from the tuple is needed. Here, we + * can't, because a datum sort only stores a single copy of the datum; the + * "tuple" field of each sortTuple is NULL. + */ + state->sortKeys->abbreviate = !typbyval; + + PrepareSortSupportFromOrderingOp(sortOperator, state->sortKeys); + + /* + * The "onlyKey" optimization cannot be used with abbreviated keys, since + * tie-breaker comparisons may be required. Typically, the optimization + * is only of value to pass-by-value types anyway, whereas abbreviated + * keys are typically only of value to pass-by-reference types. + */ + if (!state->sortKeys->abbrev_converter) + state->onlyKey = state->sortKeys; + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +/* + * tuplesort_set_bound + * + * Advise tuplesort that at most the first N result tuples are required. + * + * Must be called before inserting any tuples. (Actually, we could allow it + * as long as the sort hasn't spilled to disk, but there seems no need for + * delayed calls at the moment.) + * + * This is a hint only. The tuplesort may still return more tuples than + * requested. Parallel leader tuplesorts will always ignore the hint. + */ +void +tuplesort_set_bound(Tuplesortstate *state, int64 bound) +{ + /* Assert we're called before loading any tuples */ + Assert(state->status == TSS_INITIAL); + Assert(state->memtupcount == 0); + Assert(!state->bounded); + Assert(!WORKER(state)); + +#ifdef DEBUG_BOUNDED_SORT + /* Honor GUC setting that disables the feature (for easy testing) */ + if (!optimize_bounded_sort) + return; +#endif + + /* Parallel leader ignores hint */ + if (LEADER(state)) + return; + + /* We want to be able to compute bound * 2, so limit the setting */ + if (bound > (int64) (INT_MAX / 2)) + return; + + state->bounded = true; + state->bound = (int) bound; + + /* + * Bounded sorts are not an effective target for abbreviated key + * optimization. Disable by setting state to be consistent with no + * abbreviation support. + */ + state->sortKeys->abbrev_converter = NULL; + if (state->sortKeys->abbrev_full_comparator) + state->sortKeys->comparator = state->sortKeys->abbrev_full_comparator; + + /* Not strictly necessary, but be tidy */ + state->sortKeys->abbrev_abort = NULL; + state->sortKeys->abbrev_full_comparator = NULL; +} + +/* + * tuplesort_end + * + * Release resources and clean up. + * + * NOTE: after calling this, any pointers returned by tuplesort_getXXX are + * pointing to garbage. Be careful not to attempt to use or free such + * pointers afterwards! + */ +void +tuplesort_end(Tuplesortstate *state) +{ + /* context swap probably not needed, but let's be safe */ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + long spaceUsed; + + if (state->tapeset) + spaceUsed = LogicalTapeSetBlocks(state->tapeset); + else + spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024; +#endif + + /* + * Delete temporary "tape" files, if any. + * + * Note: want to include this in reported total cost of sort, hence need + * for two #ifdef TRACE_SORT sections. + */ + if (state->tapeset) + LogicalTapeSetClose(state->tapeset); + +#ifdef TRACE_SORT + if (trace_sort) + { + if (state->tapeset) + elog(LOG, "%s of worker %d ended, %ld disk blocks used: %s", + SERIAL(state) ? "external sort" : "parallel external sort", + state->worker, spaceUsed, pg_rusage_show(&state->ru_start)); + else + elog(LOG, "%s of worker %d ended, %ld KB used: %s", + SERIAL(state) ? "internal sort" : "unperformed parallel sort", + state->worker, spaceUsed, pg_rusage_show(&state->ru_start)); + } + + TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, spaceUsed); +#else + + /* + * If you disabled TRACE_SORT, you can still probe sort__done, but you + * ain't getting space-used stats. + */ + TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, 0L); +#endif + + /* Free any execution state created for CLUSTER case */ + if (state->estate != NULL) + { + ExprContext *econtext = GetPerTupleExprContext(state->estate); + + ExecDropSingleTupleTableSlot(econtext->ecxt_scantuple); + FreeExecutorState(state->estate); + } + + MemoryContextSwitchTo(oldcontext); + + /* + * Free the per-sort memory context, thereby releasing all working memory, + * including the Tuplesortstate struct itself. + */ + MemoryContextDelete(state->sortcontext); +} + +/* + * Grow the memtuples[] array, if possible within our memory constraint. We + * must not exceed INT_MAX tuples in memory or the caller-provided memory + * limit. Return true if we were able to enlarge the array, false if not. + * + * Normally, at each increment we double the size of the array. When doing + * that would exceed a limit, we attempt one last, smaller increase (and then + * clear the growmemtuples flag so we don't try any more). That allows us to + * use memory as fully as permitted; sticking to the pure doubling rule could + * result in almost half going unused. Because availMem moves around with + * tuple addition/removal, we need some rule to prevent making repeated small + * increases in memtupsize, which would just be useless thrashing. The + * growmemtuples flag accomplishes that and also prevents useless + * recalculations in this function. + */ +static bool +grow_memtuples(Tuplesortstate *state) +{ + int newmemtupsize; + int memtupsize = state->memtupsize; + int64 memNowUsed = state->allowedMem - state->availMem; + + /* Forget it if we've already maxed out memtuples, per comment above */ + if (!state->growmemtuples) + return false; + + /* Select new value of memtupsize */ + if (memNowUsed <= state->availMem) + { + /* + * We've used no more than half of allowedMem; double our usage, + * clamping at INT_MAX tuples. + */ + if (memtupsize < INT_MAX / 2) + newmemtupsize = memtupsize * 2; + else + { + newmemtupsize = INT_MAX; + state->growmemtuples = false; + } + } + else + { + /* + * This will be the last increment of memtupsize. Abandon doubling + * strategy and instead increase as much as we safely can. + * + * To stay within allowedMem, we can't increase memtupsize by more + * than availMem / sizeof(SortTuple) elements. In practice, we want + * to increase it by considerably less, because we need to leave some + * space for the tuples to which the new array slots will refer. We + * assume the new tuples will be about the same size as the tuples + * we've already seen, and thus we can extrapolate from the space + * consumption so far to estimate an appropriate new size for the + * memtuples array. The optimal value might be higher or lower than + * this estimate, but it's hard to know that in advance. We again + * clamp at INT_MAX tuples. + * + * This calculation is safe against enlarging the array so much that + * LACKMEM becomes true, because the memory currently used includes + * the present array; thus, there would be enough allowedMem for the + * new array elements even if no other memory were currently used. + * + * We do the arithmetic in float8, because otherwise the product of + * memtupsize and allowedMem could overflow. Any inaccuracy in the + * result should be insignificant; but even if we computed a + * completely insane result, the checks below will prevent anything + * really bad from happening. + */ + double grow_ratio; + + grow_ratio = (double) state->allowedMem / (double) memNowUsed; + if (memtupsize * grow_ratio < INT_MAX) + newmemtupsize = (int) (memtupsize * grow_ratio); + else + newmemtupsize = INT_MAX; + + /* We won't make any further enlargement attempts */ + state->growmemtuples = false; + } + + /* Must enlarge array by at least one element, else report failure */ + if (newmemtupsize <= memtupsize) + goto noalloc; + + /* + * On a 32-bit machine, allowedMem could exceed MaxAllocHugeSize. Clamp + * to ensure our request won't be rejected. Note that we can easily + * exhaust address space before facing this outcome. (This is presently + * impossible due to guc.c's MAX_KILOBYTES limitation on work_mem, but + * don't rely on that at this distance.) + */ + if ((Size) newmemtupsize >= MaxAllocHugeSize / sizeof(SortTuple)) + { + newmemtupsize = (int) (MaxAllocHugeSize / sizeof(SortTuple)); + state->growmemtuples = false; /* can't grow any more */ + } + + /* + * We need to be sure that we do not cause LACKMEM to become true, else + * the space management algorithm will go nuts. The code above should + * never generate a dangerous request, but to be safe, check explicitly + * that the array growth fits within availMem. (We could still cause + * LACKMEM if the memory chunk overhead associated with the memtuples + * array were to increase. That shouldn't happen because we chose the + * initial array size large enough to ensure that palloc will be treating + * both old and new arrays as separate chunks. But we'll check LACKMEM + * explicitly below just in case.) + */ + if (state->availMem < (int64) ((newmemtupsize - memtupsize) * sizeof(SortTuple))) + goto noalloc; + + /* OK, do it */ + FREEMEM(state, GetMemoryChunkSpace(state->memtuples)); + state->memtupsize = newmemtupsize; + state->memtuples = (SortTuple *) + repalloc_huge(state->memtuples, + state->memtupsize * sizeof(SortTuple)); + USEMEM(state, GetMemoryChunkSpace(state->memtuples)); + if (LACKMEM(state)) + elog(ERROR, "unexpected out-of-memory situation in tuplesort"); + return true; + +noalloc: + /* If for any reason we didn't realloc, shut off future attempts */ + state->growmemtuples = false; + return false; +} + +/* + * Accept one tuple while collecting input data for sort. + * + * Note that the input data is always copied; the caller need not save it. + */ +void +tuplesort_puttupleslot(Tuplesortstate *state, TupleTableSlot *slot) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + /* + * Copy the given tuple into memory we control, and decrease availMem. + * Then call the common code. + */ + COPYTUP(state, &stup, (void *) slot); + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Accept one tuple while collecting input data for sort. + * + * Note that the input data is always copied; the caller need not save it. + */ +void +tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + /* + * Copy the given tuple into memory we control, and decrease availMem. + * Then call the common code. + */ + COPYTUP(state, &stup, (void *) tup); + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Collect one index tuple while collecting input data for sort, building + * it from caller-supplied values. + */ +void +tuplesort_putindextuplevalues(Tuplesortstate *state, Relation rel, + ItemPointer self, Datum *values, + bool *isnull) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + SortTuple stup; + Datum original; + IndexTuple tuple; + + stup.tuple = index_form_tuple(RelationGetDescr(rel), values, isnull); + tuple = ((IndexTuple) stup.tuple); + tuple->t_tid = *self; + USEMEM(state, GetMemoryChunkSpace(stup.tuple)); + /* set up first-column key value */ + original = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &stup.isnull1); + + MemoryContextSwitchTo(state->sortcontext); + + if (!state->sortKeys || !state->sortKeys->abbrev_converter || stup.isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to zeroed representation (to be + * consistent, and to support cheap inequality tests for NULL + * abbreviated keys). + */ + stup.datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup.datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup.datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + tuple = mtup->tuple; + mtup->datum1 = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &mtup->isnull1); + } + } + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Accept one Datum while collecting input data for sort. + * + * If the Datum is pass-by-ref type, the value will be copied. + */ +void +tuplesort_putdatum(Tuplesortstate *state, Datum val, bool isNull) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + SortTuple stup; + + /* + * Pass-by-value types or null values are just stored directly in + * stup.datum1 (and stup.tuple is not used and set to NULL). + * + * Non-null pass-by-reference values need to be copied into memory we + * control, and possibly abbreviated. The copied value is pointed to by + * stup.tuple and is treated as the canonical copy (e.g. to return via + * tuplesort_getdatum or when writing to tape); stup.datum1 gets the + * abbreviated value if abbreviation is happening, otherwise it's + * identical to stup.tuple. + */ + + if (isNull || !state->tuples) + { + /* + * Set datum1 to zeroed representation for NULLs (to be consistent, + * and to support cheap inequality tests for NULL abbreviated keys). + */ + stup.datum1 = !isNull ? val : (Datum) 0; + stup.isnull1 = isNull; + stup.tuple = NULL; /* no separate storage */ + MemoryContextSwitchTo(state->sortcontext); + } + else + { + Datum original = datumCopy(val, false, state->datumTypeLen); + + stup.isnull1 = false; + stup.tuple = DatumGetPointer(original); + USEMEM(state, GetMemoryChunkSpace(stup.tuple)); + MemoryContextSwitchTo(state->sortcontext); + + if (!state->sortKeys->abbrev_converter) + { + stup.datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup.datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup.datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any + * case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + mtup->datum1 = PointerGetDatum(mtup->tuple); + } + } + } + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Shared code for tuple and datum cases. + */ +static void +puttuple_common(Tuplesortstate *state, SortTuple *tuple) +{ + Assert(!LEADER(state)); + + switch (state->status) + { + case TSS_INITIAL: + + /* + * Save the tuple into the unsorted array. First, grow the array + * as needed. Note that we try to grow the array when there is + * still one free slot remaining --- if we fail, there'll still be + * room to store the incoming tuple, and then we'll switch to + * tape-based operation. + */ + if (state->memtupcount >= state->memtupsize - 1) + { + (void) grow_memtuples(state); + Assert(state->memtupcount < state->memtupsize); + } + state->memtuples[state->memtupcount++] = *tuple; + + /* + * Check if it's time to switch over to a bounded heapsort. We do + * so if the input tuple count exceeds twice the desired tuple + * count (this is a heuristic for where heapsort becomes cheaper + * than a quicksort), or if we've just filled workMem and have + * enough tuples to meet the bound. + * + * Note that once we enter TSS_BOUNDED state we will always try to + * complete the sort that way. In the worst case, if later input + * tuples are larger than earlier ones, this might cause us to + * exceed workMem significantly. + */ + if (state->bounded && + (state->memtupcount > state->bound * 2 || + (state->memtupcount > state->bound && LACKMEM(state)))) + { +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "switching to bounded heapsort at %d tuples: %s", + state->memtupcount, + pg_rusage_show(&state->ru_start)); +#endif + make_bounded_heap(state); + return; + } + + /* + * Done if we still fit in available memory and have array slots. + */ + if (state->memtupcount < state->memtupsize && !LACKMEM(state)) + return; + + /* + * Nope; time to switch to tape-based operation. + */ + inittapes(state, true); + + /* + * Dump all tuples. + */ + dumptuples(state, false); + break; + + case TSS_BOUNDED: + + /* + * We don't want to grow the array here, so check whether the new + * tuple can be discarded before putting it in. This should be a + * good speed optimization, too, since when there are many more + * input tuples than the bound, most input tuples can be discarded + * with just this one comparison. Note that because we currently + * have the sort direction reversed, we must check for <= not >=. + */ + if (COMPARETUP(state, tuple, &state->memtuples[0]) <= 0) + { + /* new tuple <= top of the heap, so we can discard it */ + free_sort_tuple(state, tuple); + CHECK_FOR_INTERRUPTS(); + } + else + { + /* discard top of heap, replacing it with the new tuple */ + free_sort_tuple(state, &state->memtuples[0]); + tuplesort_heap_replace_top(state, tuple); + } + break; + + case TSS_BUILDRUNS: + + /* + * Save the tuple into the unsorted array (there must be space) + */ + state->memtuples[state->memtupcount++] = *tuple; + + /* + * If we are over the memory limit, dump all tuples. + */ + dumptuples(state, false); + break; + + default: + elog(ERROR, "invalid tuplesort state"); + break; + } +} + +static bool +consider_abort_common(Tuplesortstate *state) +{ + Assert(state->sortKeys[0].abbrev_converter != NULL); + Assert(state->sortKeys[0].abbrev_abort != NULL); + Assert(state->sortKeys[0].abbrev_full_comparator != NULL); + + /* + * Check effectiveness of abbreviation optimization. Consider aborting + * when still within memory limit. + */ + if (state->status == TSS_INITIAL && + state->memtupcount >= state->abbrevNext) + { + state->abbrevNext *= 2; + + /* + * Check opclass-supplied abbreviation abort routine. It may indicate + * that abbreviation should not proceed. + */ + if (!state->sortKeys->abbrev_abort(state->memtupcount, + state->sortKeys)) + return false; + + /* + * Finally, restore authoritative comparator, and indicate that + * abbreviation is not in play by setting abbrev_converter to NULL + */ + state->sortKeys[0].comparator = state->sortKeys[0].abbrev_full_comparator; + state->sortKeys[0].abbrev_converter = NULL; + /* Not strictly necessary, but be tidy */ + state->sortKeys[0].abbrev_abort = NULL; + state->sortKeys[0].abbrev_full_comparator = NULL; + + /* Give up - expect original pass-by-value representation */ + return true; + } + + return false; +} + +/* + * All tuples have been provided; finish the sort. + */ +void +tuplesort_performsort(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "performsort of worker %d starting: %s", + state->worker, pg_rusage_show(&state->ru_start)); +#endif + + switch (state->status) + { + case TSS_INITIAL: + + /* + * We were able to accumulate all the tuples within the allowed + * amount of memory, or leader to take over worker tapes + */ + if (SERIAL(state)) + { + /* Just qsort 'em and we're done */ + tuplesort_sort_memtuples(state); + state->status = TSS_SORTEDINMEM; + } + else if (WORKER(state)) + { + /* + * Parallel workers must still dump out tuples to tape. No + * merge is required to produce single output run, though. + */ + inittapes(state, false); + dumptuples(state, true); + worker_nomergeruns(state); + state->status = TSS_SORTEDONTAPE; + } + else + { + /* + * Leader will take over worker tapes and merge worker runs. + * Note that mergeruns sets the correct state->status. + */ + leader_takeover_tapes(state); + mergeruns(state); + } + state->current = 0; + state->eof_reached = false; + state->markpos_block = 0L; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + + case TSS_BOUNDED: + + /* + * We were able to accumulate all the tuples required for output + * in memory, using a heap to eliminate excess tuples. Now we + * have to transform the heap to a properly-sorted array. + */ + sort_bounded_heap(state); + state->current = 0; + state->eof_reached = false; + state->markpos_offset = 0; + state->markpos_eof = false; + state->status = TSS_SORTEDINMEM; + break; + + case TSS_BUILDRUNS: + + /* + * Finish tape-based sort. First, flush all tuples remaining in + * memory out to tape; then merge until we have a single remaining + * run (or, if !randomAccess and !WORKER(), one run per tape). + * Note that mergeruns sets the correct state->status. + */ + dumptuples(state, true); + mergeruns(state); + state->eof_reached = false; + state->markpos_block = 0L; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + +#ifdef TRACE_SORT + if (trace_sort) + { + if (state->status == TSS_FINALMERGE) + elog(LOG, "performsort of worker %d done (except %d-way final merge): %s", + state->worker, state->activeTapes, + pg_rusage_show(&state->ru_start)); + else + elog(LOG, "performsort of worker %d done: %s", + state->worker, pg_rusage_show(&state->ru_start)); + } +#endif + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Internal routine to fetch the next tuple in either forward or back + * direction into *stup. Returns false if no more tuples. + * Returned tuple belongs to tuplesort memory context, and must not be freed + * by caller. Note that fetched tuple is stored in memory that may be + * recycled by any future fetch. + */ +static bool +tuplesort_gettuple_common(Tuplesortstate *state, bool forward, + SortTuple *stup) +{ + unsigned int tuplen; + size_t nmoved; + + Assert(!WORKER(state)); + + switch (state->status) + { + case TSS_SORTEDINMEM: + Assert(forward || state->randomAccess); + Assert(!state->slabAllocatorUsed); + if (forward) + { + if (state->current < state->memtupcount) + { + *stup = state->memtuples[state->current++]; + return true; + } + state->eof_reached = true; + + /* + * Complain if caller tries to retrieve more tuples than + * originally asked for in a bounded sort. This is because + * returning EOF here might be the wrong thing. + */ + if (state->bounded && state->current >= state->bound) + elog(ERROR, "retrieved too many tuples in a bounded sort"); + + return false; + } + else + { + if (state->current <= 0) + return false; + + /* + * if all tuples are fetched already then we return last + * tuple, else - tuple before last returned. + */ + if (state->eof_reached) + state->eof_reached = false; + else + { + state->current--; /* last returned tuple */ + if (state->current <= 0) + return false; + } + *stup = state->memtuples[state->current - 1]; + return true; + } + break; + + case TSS_SORTEDONTAPE: + Assert(forward || state->randomAccess); + Assert(state->slabAllocatorUsed); + + /* + * The slot that held the tuple that we returned in previous + * gettuple call can now be reused. + */ + if (state->lastReturnedTuple) + { + RELEASE_SLAB_SLOT(state, state->lastReturnedTuple); + state->lastReturnedTuple = NULL; + } + + if (forward) + { + if (state->eof_reached) + return false; + + if ((tuplen = getlen(state, state->result_tape, true)) != 0) + { + READTUP(state, stup, state->result_tape, tuplen); + + /* + * Remember the tuple we return, so that we can recycle + * its memory on next call. (This can be NULL, in the + * !state->tuples case). + */ + state->lastReturnedTuple = stup->tuple; + + return true; + } + else + { + state->eof_reached = true; + return false; + } + } + + /* + * Backward. + * + * if all tuples are fetched already then we return last tuple, + * else - tuple before last returned. + */ + if (state->eof_reached) + { + /* + * Seek position is pointing just past the zero tuplen at the + * end of file; back up to fetch last tuple's ending length + * word. If seek fails we must have a completely empty file. + */ + nmoved = LogicalTapeBackspace(state->tapeset, + state->result_tape, + 2 * sizeof(unsigned int)); + if (nmoved == 0) + return false; + else if (nmoved != 2 * sizeof(unsigned int)) + elog(ERROR, "unexpected tape position"); + state->eof_reached = false; + } + else + { + /* + * Back up and fetch previously-returned tuple's ending length + * word. If seek fails, assume we are at start of file. + */ + nmoved = LogicalTapeBackspace(state->tapeset, + state->result_tape, + sizeof(unsigned int)); + if (nmoved == 0) + return false; + else if (nmoved != sizeof(unsigned int)) + elog(ERROR, "unexpected tape position"); + tuplen = getlen(state, state->result_tape, false); + + /* + * Back up to get ending length word of tuple before it. + */ + nmoved = LogicalTapeBackspace(state->tapeset, + state->result_tape, + tuplen + 2 * sizeof(unsigned int)); + if (nmoved == tuplen + sizeof(unsigned int)) + { + /* + * We backed up over the previous tuple, but there was no + * ending length word before it. That means that the prev + * tuple is the first tuple in the file. It is now the + * next to read in forward direction (not obviously right, + * but that is what in-memory case does). + */ + return false; + } + else if (nmoved != tuplen + 2 * sizeof(unsigned int)) + elog(ERROR, "bogus tuple length in backward scan"); + } + + tuplen = getlen(state, state->result_tape, false); + + /* + * Now we have the length of the prior tuple, back up and read it. + * Note: READTUP expects we are positioned after the initial + * length word of the tuple, so back up to that point. + */ + nmoved = LogicalTapeBackspace(state->tapeset, + state->result_tape, + tuplen); + if (nmoved != tuplen) + elog(ERROR, "bogus tuple length in backward scan"); + READTUP(state, stup, state->result_tape, tuplen); + + /* + * Remember the tuple we return, so that we can recycle its memory + * on next call. (This can be NULL, in the Datum case). + */ + state->lastReturnedTuple = stup->tuple; + + return true; + + case TSS_FINALMERGE: + Assert(forward); + /* We are managing memory ourselves, with the slab allocator. */ + Assert(state->slabAllocatorUsed); + + /* + * The slab slot holding the tuple that we returned in previous + * gettuple call can now be reused. + */ + if (state->lastReturnedTuple) + { + RELEASE_SLAB_SLOT(state, state->lastReturnedTuple); + state->lastReturnedTuple = NULL; + } + + /* + * This code should match the inner loop of mergeonerun(). + */ + if (state->memtupcount > 0) + { + int srcTape = state->memtuples[0].tupindex; + SortTuple newtup; + + *stup = state->memtuples[0]; + + /* + * Remember the tuple we return, so that we can recycle its + * memory on next call. (This can be NULL, in the Datum case). + */ + state->lastReturnedTuple = stup->tuple; + + /* + * Pull next tuple from tape, and replace the returned tuple + * at top of the heap with it. + */ + if (!mergereadnext(state, srcTape, &newtup)) + { + /* + * If no more data, we've reached end of run on this tape. + * Remove the top node from the heap. + */ + tuplesort_heap_delete_top(state); + + /* + * Rewind to free the read buffer. It'd go away at the + * end of the sort anyway, but better to release the + * memory early. + */ + LogicalTapeRewindForWrite(state->tapeset, srcTape); + return true; + } + newtup.tupindex = srcTape; + tuplesort_heap_replace_top(state, &newtup); + return true; + } + return false; + + default: + elog(ERROR, "invalid tuplesort state"); + return false; /* keep compiler quiet */ + } +} + +/* + * Fetch the next tuple in either forward or back direction. + * If successful, put tuple in slot and return true; else, clear the slot + * and return false. + * + * Caller may optionally be passed back abbreviated value (on true return + * value) when abbreviation was used, which can be used to cheaply avoid + * equality checks that might otherwise be required. Caller can safely make a + * determination of "non-equal tuple" based on simple binary inequality. A + * NULL value in leading attribute will set abbreviated value to zeroed + * representation, which caller may rely on in abbreviated inequality check. + * + * If copy is true, the slot receives a tuple that's been copied into the + * caller's memory context, so that it will stay valid regardless of future + * manipulations of the tuplesort's state (up to and including deleting the + * tuplesort). If copy is false, the slot will just receive a pointer to a + * tuple held within the tuplesort, which is more efficient, but only safe for + * callers that are prepared to have any subsequent manipulation of the + * tuplesort's state invalidate slot contents. + */ +bool +tuplesort_gettupleslot(Tuplesortstate *state, bool forward, bool copy, + TupleTableSlot *slot, Datum *abbrev) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + stup.tuple = NULL; + + MemoryContextSwitchTo(oldcontext); + + if (stup.tuple) + { + /* Record abbreviated key for caller */ + if (state->sortKeys->abbrev_converter && abbrev) + *abbrev = stup.datum1; + + if (copy) + stup.tuple = heap_copy_minimal_tuple((MinimalTuple) stup.tuple); + + ExecStoreMinimalTuple((MinimalTuple) stup.tuple, slot, copy); + return true; + } + else + { + ExecClearTuple(slot); + return false; + } +} + +/* + * Fetch the next tuple in either forward or back direction. + * Returns NULL if no more tuples. Returned tuple belongs to tuplesort memory + * context, and must not be freed by caller. Caller may not rely on tuple + * remaining valid after any further manipulation of tuplesort. + */ +HeapTuple +tuplesort_getheaptuple(Tuplesortstate *state, bool forward) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + stup.tuple = NULL; + + MemoryContextSwitchTo(oldcontext); + + return stup.tuple; +} + +/* + * Fetch the next index tuple in either forward or back direction. + * Returns NULL if no more tuples. Returned tuple belongs to tuplesort memory + * context, and must not be freed by caller. Caller may not rely on tuple + * remaining valid after any further manipulation of tuplesort. + */ +IndexTuple +tuplesort_getindextuple(Tuplesortstate *state, bool forward) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + stup.tuple = NULL; + + MemoryContextSwitchTo(oldcontext); + + return (IndexTuple) stup.tuple; +} + +/* + * Fetch the next Datum in either forward or back direction. + * Returns false if no more datums. + * + * If the Datum is pass-by-ref type, the returned value is freshly palloc'd + * in caller's context, and is now owned by the caller (this differs from + * similar routines for other types of tuplesorts). + * + * Caller may optionally be passed back abbreviated value (on true return + * value) when abbreviation was used, which can be used to cheaply avoid + * equality checks that might otherwise be required. Caller can safely make a + * determination of "non-equal tuple" based on simple binary inequality. A + * NULL value will have a zeroed abbreviated value representation, which caller + * may rely on in abbreviated inequality check. + */ +bool +tuplesort_getdatum(Tuplesortstate *state, bool forward, + Datum *val, bool *isNull, Datum *abbrev) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + { + MemoryContextSwitchTo(oldcontext); + return false; + } + + /* Ensure we copy into caller's memory context */ + MemoryContextSwitchTo(oldcontext); + + /* Record abbreviated key for caller */ + if (state->sortKeys->abbrev_converter && abbrev) + *abbrev = stup.datum1; + + if (stup.isnull1 || !state->tuples) + { + *val = stup.datum1; + *isNull = stup.isnull1; + } + else + { + /* use stup.tuple because stup.datum1 may be an abbreviation */ + *val = datumCopy(PointerGetDatum(stup.tuple), false, state->datumTypeLen); + *isNull = false; + } + + return true; +} + +/* + * Advance over N tuples in either forward or back direction, + * without returning any data. N==0 is a no-op. + * Returns true if successful, false if ran out of tuples. + */ +bool +tuplesort_skiptuples(Tuplesortstate *state, int64 ntuples, bool forward) +{ + MemoryContext oldcontext; + + /* + * We don't actually support backwards skip yet, because no callers need + * it. The API is designed to allow for that later, though. + */ + Assert(forward); + Assert(ntuples >= 0); + Assert(!WORKER(state)); + + switch (state->status) + { + case TSS_SORTEDINMEM: + if (state->memtupcount - state->current >= ntuples) + { + state->current += ntuples; + return true; + } + state->current = state->memtupcount; + state->eof_reached = true; + + /* + * Complain if caller tries to retrieve more tuples than + * originally asked for in a bounded sort. This is because + * returning EOF here might be the wrong thing. + */ + if (state->bounded && state->current >= state->bound) + elog(ERROR, "retrieved too many tuples in a bounded sort"); + + return false; + + case TSS_SORTEDONTAPE: + case TSS_FINALMERGE: + + /* + * We could probably optimize these cases better, but for now it's + * not worth the trouble. + */ + oldcontext = MemoryContextSwitchTo(state->sortcontext); + while (ntuples-- > 0) + { + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + { + MemoryContextSwitchTo(oldcontext); + return false; + } + CHECK_FOR_INTERRUPTS(); + } + MemoryContextSwitchTo(oldcontext); + return true; + + default: + elog(ERROR, "invalid tuplesort state"); + return false; /* keep compiler quiet */ + } +} + +/* + * tuplesort_merge_order - report merge order we'll use for given memory + * (note: "merge order" just means the number of input tapes in the merge). + * + * This is exported for use by the planner. allowedMem is in bytes. + */ +int +tuplesort_merge_order(int64 allowedMem) +{ + int mOrder; + + /* + * We need one tape for each merge input, plus another one for the output, + * and each of these tapes needs buffer space. In addition we want + * MERGE_BUFFER_SIZE workspace per input tape (but the output tape doesn't + * count). + * + * Note: you might be thinking we need to account for the memtuples[] + * array in this calculation, but we effectively treat that as part of the + * MERGE_BUFFER_SIZE workspace. + */ + mOrder = (allowedMem - TAPE_BUFFER_OVERHEAD) / + (MERGE_BUFFER_SIZE + TAPE_BUFFER_OVERHEAD); + + /* + * Even in minimum memory, use at least a MINORDER merge. On the other + * hand, even when we have lots of memory, do not use more than a MAXORDER + * merge. Tapes are pretty cheap, but they're not entirely free. Each + * additional tape reduces the amount of memory available to build runs, + * which in turn can cause the same sort to need more runs, which makes + * merging slower even if it can still be done in a single pass. Also, + * high order merges are quite slow due to CPU cache effects; it can be + * faster to pay the I/O cost of a polyphase merge than to perform a + * single merge pass across many hundreds of tapes. + */ + mOrder = Max(mOrder, MINORDER); + mOrder = Min(mOrder, MAXORDER); + + return mOrder; +} + +/* + * inittapes - initialize for tape sorting. + * + * This is called only if we have found we won't sort in memory. + */ +static void +inittapes(Tuplesortstate *state, bool mergeruns) +{ + int maxTapes, + j; + + Assert(!LEADER(state)); + + if (mergeruns) + { + /* Compute number of tapes to use: merge order plus 1 */ + maxTapes = tuplesort_merge_order(state->allowedMem) + 1; + } + else + { + /* Workers can sometimes produce single run, output without merge */ + Assert(WORKER(state)); + maxTapes = MINORDER + 1; + } + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d switching to external sort with %d tapes: %s", + state->worker, maxTapes, pg_rusage_show(&state->ru_start)); +#endif + + /* Create the tape set and allocate the per-tape data arrays */ + inittapestate(state, maxTapes); + state->tapeset = + LogicalTapeSetCreate(maxTapes, NULL, + state->shared ? &state->shared->fileset : NULL, + state->worker); + + state->currentRun = 0; + + /* + * Initialize variables of Algorithm D (step D1). + */ + for (j = 0; j < maxTapes; j++) + { + state->tp_fib[j] = 1; + state->tp_runs[j] = 0; + state->tp_dummy[j] = 1; + state->tp_tapenum[j] = j; + } + state->tp_fib[state->tapeRange] = 0; + state->tp_dummy[state->tapeRange] = 0; + + state->Level = 1; + state->destTape = 0; + + state->status = TSS_BUILDRUNS; +} + +/* + * inittapestate - initialize generic tape management state + */ +static void +inittapestate(Tuplesortstate *state, int maxTapes) +{ + int64 tapeSpace; + + /* + * Decrease availMem to reflect the space needed for tape buffers; but + * don't decrease it to the point that we have no room for tuples. (That + * case is only likely to occur if sorting pass-by-value Datums; in all + * other scenarios the memtuples[] array is unlikely to occupy more than + * half of allowedMem. In the pass-by-value case it's not important to + * account for tuple space, so we don't care if LACKMEM becomes + * inaccurate.) + */ + tapeSpace = (int64) maxTapes * TAPE_BUFFER_OVERHEAD; + + if (tapeSpace + GetMemoryChunkSpace(state->memtuples) < state->allowedMem) + USEMEM(state, tapeSpace); + + /* + * Make sure that the temp file(s) underlying the tape set are created in + * suitable temp tablespaces. For parallel sorts, this should have been + * called already, but it doesn't matter if it is called a second time. + */ + PrepareTempTablespaces(); + + state->mergeactive = (bool *) palloc0(maxTapes * sizeof(bool)); + state->tp_fib = (int *) palloc0(maxTapes * sizeof(int)); + state->tp_runs = (int *) palloc0(maxTapes * sizeof(int)); + state->tp_dummy = (int *) palloc0(maxTapes * sizeof(int)); + state->tp_tapenum = (int *) palloc0(maxTapes * sizeof(int)); + + /* Record # of tapes allocated (for duration of sort) */ + state->maxTapes = maxTapes; + /* Record maximum # of tapes usable as inputs when merging */ + state->tapeRange = maxTapes - 1; +} + +/* + * selectnewtape -- select new tape for new initial run. + * + * This is called after finishing a run when we know another run + * must be started. This implements steps D3, D4 of Algorithm D. + */ +static void +selectnewtape(Tuplesortstate *state) +{ + int j; + int a; + + /* Step D3: advance j (destTape) */ + if (state->tp_dummy[state->destTape] < state->tp_dummy[state->destTape + 1]) + { + state->destTape++; + return; + } + if (state->tp_dummy[state->destTape] != 0) + { + state->destTape = 0; + return; + } + + /* Step D4: increase level */ + state->Level++; + a = state->tp_fib[0]; + for (j = 0; j < state->tapeRange; j++) + { + state->tp_dummy[j] = a + state->tp_fib[j + 1] - state->tp_fib[j]; + state->tp_fib[j] = a + state->tp_fib[j + 1]; + } + state->destTape = 0; +} + +/* + * Initialize the slab allocation arena, for the given number of slots. + */ +static void +init_slab_allocator(Tuplesortstate *state, int numSlots) +{ + if (numSlots > 0) + { + char *p; + int i; + + state->slabMemoryBegin = palloc(numSlots * SLAB_SLOT_SIZE); + state->slabMemoryEnd = state->slabMemoryBegin + + numSlots * SLAB_SLOT_SIZE; + state->slabFreeHead = (SlabSlot *) state->slabMemoryBegin; + USEMEM(state, numSlots * SLAB_SLOT_SIZE); + + p = state->slabMemoryBegin; + for (i = 0; i < numSlots - 1; i++) + { + ((SlabSlot *) p)->nextfree = (SlabSlot *) (p + SLAB_SLOT_SIZE); + p += SLAB_SLOT_SIZE; + } + ((SlabSlot *) p)->nextfree = NULL; + } + else + { + state->slabMemoryBegin = state->slabMemoryEnd = NULL; + state->slabFreeHead = NULL; + } + state->slabAllocatorUsed = true; +} + +/* + * mergeruns -- merge all the completed initial runs. + * + * This implements steps D5, D6 of Algorithm D. All input data has + * already been written to initial runs on tape (see dumptuples). + */ +static void +mergeruns(Tuplesortstate *state) +{ + int tapenum, + svTape, + svRuns, + svDummy; + int numTapes; + int numInputTapes; + + Assert(state->status == TSS_BUILDRUNS); + Assert(state->memtupcount == 0); + + if (state->sortKeys != NULL && state->sortKeys->abbrev_converter != NULL) + { + /* + * If there are multiple runs to be merged, when we go to read back + * tuples from disk, abbreviated keys will not have been stored, and + * we don't care to regenerate them. Disable abbreviation from this + * point on. + */ + state->sortKeys->abbrev_converter = NULL; + state->sortKeys->comparator = state->sortKeys->abbrev_full_comparator; + + /* Not strictly necessary, but be tidy */ + state->sortKeys->abbrev_abort = NULL; + state->sortKeys->abbrev_full_comparator = NULL; + } + + /* + * Reset tuple memory. We've freed all the tuples that we previously + * allocated. We will use the slab allocator from now on. + */ + MemoryContextDelete(state->tuplecontext); + state->tuplecontext = NULL; + + /* + * We no longer need a large memtuples array. (We will allocate a smaller + * one for the heap later.) + */ + FREEMEM(state, GetMemoryChunkSpace(state->memtuples)); + pfree(state->memtuples); + state->memtuples = NULL; + + /* + * If we had fewer runs than tapes, refund the memory that we imagined we + * would need for the tape buffers of the unused tapes. + * + * numTapes and numInputTapes reflect the actual number of tapes we will + * use. Note that the output tape's tape number is maxTapes - 1, so the + * tape numbers of the used tapes are not consecutive, and you cannot just + * loop from 0 to numTapes to visit all used tapes! + */ + if (state->Level == 1) + { + numInputTapes = state->currentRun; + numTapes = numInputTapes + 1; + FREEMEM(state, (state->maxTapes - numTapes) * TAPE_BUFFER_OVERHEAD); + } + else + { + numInputTapes = state->tapeRange; + numTapes = state->maxTapes; + } + + /* + * Initialize the slab allocator. We need one slab slot per input tape, + * for the tuples in the heap, plus one to hold the tuple last returned + * from tuplesort_gettuple. (If we're sorting pass-by-val Datums, + * however, we don't need to do allocate anything.) + * + * From this point on, we no longer use the USEMEM()/LACKMEM() mechanism + * to track memory usage of individual tuples. + */ + if (state->tuples) + init_slab_allocator(state, numInputTapes + 1); + else + init_slab_allocator(state, 0); + + /* + * Allocate a new 'memtuples' array, for the heap. It will hold one tuple + * from each input tape. + */ + state->memtupsize = numInputTapes; + state->memtuples = (SortTuple *) palloc(numInputTapes * sizeof(SortTuple)); + USEMEM(state, GetMemoryChunkSpace(state->memtuples)); + + /* + * Use all the remaining memory we have available for read buffers among + * the input tapes. + * + * We don't try to "rebalance" the memory among tapes, when we start a new + * merge phase, even if some tapes are inactive in the new phase. That + * would be hard, because logtape.c doesn't know where one run ends and + * another begins. When a new merge phase begins, and a tape doesn't + * participate in it, its buffer nevertheless already contains tuples from + * the next run on same tape, so we cannot release the buffer. That's OK + * in practice, merge performance isn't that sensitive to the amount of + * buffers used, and most merge phases use all or almost all tapes, + * anyway. + */ +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d using " INT64_FORMAT " KB of memory for read buffers among %d input tapes", + state->worker, state->availMem / 1024, numInputTapes); +#endif + + state->read_buffer_size = Max(state->availMem / numInputTapes, 0); + USEMEM(state, state->read_buffer_size * numInputTapes); + + /* End of step D2: rewind all output tapes to prepare for merging */ + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + LogicalTapeRewindForRead(state->tapeset, tapenum, state->read_buffer_size); + + for (;;) + { + /* + * At this point we know that tape[T] is empty. If there's just one + * (real or dummy) run left on each input tape, then only one merge + * pass remains. If we don't have to produce a materialized sorted + * tape, we can stop at this point and do the final merge on-the-fly. + */ + if (!state->randomAccess && !WORKER(state)) + { + bool allOneRun = true; + + Assert(state->tp_runs[state->tapeRange] == 0); + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + { + if (state->tp_runs[tapenum] + state->tp_dummy[tapenum] != 1) + { + allOneRun = false; + break; + } + } + if (allOneRun) + { + /* Tell logtape.c we won't be writing anymore */ + LogicalTapeSetForgetFreeSpace(state->tapeset); + /* Initialize for the final merge pass */ + beginmerge(state); + state->status = TSS_FINALMERGE; + return; + } + } + + /* Step D5: merge runs onto tape[T] until tape[P] is empty */ + while (state->tp_runs[state->tapeRange - 1] || + state->tp_dummy[state->tapeRange - 1]) + { + bool allDummy = true; + + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + { + if (state->tp_dummy[tapenum] == 0) + { + allDummy = false; + break; + } + } + + if (allDummy) + { + state->tp_dummy[state->tapeRange]++; + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + state->tp_dummy[tapenum]--; + } + else + mergeonerun(state); + } + + /* Step D6: decrease level */ + if (--state->Level == 0) + break; + /* rewind output tape T to use as new input */ + LogicalTapeRewindForRead(state->tapeset, state->tp_tapenum[state->tapeRange], + state->read_buffer_size); + /* rewind used-up input tape P, and prepare it for write pass */ + LogicalTapeRewindForWrite(state->tapeset, state->tp_tapenum[state->tapeRange - 1]); + state->tp_runs[state->tapeRange - 1] = 0; + + /* + * reassign tape units per step D6; note we no longer care about A[] + */ + svTape = state->tp_tapenum[state->tapeRange]; + svDummy = state->tp_dummy[state->tapeRange]; + svRuns = state->tp_runs[state->tapeRange]; + for (tapenum = state->tapeRange; tapenum > 0; tapenum--) + { + state->tp_tapenum[tapenum] = state->tp_tapenum[tapenum - 1]; + state->tp_dummy[tapenum] = state->tp_dummy[tapenum - 1]; + state->tp_runs[tapenum] = state->tp_runs[tapenum - 1]; + } + state->tp_tapenum[0] = svTape; + state->tp_dummy[0] = svDummy; + state->tp_runs[0] = svRuns; + } + + /* + * Done. Knuth says that the result is on TAPE[1], but since we exited + * the loop without performing the last iteration of step D6, we have not + * rearranged the tape unit assignment, and therefore the result is on + * TAPE[T]. We need to do it this way so that we can freeze the final + * output tape while rewinding it. The last iteration of step D6 would be + * a waste of cycles anyway... + */ + state->result_tape = state->tp_tapenum[state->tapeRange]; + if (!WORKER(state)) + LogicalTapeFreeze(state->tapeset, state->result_tape, NULL); + else + worker_freeze_result_tape(state); + state->status = TSS_SORTEDONTAPE; + + /* Release the read buffers of all the other tapes, by rewinding them. */ + for (tapenum = 0; tapenum < state->maxTapes; tapenum++) + { + if (tapenum != state->result_tape) + LogicalTapeRewindForWrite(state->tapeset, tapenum); + } +} + +/* + * Merge one run from each input tape, except ones with dummy runs. + * + * This is the inner loop of Algorithm D step D5. We know that the + * output tape is TAPE[T]. + */ +static void +mergeonerun(Tuplesortstate *state) +{ + int destTape = state->tp_tapenum[state->tapeRange]; + int srcTape; + + /* + * Start the merge by loading one tuple from each active source tape into + * the heap. We can also decrease the input run/dummy run counts. + */ + beginmerge(state); + + /* + * Execute merge by repeatedly extracting lowest tuple in heap, writing it + * out, and replacing it with next tuple from same tape (if there is + * another one). + */ + while (state->memtupcount > 0) + { + SortTuple stup; + + /* write the tuple to destTape */ + srcTape = state->memtuples[0].tupindex; + WRITETUP(state, destTape, &state->memtuples[0]); + + /* recycle the slot of the tuple we just wrote out, for the next read */ + if (state->memtuples[0].tuple) + RELEASE_SLAB_SLOT(state, state->memtuples[0].tuple); + + /* + * pull next tuple from the tape, and replace the written-out tuple in + * the heap with it. + */ + if (mergereadnext(state, srcTape, &stup)) + { + stup.tupindex = srcTape; + tuplesort_heap_replace_top(state, &stup); + + } + else + tuplesort_heap_delete_top(state); + } + + /* + * When the heap empties, we're done. Write an end-of-run marker on the + * output tape, and increment its count of real runs. + */ + markrunend(state, destTape); + state->tp_runs[state->tapeRange]++; + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d finished %d-way merge step: %s", state->worker, + state->activeTapes, pg_rusage_show(&state->ru_start)); +#endif +} + +/* + * beginmerge - initialize for a merge pass + * + * We decrease the counts of real and dummy runs for each tape, and mark + * which tapes contain active input runs in mergeactive[]. Then, fill the + * merge heap with the first tuple from each active tape. + */ +static void +beginmerge(Tuplesortstate *state) +{ + int activeTapes; + int tapenum; + int srcTape; + + /* Heap should be empty here */ + Assert(state->memtupcount == 0); + + /* Adjust run counts and mark the active tapes */ + memset(state->mergeactive, 0, + state->maxTapes * sizeof(*state->mergeactive)); + activeTapes = 0; + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + { + if (state->tp_dummy[tapenum] > 0) + state->tp_dummy[tapenum]--; + else + { + Assert(state->tp_runs[tapenum] > 0); + state->tp_runs[tapenum]--; + srcTape = state->tp_tapenum[tapenum]; + state->mergeactive[srcTape] = true; + activeTapes++; + } + } + Assert(activeTapes > 0); + state->activeTapes = activeTapes; + + /* Load the merge heap with the first tuple from each input tape */ + for (srcTape = 0; srcTape < state->maxTapes; srcTape++) + { + SortTuple tup; + + if (mergereadnext(state, srcTape, &tup)) + { + tup.tupindex = srcTape; + tuplesort_heap_insert(state, &tup); + } + } +} + +/* + * mergereadnext - read next tuple from one merge input tape + * + * Returns false on EOF. + */ +static bool +mergereadnext(Tuplesortstate *state, int srcTape, SortTuple *stup) +{ + unsigned int tuplen; + + if (!state->mergeactive[srcTape]) + return false; /* tape's run is already exhausted */ + + /* read next tuple, if any */ + if ((tuplen = getlen(state, srcTape, true)) == 0) + { + state->mergeactive[srcTape] = false; + return false; + } + READTUP(state, stup, srcTape, tuplen); + + return true; +} + +/* + * dumptuples - remove tuples from memtuples and write initial run to tape + * + * When alltuples = true, dump everything currently in memory. (This case is + * only used at end of input data.) + */ +static void +dumptuples(Tuplesortstate *state, bool alltuples) +{ + int memtupwrite; + int i; + + /* + * Nothing to do if we still fit in available memory and have array slots, + * unless this is the final call during initial run generation. + */ + if (state->memtupcount < state->memtupsize && !LACKMEM(state) && + !alltuples) + return; + + /* + * Final call might require no sorting, in rare cases where we just so + * happen to have previously LACKMEM()'d at the point where exactly all + * remaining tuples are loaded into memory, just before input was + * exhausted. + * + * In general, short final runs are quite possible. Rather than allowing + * a special case where there was a superfluous selectnewtape() call (i.e. + * a call with no subsequent run actually written to destTape), we prefer + * to write out a 0 tuple run. + * + * mergereadnext() is prepared for 0 tuple runs, and will reliably mark + * the tape inactive for the merge when called from beginmerge(). This + * case is therefore similar to the case where mergeonerun() finds a dummy + * run for the tape, and so doesn't need to merge a run from the tape (or + * conceptually "merges" the dummy run, if you prefer). According to + * Knuth, Algorithm D "isn't strictly optimal" in its method of + * distribution and dummy run assignment; this edge case seems very + * unlikely to make that appreciably worse. + */ + Assert(state->status == TSS_BUILDRUNS); + + /* + * It seems unlikely that this limit will ever be exceeded, but take no + * chances + */ + if (state->currentRun == INT_MAX) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("cannot have more than %d runs for an external sort", + INT_MAX))); + + state->currentRun++; + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d starting quicksort of run %d: %s", + state->worker, state->currentRun, + pg_rusage_show(&state->ru_start)); +#endif + + /* + * Sort all tuples accumulated within the allowed amount of memory for + * this run using quicksort + */ + tuplesort_sort_memtuples(state); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d finished quicksort of run %d: %s", + state->worker, state->currentRun, + pg_rusage_show(&state->ru_start)); +#endif + + memtupwrite = state->memtupcount; + for (i = 0; i < memtupwrite; i++) + { + WRITETUP(state, state->tp_tapenum[state->destTape], + &state->memtuples[i]); + state->memtupcount--; + } + + /* + * Reset tuple memory. We've freed all of the tuples that we previously + * allocated. It's important to avoid fragmentation when there is a stark + * change in the sizes of incoming tuples. Fragmentation due to + * AllocSetFree's bucketing by size class might be particularly bad if + * this step wasn't taken. + */ + MemoryContextReset(state->tuplecontext); + + markrunend(state, state->tp_tapenum[state->destTape]); + state->tp_runs[state->destTape]++; + state->tp_dummy[state->destTape]--; /* per Alg D step D2 */ + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d finished writing run %d to tape %d: %s", + state->worker, state->currentRun, state->destTape, + pg_rusage_show(&state->ru_start)); +#endif + + if (!alltuples) + selectnewtape(state); +} + +/* + * tuplesort_rescan - rewind and replay the scan + */ +void +tuplesort_rescan(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + + Assert(state->randomAccess); + + switch (state->status) + { + case TSS_SORTEDINMEM: + state->current = 0; + state->eof_reached = false; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + case TSS_SORTEDONTAPE: + LogicalTapeRewindForRead(state->tapeset, + state->result_tape, + 0); + state->eof_reached = false; + state->markpos_block = 0L; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + + MemoryContextSwitchTo(oldcontext); +} + +/* + * tuplesort_markpos - saves current position in the merged sort file + */ +void +tuplesort_markpos(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + + Assert(state->randomAccess); + + switch (state->status) + { + case TSS_SORTEDINMEM: + state->markpos_offset = state->current; + state->markpos_eof = state->eof_reached; + break; + case TSS_SORTEDONTAPE: + LogicalTapeTell(state->tapeset, + state->result_tape, + &state->markpos_block, + &state->markpos_offset); + state->markpos_eof = state->eof_reached; + break; + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + + MemoryContextSwitchTo(oldcontext); +} + +/* + * tuplesort_restorepos - restores current position in merged sort file to + * last saved position + */ +void +tuplesort_restorepos(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + + Assert(state->randomAccess); + + switch (state->status) + { + case TSS_SORTEDINMEM: + state->current = state->markpos_offset; + state->eof_reached = state->markpos_eof; + break; + case TSS_SORTEDONTAPE: + LogicalTapeSeek(state->tapeset, + state->result_tape, + state->markpos_block, + state->markpos_offset); + state->eof_reached = state->markpos_eof; + break; + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + + MemoryContextSwitchTo(oldcontext); +} + +/* + * tuplesort_get_stats - extract summary statistics + * + * This can be called after tuplesort_performsort() finishes to obtain + * printable summary information about how the sort was performed. + */ +void +tuplesort_get_stats(Tuplesortstate *state, + TuplesortInstrumentation *stats) +{ + /* + * Note: it might seem we should provide both memory and disk usage for a + * disk-based sort. However, the current code doesn't track memory space + * accurately once we have begun to return tuples to the caller (since we + * don't account for pfree's the caller is expected to do), so we cannot + * rely on availMem in a disk sort. This does not seem worth the overhead + * to fix. Is it worth creating an API for the memory context code to + * tell us how much is actually used in sortcontext? + */ + if (state->tapeset) + { + stats->spaceType = SORT_SPACE_TYPE_DISK; + stats->spaceUsed = LogicalTapeSetBlocks(state->tapeset) * (BLCKSZ / 1024); + } + else + { + stats->spaceType = SORT_SPACE_TYPE_MEMORY; + stats->spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024; + } + + switch (state->status) + { + case TSS_SORTEDINMEM: + if (state->boundUsed) + stats->sortMethod = SORT_TYPE_TOP_N_HEAPSORT; + else + stats->sortMethod = SORT_TYPE_QUICKSORT; + break; + case TSS_SORTEDONTAPE: + stats->sortMethod = SORT_TYPE_EXTERNAL_SORT; + break; + case TSS_FINALMERGE: + stats->sortMethod = SORT_TYPE_EXTERNAL_MERGE; + break; + default: + stats->sortMethod = SORT_TYPE_STILL_IN_PROGRESS; + break; + } +} + +/* + * Convert TuplesortMethod to a string. + */ +const char * +tuplesort_method_name(TuplesortMethod m) +{ + switch (m) + { + case SORT_TYPE_STILL_IN_PROGRESS: + return "still in progress"; + case SORT_TYPE_TOP_N_HEAPSORT: + return "top-N heapsort"; + case SORT_TYPE_QUICKSORT: + return "quicksort"; + case SORT_TYPE_EXTERNAL_SORT: + return "external sort"; + case SORT_TYPE_EXTERNAL_MERGE: + return "external merge"; + } + + return "unknown"; +} + +/* + * Convert TuplesortSpaceType to a string. + */ +const char * +tuplesort_space_type_name(TuplesortSpaceType t) +{ + Assert(t == SORT_SPACE_TYPE_DISK || t == SORT_SPACE_TYPE_MEMORY); + return t == SORT_SPACE_TYPE_DISK ? "Disk" : "Memory"; +} + + +/* + * Heap manipulation routines, per Knuth's Algorithm 5.2.3H. + */ + +/* + * Convert the existing unordered array of SortTuples to a bounded heap, + * discarding all but the smallest "state->bound" tuples. + * + * When working with a bounded heap, we want to keep the largest entry + * at the root (array entry zero), instead of the smallest as in the normal + * sort case. This allows us to discard the largest entry cheaply. + * Therefore, we temporarily reverse the sort direction. + */ +static void +make_bounded_heap(Tuplesortstate *state) +{ + int tupcount = state->memtupcount; + int i; + + Assert(state->status == TSS_INITIAL); + Assert(state->bounded); + Assert(tupcount >= state->bound); + Assert(SERIAL(state)); + + /* Reverse sort direction so largest entry will be at root */ + reversedirection(state); + + state->memtupcount = 0; /* make the heap empty */ + for (i = 0; i < tupcount; i++) + { + if (state->memtupcount < state->bound) + { + /* Insert next tuple into heap */ + /* Must copy source tuple to avoid possible overwrite */ + SortTuple stup = state->memtuples[i]; + + tuplesort_heap_insert(state, &stup); + } + else + { + /* + * The heap is full. Replace the largest entry with the new + * tuple, or just discard it, if it's larger than anything already + * in the heap. + */ + if (COMPARETUP(state, &state->memtuples[i], &state->memtuples[0]) <= 0) + { + free_sort_tuple(state, &state->memtuples[i]); + CHECK_FOR_INTERRUPTS(); + } + else + tuplesort_heap_replace_top(state, &state->memtuples[i]); + } + } + + Assert(state->memtupcount == state->bound); + state->status = TSS_BOUNDED; +} + +/* + * Convert the bounded heap to a properly-sorted array + */ +static void +sort_bounded_heap(Tuplesortstate *state) +{ + int tupcount = state->memtupcount; + + Assert(state->status == TSS_BOUNDED); + Assert(state->bounded); + Assert(tupcount == state->bound); + Assert(SERIAL(state)); + + /* + * We can unheapify in place because each delete-top call will remove the + * largest entry, which we can promptly store in the newly freed slot at + * the end. Once we're down to a single-entry heap, we're done. + */ + while (state->memtupcount > 1) + { + SortTuple stup = state->memtuples[0]; + + /* this sifts-up the next-largest entry and decreases memtupcount */ + tuplesort_heap_delete_top(state); + state->memtuples[state->memtupcount] = stup; + } + state->memtupcount = tupcount; + + /* + * Reverse sort direction back to the original state. This is not + * actually necessary but seems like a good idea for tidiness. + */ + reversedirection(state); + + state->status = TSS_SORTEDINMEM; + state->boundUsed = true; +} + +/* + * Sort all memtuples using specialized qsort() routines. + * + * Quicksort is used for small in-memory sorts, and external sort runs. + */ +static void +tuplesort_sort_memtuples(Tuplesortstate *state) +{ + Assert(!LEADER(state)); + + if (state->memtupcount > 1) + { + /* Can we use the single-key sort function? */ + if (state->onlyKey != NULL) + qsort_ssup(state->memtuples, state->memtupcount, + state->onlyKey); + else + qsort_tuple(state->memtuples, + state->memtupcount, + state->comparetup, + state); + } +} + +/* + * Insert a new tuple into an empty or existing heap, maintaining the + * heap invariant. Caller is responsible for ensuring there's room. + * + * Note: For some callers, tuple points to a memtuples[] entry above the + * end of the heap. This is safe as long as it's not immediately adjacent + * to the end of the heap (ie, in the [memtupcount] array entry) --- if it + * is, it might get overwritten before being moved into the heap! + */ +static void +tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple) +{ + SortTuple *memtuples; + int j; + + memtuples = state->memtuples; + Assert(state->memtupcount < state->memtupsize); + + CHECK_FOR_INTERRUPTS(); + + /* + * Sift-up the new entry, per Knuth 5.2.3 exercise 16. Note that Knuth is + * using 1-based array indexes, not 0-based. + */ + j = state->memtupcount++; + while (j > 0) + { + int i = (j - 1) >> 1; + + if (COMPARETUP(state, tuple, &memtuples[i]) >= 0) + break; + memtuples[j] = memtuples[i]; + j = i; + } + memtuples[j] = *tuple; +} + +/* + * Remove the tuple at state->memtuples[0] from the heap. Decrement + * memtupcount, and sift up to maintain the heap invariant. + * + * The caller has already free'd the tuple the top node points to, + * if necessary. + */ +static void +tuplesort_heap_delete_top(Tuplesortstate *state) +{ + SortTuple *memtuples = state->memtuples; + SortTuple *tuple; + + if (--state->memtupcount <= 0) + return; + + /* + * Remove the last tuple in the heap, and re-insert it, by replacing the + * current top node with it. + */ + tuple = &memtuples[state->memtupcount]; + tuplesort_heap_replace_top(state, tuple); +} + +/* + * Replace the tuple at state->memtuples[0] with a new tuple. Sift up to + * maintain the heap invariant. + * + * This corresponds to Knuth's "sift-up" algorithm (Algorithm 5.2.3H, + * Heapsort, steps H3-H8). + */ +static void +tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple *tuple) +{ + SortTuple *memtuples = state->memtuples; + unsigned int i, + n; + + Assert(state->memtupcount >= 1); + + CHECK_FOR_INTERRUPTS(); + + /* + * state->memtupcount is "int", but we use "unsigned int" for i, j, n. + * This prevents overflow in the "2 * i + 1" calculation, since at the top + * of the loop we must have i < n <= INT_MAX <= UINT_MAX/2. + */ + n = state->memtupcount; + i = 0; /* i is where the "hole" is */ + for (;;) + { + unsigned int j = 2 * i + 1; + + if (j >= n) + break; + if (j + 1 < n && + COMPARETUP(state, &memtuples[j], &memtuples[j + 1]) > 0) + j++; + if (COMPARETUP(state, tuple, &memtuples[j]) <= 0) + break; + memtuples[i] = memtuples[j]; + i = j; + } + memtuples[i] = *tuple; +} + +/* + * Function to reverse the sort direction from its current state + * + * It is not safe to call this when performing hash tuplesorts + */ +static void +reversedirection(Tuplesortstate *state) +{ + SortSupport sortKey = state->sortKeys; + int nkey; + + for (nkey = 0; nkey < state->nKeys; nkey++, sortKey++) + { + sortKey->ssup_reverse = !sortKey->ssup_reverse; + sortKey->ssup_nulls_first = !sortKey->ssup_nulls_first; + } +} + + +/* + * Tape interface routines + */ + +static unsigned int +getlen(Tuplesortstate *state, int tapenum, bool eofOK) +{ + unsigned int len; + + if (LogicalTapeRead(state->tapeset, tapenum, + &len, sizeof(len)) != sizeof(len)) + elog(ERROR, "unexpected end of tape"); + if (len == 0 && !eofOK) + elog(ERROR, "unexpected end of data"); + return len; +} + +static void +markrunend(Tuplesortstate *state, int tapenum) +{ + unsigned int len = 0; + + LogicalTapeWrite(state->tapeset, tapenum, (void *) &len, sizeof(len)); +} + +/* + * Get memory for tuple from within READTUP() routine. + * + * We use next free slot from the slab allocator, or palloc() if the tuple + * is too large for that. + */ +static void * +readtup_alloc(Tuplesortstate *state, Size tuplen) +{ + SlabSlot *buf; + + /* + * We pre-allocate enough slots in the slab arena that we should never run + * out. + */ + Assert(state->slabFreeHead); + + if (tuplen > SLAB_SLOT_SIZE || !state->slabFreeHead) + return MemoryContextAlloc(state->sortcontext, tuplen); + else + { + buf = state->slabFreeHead; + /* Reuse this slot */ + state->slabFreeHead = buf->nextfree; + + return buf; + } +} + + +/* + * Routines specialized for HeapTuple (actually MinimalTuple) case + */ + +static int +comparetup_heap(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) +{ + SortSupport sortKey = state->sortKeys; + HeapTupleData ltup; + HeapTupleData rtup; + TupleDesc tupDesc; + int nkey; + int32 compare; + AttrNumber attno; + Datum datum1, + datum2; + bool isnull1, + isnull2; + + + /* Compare the leading sort key */ + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + sortKey); + if (compare != 0) + return compare; + + /* Compare additional sort keys */ + ltup.t_len = ((MinimalTuple) a->tuple)->t_len + MINIMAL_TUPLE_OFFSET; + ltup.t_data = (HeapTupleHeader) ((char *) a->tuple - MINIMAL_TUPLE_OFFSET); + rtup.t_len = ((MinimalTuple) b->tuple)->t_len + MINIMAL_TUPLE_OFFSET; + rtup.t_data = (HeapTupleHeader) ((char *) b->tuple - MINIMAL_TUPLE_OFFSET); + tupDesc = state->tupDesc; + + if (sortKey->abbrev_converter) + { + attno = sortKey->ssup_attno; + + datum1 = heap_getattr(<up, attno, tupDesc, &isnull1); + datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2); + + compare = ApplySortAbbrevFullComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + + sortKey++; + for (nkey = 1; nkey < state->nKeys; nkey++, sortKey++) + { + attno = sortKey->ssup_attno; + + datum1 = heap_getattr(<up, attno, tupDesc, &isnull1); + datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2); + + compare = ApplySortComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + + return 0; +} + +static void +copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + /* + * We expect the passed "tup" to be a TupleTableSlot, and form a + * MinimalTuple using the exported interface for that. + */ + TupleTableSlot *slot = (TupleTableSlot *) tup; + Datum original; + MinimalTuple tuple; + HeapTupleData htup; + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + + /* copy the tuple into sort storage */ + tuple = ExecCopySlotMinimalTuple(slot); + stup->tuple = (void *) tuple; + USEMEM(state, GetMemoryChunkSpace(tuple)); + /* set up first-column key value */ + htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET; + htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET); + original = heap_getattr(&htup, + state->sortKeys[0].ssup_attno, + state->tupDesc, + &stup->isnull1); + + MemoryContextSwitchTo(oldcontext); + + if (!state->sortKeys->abbrev_converter || stup->isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to zeroed representation (to be + * consistent, and to support cheap inequality tests for NULL + * abbreviated keys). + */ + stup->datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup->datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup->datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + htup.t_len = ((MinimalTuple) mtup->tuple)->t_len + + MINIMAL_TUPLE_OFFSET; + htup.t_data = (HeapTupleHeader) ((char *) mtup->tuple - + MINIMAL_TUPLE_OFFSET); + + mtup->datum1 = heap_getattr(&htup, + state->sortKeys[0].ssup_attno, + state->tupDesc, + &mtup->isnull1); + } + } +} + +static void +writetup_heap(Tuplesortstate *state, int tapenum, SortTuple *stup) +{ + MinimalTuple tuple = (MinimalTuple) stup->tuple; + + /* the part of the MinimalTuple we'll write: */ + char *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET; + unsigned int tupbodylen = tuple->t_len - MINIMAL_TUPLE_DATA_OFFSET; + + /* total on-disk footprint: */ + unsigned int tuplen = tupbodylen + sizeof(int); + + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &tuplen, sizeof(tuplen)); + LogicalTapeWrite(state->tapeset, tapenum, + (void *) tupbody, tupbodylen); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &tuplen, sizeof(tuplen)); + + if (!state->slabAllocatorUsed) + { + FREEMEM(state, GetMemoryChunkSpace(tuple)); + heap_free_minimal_tuple(tuple); + } +} + +static void +readtup_heap(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len) +{ + unsigned int tupbodylen = len - sizeof(int); + unsigned int tuplen = tupbodylen + MINIMAL_TUPLE_DATA_OFFSET; + MinimalTuple tuple = (MinimalTuple) readtup_alloc(state, tuplen); + char *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET; + HeapTupleData htup; + + /* read in the tuple proper */ + tuple->t_len = tuplen; + LogicalTapeReadExact(state->tapeset, tapenum, + tupbody, tupbodylen); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeReadExact(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + stup->tuple = (void *) tuple; + /* set up first-column key value */ + htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET; + htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET); + stup->datum1 = heap_getattr(&htup, + state->sortKeys[0].ssup_attno, + state->tupDesc, + &stup->isnull1); +} + +/* + * Routines specialized for the CLUSTER case (HeapTuple data, with + * comparisons per a btree index definition) + */ + +static int +comparetup_cluster(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state) +{ + SortSupport sortKey = state->sortKeys; + HeapTuple ltup; + HeapTuple rtup; + TupleDesc tupDesc; + int nkey; + int32 compare; + Datum datum1, + datum2; + bool isnull1, + isnull2; + AttrNumber leading = state->indexInfo->ii_IndexAttrNumbers[0]; + + /* Be prepared to compare additional sort keys */ + ltup = (HeapTuple) a->tuple; + rtup = (HeapTuple) b->tuple; + tupDesc = state->tupDesc; + + /* Compare the leading sort key, if it's simple */ + if (leading != 0) + { + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + sortKey); + if (compare != 0) + return compare; + + if (sortKey->abbrev_converter) + { + datum1 = heap_getattr(ltup, leading, tupDesc, &isnull1); + datum2 = heap_getattr(rtup, leading, tupDesc, &isnull2); + + compare = ApplySortAbbrevFullComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + } + if (compare != 0 || state->nKeys == 1) + return compare; + /* Compare additional columns the hard way */ + sortKey++; + nkey = 1; + } + else + { + /* Must compare all keys the hard way */ + nkey = 0; + } + + if (state->indexInfo->ii_Expressions == NULL) + { + /* If not expression index, just compare the proper heap attrs */ + + for (; nkey < state->nKeys; nkey++, sortKey++) + { + AttrNumber attno = state->indexInfo->ii_IndexAttrNumbers[nkey]; + + datum1 = heap_getattr(ltup, attno, tupDesc, &isnull1); + datum2 = heap_getattr(rtup, attno, tupDesc, &isnull2); + + compare = ApplySortComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + } + else + { + /* + * In the expression index case, compute the whole index tuple and + * then compare values. It would perhaps be faster to compute only as + * many columns as we need to compare, but that would require + * duplicating all the logic in FormIndexDatum. + */ + Datum l_index_values[INDEX_MAX_KEYS]; + bool l_index_isnull[INDEX_MAX_KEYS]; + Datum r_index_values[INDEX_MAX_KEYS]; + bool r_index_isnull[INDEX_MAX_KEYS]; + TupleTableSlot *ecxt_scantuple; + + /* Reset context each time to prevent memory leakage */ + ResetPerTupleExprContext(state->estate); + + ecxt_scantuple = GetPerTupleExprContext(state->estate)->ecxt_scantuple; + + ExecStoreTuple(ltup, ecxt_scantuple, InvalidBuffer, false); + FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate, + l_index_values, l_index_isnull); + + ExecStoreTuple(rtup, ecxt_scantuple, InvalidBuffer, false); + FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate, + r_index_values, r_index_isnull); + + for (; nkey < state->nKeys; nkey++, sortKey++) + { + compare = ApplySortComparator(l_index_values[nkey], + l_index_isnull[nkey], + r_index_values[nkey], + r_index_isnull[nkey], + sortKey); + if (compare != 0) + return compare; + } + } + + return 0; +} + +static void +copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + HeapTuple tuple = (HeapTuple) tup; + Datum original; + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + + /* copy the tuple into sort storage */ + tuple = heap_copytuple(tuple); + stup->tuple = (void *) tuple; + USEMEM(state, GetMemoryChunkSpace(tuple)); + + MemoryContextSwitchTo(oldcontext); + + /* + * set up first-column key value, and potentially abbreviate, if it's a + * simple column + */ + if (state->indexInfo->ii_IndexAttrNumbers[0] == 0) + return; + + original = heap_getattr(tuple, + state->indexInfo->ii_IndexAttrNumbers[0], + state->tupDesc, + &stup->isnull1); + + if (!state->sortKeys->abbrev_converter || stup->isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to zeroed representation (to be + * consistent, and to support cheap inequality tests for NULL + * abbreviated keys). + */ + stup->datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup->datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup->datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + tuple = (HeapTuple) mtup->tuple; + mtup->datum1 = heap_getattr(tuple, + state->indexInfo->ii_IndexAttrNumbers[0], + state->tupDesc, + &mtup->isnull1); + } + } +} + +static void +writetup_cluster(Tuplesortstate *state, int tapenum, SortTuple *stup) +{ + HeapTuple tuple = (HeapTuple) stup->tuple; + unsigned int tuplen = tuple->t_len + sizeof(ItemPointerData) + sizeof(int); + + /* We need to store t_self, but not other fields of HeapTupleData */ + LogicalTapeWrite(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + LogicalTapeWrite(state->tapeset, tapenum, + &tuple->t_self, sizeof(ItemPointerData)); + LogicalTapeWrite(state->tapeset, tapenum, + tuple->t_data, tuple->t_len); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeWrite(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + + if (!state->slabAllocatorUsed) + { + FREEMEM(state, GetMemoryChunkSpace(tuple)); + heap_freetuple(tuple); + } +} + +static void +readtup_cluster(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int tuplen) +{ + unsigned int t_len = tuplen - sizeof(ItemPointerData) - sizeof(int); + HeapTuple tuple = (HeapTuple) readtup_alloc(state, + t_len + HEAPTUPLESIZE); + + /* Reconstruct the HeapTupleData header */ + tuple->t_data = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE); + tuple->t_len = t_len; + LogicalTapeReadExact(state->tapeset, tapenum, + &tuple->t_self, sizeof(ItemPointerData)); + /* We don't currently bother to reconstruct t_tableOid */ + tuple->t_tableOid = InvalidOid; + /* Read in the tuple body */ + LogicalTapeReadExact(state->tapeset, tapenum, + tuple->t_data, tuple->t_len); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeReadExact(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + stup->tuple = (void *) tuple; + /* set up first-column key value, if it's a simple column */ + if (state->indexInfo->ii_IndexAttrNumbers[0] != 0) + stup->datum1 = heap_getattr(tuple, + state->indexInfo->ii_IndexAttrNumbers[0], + state->tupDesc, + &stup->isnull1); +} + +/* + * Routines specialized for IndexTuple case + * + * The btree and hash cases require separate comparison functions, but the + * IndexTuple representation is the same so the copy/write/read support + * functions can be shared. + */ + +static int +comparetup_index_btree(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state) +{ + /* + * This is similar to comparetup_heap(), but expects index tuples. There + * is also special handling for enforcing uniqueness, and special + * treatment for equal keys at the end. + */ + SortSupport sortKey = state->sortKeys; + IndexTuple tuple1; + IndexTuple tuple2; + int keysz; + TupleDesc tupDes; + bool equal_hasnull = false; + int nkey; + int32 compare; + Datum datum1, + datum2; + bool isnull1, + isnull2; + + + /* Compare the leading sort key */ + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + sortKey); + if (compare != 0) + return compare; + + /* Compare additional sort keys */ + tuple1 = (IndexTuple) a->tuple; + tuple2 = (IndexTuple) b->tuple; + keysz = state->nKeys; + tupDes = RelationGetDescr(state->indexRel); + + if (sortKey->abbrev_converter) + { + datum1 = index_getattr(tuple1, 1, tupDes, &isnull1); + datum2 = index_getattr(tuple2, 1, tupDes, &isnull2); + + compare = ApplySortAbbrevFullComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + + /* they are equal, so we only need to examine one null flag */ + if (a->isnull1) + equal_hasnull = true; + + sortKey++; + for (nkey = 2; nkey <= keysz; nkey++, sortKey++) + { + datum1 = index_getattr(tuple1, nkey, tupDes, &isnull1); + datum2 = index_getattr(tuple2, nkey, tupDes, &isnull2); + + compare = ApplySortComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; /* done when we find unequal attributes */ + + /* they are equal, so we only need to examine one null flag */ + if (isnull1) + equal_hasnull = true; + } + + /* + * If btree has asked us to enforce uniqueness, complain if two equal + * tuples are detected (unless there was at least one NULL field). + * + * It is sufficient to make the test here, because if two tuples are equal + * they *must* get compared at some stage of the sort --- otherwise the + * sort algorithm wouldn't have checked whether one must appear before the + * other. + */ + if (state->enforceUnique && !equal_hasnull) + { + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + char *key_desc; + + /* + * Some rather brain-dead implementations of qsort (such as the one in + * QNX 4) will sometimes call the comparison routine to compare a + * value to itself, but we always use our own implementation, which + * does not. + */ + Assert(tuple1 != tuple2); + + index_deform_tuple(tuple1, tupDes, values, isnull); + + key_desc = BuildIndexValueDescription(state->indexRel, values, isnull); + + ereport(ERROR, + (errcode(ERRCODE_UNIQUE_VIOLATION), + errmsg("could not create unique index \"%s\"", + RelationGetRelationName(state->indexRel)), + key_desc ? errdetail("Key %s is duplicated.", key_desc) : + errdetail("Duplicate keys exist."), + errtableconstraint(state->heapRel, + RelationGetRelationName(state->indexRel)))); + } + + /* + * If key values are equal, we sort on ItemPointer. This does not affect + * validity of the finished index, but it may be useful to have index + * scans in physical order. + */ + { + BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid); + BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid); + + if (blk1 != blk2) + return (blk1 < blk2) ? -1 : 1; + } + { + OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid); + OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid); + + if (pos1 != pos2) + return (pos1 < pos2) ? -1 : 1; + } + + /* ItemPointer values should never be equal */ + Assert(false); + + return 0; +} + +static int +comparetup_index_hash(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state) +{ + Bucket bucket1; + Bucket bucket2; + IndexTuple tuple1; + IndexTuple tuple2; + + /* + * Fetch hash keys and mask off bits we don't want to sort by. We know + * that the first column of the index tuple is the hash key. + */ + Assert(!a->isnull1); + bucket1 = _hash_hashkey2bucket(DatumGetUInt32(a->datum1), + state->max_buckets, state->high_mask, + state->low_mask); + Assert(!b->isnull1); + bucket2 = _hash_hashkey2bucket(DatumGetUInt32(b->datum1), + state->max_buckets, state->high_mask, + state->low_mask); + if (bucket1 > bucket2) + return 1; + else if (bucket1 < bucket2) + return -1; + + /* + * If hash values are equal, we sort on ItemPointer. This does not affect + * validity of the finished index, but it may be useful to have index + * scans in physical order. + */ + tuple1 = (IndexTuple) a->tuple; + tuple2 = (IndexTuple) b->tuple; + + { + BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid); + BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid); + + if (blk1 != blk2) + return (blk1 < blk2) ? -1 : 1; + } + { + OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid); + OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid); + + if (pos1 != pos2) + return (pos1 < pos2) ? -1 : 1; + } + + /* ItemPointer values should never be equal */ + Assert(false); + + return 0; +} + +static void +copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + IndexTuple tuple = (IndexTuple) tup; + unsigned int tuplen = IndexTupleSize(tuple); + IndexTuple newtuple; + Datum original; + + /* copy the tuple into sort storage */ + newtuple = (IndexTuple) MemoryContextAlloc(state->tuplecontext, tuplen); + memcpy(newtuple, tuple, tuplen); + USEMEM(state, GetMemoryChunkSpace(newtuple)); + stup->tuple = (void *) newtuple; + /* set up first-column key value */ + original = index_getattr(newtuple, + 1, + RelationGetDescr(state->indexRel), + &stup->isnull1); + + if (!state->sortKeys->abbrev_converter || stup->isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to zeroed representation (to be + * consistent, and to support cheap inequality tests for NULL + * abbreviated keys). + */ + stup->datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup->datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup->datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + tuple = (IndexTuple) mtup->tuple; + mtup->datum1 = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &mtup->isnull1); + } + } +} + +static void +writetup_index(Tuplesortstate *state, int tapenum, SortTuple *stup) +{ + IndexTuple tuple = (IndexTuple) stup->tuple; + unsigned int tuplen; + + tuplen = IndexTupleSize(tuple) + sizeof(tuplen); + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &tuplen, sizeof(tuplen)); + LogicalTapeWrite(state->tapeset, tapenum, + (void *) tuple, IndexTupleSize(tuple)); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &tuplen, sizeof(tuplen)); + + if (!state->slabAllocatorUsed) + { + FREEMEM(state, GetMemoryChunkSpace(tuple)); + pfree(tuple); + } +} + +static void +readtup_index(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len) +{ + unsigned int tuplen = len - sizeof(unsigned int); + IndexTuple tuple = (IndexTuple) readtup_alloc(state, tuplen); + + LogicalTapeReadExact(state->tapeset, tapenum, + tuple, tuplen); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeReadExact(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + stup->tuple = (void *) tuple; + /* set up first-column key value */ + stup->datum1 = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &stup->isnull1); +} + +/* + * Routines specialized for DatumTuple case + */ + +static int +comparetup_datum(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) +{ + int compare; + + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + state->sortKeys); + if (compare != 0) + return compare; + + /* if we have abbreviations, then "tuple" has the original value */ + + if (state->sortKeys->abbrev_converter) + compare = ApplySortAbbrevFullComparator(PointerGetDatum(a->tuple), a->isnull1, + PointerGetDatum(b->tuple), b->isnull1, + state->sortKeys); + + return compare; +} + +static void +copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + /* Not currently needed */ + elog(ERROR, "copytup_datum() should not be called"); +} + +static void +writetup_datum(Tuplesortstate *state, int tapenum, SortTuple *stup) +{ + void *waddr; + unsigned int tuplen; + unsigned int writtenlen; + + if (stup->isnull1) + { + waddr = NULL; + tuplen = 0; + } + else if (!state->tuples) + { + waddr = &stup->datum1; + tuplen = sizeof(Datum); + } + else + { + waddr = stup->tuple; + tuplen = datumGetSize(PointerGetDatum(stup->tuple), false, state->datumTypeLen); + Assert(tuplen != 0); + } + + writtenlen = tuplen + sizeof(unsigned int); + + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &writtenlen, sizeof(writtenlen)); + LogicalTapeWrite(state->tapeset, tapenum, + waddr, tuplen); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &writtenlen, sizeof(writtenlen)); + + if (!state->slabAllocatorUsed && stup->tuple) + { + FREEMEM(state, GetMemoryChunkSpace(stup->tuple)); + pfree(stup->tuple); + } +} + +static void +readtup_datum(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len) +{ + unsigned int tuplen = len - sizeof(unsigned int); + + if (tuplen == 0) + { + /* it's NULL */ + stup->datum1 = (Datum) 0; + stup->isnull1 = true; + stup->tuple = NULL; + } + else if (!state->tuples) + { + Assert(tuplen == sizeof(Datum)); + LogicalTapeReadExact(state->tapeset, tapenum, + &stup->datum1, tuplen); + stup->isnull1 = false; + stup->tuple = NULL; + } + else + { + void *raddr = readtup_alloc(state, tuplen); + + LogicalTapeReadExact(state->tapeset, tapenum, + raddr, tuplen); + stup->datum1 = PointerGetDatum(raddr); + stup->isnull1 = false; + stup->tuple = raddr; + } + + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeReadExact(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); +} + +/* + * Parallel sort routines + */ + +/* + * tuplesort_estimate_shared - estimate required shared memory allocation + * + * nWorkers is an estimate of the number of workers (it's the number that + * will be requested). + */ +Size +tuplesort_estimate_shared(int nWorkers) +{ + Size tapesSize; + + Assert(nWorkers > 0); + + /* Make sure that BufFile shared state is MAXALIGN'd */ + tapesSize = mul_size(sizeof(TapeShare), nWorkers); + tapesSize = MAXALIGN(add_size(tapesSize, offsetof(Sharedsort, tapes))); + + return tapesSize; +} + +/* + * tuplesort_initialize_shared - initialize shared tuplesort state + * + * Must be called from leader process before workers are launched, to + * establish state needed up-front for worker tuplesortstates. nWorkers + * should match the argument passed to tuplesort_estimate_shared(). + */ +void +tuplesort_initialize_shared(Sharedsort *shared, int nWorkers, dsm_segment *seg) +{ + int i; + + Assert(nWorkers > 0); + + SpinLockInit(&shared->mutex); + shared->currentWorker = 0; + shared->workersFinished = 0; + SharedFileSetInit(&shared->fileset, seg); + shared->nTapes = nWorkers; + for (i = 0; i < nWorkers; i++) + { + shared->tapes[i].firstblocknumber = 0L; + } +} + +/* + * tuplesort_attach_shared - attach to shared tuplesort state + * + * Must be called by all worker processes. + */ +void +tuplesort_attach_shared(Sharedsort *shared, dsm_segment *seg) +{ + /* Attach to SharedFileSet */ + SharedFileSetAttach(&shared->fileset, seg); +} + +/* + * worker_get_identifier - Assign and return ordinal identifier for worker + * + * The order in which these are assigned is not well defined, and should not + * matter; worker numbers across parallel sort participants need only be + * distinct and gapless. logtape.c requires this. + * + * Note that the identifiers assigned from here have no relation to + * ParallelWorkerNumber number, to avoid making any assumption about + * caller's requirements. However, we do follow the ParallelWorkerNumber + * convention of representing a non-worker with worker number -1. This + * includes the leader, as well as serial Tuplesort processes. + */ +static int +worker_get_identifier(Tuplesortstate *state) +{ + Sharedsort *shared = state->shared; + int worker; + + Assert(WORKER(state)); + + SpinLockAcquire(&shared->mutex); + worker = shared->currentWorker++; + SpinLockRelease(&shared->mutex); + + return worker; +} + +/* + * worker_freeze_result_tape - freeze worker's result tape for leader + * + * This is called by workers just after the result tape has been determined, + * instead of calling LogicalTapeFreeze() directly. They do so because + * workers require a few additional steps over similar serial + * TSS_SORTEDONTAPE external sort cases, which also happen here. The extra + * steps are around freeing now unneeded resources, and representing to + * leader that worker's input run is available for its merge. + * + * There should only be one final output run for each worker, which consists + * of all tuples that were originally input into worker. + */ +static void +worker_freeze_result_tape(Tuplesortstate *state) +{ + Sharedsort *shared = state->shared; + TapeShare output; + + Assert(WORKER(state)); + Assert(state->result_tape != -1); + Assert(state->memtupcount == 0); + + /* + * Free most remaining memory, in case caller is sensitive to our holding + * on to it. memtuples may not be a tiny merge heap at this point. + */ + pfree(state->memtuples); + /* Be tidy */ + state->memtuples = NULL; + state->memtupsize = 0; + + /* + * Parallel worker requires result tape metadata, which is to be stored in + * shared memory for leader + */ + LogicalTapeFreeze(state->tapeset, state->result_tape, &output); + + /* Store properties of output tape, and update finished worker count */ + SpinLockAcquire(&shared->mutex); + shared->tapes[state->worker] = output; + shared->workersFinished++; + SpinLockRelease(&shared->mutex); +} + +/* + * worker_nomergeruns - dump memtuples in worker, without merging + * + * This called as an alternative to mergeruns() with a worker when no + * merging is required. + */ +static void +worker_nomergeruns(Tuplesortstate *state) +{ + Assert(WORKER(state)); + Assert(state->result_tape == -1); + + state->result_tape = state->tp_tapenum[state->destTape]; + worker_freeze_result_tape(state); +} + +/* + * leader_takeover_tapes - create tapeset for leader from worker tapes + * + * So far, leader Tuplesortstate has performed no actual sorting. By now, all + * sorting has occurred in workers, all of which must have already returned + * from tuplesort_performsort(). + * + * When this returns, leader process is left in a state that is virtually + * indistinguishable from it having generated runs as a serial external sort + * might have. + */ +static void +leader_takeover_tapes(Tuplesortstate *state) +{ + Sharedsort *shared = state->shared; + int nParticipants = state->nParticipants; + int workersFinished; + int j; + + Assert(LEADER(state)); + Assert(nParticipants >= 1); + + SpinLockAcquire(&shared->mutex); + workersFinished = shared->workersFinished; + SpinLockRelease(&shared->mutex); + + if (nParticipants != workersFinished) + elog(ERROR, "cannot take over tapes before all workers finish"); + + /* + * Create the tapeset from worker tapes, including a leader-owned tape at + * the end. Parallel workers are far more expensive than logical tapes, + * so the number of tapes allocated here should never be excessive. + * + * We still have a leader tape, though it's not possible to write to it + * due to restrictions in the shared fileset infrastructure used by + * logtape.c. It will never be written to in practice because + * randomAccess is disallowed for parallel sorts. + */ + inittapestate(state, nParticipants + 1); + state->tapeset = LogicalTapeSetCreate(nParticipants + 1, shared->tapes, + &shared->fileset, state->worker); + + /* mergeruns() relies on currentRun for # of runs (in one-pass cases) */ + state->currentRun = nParticipants; + + /* + * Initialize variables of Algorithm D to be consistent with runs from + * workers having been generated in the leader. + * + * There will always be exactly 1 run per worker, and exactly one input + * tape per run, because workers always output exactly 1 run, even when + * there were no input tuples for workers to sort. + */ + for (j = 0; j < state->maxTapes; j++) + { + /* One real run; no dummy runs for worker tapes */ + state->tp_fib[j] = 1; + state->tp_runs[j] = 1; + state->tp_dummy[j] = 0; + state->tp_tapenum[j] = j; + } + /* Leader tape gets one dummy run, and no real runs */ + state->tp_fib[state->tapeRange] = 0; + state->tp_runs[state->tapeRange] = 0; + state->tp_dummy[state->tapeRange] = 1; + + state->Level = 1; + state->destTape = 0; + + state->status = TSS_BUILDRUNS; +} + +/* + * Convenience routine to free a tuple previously loaded into sort memory + */ +static void +free_sort_tuple(Tuplesortstate *state, SortTuple *stup) +{ + if (stup->tuple) + { + FREEMEM(state, GetMemoryChunkSpace(stup->tuple)); + pfree(stup->tuple); + stup->tuple = NULL; + } +} diff --git a/src/tuplesort12.c b/src/tuplesort12.c new file mode 100644 index 0000000000..796c1b8392 --- /dev/null +++ b/src/tuplesort12.c @@ -0,0 +1,4596 @@ +/*------------------------------------------------------------------------- + * + * tuplesort.c + * Generalized tuple sorting routines. + * + * This module handles sorting of heap tuples, index tuples, or single + * Datums (and could easily support other kinds of sortable objects, + * if necessary). It works efficiently for both small and large amounts + * of data. Small amounts are sorted in-memory using qsort(). Large + * amounts are sorted using temporary files and a standard external sort + * algorithm. + * + * See Knuth, volume 3, for more than you want to know about the external + * sorting algorithm. Historically, we divided the input into sorted runs + * using replacement selection, in the form of a priority tree implemented + * as a heap (essentially his Algorithm 5.2.3H), but now we always use + * quicksort for run generation. We merge the runs using polyphase merge, + * Knuth's Algorithm 5.4.2D. The logical "tapes" used by Algorithm D are + * implemented by logtape.c, which avoids space wastage by recycling disk + * space as soon as each block is read from its "tape". + * + * The approximate amount of memory allowed for any one sort operation + * is specified in kilobytes by the caller (most pass work_mem). Initially, + * we absorb tuples and simply store them in an unsorted array as long as + * we haven't exceeded workMem. If we reach the end of the input without + * exceeding workMem, we sort the array using qsort() and subsequently return + * tuples just by scanning the tuple array sequentially. If we do exceed + * workMem, we begin to emit tuples into sorted runs in temporary tapes. + * When tuples are dumped in batch after quicksorting, we begin a new run + * with a new output tape (selected per Algorithm D). After the end of the + * input is reached, we dump out remaining tuples in memory into a final run, + * then merge the runs using Algorithm D. + * + * When merging runs, we use a heap containing just the frontmost tuple from + * each source run; we repeatedly output the smallest tuple and replace it + * with the next tuple from its source tape (if any). When the heap empties, + * the merge is complete. The basic merge algorithm thus needs very little + * memory --- only M tuples for an M-way merge, and M is constrained to a + * small number. However, we can still make good use of our full workMem + * allocation by pre-reading additional blocks from each source tape. Without + * prereading, our access pattern to the temporary file would be very erratic; + * on average we'd read one block from each of M source tapes during the same + * time that we're writing M blocks to the output tape, so there is no + * sequentiality of access at all, defeating the read-ahead methods used by + * most Unix kernels. Worse, the output tape gets written into a very random + * sequence of blocks of the temp file, ensuring that things will be even + * worse when it comes time to read that tape. A straightforward merge pass + * thus ends up doing a lot of waiting for disk seeks. We can improve matters + * by prereading from each source tape sequentially, loading about workMem/M + * bytes from each tape in turn, and making the sequential blocks immediately + * available for reuse. This approach helps to localize both read and write + * accesses. The pre-reading is handled by logtape.c, we just tell it how + * much memory to use for the buffers. + * + * When the caller requests random access to the sort result, we form + * the final sorted run on a logical tape which is then "frozen", so + * that we can access it randomly. When the caller does not need random + * access, we return from tuplesort_performsort() as soon as we are down + * to one run per logical tape. The final merge is then performed + * on-the-fly as the caller repeatedly calls tuplesort_getXXX; this + * saves one cycle of writing all the data out to disk and reading it in. + * + * Before Postgres 8.2, we always used a seven-tape polyphase merge, on the + * grounds that 7 is the "sweet spot" on the tapes-to-passes curve according + * to Knuth's figure 70 (section 5.4.2). However, Knuth is assuming that + * tape drives are expensive beasts, and in particular that there will always + * be many more runs than tape drives. In our implementation a "tape drive" + * doesn't cost much more than a few Kb of memory buffers, so we can afford + * to have lots of them. In particular, if we can have as many tape drives + * as sorted runs, we can eliminate any repeated I/O at all. In the current + * code we determine the number of tapes M on the basis of workMem: we want + * workMem/M to be large enough that we read a fair amount of data each time + * we preread from a tape, so as to maintain the locality of access described + * above. Nonetheless, with large workMem we can have many tapes (but not + * too many -- see the comments in tuplesort_merge_order). + * + * This module supports parallel sorting. Parallel sorts involve coordination + * among one or more worker processes, and a leader process, each with its own + * tuplesort state. The leader process (or, more accurately, the + * Tuplesortstate associated with a leader process) creates a full tapeset + * consisting of worker tapes with one run to merge; a run for every + * worker process. This is then merged. Worker processes are guaranteed to + * produce exactly one output run from their partial input. + * + * + * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/utils/sort/tuplesort.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include + +#include "access/hash.h" +#include "access/htup_details.h" +#include "access/nbtree.h" +#include "catalog/index.h" +#include "catalog/pg_am.h" +#include "commands/tablespace.h" +#include "executor/executor.h" +#include "miscadmin.h" +#include "pg_trace.h" +#include "utils/datum.h" +#include "utils/logtape.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/pg_rusage.h" +#include "utils/rel.h" +#include "utils/sortsupport.h" +#include "utils/tuplesort.h" + +/* Should be the last include */ +#include "disable_core_macro.h" + +/* sort-type codes for sort__start probes */ +#define HEAP_SORT 0 +#define INDEX_SORT 1 +#define DATUM_SORT 2 +#define CLUSTER_SORT 3 + +/* Sort parallel code from state for sort__start probes */ +#define PARALLEL_SORT(state) ((state)->shared == NULL ? 0 : \ + (state)->worker >= 0 ? 1 : 2) + +/* GUC variables */ +#ifdef TRACE_SORT +bool trace_sort = false; +#endif + +#ifdef DEBUG_BOUNDED_SORT +bool optimize_bounded_sort = true; +#endif + + +/* + * The objects we actually sort are SortTuple structs. These contain + * a pointer to the tuple proper (might be a MinimalTuple or IndexTuple), + * which is a separate palloc chunk --- we assume it is just one chunk and + * can be freed by a simple pfree() (except during merge, when we use a + * simple slab allocator). SortTuples also contain the tuple's first key + * column in Datum/nullflag format, and an index integer. + * + * Storing the first key column lets us save heap_getattr or index_getattr + * calls during tuple comparisons. We could extract and save all the key + * columns not just the first, but this would increase code complexity and + * overhead, and wouldn't actually save any comparison cycles in the common + * case where the first key determines the comparison result. Note that + * for a pass-by-reference datatype, datum1 points into the "tuple" storage. + * + * There is one special case: when the sort support infrastructure provides an + * "abbreviated key" representation, where the key is (typically) a pass by + * value proxy for a pass by reference type. In this case, the abbreviated key + * is stored in datum1 in place of the actual first key column. + * + * When sorting single Datums, the data value is represented directly by + * datum1/isnull1 for pass by value types (or null values). If the datatype is + * pass-by-reference and isnull1 is false, then "tuple" points to a separately + * palloc'd data value, otherwise "tuple" is NULL. The value of datum1 is then + * either the same pointer as "tuple", or is an abbreviated key value as + * described above. Accordingly, "tuple" is always used in preference to + * datum1 as the authoritative value for pass-by-reference cases. + * + * tupindex holds the input tape number that each tuple in the heap was read + * from during merge passes. + */ +typedef struct +{ + void *tuple; /* the tuple itself */ + Datum datum1; /* value of first key column */ + bool isnull1; /* is first key column NULL? */ + int tupindex; /* see notes above */ +} SortTuple; + +/* + * During merge, we use a pre-allocated set of fixed-size slots to hold + * tuples. To avoid palloc/pfree overhead. + * + * Merge doesn't require a lot of memory, so we can afford to waste some, + * by using gratuitously-sized slots. If a tuple is larger than 1 kB, the + * palloc() overhead is not significant anymore. + * + * 'nextfree' is valid when this chunk is in the free list. When in use, the + * slot holds a tuple. + */ +#define SLAB_SLOT_SIZE 1024 + +typedef union SlabSlot +{ + union SlabSlot *nextfree; + char buffer[SLAB_SLOT_SIZE]; +} SlabSlot; + +/* + * Possible states of a Tuplesort object. These denote the states that + * persist between calls of Tuplesort routines. + */ +typedef enum +{ + TSS_INITIAL, /* Loading tuples; still within memory limit */ + TSS_BOUNDED, /* Loading tuples into bounded-size heap */ + TSS_BUILDRUNS, /* Loading tuples; writing to tape */ + TSS_SORTEDINMEM, /* Sort completed entirely in memory */ + TSS_SORTEDONTAPE, /* Sort completed, final run is on tape */ + TSS_FINALMERGE /* Performing final merge on-the-fly */ +} TupSortStatus; + +/* + * Parameters for calculation of number of tapes to use --- see inittapes() + * and tuplesort_merge_order(). + * + * In this calculation we assume that each tape will cost us about 1 blocks + * worth of buffer space. This ignores the overhead of all the other data + * structures needed for each tape, but it's probably close enough. + * + * MERGE_BUFFER_SIZE is how much data we'd like to read from each input + * tape during a preread cycle (see discussion at top of file). + */ +#define MINORDER 6 /* minimum merge order */ +#define MAXORDER 500 /* maximum merge order */ +#define TAPE_BUFFER_OVERHEAD BLCKSZ +#define MERGE_BUFFER_SIZE (BLCKSZ * 32) + +typedef int (*SortTupleComparator) (const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); + +/* + * Private state of a Tuplesort operation. + */ +struct Tuplesortstate +{ + TupSortStatus status; /* enumerated value as shown above */ + int nKeys; /* number of columns in sort key */ + bool randomAccess; /* did caller request random access? */ + bool bounded; /* did caller specify a maximum number of + * tuples to return? */ + bool boundUsed; /* true if we made use of a bounded heap */ + int bound; /* if bounded, the maximum number of tuples */ + bool tuples; /* Can SortTuple.tuple ever be set? */ + int64 availMem; /* remaining memory available, in bytes */ + int64 allowedMem; /* total memory allowed, in bytes */ + int maxTapes; /* number of tapes (Knuth's T) */ + int tapeRange; /* maxTapes-1 (Knuth's P) */ + MemoryContext sortcontext; /* memory context holding most sort data */ + MemoryContext tuplecontext; /* sub-context of sortcontext for tuple data */ + LogicalTapeSet *tapeset; /* logtape.c object for tapes in a temp file */ + + /* + * These function pointers decouple the routines that must know what kind + * of tuple we are sorting from the routines that don't need to know it. + * They are set up by the tuplesort_begin_xxx routines. + * + * Function to compare two tuples; result is per qsort() convention, ie: + * <0, 0, >0 according as ab. The API must match + * qsort_arg_comparator. + */ + SortTupleComparator comparetup; + + /* + * Function to copy a supplied input tuple into palloc'd space and set up + * its SortTuple representation (ie, set tuple/datum1/isnull1). Also, + * state->availMem must be decreased by the amount of space used for the + * tuple copy (note the SortTuple struct itself is not counted). + */ + void (*copytup) (Tuplesortstate *state, SortTuple *stup, void *tup); + + /* + * Function to write a stored tuple onto tape. The representation of the + * tuple on tape need not be the same as it is in memory; requirements on + * the tape representation are given below. Unless the slab allocator is + * used, after writing the tuple, pfree() the out-of-line data (not the + * SortTuple struct!), and increase state->availMem by the amount of + * memory space thereby released. + */ + void (*writetup) (Tuplesortstate *state, int tapenum, + SortTuple *stup); + + /* + * Function to read a stored tuple from tape back into memory. 'len' is + * the already-read length of the stored tuple. The tuple is allocated + * from the slab memory arena, or is palloc'd, see readtup_alloc(). + */ + void (*readtup) (Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); + + /* + * This array holds the tuples now in sort memory. If we are in state + * INITIAL, the tuples are in no particular order; if we are in state + * SORTEDINMEM, the tuples are in final sorted order; in states BUILDRUNS + * and FINALMERGE, the tuples are organized in "heap" order per Algorithm + * H. In state SORTEDONTAPE, the array is not used. + */ + SortTuple *memtuples; /* array of SortTuple structs */ + int memtupcount; /* number of tuples currently present */ + int memtupsize; /* allocated length of memtuples array */ + bool growmemtuples; /* memtuples' growth still underway? */ + + /* + * Memory for tuples is sometimes allocated using a simple slab allocator, + * rather than with palloc(). Currently, we switch to slab allocation + * when we start merging. Merging only needs to keep a small, fixed + * number of tuples in memory at any time, so we can avoid the + * palloc/pfree overhead by recycling a fixed number of fixed-size slots + * to hold the tuples. + * + * For the slab, we use one large allocation, divided into SLAB_SLOT_SIZE + * slots. The allocation is sized to have one slot per tape, plus one + * additional slot. We need that many slots to hold all the tuples kept + * in the heap during merge, plus the one we have last returned from the + * sort, with tuplesort_gettuple. + * + * Initially, all the slots are kept in a linked list of free slots. When + * a tuple is read from a tape, it is put to the next available slot, if + * it fits. If the tuple is larger than SLAB_SLOT_SIZE, it is palloc'd + * instead. + * + * When we're done processing a tuple, we return the slot back to the free + * list, or pfree() if it was palloc'd. We know that a tuple was + * allocated from the slab, if its pointer value is between + * slabMemoryBegin and -End. + * + * When the slab allocator is used, the USEMEM/LACKMEM mechanism of + * tracking memory usage is not used. + */ + bool slabAllocatorUsed; + + char *slabMemoryBegin; /* beginning of slab memory arena */ + char *slabMemoryEnd; /* end of slab memory arena */ + SlabSlot *slabFreeHead; /* head of free list */ + + /* Buffer size to use for reading input tapes, during merge. */ + size_t read_buffer_size; + + /* + * When we return a tuple to the caller in tuplesort_gettuple_XXX, that + * came from a tape (that is, in TSS_SORTEDONTAPE or TSS_FINALMERGE + * modes), we remember the tuple in 'lastReturnedTuple', so that we can + * recycle the memory on next gettuple call. + */ + void *lastReturnedTuple; + + /* + * While building initial runs, this is the current output run number. + * Afterwards, it is the number of initial runs we made. + */ + int currentRun; + + /* + * Unless otherwise noted, all pointer variables below are pointers to + * arrays of length maxTapes, holding per-tape data. + */ + + /* + * This variable is only used during merge passes. mergeactive[i] is true + * if we are reading an input run from (actual) tape number i and have not + * yet exhausted that run. + */ + bool *mergeactive; /* active input run source? */ + + /* + * Variables for Algorithm D. Note that destTape is a "logical" tape + * number, ie, an index into the tp_xxx[] arrays. Be careful to keep + * "logical" and "actual" tape numbers straight! + */ + int Level; /* Knuth's l */ + int destTape; /* current output tape (Knuth's j, less 1) */ + int *tp_fib; /* Target Fibonacci run counts (A[]) */ + int *tp_runs; /* # of real runs on each tape */ + int *tp_dummy; /* # of dummy runs for each tape (D[]) */ + int *tp_tapenum; /* Actual tape numbers (TAPE[]) */ + int activeTapes; /* # of active input tapes in merge pass */ + + /* + * These variables are used after completion of sorting to keep track of + * the next tuple to return. (In the tape case, the tape's current read + * position is also critical state.) + */ + int result_tape; /* actual tape number of finished output */ + int current; /* array index (only used if SORTEDINMEM) */ + bool eof_reached; /* reached EOF (needed for cursors) */ + + /* markpos_xxx holds marked position for mark and restore */ + long markpos_block; /* tape block# (only used if SORTEDONTAPE) */ + int markpos_offset; /* saved "current", or offset in tape block */ + bool markpos_eof; /* saved "eof_reached" */ + + /* + * These variables are used during parallel sorting. + * + * worker is our worker identifier. Follows the general convention that + * -1 value relates to a leader tuplesort, and values >= 0 worker + * tuplesorts. (-1 can also be a serial tuplesort.) + * + * shared is mutable shared memory state, which is used to coordinate + * parallel sorts. + * + * nParticipants is the number of worker Tuplesortstates known by the + * leader to have actually been launched, which implies that they must + * finish a run leader can merge. Typically includes a worker state held + * by the leader process itself. Set in the leader Tuplesortstate only. + */ + int worker; + Sharedsort *shared; + int nParticipants; + + /* + * The sortKeys variable is used by every case other than the hash index + * case; it is set by tuplesort_begin_xxx. tupDesc is only used by the + * MinimalTuple and CLUSTER routines, though. + */ + TupleDesc tupDesc; + SortSupport sortKeys; /* array of length nKeys */ + + /* + * This variable is shared by the single-key MinimalTuple case and the + * Datum case (which both use qsort_ssup()). Otherwise it's NULL. + */ + SortSupport onlyKey; + + /* + * Additional state for managing "abbreviated key" sortsupport routines + * (which currently may be used by all cases except the hash index case). + * Tracks the intervals at which the optimization's effectiveness is + * tested. + */ + int64 abbrevNext; /* Tuple # at which to next check + * applicability */ + + /* + * These variables are specific to the CLUSTER case; they are set by + * tuplesort_begin_cluster. + */ + IndexInfo *indexInfo; /* info about index being used for reference */ + EState *estate; /* for evaluating index expressions */ + + /* + * These variables are specific to the IndexTuple case; they are set by + * tuplesort_begin_index_xxx and used only by the IndexTuple routines. + */ + Relation heapRel; /* table the index is being built on */ + Relation indexRel; /* index being built */ + + /* These are specific to the index_btree subcase: */ + bool enforceUnique; /* complain if we find duplicate tuples */ + + /* These are specific to the index_hash subcase: */ + uint32 high_mask; /* masks for sortable part of hash code */ + uint32 low_mask; + uint32 max_buckets; + + /* + * These variables are specific to the Datum case; they are set by + * tuplesort_begin_datum and used only by the DatumTuple routines. + */ + Oid datumType; + /* we need typelen in order to know how to copy the Datums. */ + int datumTypeLen; + + /* + * Resource snapshot for time of sort start. + */ +#ifdef TRACE_SORT + PGRUsage ru_start; +#endif +}; + +/* + * Private mutable state of tuplesort-parallel-operation. This is allocated + * in shared memory. + */ +struct Sharedsort +{ + /* mutex protects all fields prior to tapes */ + slock_t mutex; + + /* + * currentWorker generates ordinal identifier numbers for parallel sort + * workers. These start from 0, and are always gapless. + * + * Workers increment workersFinished to indicate having finished. If this + * is equal to state.nParticipants within the leader, leader is ready to + * merge worker runs. + */ + int currentWorker; + int workersFinished; + + /* Temporary file space */ + SharedFileSet fileset; + + /* Size of tapes flexible array */ + int nTapes; + + /* + * Tapes array used by workers to report back information needed by the + * leader to concatenate all worker tapes into one for merging + */ + TapeShare tapes[FLEXIBLE_ARRAY_MEMBER]; +}; + +/* + * Is the given tuple allocated from the slab memory arena? + */ +#define IS_SLAB_SLOT(state, tuple) \ + ((char *) (tuple) >= (state)->slabMemoryBegin && \ + (char *) (tuple) < (state)->slabMemoryEnd) + +/* + * Return the given tuple to the slab memory free list, or free it + * if it was palloc'd. + */ +#define RELEASE_SLAB_SLOT(state, tuple) \ + do { \ + SlabSlot *buf = (SlabSlot *) tuple; \ + \ + if (IS_SLAB_SLOT((state), buf)) \ + { \ + buf->nextfree = (state)->slabFreeHead; \ + (state)->slabFreeHead = buf; \ + } else \ + pfree(buf); \ + } while(0) + +#define COMPARETUP(state,a,b) ((*(state)->comparetup) (a, b, state)) +#define COPYTUP(state,stup,tup) ((*(state)->copytup) (state, stup, tup)) +#define WRITETUP(state,tape,stup) ((*(state)->writetup) (state, tape, stup)) +#define READTUP(state,stup,tape,len) ((*(state)->readtup) (state, stup, tape, len)) +#define LACKMEM(state) ((state)->availMem < 0 && !(state)->slabAllocatorUsed) +#define USEMEM(state,amt) ((state)->availMem -= (amt)) +#define FREEMEM(state,amt) ((state)->availMem += (amt)) +#define SERIAL(state) ((state)->shared == NULL) +#define WORKER(state) ((state)->shared && (state)->worker != -1) +#define LEADER(state) ((state)->shared && (state)->worker == -1) + +/* + * NOTES about on-tape representation of tuples: + * + * We require the first "unsigned int" of a stored tuple to be the total size + * on-tape of the tuple, including itself (so it is never zero; an all-zero + * unsigned int is used to delimit runs). The remainder of the stored tuple + * may or may not match the in-memory representation of the tuple --- + * any conversion needed is the job of the writetup and readtup routines. + * + * If state->randomAccess is true, then the stored representation of the + * tuple must be followed by another "unsigned int" that is a copy of the + * length --- so the total tape space used is actually sizeof(unsigned int) + * more than the stored length value. This allows read-backwards. When + * randomAccess is not true, the write/read routines may omit the extra + * length word. + * + * writetup is expected to write both length words as well as the tuple + * data. When readtup is called, the tape is positioned just after the + * front length word; readtup must read the tuple data and advance past + * the back length word (if present). + * + * The write/read routines can make use of the tuple description data + * stored in the Tuplesortstate record, if needed. They are also expected + * to adjust state->availMem by the amount of memory space (not tape space!) + * released or consumed. There is no error return from either writetup + * or readtup; they should ereport() on failure. + * + * + * NOTES about memory consumption calculations: + * + * We count space allocated for tuples against the workMem limit, plus + * the space used by the variable-size memtuples array. Fixed-size space + * is not counted; it's small enough to not be interesting. + * + * Note that we count actual space used (as shown by GetMemoryChunkSpace) + * rather than the originally-requested size. This is important since + * palloc can add substantial overhead. It's not a complete answer since + * we won't count any wasted space in palloc allocation blocks, but it's + * a lot better than what we were doing before 7.3. As of 9.6, a + * separate memory context is used for caller passed tuples. Resetting + * it at certain key increments significantly ameliorates fragmentation. + * Note that this places a responsibility on readtup and copytup routines + * to use the right memory context for these tuples (and to not use the + * reset context for anything whose lifetime needs to span multiple + * external sort runs). + */ + +/* When using this macro, beware of double evaluation of len */ +#define LogicalTapeReadExact(tapeset, tapenum, ptr, len) \ + do { \ + if (LogicalTapeRead(tapeset, tapenum, ptr, len) != (size_t) (len)) \ + elog(ERROR, "unexpected end of data"); \ + } while(0) + + +static Tuplesortstate *tuplesort_begin_common(int workMem, + SortCoordinate coordinate, + bool randomAccess); +static void puttuple_common(Tuplesortstate *state, SortTuple *tuple); +static bool consider_abort_common(Tuplesortstate *state); +static void inittapes(Tuplesortstate *state, bool mergeruns); +static void inittapestate(Tuplesortstate *state, int maxTapes); +static void selectnewtape(Tuplesortstate *state); +static void init_slab_allocator(Tuplesortstate *state, int numSlots); +static void mergeruns(Tuplesortstate *state); +static void mergeonerun(Tuplesortstate *state); +static void beginmerge(Tuplesortstate *state); +static bool mergereadnext(Tuplesortstate *state, int srcTape, SortTuple *stup); +static void dumptuples(Tuplesortstate *state, bool alltuples); +static void make_bounded_heap(Tuplesortstate *state); +static void sort_bounded_heap(Tuplesortstate *state); +static void tuplesort_sort_memtuples(Tuplesortstate *state); +static void tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple); +static void tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple *tuple); +static void tuplesort_heap_delete_top(Tuplesortstate *state); +static void reversedirection(Tuplesortstate *state); +static unsigned int getlen(Tuplesortstate *state, int tapenum, bool eofOK); +static void markrunend(Tuplesortstate *state, int tapenum); +static void *readtup_alloc(Tuplesortstate *state, Size tuplen); +static int comparetup_heap(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_heap(Tuplesortstate *state, int tapenum, + SortTuple *stup); +static void readtup_heap(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); +static int comparetup_cluster(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_cluster(Tuplesortstate *state, int tapenum, + SortTuple *stup); +static void readtup_cluster(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); +static int comparetup_index_btree(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static int comparetup_index_hash(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_index(Tuplesortstate *state, int tapenum, + SortTuple *stup); +static void readtup_index(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); +static int comparetup_datum(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_datum(Tuplesortstate *state, int tapenum, + SortTuple *stup); +static void readtup_datum(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); +static int worker_get_identifier(Tuplesortstate *state); +static void worker_freeze_result_tape(Tuplesortstate *state); +static void worker_nomergeruns(Tuplesortstate *state); +static void leader_takeover_tapes(Tuplesortstate *state); +static void free_sort_tuple(Tuplesortstate *state, SortTuple *stup); + +/* + * Special versions of qsort just for SortTuple objects. qsort_tuple() sorts + * any variant of SortTuples, using the appropriate comparetup function. + * qsort_ssup() is specialized for the case where the comparetup function + * reduces to ApplySortComparator(), that is single-key MinimalTuple sorts + * and Datum sorts. + */ +#include "qsort_tuple.c" + + +/* + * tuplesort_begin_xxx + * + * Initialize for a tuple sort operation. + * + * After calling tuplesort_begin, the caller should call tuplesort_putXXX + * zero or more times, then call tuplesort_performsort when all the tuples + * have been supplied. After performsort, retrieve the tuples in sorted + * order by calling tuplesort_getXXX until it returns false/NULL. (If random + * access was requested, rescan, markpos, and restorepos can also be called.) + * Call tuplesort_end to terminate the operation and release memory/disk space. + * + * Each variant of tuplesort_begin has a workMem parameter specifying the + * maximum number of kilobytes of RAM to use before spilling data to disk. + * (The normal value of this parameter is work_mem, but some callers use + * other values.) Each variant also has a randomAccess parameter specifying + * whether the caller needs non-sequential access to the sort result. + */ + +static Tuplesortstate * +tuplesort_begin_common(int workMem, SortCoordinate coordinate, + bool randomAccess) +{ + Tuplesortstate *state; + MemoryContext sortcontext; + MemoryContext tuplecontext; + MemoryContext oldcontext; + + /* See leader_takeover_tapes() remarks on randomAccess support */ + if (coordinate && randomAccess) + elog(ERROR, "random access disallowed under parallel sort"); + + /* + * Create a working memory context for this sort operation. All data + * needed by the sort will live inside this context. + */ + sortcontext = AllocSetContextCreate(CurrentMemoryContext, + "TupleSort main", + ALLOCSET_DEFAULT_SIZES); + + /* + * Caller tuple (e.g. IndexTuple) memory context. + * + * A dedicated child context used exclusively for caller passed tuples + * eases memory management. Resetting at key points reduces + * fragmentation. Note that the memtuples array of SortTuples is allocated + * in the parent context, not this context, because there is no need to + * free memtuples early. + */ + tuplecontext = AllocSetContextCreate(sortcontext, + "Caller tuples", + ALLOCSET_DEFAULT_SIZES); + + /* + * Make the Tuplesortstate within the per-sort context. This way, we + * don't need a separate pfree() operation for it at shutdown. + */ + oldcontext = MemoryContextSwitchTo(sortcontext); + + state = (Tuplesortstate *) palloc0(sizeof(Tuplesortstate)); + +#ifdef TRACE_SORT + if (trace_sort) + pg_rusage_init(&state->ru_start); +#endif + + state->status = TSS_INITIAL; + state->randomAccess = randomAccess; + state->bounded = false; + state->tuples = true; + state->boundUsed = false; + + /* + * workMem is forced to be at least 64KB, the current minimum valid value + * for the work_mem GUC. This is a defense against parallel sort callers + * that divide out memory among many workers in a way that leaves each + * with very little memory. + */ + state->allowedMem = Max(workMem, 64) * (int64) 1024; + state->availMem = state->allowedMem; + state->sortcontext = sortcontext; + state->tuplecontext = tuplecontext; + state->tapeset = NULL; + + state->memtupcount = 0; + + /* + * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD; + * see comments in grow_memtuples(). + */ + state->memtupsize = Max(1024, + ALLOCSET_SEPARATE_THRESHOLD / sizeof(SortTuple) + 1); + + state->growmemtuples = true; + state->slabAllocatorUsed = false; + state->memtuples = (SortTuple *) palloc(state->memtupsize * sizeof(SortTuple)); + + USEMEM(state, GetMemoryChunkSpace(state->memtuples)); + + /* workMem must be large enough for the minimal memtuples array */ + if (LACKMEM(state)) + elog(ERROR, "insufficient memory allowed for sort"); + + state->currentRun = 0; + + /* + * maxTapes, tapeRange, and Algorithm D variables will be initialized by + * inittapes(), if needed + */ + + state->result_tape = -1; /* flag that result tape has not been formed */ + + /* + * Initialize parallel-related state based on coordination information + * from caller + */ + if (!coordinate) + { + /* Serial sort */ + state->shared = NULL; + state->worker = -1; + state->nParticipants = -1; + } + else if (coordinate->isWorker) + { + /* Parallel worker produces exactly one final run from all input */ + state->shared = coordinate->sharedsort; + state->worker = worker_get_identifier(state); + state->nParticipants = -1; + } + else + { + /* Parallel leader state only used for final merge */ + state->shared = coordinate->sharedsort; + state->worker = -1; + state->nParticipants = coordinate->nParticipants; + Assert(state->nParticipants >= 1); + } + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_heap(TupleDesc tupDesc, + int nkeys, AttrNumber *attNums, + Oid *sortOperators, Oid *sortCollations, + bool *nullsFirstFlags, + int workMem, SortCoordinate coordinate, bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + randomAccess); + MemoryContext oldcontext; + int i; + + oldcontext = MemoryContextSwitchTo(state->sortcontext); + + AssertArg(nkeys > 0); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c", + nkeys, workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = nkeys; + + TRACE_POSTGRESQL_SORT_START(HEAP_SORT, + false, /* no unique check */ + nkeys, + workMem, + randomAccess, + PARALLEL_SORT(state)); + + state->comparetup = comparetup_heap; + state->copytup = copytup_heap; + state->writetup = writetup_heap; + state->readtup = readtup_heap; + + state->tupDesc = tupDesc; /* assume we need not copy tupDesc */ + state->abbrevNext = 10; + + /* Prepare SortSupport data for each column */ + state->sortKeys = (SortSupport) palloc0(nkeys * sizeof(SortSupportData)); + + for (i = 0; i < nkeys; i++) + { + SortSupport sortKey = state->sortKeys + i; + + AssertArg(attNums[i] != 0); + AssertArg(sortOperators[i] != 0); + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = sortCollations[i]; + sortKey->ssup_nulls_first = nullsFirstFlags[i]; + sortKey->ssup_attno = attNums[i]; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0); + + PrepareSortSupportFromOrderingOp(sortOperators[i], sortKey); + } + + /* + * The "onlyKey" optimization cannot be used with abbreviated keys, since + * tie-breaker comparisons may be required. Typically, the optimization + * is only of value to pass-by-value types anyway, whereas abbreviated + * keys are typically only of value to pass-by-reference types. + */ + if (nkeys == 1 && !state->sortKeys->abbrev_converter) + state->onlyKey = state->sortKeys; + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_cluster(TupleDesc tupDesc, + Relation indexRel, + int workMem, + SortCoordinate coordinate, bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + randomAccess); + BTScanInsert indexScanKey; + MemoryContext oldcontext; + int i; + + Assert(indexRel->rd_rel->relam == BTREE_AM_OID); + + oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c", + RelationGetNumberOfAttributes(indexRel), + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = IndexRelationGetNumberOfKeyAttributes(indexRel); + + TRACE_POSTGRESQL_SORT_START(CLUSTER_SORT, + false, /* no unique check */ + state->nKeys, + workMem, + randomAccess, + PARALLEL_SORT(state)); + + state->comparetup = comparetup_cluster; + state->copytup = copytup_cluster; + state->writetup = writetup_cluster; + state->readtup = readtup_cluster; + state->abbrevNext = 10; + + state->indexInfo = BuildIndexInfo(indexRel); + + state->tupDesc = tupDesc; /* assume we need not copy tupDesc */ + + indexScanKey = _bt_mkscankey(indexRel, NULL); + + if (state->indexInfo->ii_Expressions != NULL) + { + TupleTableSlot *slot; + ExprContext *econtext; + + /* + * We will need to use FormIndexDatum to evaluate the index + * expressions. To do that, we need an EState, as well as a + * TupleTableSlot to put the table tuples into. The econtext's + * scantuple has to point to that slot, too. + */ + state->estate = CreateExecutorState(); + slot = MakeSingleTupleTableSlot(tupDesc, &TTSOpsHeapTuple); + econtext = GetPerTupleExprContext(state->estate); + econtext->ecxt_scantuple = slot; + } + + /* Prepare SortSupport data for each column */ + state->sortKeys = (SortSupport) palloc0(state->nKeys * + sizeof(SortSupportData)); + + for (i = 0; i < state->nKeys; i++) + { + SortSupport sortKey = state->sortKeys + i; + ScanKey scanKey = indexScanKey->scankeys + i; + int16 strategy; + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = scanKey->sk_collation; + sortKey->ssup_nulls_first = + (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0; + sortKey->ssup_attno = scanKey->sk_attno; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0); + + AssertState(sortKey->ssup_attno != 0); + + strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ? + BTGreaterStrategyNumber : BTLessStrategyNumber; + + PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey); + } + + pfree(indexScanKey); + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_index_btree(Relation heapRel, + Relation indexRel, + bool enforceUnique, + int workMem, + SortCoordinate coordinate, + bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + randomAccess); + BTScanInsert indexScanKey; + MemoryContext oldcontext; + int i; + + oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin index sort: unique = %c, workMem = %d, randomAccess = %c", + enforceUnique ? 't' : 'f', + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = IndexRelationGetNumberOfKeyAttributes(indexRel); + + TRACE_POSTGRESQL_SORT_START(INDEX_SORT, + enforceUnique, + state->nKeys, + workMem, + randomAccess, + PARALLEL_SORT(state)); + + state->comparetup = comparetup_index_btree; + state->copytup = copytup_index; + state->writetup = writetup_index; + state->readtup = readtup_index; + state->abbrevNext = 10; + + state->heapRel = heapRel; + state->indexRel = indexRel; + state->enforceUnique = enforceUnique; + + indexScanKey = _bt_mkscankey(indexRel, NULL); + + /* Prepare SortSupport data for each column */ + state->sortKeys = (SortSupport) palloc0(state->nKeys * + sizeof(SortSupportData)); + + for (i = 0; i < state->nKeys; i++) + { + SortSupport sortKey = state->sortKeys + i; + ScanKey scanKey = indexScanKey->scankeys + i; + int16 strategy; + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = scanKey->sk_collation; + sortKey->ssup_nulls_first = + (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0; + sortKey->ssup_attno = scanKey->sk_attno; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0); + + AssertState(sortKey->ssup_attno != 0); + + strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ? + BTGreaterStrategyNumber : BTLessStrategyNumber; + + PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey); + } + + pfree(indexScanKey); + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_index_hash(Relation heapRel, + Relation indexRel, + uint32 high_mask, + uint32 low_mask, + uint32 max_buckets, + int workMem, + SortCoordinate coordinate, + bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + randomAccess); + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin index sort: high_mask = 0x%x, low_mask = 0x%x, " + "max_buckets = 0x%x, workMem = %d, randomAccess = %c", + high_mask, + low_mask, + max_buckets, + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = 1; /* Only one sort column, the hash code */ + + state->comparetup = comparetup_index_hash; + state->copytup = copytup_index; + state->writetup = writetup_index; + state->readtup = readtup_index; + + state->heapRel = heapRel; + state->indexRel = indexRel; + + state->high_mask = high_mask; + state->low_mask = low_mask; + state->max_buckets = max_buckets; + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation, + bool nullsFirstFlag, int workMem, + SortCoordinate coordinate, bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + randomAccess); + MemoryContext oldcontext; + int16 typlen; + bool typbyval; + + oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin datum sort: workMem = %d, randomAccess = %c", + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = 1; /* always a one-column sort */ + + TRACE_POSTGRESQL_SORT_START(DATUM_SORT, + false, /* no unique check */ + 1, + workMem, + randomAccess, + PARALLEL_SORT(state)); + + state->comparetup = comparetup_datum; + state->copytup = copytup_datum; + state->writetup = writetup_datum; + state->readtup = readtup_datum; + state->abbrevNext = 10; + + state->datumType = datumType; + + /* lookup necessary attributes of the datum type */ + get_typlenbyval(datumType, &typlen, &typbyval); + state->datumTypeLen = typlen; + state->tuples = !typbyval; + + /* Prepare SortSupport data */ + state->sortKeys = (SortSupport) palloc0(sizeof(SortSupportData)); + + state->sortKeys->ssup_cxt = CurrentMemoryContext; + state->sortKeys->ssup_collation = sortCollation; + state->sortKeys->ssup_nulls_first = nullsFirstFlag; + + /* + * Abbreviation is possible here only for by-reference types. In theory, + * a pass-by-value datatype could have an abbreviated form that is cheaper + * to compare. In a tuple sort, we could support that, because we can + * always extract the original datum from the tuple is needed. Here, we + * can't, because a datum sort only stores a single copy of the datum; the + * "tuple" field of each sortTuple is NULL. + */ + state->sortKeys->abbreviate = !typbyval; + + PrepareSortSupportFromOrderingOp(sortOperator, state->sortKeys); + + /* + * The "onlyKey" optimization cannot be used with abbreviated keys, since + * tie-breaker comparisons may be required. Typically, the optimization + * is only of value to pass-by-value types anyway, whereas abbreviated + * keys are typically only of value to pass-by-reference types. + */ + if (!state->sortKeys->abbrev_converter) + state->onlyKey = state->sortKeys; + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +/* + * tuplesort_set_bound + * + * Advise tuplesort that at most the first N result tuples are required. + * + * Must be called before inserting any tuples. (Actually, we could allow it + * as long as the sort hasn't spilled to disk, but there seems no need for + * delayed calls at the moment.) + * + * This is a hint only. The tuplesort may still return more tuples than + * requested. Parallel leader tuplesorts will always ignore the hint. + */ +void +tuplesort_set_bound(Tuplesortstate *state, int64 bound) +{ + /* Assert we're called before loading any tuples */ + Assert(state->status == TSS_INITIAL); + Assert(state->memtupcount == 0); + Assert(!state->bounded); + Assert(!WORKER(state)); + +#ifdef DEBUG_BOUNDED_SORT + /* Honor GUC setting that disables the feature (for easy testing) */ + if (!optimize_bounded_sort) + return; +#endif + + /* Parallel leader ignores hint */ + if (LEADER(state)) + return; + + /* We want to be able to compute bound * 2, so limit the setting */ + if (bound > (int64) (INT_MAX / 2)) + return; + + state->bounded = true; + state->bound = (int) bound; + + /* + * Bounded sorts are not an effective target for abbreviated key + * optimization. Disable by setting state to be consistent with no + * abbreviation support. + */ + state->sortKeys->abbrev_converter = NULL; + if (state->sortKeys->abbrev_full_comparator) + state->sortKeys->comparator = state->sortKeys->abbrev_full_comparator; + + /* Not strictly necessary, but be tidy */ + state->sortKeys->abbrev_abort = NULL; + state->sortKeys->abbrev_full_comparator = NULL; +} + +/* + * tuplesort_end + * + * Release resources and clean up. + * + * NOTE: after calling this, any pointers returned by tuplesort_getXXX are + * pointing to garbage. Be careful not to attempt to use or free such + * pointers afterwards! + */ +void +tuplesort_end(Tuplesortstate *state) +{ + /* context swap probably not needed, but let's be safe */ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + long spaceUsed; + + if (state->tapeset) + spaceUsed = LogicalTapeSetBlocks(state->tapeset); + else + spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024; +#endif + + /* + * Delete temporary "tape" files, if any. + * + * Note: want to include this in reported total cost of sort, hence need + * for two #ifdef TRACE_SORT sections. + */ + if (state->tapeset) + LogicalTapeSetClose(state->tapeset); + +#ifdef TRACE_SORT + if (trace_sort) + { + if (state->tapeset) + elog(LOG, "%s of worker %d ended, %ld disk blocks used: %s", + SERIAL(state) ? "external sort" : "parallel external sort", + state->worker, spaceUsed, pg_rusage_show(&state->ru_start)); + else + elog(LOG, "%s of worker %d ended, %ld KB used: %s", + SERIAL(state) ? "internal sort" : "unperformed parallel sort", + state->worker, spaceUsed, pg_rusage_show(&state->ru_start)); + } + + TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, spaceUsed); +#else + + /* + * If you disabled TRACE_SORT, you can still probe sort__done, but you + * ain't getting space-used stats. + */ + TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, 0L); +#endif + + /* Free any execution state created for CLUSTER case */ + if (state->estate != NULL) + { + ExprContext *econtext = GetPerTupleExprContext(state->estate); + + ExecDropSingleTupleTableSlot(econtext->ecxt_scantuple); + FreeExecutorState(state->estate); + } + + MemoryContextSwitchTo(oldcontext); + + /* + * Free the per-sort memory context, thereby releasing all working memory, + * including the Tuplesortstate struct itself. + */ + MemoryContextDelete(state->sortcontext); +} + +/* + * Grow the memtuples[] array, if possible within our memory constraint. We + * must not exceed INT_MAX tuples in memory or the caller-provided memory + * limit. Return true if we were able to enlarge the array, false if not. + * + * Normally, at each increment we double the size of the array. When doing + * that would exceed a limit, we attempt one last, smaller increase (and then + * clear the growmemtuples flag so we don't try any more). That allows us to + * use memory as fully as permitted; sticking to the pure doubling rule could + * result in almost half going unused. Because availMem moves around with + * tuple addition/removal, we need some rule to prevent making repeated small + * increases in memtupsize, which would just be useless thrashing. The + * growmemtuples flag accomplishes that and also prevents useless + * recalculations in this function. + */ +static bool +grow_memtuples(Tuplesortstate *state) +{ + int newmemtupsize; + int memtupsize = state->memtupsize; + int64 memNowUsed = state->allowedMem - state->availMem; + + /* Forget it if we've already maxed out memtuples, per comment above */ + if (!state->growmemtuples) + return false; + + /* Select new value of memtupsize */ + if (memNowUsed <= state->availMem) + { + /* + * We've used no more than half of allowedMem; double our usage, + * clamping at INT_MAX tuples. + */ + if (memtupsize < INT_MAX / 2) + newmemtupsize = memtupsize * 2; + else + { + newmemtupsize = INT_MAX; + state->growmemtuples = false; + } + } + else + { + /* + * This will be the last increment of memtupsize. Abandon doubling + * strategy and instead increase as much as we safely can. + * + * To stay within allowedMem, we can't increase memtupsize by more + * than availMem / sizeof(SortTuple) elements. In practice, we want + * to increase it by considerably less, because we need to leave some + * space for the tuples to which the new array slots will refer. We + * assume the new tuples will be about the same size as the tuples + * we've already seen, and thus we can extrapolate from the space + * consumption so far to estimate an appropriate new size for the + * memtuples array. The optimal value might be higher or lower than + * this estimate, but it's hard to know that in advance. We again + * clamp at INT_MAX tuples. + * + * This calculation is safe against enlarging the array so much that + * LACKMEM becomes true, because the memory currently used includes + * the present array; thus, there would be enough allowedMem for the + * new array elements even if no other memory were currently used. + * + * We do the arithmetic in float8, because otherwise the product of + * memtupsize and allowedMem could overflow. Any inaccuracy in the + * result should be insignificant; but even if we computed a + * completely insane result, the checks below will prevent anything + * really bad from happening. + */ + double grow_ratio; + + grow_ratio = (double) state->allowedMem / (double) memNowUsed; + if (memtupsize * grow_ratio < INT_MAX) + newmemtupsize = (int) (memtupsize * grow_ratio); + else + newmemtupsize = INT_MAX; + + /* We won't make any further enlargement attempts */ + state->growmemtuples = false; + } + + /* Must enlarge array by at least one element, else report failure */ + if (newmemtupsize <= memtupsize) + goto noalloc; + + /* + * On a 32-bit machine, allowedMem could exceed MaxAllocHugeSize. Clamp + * to ensure our request won't be rejected. Note that we can easily + * exhaust address space before facing this outcome. (This is presently + * impossible due to guc.c's MAX_KILOBYTES limitation on work_mem, but + * don't rely on that at this distance.) + */ + if ((Size) newmemtupsize >= MaxAllocHugeSize / sizeof(SortTuple)) + { + newmemtupsize = (int) (MaxAllocHugeSize / sizeof(SortTuple)); + state->growmemtuples = false; /* can't grow any more */ + } + + /* + * We need to be sure that we do not cause LACKMEM to become true, else + * the space management algorithm will go nuts. The code above should + * never generate a dangerous request, but to be safe, check explicitly + * that the array growth fits within availMem. (We could still cause + * LACKMEM if the memory chunk overhead associated with the memtuples + * array were to increase. That shouldn't happen because we chose the + * initial array size large enough to ensure that palloc will be treating + * both old and new arrays as separate chunks. But we'll check LACKMEM + * explicitly below just in case.) + */ + if (state->availMem < (int64) ((newmemtupsize - memtupsize) * sizeof(SortTuple))) + goto noalloc; + + /* OK, do it */ + FREEMEM(state, GetMemoryChunkSpace(state->memtuples)); + state->memtupsize = newmemtupsize; + state->memtuples = (SortTuple *) + repalloc_huge(state->memtuples, + state->memtupsize * sizeof(SortTuple)); + USEMEM(state, GetMemoryChunkSpace(state->memtuples)); + if (LACKMEM(state)) + elog(ERROR, "unexpected out-of-memory situation in tuplesort"); + return true; + +noalloc: + /* If for any reason we didn't realloc, shut off future attempts */ + state->growmemtuples = false; + return false; +} + +/* + * Accept one tuple while collecting input data for sort. + * + * Note that the input data is always copied; the caller need not save it. + */ +void +tuplesort_puttupleslot(Tuplesortstate *state, TupleTableSlot *slot) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + /* + * Copy the given tuple into memory we control, and decrease availMem. + * Then call the common code. + */ + COPYTUP(state, &stup, (void *) slot); + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Accept one tuple while collecting input data for sort. + * + * Note that the input data is always copied; the caller need not save it. + */ +void +tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + /* + * Copy the given tuple into memory we control, and decrease availMem. + * Then call the common code. + */ + COPYTUP(state, &stup, (void *) tup); + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Collect one index tuple while collecting input data for sort, building + * it from caller-supplied values. + */ +void +tuplesort_putindextuplevalues(Tuplesortstate *state, Relation rel, + ItemPointer self, Datum *values, + bool *isnull) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + SortTuple stup; + Datum original; + IndexTuple tuple; + + stup.tuple = index_form_tuple(RelationGetDescr(rel), values, isnull); + tuple = ((IndexTuple) stup.tuple); + tuple->t_tid = *self; + USEMEM(state, GetMemoryChunkSpace(stup.tuple)); + /* set up first-column key value */ + original = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &stup.isnull1); + + MemoryContextSwitchTo(state->sortcontext); + + if (!state->sortKeys || !state->sortKeys->abbrev_converter || stup.isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to zeroed representation (to be + * consistent, and to support cheap inequality tests for NULL + * abbreviated keys). + */ + stup.datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup.datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup.datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + tuple = mtup->tuple; + mtup->datum1 = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &mtup->isnull1); + } + } + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Accept one Datum while collecting input data for sort. + * + * If the Datum is pass-by-ref type, the value will be copied. + */ +void +tuplesort_putdatum(Tuplesortstate *state, Datum val, bool isNull) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + SortTuple stup; + + /* + * Pass-by-value types or null values are just stored directly in + * stup.datum1 (and stup.tuple is not used and set to NULL). + * + * Non-null pass-by-reference values need to be copied into memory we + * control, and possibly abbreviated. The copied value is pointed to by + * stup.tuple and is treated as the canonical copy (e.g. to return via + * tuplesort_getdatum or when writing to tape); stup.datum1 gets the + * abbreviated value if abbreviation is happening, otherwise it's + * identical to stup.tuple. + */ + + if (isNull || !state->tuples) + { + /* + * Set datum1 to zeroed representation for NULLs (to be consistent, + * and to support cheap inequality tests for NULL abbreviated keys). + */ + stup.datum1 = !isNull ? val : (Datum) 0; + stup.isnull1 = isNull; + stup.tuple = NULL; /* no separate storage */ + MemoryContextSwitchTo(state->sortcontext); + } + else + { + Datum original = datumCopy(val, false, state->datumTypeLen); + + stup.isnull1 = false; + stup.tuple = DatumGetPointer(original); + USEMEM(state, GetMemoryChunkSpace(stup.tuple)); + MemoryContextSwitchTo(state->sortcontext); + + if (!state->sortKeys->abbrev_converter) + { + stup.datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup.datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup.datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any + * case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + mtup->datum1 = PointerGetDatum(mtup->tuple); + } + } + } + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Shared code for tuple and datum cases. + */ +static void +puttuple_common(Tuplesortstate *state, SortTuple *tuple) +{ + Assert(!LEADER(state)); + + switch (state->status) + { + case TSS_INITIAL: + + /* + * Save the tuple into the unsorted array. First, grow the array + * as needed. Note that we try to grow the array when there is + * still one free slot remaining --- if we fail, there'll still be + * room to store the incoming tuple, and then we'll switch to + * tape-based operation. + */ + if (state->memtupcount >= state->memtupsize - 1) + { + (void) grow_memtuples(state); + Assert(state->memtupcount < state->memtupsize); + } + state->memtuples[state->memtupcount++] = *tuple; + + /* + * Check if it's time to switch over to a bounded heapsort. We do + * so if the input tuple count exceeds twice the desired tuple + * count (this is a heuristic for where heapsort becomes cheaper + * than a quicksort), or if we've just filled workMem and have + * enough tuples to meet the bound. + * + * Note that once we enter TSS_BOUNDED state we will always try to + * complete the sort that way. In the worst case, if later input + * tuples are larger than earlier ones, this might cause us to + * exceed workMem significantly. + */ + if (state->bounded && + (state->memtupcount > state->bound * 2 || + (state->memtupcount > state->bound && LACKMEM(state)))) + { +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "switching to bounded heapsort at %d tuples: %s", + state->memtupcount, + pg_rusage_show(&state->ru_start)); +#endif + make_bounded_heap(state); + return; + } + + /* + * Done if we still fit in available memory and have array slots. + */ + if (state->memtupcount < state->memtupsize && !LACKMEM(state)) + return; + + /* + * Nope; time to switch to tape-based operation. + */ + inittapes(state, true); + + /* + * Dump all tuples. + */ + dumptuples(state, false); + break; + + case TSS_BOUNDED: + + /* + * We don't want to grow the array here, so check whether the new + * tuple can be discarded before putting it in. This should be a + * good speed optimization, too, since when there are many more + * input tuples than the bound, most input tuples can be discarded + * with just this one comparison. Note that because we currently + * have the sort direction reversed, we must check for <= not >=. + */ + if (COMPARETUP(state, tuple, &state->memtuples[0]) <= 0) + { + /* new tuple <= top of the heap, so we can discard it */ + free_sort_tuple(state, tuple); + CHECK_FOR_INTERRUPTS(); + } + else + { + /* discard top of heap, replacing it with the new tuple */ + free_sort_tuple(state, &state->memtuples[0]); + tuplesort_heap_replace_top(state, tuple); + } + break; + + case TSS_BUILDRUNS: + + /* + * Save the tuple into the unsorted array (there must be space) + */ + state->memtuples[state->memtupcount++] = *tuple; + + /* + * If we are over the memory limit, dump all tuples. + */ + dumptuples(state, false); + break; + + default: + elog(ERROR, "invalid tuplesort state"); + break; + } +} + +static bool +consider_abort_common(Tuplesortstate *state) +{ + Assert(state->sortKeys[0].abbrev_converter != NULL); + Assert(state->sortKeys[0].abbrev_abort != NULL); + Assert(state->sortKeys[0].abbrev_full_comparator != NULL); + + /* + * Check effectiveness of abbreviation optimization. Consider aborting + * when still within memory limit. + */ + if (state->status == TSS_INITIAL && + state->memtupcount >= state->abbrevNext) + { + state->abbrevNext *= 2; + + /* + * Check opclass-supplied abbreviation abort routine. It may indicate + * that abbreviation should not proceed. + */ + if (!state->sortKeys->abbrev_abort(state->memtupcount, + state->sortKeys)) + return false; + + /* + * Finally, restore authoritative comparator, and indicate that + * abbreviation is not in play by setting abbrev_converter to NULL + */ + state->sortKeys[0].comparator = state->sortKeys[0].abbrev_full_comparator; + state->sortKeys[0].abbrev_converter = NULL; + /* Not strictly necessary, but be tidy */ + state->sortKeys[0].abbrev_abort = NULL; + state->sortKeys[0].abbrev_full_comparator = NULL; + + /* Give up - expect original pass-by-value representation */ + return true; + } + + return false; +} + +/* + * All tuples have been provided; finish the sort. + */ +void +tuplesort_performsort(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "performsort of worker %d starting: %s", + state->worker, pg_rusage_show(&state->ru_start)); +#endif + + switch (state->status) + { + case TSS_INITIAL: + + /* + * We were able to accumulate all the tuples within the allowed + * amount of memory, or leader to take over worker tapes + */ + if (SERIAL(state)) + { + /* Just qsort 'em and we're done */ + tuplesort_sort_memtuples(state); + state->status = TSS_SORTEDINMEM; + } + else if (WORKER(state)) + { + /* + * Parallel workers must still dump out tuples to tape. No + * merge is required to produce single output run, though. + */ + inittapes(state, false); + dumptuples(state, true); + worker_nomergeruns(state); + state->status = TSS_SORTEDONTAPE; + } + else + { + /* + * Leader will take over worker tapes and merge worker runs. + * Note that mergeruns sets the correct state->status. + */ + leader_takeover_tapes(state); + mergeruns(state); + } + state->current = 0; + state->eof_reached = false; + state->markpos_block = 0L; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + + case TSS_BOUNDED: + + /* + * We were able to accumulate all the tuples required for output + * in memory, using a heap to eliminate excess tuples. Now we + * have to transform the heap to a properly-sorted array. + */ + sort_bounded_heap(state); + state->current = 0; + state->eof_reached = false; + state->markpos_offset = 0; + state->markpos_eof = false; + state->status = TSS_SORTEDINMEM; + break; + + case TSS_BUILDRUNS: + + /* + * Finish tape-based sort. First, flush all tuples remaining in + * memory out to tape; then merge until we have a single remaining + * run (or, if !randomAccess and !WORKER(), one run per tape). + * Note that mergeruns sets the correct state->status. + */ + dumptuples(state, true); + mergeruns(state); + state->eof_reached = false; + state->markpos_block = 0L; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + +#ifdef TRACE_SORT + if (trace_sort) + { + if (state->status == TSS_FINALMERGE) + elog(LOG, "performsort of worker %d done (except %d-way final merge): %s", + state->worker, state->activeTapes, + pg_rusage_show(&state->ru_start)); + else + elog(LOG, "performsort of worker %d done: %s", + state->worker, pg_rusage_show(&state->ru_start)); + } +#endif + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Internal routine to fetch the next tuple in either forward or back + * direction into *stup. Returns false if no more tuples. + * Returned tuple belongs to tuplesort memory context, and must not be freed + * by caller. Note that fetched tuple is stored in memory that may be + * recycled by any future fetch. + */ +static bool +tuplesort_gettuple_common(Tuplesortstate *state, bool forward, + SortTuple *stup) +{ + unsigned int tuplen; + size_t nmoved; + + Assert(!WORKER(state)); + + switch (state->status) + { + case TSS_SORTEDINMEM: + Assert(forward || state->randomAccess); + Assert(!state->slabAllocatorUsed); + if (forward) + { + if (state->current < state->memtupcount) + { + *stup = state->memtuples[state->current++]; + return true; + } + state->eof_reached = true; + + /* + * Complain if caller tries to retrieve more tuples than + * originally asked for in a bounded sort. This is because + * returning EOF here might be the wrong thing. + */ + if (state->bounded && state->current >= state->bound) + elog(ERROR, "retrieved too many tuples in a bounded sort"); + + return false; + } + else + { + if (state->current <= 0) + return false; + + /* + * if all tuples are fetched already then we return last + * tuple, else - tuple before last returned. + */ + if (state->eof_reached) + state->eof_reached = false; + else + { + state->current--; /* last returned tuple */ + if (state->current <= 0) + return false; + } + *stup = state->memtuples[state->current - 1]; + return true; + } + break; + + case TSS_SORTEDONTAPE: + Assert(forward || state->randomAccess); + Assert(state->slabAllocatorUsed); + + /* + * The slot that held the tuple that we returned in previous + * gettuple call can now be reused. + */ + if (state->lastReturnedTuple) + { + RELEASE_SLAB_SLOT(state, state->lastReturnedTuple); + state->lastReturnedTuple = NULL; + } + + if (forward) + { + if (state->eof_reached) + return false; + + if ((tuplen = getlen(state, state->result_tape, true)) != 0) + { + READTUP(state, stup, state->result_tape, tuplen); + + /* + * Remember the tuple we return, so that we can recycle + * its memory on next call. (This can be NULL, in the + * !state->tuples case). + */ + state->lastReturnedTuple = stup->tuple; + + return true; + } + else + { + state->eof_reached = true; + return false; + } + } + + /* + * Backward. + * + * if all tuples are fetched already then we return last tuple, + * else - tuple before last returned. + */ + if (state->eof_reached) + { + /* + * Seek position is pointing just past the zero tuplen at the + * end of file; back up to fetch last tuple's ending length + * word. If seek fails we must have a completely empty file. + */ + nmoved = LogicalTapeBackspace(state->tapeset, + state->result_tape, + 2 * sizeof(unsigned int)); + if (nmoved == 0) + return false; + else if (nmoved != 2 * sizeof(unsigned int)) + elog(ERROR, "unexpected tape position"); + state->eof_reached = false; + } + else + { + /* + * Back up and fetch previously-returned tuple's ending length + * word. If seek fails, assume we are at start of file. + */ + nmoved = LogicalTapeBackspace(state->tapeset, + state->result_tape, + sizeof(unsigned int)); + if (nmoved == 0) + return false; + else if (nmoved != sizeof(unsigned int)) + elog(ERROR, "unexpected tape position"); + tuplen = getlen(state, state->result_tape, false); + + /* + * Back up to get ending length word of tuple before it. + */ + nmoved = LogicalTapeBackspace(state->tapeset, + state->result_tape, + tuplen + 2 * sizeof(unsigned int)); + if (nmoved == tuplen + sizeof(unsigned int)) + { + /* + * We backed up over the previous tuple, but there was no + * ending length word before it. That means that the prev + * tuple is the first tuple in the file. It is now the + * next to read in forward direction (not obviously right, + * but that is what in-memory case does). + */ + return false; + } + else if (nmoved != tuplen + 2 * sizeof(unsigned int)) + elog(ERROR, "bogus tuple length in backward scan"); + } + + tuplen = getlen(state, state->result_tape, false); + + /* + * Now we have the length of the prior tuple, back up and read it. + * Note: READTUP expects we are positioned after the initial + * length word of the tuple, so back up to that point. + */ + nmoved = LogicalTapeBackspace(state->tapeset, + state->result_tape, + tuplen); + if (nmoved != tuplen) + elog(ERROR, "bogus tuple length in backward scan"); + READTUP(state, stup, state->result_tape, tuplen); + + /* + * Remember the tuple we return, so that we can recycle its memory + * on next call. (This can be NULL, in the Datum case). + */ + state->lastReturnedTuple = stup->tuple; + + return true; + + case TSS_FINALMERGE: + Assert(forward); + /* We are managing memory ourselves, with the slab allocator. */ + Assert(state->slabAllocatorUsed); + + /* + * The slab slot holding the tuple that we returned in previous + * gettuple call can now be reused. + */ + if (state->lastReturnedTuple) + { + RELEASE_SLAB_SLOT(state, state->lastReturnedTuple); + state->lastReturnedTuple = NULL; + } + + /* + * This code should match the inner loop of mergeonerun(). + */ + if (state->memtupcount > 0) + { + int srcTape = state->memtuples[0].tupindex; + SortTuple newtup; + + *stup = state->memtuples[0]; + + /* + * Remember the tuple we return, so that we can recycle its + * memory on next call. (This can be NULL, in the Datum case). + */ + state->lastReturnedTuple = stup->tuple; + + /* + * Pull next tuple from tape, and replace the returned tuple + * at top of the heap with it. + */ + if (!mergereadnext(state, srcTape, &newtup)) + { + /* + * If no more data, we've reached end of run on this tape. + * Remove the top node from the heap. + */ + tuplesort_heap_delete_top(state); + + /* + * Rewind to free the read buffer. It'd go away at the + * end of the sort anyway, but better to release the + * memory early. + */ + LogicalTapeRewindForWrite(state->tapeset, srcTape); + return true; + } + newtup.tupindex = srcTape; + tuplesort_heap_replace_top(state, &newtup); + return true; + } + return false; + + default: + elog(ERROR, "invalid tuplesort state"); + return false; /* keep compiler quiet */ + } +} + +/* + * Fetch the next tuple in either forward or back direction. + * If successful, put tuple in slot and return true; else, clear the slot + * and return false. + * + * Caller may optionally be passed back abbreviated value (on true return + * value) when abbreviation was used, which can be used to cheaply avoid + * equality checks that might otherwise be required. Caller can safely make a + * determination of "non-equal tuple" based on simple binary inequality. A + * NULL value in leading attribute will set abbreviated value to zeroed + * representation, which caller may rely on in abbreviated inequality check. + * + * If copy is true, the slot receives a tuple that's been copied into the + * caller's memory context, so that it will stay valid regardless of future + * manipulations of the tuplesort's state (up to and including deleting the + * tuplesort). If copy is false, the slot will just receive a pointer to a + * tuple held within the tuplesort, which is more efficient, but only safe for + * callers that are prepared to have any subsequent manipulation of the + * tuplesort's state invalidate slot contents. + */ +bool +tuplesort_gettupleslot(Tuplesortstate *state, bool forward, bool copy, + TupleTableSlot *slot, Datum *abbrev) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + stup.tuple = NULL; + + MemoryContextSwitchTo(oldcontext); + + if (stup.tuple) + { + /* Record abbreviated key for caller */ + if (state->sortKeys->abbrev_converter && abbrev) + *abbrev = stup.datum1; + + if (copy) + stup.tuple = heap_copy_minimal_tuple((MinimalTuple) stup.tuple); + + ExecStoreMinimalTuple((MinimalTuple) stup.tuple, slot, copy); + return true; + } + else + { + ExecClearTuple(slot); + return false; + } +} + +/* + * Fetch the next tuple in either forward or back direction. + * Returns NULL if no more tuples. Returned tuple belongs to tuplesort memory + * context, and must not be freed by caller. Caller may not rely on tuple + * remaining valid after any further manipulation of tuplesort. + */ +HeapTuple +tuplesort_getheaptuple(Tuplesortstate *state, bool forward) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + stup.tuple = NULL; + + MemoryContextSwitchTo(oldcontext); + + return stup.tuple; +} + +/* + * Fetch the next index tuple in either forward or back direction. + * Returns NULL if no more tuples. Returned tuple belongs to tuplesort memory + * context, and must not be freed by caller. Caller may not rely on tuple + * remaining valid after any further manipulation of tuplesort. + */ +IndexTuple +tuplesort_getindextuple(Tuplesortstate *state, bool forward) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + stup.tuple = NULL; + + MemoryContextSwitchTo(oldcontext); + + return (IndexTuple) stup.tuple; +} + +/* + * Fetch the next Datum in either forward or back direction. + * Returns false if no more datums. + * + * If the Datum is pass-by-ref type, the returned value is freshly palloc'd + * in caller's context, and is now owned by the caller (this differs from + * similar routines for other types of tuplesorts). + * + * Caller may optionally be passed back abbreviated value (on true return + * value) when abbreviation was used, which can be used to cheaply avoid + * equality checks that might otherwise be required. Caller can safely make a + * determination of "non-equal tuple" based on simple binary inequality. A + * NULL value will have a zeroed abbreviated value representation, which caller + * may rely on in abbreviated inequality check. + */ +bool +tuplesort_getdatum(Tuplesortstate *state, bool forward, + Datum *val, bool *isNull, Datum *abbrev) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + { + MemoryContextSwitchTo(oldcontext); + return false; + } + + /* Ensure we copy into caller's memory context */ + MemoryContextSwitchTo(oldcontext); + + /* Record abbreviated key for caller */ + if (state->sortKeys->abbrev_converter && abbrev) + *abbrev = stup.datum1; + + if (stup.isnull1 || !state->tuples) + { + *val = stup.datum1; + *isNull = stup.isnull1; + } + else + { + /* use stup.tuple because stup.datum1 may be an abbreviation */ + *val = datumCopy(PointerGetDatum(stup.tuple), false, state->datumTypeLen); + *isNull = false; + } + + return true; +} + +/* + * Advance over N tuples in either forward or back direction, + * without returning any data. N==0 is a no-op. + * Returns true if successful, false if ran out of tuples. + */ +bool +tuplesort_skiptuples(Tuplesortstate *state, int64 ntuples, bool forward) +{ + MemoryContext oldcontext; + + /* + * We don't actually support backwards skip yet, because no callers need + * it. The API is designed to allow for that later, though. + */ + Assert(forward); + Assert(ntuples >= 0); + Assert(!WORKER(state)); + + switch (state->status) + { + case TSS_SORTEDINMEM: + if (state->memtupcount - state->current >= ntuples) + { + state->current += ntuples; + return true; + } + state->current = state->memtupcount; + state->eof_reached = true; + + /* + * Complain if caller tries to retrieve more tuples than + * originally asked for in a bounded sort. This is because + * returning EOF here might be the wrong thing. + */ + if (state->bounded && state->current >= state->bound) + elog(ERROR, "retrieved too many tuples in a bounded sort"); + + return false; + + case TSS_SORTEDONTAPE: + case TSS_FINALMERGE: + + /* + * We could probably optimize these cases better, but for now it's + * not worth the trouble. + */ + oldcontext = MemoryContextSwitchTo(state->sortcontext); + while (ntuples-- > 0) + { + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + { + MemoryContextSwitchTo(oldcontext); + return false; + } + CHECK_FOR_INTERRUPTS(); + } + MemoryContextSwitchTo(oldcontext); + return true; + + default: + elog(ERROR, "invalid tuplesort state"); + return false; /* keep compiler quiet */ + } +} + +/* + * tuplesort_merge_order - report merge order we'll use for given memory + * (note: "merge order" just means the number of input tapes in the merge). + * + * This is exported for use by the planner. allowedMem is in bytes. + */ +int +tuplesort_merge_order(int64 allowedMem) +{ + int mOrder; + + /* + * We need one tape for each merge input, plus another one for the output, + * and each of these tapes needs buffer space. In addition we want + * MERGE_BUFFER_SIZE workspace per input tape (but the output tape doesn't + * count). + * + * Note: you might be thinking we need to account for the memtuples[] + * array in this calculation, but we effectively treat that as part of the + * MERGE_BUFFER_SIZE workspace. + */ + mOrder = (allowedMem - TAPE_BUFFER_OVERHEAD) / + (MERGE_BUFFER_SIZE + TAPE_BUFFER_OVERHEAD); + + /* + * Even in minimum memory, use at least a MINORDER merge. On the other + * hand, even when we have lots of memory, do not use more than a MAXORDER + * merge. Tapes are pretty cheap, but they're not entirely free. Each + * additional tape reduces the amount of memory available to build runs, + * which in turn can cause the same sort to need more runs, which makes + * merging slower even if it can still be done in a single pass. Also, + * high order merges are quite slow due to CPU cache effects; it can be + * faster to pay the I/O cost of a polyphase merge than to perform a + * single merge pass across many hundreds of tapes. + */ + mOrder = Max(mOrder, MINORDER); + mOrder = Min(mOrder, MAXORDER); + + return mOrder; +} + +/* + * inittapes - initialize for tape sorting. + * + * This is called only if we have found we won't sort in memory. + */ +static void +inittapes(Tuplesortstate *state, bool mergeruns) +{ + int maxTapes, + j; + + Assert(!LEADER(state)); + + if (mergeruns) + { + /* Compute number of tapes to use: merge order plus 1 */ + maxTapes = tuplesort_merge_order(state->allowedMem) + 1; + } + else + { + /* Workers can sometimes produce single run, output without merge */ + Assert(WORKER(state)); + maxTapes = MINORDER + 1; + } + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d switching to external sort with %d tapes: %s", + state->worker, maxTapes, pg_rusage_show(&state->ru_start)); +#endif + + /* Create the tape set and allocate the per-tape data arrays */ + inittapestate(state, maxTapes); + state->tapeset = + LogicalTapeSetCreate(maxTapes, NULL, + state->shared ? &state->shared->fileset : NULL, + state->worker); + + state->currentRun = 0; + + /* + * Initialize variables of Algorithm D (step D1). + */ + for (j = 0; j < maxTapes; j++) + { + state->tp_fib[j] = 1; + state->tp_runs[j] = 0; + state->tp_dummy[j] = 1; + state->tp_tapenum[j] = j; + } + state->tp_fib[state->tapeRange] = 0; + state->tp_dummy[state->tapeRange] = 0; + + state->Level = 1; + state->destTape = 0; + + state->status = TSS_BUILDRUNS; +} + +/* + * inittapestate - initialize generic tape management state + */ +static void +inittapestate(Tuplesortstate *state, int maxTapes) +{ + int64 tapeSpace; + + /* + * Decrease availMem to reflect the space needed for tape buffers; but + * don't decrease it to the point that we have no room for tuples. (That + * case is only likely to occur if sorting pass-by-value Datums; in all + * other scenarios the memtuples[] array is unlikely to occupy more than + * half of allowedMem. In the pass-by-value case it's not important to + * account for tuple space, so we don't care if LACKMEM becomes + * inaccurate.) + */ + tapeSpace = (int64) maxTapes * TAPE_BUFFER_OVERHEAD; + + if (tapeSpace + GetMemoryChunkSpace(state->memtuples) < state->allowedMem) + USEMEM(state, tapeSpace); + + /* + * Make sure that the temp file(s) underlying the tape set are created in + * suitable temp tablespaces. For parallel sorts, this should have been + * called already, but it doesn't matter if it is called a second time. + */ + PrepareTempTablespaces(); + + state->mergeactive = (bool *) palloc0(maxTapes * sizeof(bool)); + state->tp_fib = (int *) palloc0(maxTapes * sizeof(int)); + state->tp_runs = (int *) palloc0(maxTapes * sizeof(int)); + state->tp_dummy = (int *) palloc0(maxTapes * sizeof(int)); + state->tp_tapenum = (int *) palloc0(maxTapes * sizeof(int)); + + /* Record # of tapes allocated (for duration of sort) */ + state->maxTapes = maxTapes; + /* Record maximum # of tapes usable as inputs when merging */ + state->tapeRange = maxTapes - 1; +} + +/* + * selectnewtape -- select new tape for new initial run. + * + * This is called after finishing a run when we know another run + * must be started. This implements steps D3, D4 of Algorithm D. + */ +static void +selectnewtape(Tuplesortstate *state) +{ + int j; + int a; + + /* Step D3: advance j (destTape) */ + if (state->tp_dummy[state->destTape] < state->tp_dummy[state->destTape + 1]) + { + state->destTape++; + return; + } + if (state->tp_dummy[state->destTape] != 0) + { + state->destTape = 0; + return; + } + + /* Step D4: increase level */ + state->Level++; + a = state->tp_fib[0]; + for (j = 0; j < state->tapeRange; j++) + { + state->tp_dummy[j] = a + state->tp_fib[j + 1] - state->tp_fib[j]; + state->tp_fib[j] = a + state->tp_fib[j + 1]; + } + state->destTape = 0; +} + +/* + * Initialize the slab allocation arena, for the given number of slots. + */ +static void +init_slab_allocator(Tuplesortstate *state, int numSlots) +{ + if (numSlots > 0) + { + char *p; + int i; + + state->slabMemoryBegin = palloc(numSlots * SLAB_SLOT_SIZE); + state->slabMemoryEnd = state->slabMemoryBegin + + numSlots * SLAB_SLOT_SIZE; + state->slabFreeHead = (SlabSlot *) state->slabMemoryBegin; + USEMEM(state, numSlots * SLAB_SLOT_SIZE); + + p = state->slabMemoryBegin; + for (i = 0; i < numSlots - 1; i++) + { + ((SlabSlot *) p)->nextfree = (SlabSlot *) (p + SLAB_SLOT_SIZE); + p += SLAB_SLOT_SIZE; + } + ((SlabSlot *) p)->nextfree = NULL; + } + else + { + state->slabMemoryBegin = state->slabMemoryEnd = NULL; + state->slabFreeHead = NULL; + } + state->slabAllocatorUsed = true; +} + +/* + * mergeruns -- merge all the completed initial runs. + * + * This implements steps D5, D6 of Algorithm D. All input data has + * already been written to initial runs on tape (see dumptuples). + */ +static void +mergeruns(Tuplesortstate *state) +{ + int tapenum, + svTape, + svRuns, + svDummy; + int numTapes; + int numInputTapes; + + Assert(state->status == TSS_BUILDRUNS); + Assert(state->memtupcount == 0); + + if (state->sortKeys != NULL && state->sortKeys->abbrev_converter != NULL) + { + /* + * If there are multiple runs to be merged, when we go to read back + * tuples from disk, abbreviated keys will not have been stored, and + * we don't care to regenerate them. Disable abbreviation from this + * point on. + */ + state->sortKeys->abbrev_converter = NULL; + state->sortKeys->comparator = state->sortKeys->abbrev_full_comparator; + + /* Not strictly necessary, but be tidy */ + state->sortKeys->abbrev_abort = NULL; + state->sortKeys->abbrev_full_comparator = NULL; + } + + /* + * Reset tuple memory. We've freed all the tuples that we previously + * allocated. We will use the slab allocator from now on. + */ + MemoryContextDelete(state->tuplecontext); + state->tuplecontext = NULL; + + /* + * We no longer need a large memtuples array. (We will allocate a smaller + * one for the heap later.) + */ + FREEMEM(state, GetMemoryChunkSpace(state->memtuples)); + pfree(state->memtuples); + state->memtuples = NULL; + + /* + * If we had fewer runs than tapes, refund the memory that we imagined we + * would need for the tape buffers of the unused tapes. + * + * numTapes and numInputTapes reflect the actual number of tapes we will + * use. Note that the output tape's tape number is maxTapes - 1, so the + * tape numbers of the used tapes are not consecutive, and you cannot just + * loop from 0 to numTapes to visit all used tapes! + */ + if (state->Level == 1) + { + numInputTapes = state->currentRun; + numTapes = numInputTapes + 1; + FREEMEM(state, (state->maxTapes - numTapes) * TAPE_BUFFER_OVERHEAD); + } + else + { + numInputTapes = state->tapeRange; + numTapes = state->maxTapes; + } + + /* + * Initialize the slab allocator. We need one slab slot per input tape, + * for the tuples in the heap, plus one to hold the tuple last returned + * from tuplesort_gettuple. (If we're sorting pass-by-val Datums, + * however, we don't need to do allocate anything.) + * + * From this point on, we no longer use the USEMEM()/LACKMEM() mechanism + * to track memory usage of individual tuples. + */ + if (state->tuples) + init_slab_allocator(state, numInputTapes + 1); + else + init_slab_allocator(state, 0); + + /* + * Allocate a new 'memtuples' array, for the heap. It will hold one tuple + * from each input tape. + */ + state->memtupsize = numInputTapes; + state->memtuples = (SortTuple *) palloc(numInputTapes * sizeof(SortTuple)); + USEMEM(state, GetMemoryChunkSpace(state->memtuples)); + + /* + * Use all the remaining memory we have available for read buffers among + * the input tapes. + * + * We don't try to "rebalance" the memory among tapes, when we start a new + * merge phase, even if some tapes are inactive in the new phase. That + * would be hard, because logtape.c doesn't know where one run ends and + * another begins. When a new merge phase begins, and a tape doesn't + * participate in it, its buffer nevertheless already contains tuples from + * the next run on same tape, so we cannot release the buffer. That's OK + * in practice, merge performance isn't that sensitive to the amount of + * buffers used, and most merge phases use all or almost all tapes, + * anyway. + */ +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d using " INT64_FORMAT " KB of memory for read buffers among %d input tapes", + state->worker, state->availMem / 1024, numInputTapes); +#endif + + state->read_buffer_size = Max(state->availMem / numInputTapes, 0); + USEMEM(state, state->read_buffer_size * numInputTapes); + + /* End of step D2: rewind all output tapes to prepare for merging */ + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + LogicalTapeRewindForRead(state->tapeset, tapenum, state->read_buffer_size); + + for (;;) + { + /* + * At this point we know that tape[T] is empty. If there's just one + * (real or dummy) run left on each input tape, then only one merge + * pass remains. If we don't have to produce a materialized sorted + * tape, we can stop at this point and do the final merge on-the-fly. + */ + if (!state->randomAccess && !WORKER(state)) + { + bool allOneRun = true; + + Assert(state->tp_runs[state->tapeRange] == 0); + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + { + if (state->tp_runs[tapenum] + state->tp_dummy[tapenum] != 1) + { + allOneRun = false; + break; + } + } + if (allOneRun) + { + /* Tell logtape.c we won't be writing anymore */ + LogicalTapeSetForgetFreeSpace(state->tapeset); + /* Initialize for the final merge pass */ + beginmerge(state); + state->status = TSS_FINALMERGE; + return; + } + } + + /* Step D5: merge runs onto tape[T] until tape[P] is empty */ + while (state->tp_runs[state->tapeRange - 1] || + state->tp_dummy[state->tapeRange - 1]) + { + bool allDummy = true; + + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + { + if (state->tp_dummy[tapenum] == 0) + { + allDummy = false; + break; + } + } + + if (allDummy) + { + state->tp_dummy[state->tapeRange]++; + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + state->tp_dummy[tapenum]--; + } + else + mergeonerun(state); + } + + /* Step D6: decrease level */ + if (--state->Level == 0) + break; + /* rewind output tape T to use as new input */ + LogicalTapeRewindForRead(state->tapeset, state->tp_tapenum[state->tapeRange], + state->read_buffer_size); + /* rewind used-up input tape P, and prepare it for write pass */ + LogicalTapeRewindForWrite(state->tapeset, state->tp_tapenum[state->tapeRange - 1]); + state->tp_runs[state->tapeRange - 1] = 0; + + /* + * reassign tape units per step D6; note we no longer care about A[] + */ + svTape = state->tp_tapenum[state->tapeRange]; + svDummy = state->tp_dummy[state->tapeRange]; + svRuns = state->tp_runs[state->tapeRange]; + for (tapenum = state->tapeRange; tapenum > 0; tapenum--) + { + state->tp_tapenum[tapenum] = state->tp_tapenum[tapenum - 1]; + state->tp_dummy[tapenum] = state->tp_dummy[tapenum - 1]; + state->tp_runs[tapenum] = state->tp_runs[tapenum - 1]; + } + state->tp_tapenum[0] = svTape; + state->tp_dummy[0] = svDummy; + state->tp_runs[0] = svRuns; + } + + /* + * Done. Knuth says that the result is on TAPE[1], but since we exited + * the loop without performing the last iteration of step D6, we have not + * rearranged the tape unit assignment, and therefore the result is on + * TAPE[T]. We need to do it this way so that we can freeze the final + * output tape while rewinding it. The last iteration of step D6 would be + * a waste of cycles anyway... + */ + state->result_tape = state->tp_tapenum[state->tapeRange]; + if (!WORKER(state)) + LogicalTapeFreeze(state->tapeset, state->result_tape, NULL); + else + worker_freeze_result_tape(state); + state->status = TSS_SORTEDONTAPE; + + /* Release the read buffers of all the other tapes, by rewinding them. */ + for (tapenum = 0; tapenum < state->maxTapes; tapenum++) + { + if (tapenum != state->result_tape) + LogicalTapeRewindForWrite(state->tapeset, tapenum); + } +} + +/* + * Merge one run from each input tape, except ones with dummy runs. + * + * This is the inner loop of Algorithm D step D5. We know that the + * output tape is TAPE[T]. + */ +static void +mergeonerun(Tuplesortstate *state) +{ + int destTape = state->tp_tapenum[state->tapeRange]; + int srcTape; + + /* + * Start the merge by loading one tuple from each active source tape into + * the heap. We can also decrease the input run/dummy run counts. + */ + beginmerge(state); + + /* + * Execute merge by repeatedly extracting lowest tuple in heap, writing it + * out, and replacing it with next tuple from same tape (if there is + * another one). + */ + while (state->memtupcount > 0) + { + SortTuple stup; + + /* write the tuple to destTape */ + srcTape = state->memtuples[0].tupindex; + WRITETUP(state, destTape, &state->memtuples[0]); + + /* recycle the slot of the tuple we just wrote out, for the next read */ + if (state->memtuples[0].tuple) + RELEASE_SLAB_SLOT(state, state->memtuples[0].tuple); + + /* + * pull next tuple from the tape, and replace the written-out tuple in + * the heap with it. + */ + if (mergereadnext(state, srcTape, &stup)) + { + stup.tupindex = srcTape; + tuplesort_heap_replace_top(state, &stup); + + } + else + tuplesort_heap_delete_top(state); + } + + /* + * When the heap empties, we're done. Write an end-of-run marker on the + * output tape, and increment its count of real runs. + */ + markrunend(state, destTape); + state->tp_runs[state->tapeRange]++; + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d finished %d-way merge step: %s", state->worker, + state->activeTapes, pg_rusage_show(&state->ru_start)); +#endif +} + +/* + * beginmerge - initialize for a merge pass + * + * We decrease the counts of real and dummy runs for each tape, and mark + * which tapes contain active input runs in mergeactive[]. Then, fill the + * merge heap with the first tuple from each active tape. + */ +static void +beginmerge(Tuplesortstate *state) +{ + int activeTapes; + int tapenum; + int srcTape; + + /* Heap should be empty here */ + Assert(state->memtupcount == 0); + + /* Adjust run counts and mark the active tapes */ + memset(state->mergeactive, 0, + state->maxTapes * sizeof(*state->mergeactive)); + activeTapes = 0; + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + { + if (state->tp_dummy[tapenum] > 0) + state->tp_dummy[tapenum]--; + else + { + Assert(state->tp_runs[tapenum] > 0); + state->tp_runs[tapenum]--; + srcTape = state->tp_tapenum[tapenum]; + state->mergeactive[srcTape] = true; + activeTapes++; + } + } + Assert(activeTapes > 0); + state->activeTapes = activeTapes; + + /* Load the merge heap with the first tuple from each input tape */ + for (srcTape = 0; srcTape < state->maxTapes; srcTape++) + { + SortTuple tup; + + if (mergereadnext(state, srcTape, &tup)) + { + tup.tupindex = srcTape; + tuplesort_heap_insert(state, &tup); + } + } +} + +/* + * mergereadnext - read next tuple from one merge input tape + * + * Returns false on EOF. + */ +static bool +mergereadnext(Tuplesortstate *state, int srcTape, SortTuple *stup) +{ + unsigned int tuplen; + + if (!state->mergeactive[srcTape]) + return false; /* tape's run is already exhausted */ + + /* read next tuple, if any */ + if ((tuplen = getlen(state, srcTape, true)) == 0) + { + state->mergeactive[srcTape] = false; + return false; + } + READTUP(state, stup, srcTape, tuplen); + + return true; +} + +/* + * dumptuples - remove tuples from memtuples and write initial run to tape + * + * When alltuples = true, dump everything currently in memory. (This case is + * only used at end of input data.) + */ +static void +dumptuples(Tuplesortstate *state, bool alltuples) +{ + int memtupwrite; + int i; + + /* + * Nothing to do if we still fit in available memory and have array slots, + * unless this is the final call during initial run generation. + */ + if (state->memtupcount < state->memtupsize && !LACKMEM(state) && + !alltuples) + return; + + /* + * Final call might require no sorting, in rare cases where we just so + * happen to have previously LACKMEM()'d at the point where exactly all + * remaining tuples are loaded into memory, just before input was + * exhausted. + * + * In general, short final runs are quite possible. Rather than allowing + * a special case where there was a superfluous selectnewtape() call (i.e. + * a call with no subsequent run actually written to destTape), we prefer + * to write out a 0 tuple run. + * + * mergereadnext() is prepared for 0 tuple runs, and will reliably mark + * the tape inactive for the merge when called from beginmerge(). This + * case is therefore similar to the case where mergeonerun() finds a dummy + * run for the tape, and so doesn't need to merge a run from the tape (or + * conceptually "merges" the dummy run, if you prefer). According to + * Knuth, Algorithm D "isn't strictly optimal" in its method of + * distribution and dummy run assignment; this edge case seems very + * unlikely to make that appreciably worse. + */ + Assert(state->status == TSS_BUILDRUNS); + + /* + * It seems unlikely that this limit will ever be exceeded, but take no + * chances + */ + if (state->currentRun == INT_MAX) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("cannot have more than %d runs for an external sort", + INT_MAX))); + + state->currentRun++; + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d starting quicksort of run %d: %s", + state->worker, state->currentRun, + pg_rusage_show(&state->ru_start)); +#endif + + /* + * Sort all tuples accumulated within the allowed amount of memory for + * this run using quicksort + */ + tuplesort_sort_memtuples(state); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d finished quicksort of run %d: %s", + state->worker, state->currentRun, + pg_rusage_show(&state->ru_start)); +#endif + + memtupwrite = state->memtupcount; + for (i = 0; i < memtupwrite; i++) + { + WRITETUP(state, state->tp_tapenum[state->destTape], + &state->memtuples[i]); + state->memtupcount--; + } + + /* + * Reset tuple memory. We've freed all of the tuples that we previously + * allocated. It's important to avoid fragmentation when there is a stark + * change in the sizes of incoming tuples. Fragmentation due to + * AllocSetFree's bucketing by size class might be particularly bad if + * this step wasn't taken. + */ + MemoryContextReset(state->tuplecontext); + + markrunend(state, state->tp_tapenum[state->destTape]); + state->tp_runs[state->destTape]++; + state->tp_dummy[state->destTape]--; /* per Alg D step D2 */ + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d finished writing run %d to tape %d: %s", + state->worker, state->currentRun, state->destTape, + pg_rusage_show(&state->ru_start)); +#endif + + if (!alltuples) + selectnewtape(state); +} + +/* + * tuplesort_rescan - rewind and replay the scan + */ +void +tuplesort_rescan(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + + Assert(state->randomAccess); + + switch (state->status) + { + case TSS_SORTEDINMEM: + state->current = 0; + state->eof_reached = false; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + case TSS_SORTEDONTAPE: + LogicalTapeRewindForRead(state->tapeset, + state->result_tape, + 0); + state->eof_reached = false; + state->markpos_block = 0L; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + + MemoryContextSwitchTo(oldcontext); +} + +/* + * tuplesort_markpos - saves current position in the merged sort file + */ +void +tuplesort_markpos(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + + Assert(state->randomAccess); + + switch (state->status) + { + case TSS_SORTEDINMEM: + state->markpos_offset = state->current; + state->markpos_eof = state->eof_reached; + break; + case TSS_SORTEDONTAPE: + LogicalTapeTell(state->tapeset, + state->result_tape, + &state->markpos_block, + &state->markpos_offset); + state->markpos_eof = state->eof_reached; + break; + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + + MemoryContextSwitchTo(oldcontext); +} + +/* + * tuplesort_restorepos - restores current position in merged sort file to + * last saved position + */ +void +tuplesort_restorepos(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + + Assert(state->randomAccess); + + switch (state->status) + { + case TSS_SORTEDINMEM: + state->current = state->markpos_offset; + state->eof_reached = state->markpos_eof; + break; + case TSS_SORTEDONTAPE: + LogicalTapeSeek(state->tapeset, + state->result_tape, + state->markpos_block, + state->markpos_offset); + state->eof_reached = state->markpos_eof; + break; + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + + MemoryContextSwitchTo(oldcontext); +} + +/* + * tuplesort_get_stats - extract summary statistics + * + * This can be called after tuplesort_performsort() finishes to obtain + * printable summary information about how the sort was performed. + */ +void +tuplesort_get_stats(Tuplesortstate *state, + TuplesortInstrumentation *stats) +{ + /* + * Note: it might seem we should provide both memory and disk usage for a + * disk-based sort. However, the current code doesn't track memory space + * accurately once we have begun to return tuples to the caller (since we + * don't account for pfree's the caller is expected to do), so we cannot + * rely on availMem in a disk sort. This does not seem worth the overhead + * to fix. Is it worth creating an API for the memory context code to + * tell us how much is actually used in sortcontext? + */ + if (state->tapeset) + { + stats->spaceType = SORT_SPACE_TYPE_DISK; + stats->spaceUsed = LogicalTapeSetBlocks(state->tapeset) * (BLCKSZ / 1024); + } + else + { + stats->spaceType = SORT_SPACE_TYPE_MEMORY; + stats->spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024; + } + + switch (state->status) + { + case TSS_SORTEDINMEM: + if (state->boundUsed) + stats->sortMethod = SORT_TYPE_TOP_N_HEAPSORT; + else + stats->sortMethod = SORT_TYPE_QUICKSORT; + break; + case TSS_SORTEDONTAPE: + stats->sortMethod = SORT_TYPE_EXTERNAL_SORT; + break; + case TSS_FINALMERGE: + stats->sortMethod = SORT_TYPE_EXTERNAL_MERGE; + break; + default: + stats->sortMethod = SORT_TYPE_STILL_IN_PROGRESS; + break; + } +} + +/* + * Convert TuplesortMethod to a string. + */ +const char * +tuplesort_method_name(TuplesortMethod m) +{ + switch (m) + { + case SORT_TYPE_STILL_IN_PROGRESS: + return "still in progress"; + case SORT_TYPE_TOP_N_HEAPSORT: + return "top-N heapsort"; + case SORT_TYPE_QUICKSORT: + return "quicksort"; + case SORT_TYPE_EXTERNAL_SORT: + return "external sort"; + case SORT_TYPE_EXTERNAL_MERGE: + return "external merge"; + } + + return "unknown"; +} + +/* + * Convert TuplesortSpaceType to a string. + */ +const char * +tuplesort_space_type_name(TuplesortSpaceType t) +{ + Assert(t == SORT_SPACE_TYPE_DISK || t == SORT_SPACE_TYPE_MEMORY); + return t == SORT_SPACE_TYPE_DISK ? "Disk" : "Memory"; +} + + +/* + * Heap manipulation routines, per Knuth's Algorithm 5.2.3H. + */ + +/* + * Convert the existing unordered array of SortTuples to a bounded heap, + * discarding all but the smallest "state->bound" tuples. + * + * When working with a bounded heap, we want to keep the largest entry + * at the root (array entry zero), instead of the smallest as in the normal + * sort case. This allows us to discard the largest entry cheaply. + * Therefore, we temporarily reverse the sort direction. + */ +static void +make_bounded_heap(Tuplesortstate *state) +{ + int tupcount = state->memtupcount; + int i; + + Assert(state->status == TSS_INITIAL); + Assert(state->bounded); + Assert(tupcount >= state->bound); + Assert(SERIAL(state)); + + /* Reverse sort direction so largest entry will be at root */ + reversedirection(state); + + state->memtupcount = 0; /* make the heap empty */ + for (i = 0; i < tupcount; i++) + { + if (state->memtupcount < state->bound) + { + /* Insert next tuple into heap */ + /* Must copy source tuple to avoid possible overwrite */ + SortTuple stup = state->memtuples[i]; + + tuplesort_heap_insert(state, &stup); + } + else + { + /* + * The heap is full. Replace the largest entry with the new + * tuple, or just discard it, if it's larger than anything already + * in the heap. + */ + if (COMPARETUP(state, &state->memtuples[i], &state->memtuples[0]) <= 0) + { + free_sort_tuple(state, &state->memtuples[i]); + CHECK_FOR_INTERRUPTS(); + } + else + tuplesort_heap_replace_top(state, &state->memtuples[i]); + } + } + + Assert(state->memtupcount == state->bound); + state->status = TSS_BOUNDED; +} + +/* + * Convert the bounded heap to a properly-sorted array + */ +static void +sort_bounded_heap(Tuplesortstate *state) +{ + int tupcount = state->memtupcount; + + Assert(state->status == TSS_BOUNDED); + Assert(state->bounded); + Assert(tupcount == state->bound); + Assert(SERIAL(state)); + + /* + * We can unheapify in place because each delete-top call will remove the + * largest entry, which we can promptly store in the newly freed slot at + * the end. Once we're down to a single-entry heap, we're done. + */ + while (state->memtupcount > 1) + { + SortTuple stup = state->memtuples[0]; + + /* this sifts-up the next-largest entry and decreases memtupcount */ + tuplesort_heap_delete_top(state); + state->memtuples[state->memtupcount] = stup; + } + state->memtupcount = tupcount; + + /* + * Reverse sort direction back to the original state. This is not + * actually necessary but seems like a good idea for tidiness. + */ + reversedirection(state); + + state->status = TSS_SORTEDINMEM; + state->boundUsed = true; +} + +/* + * Sort all memtuples using specialized qsort() routines. + * + * Quicksort is used for small in-memory sorts, and external sort runs. + */ +static void +tuplesort_sort_memtuples(Tuplesortstate *state) +{ + Assert(!LEADER(state)); + + if (state->memtupcount > 1) + { + /* Can we use the single-key sort function? */ + if (state->onlyKey != NULL) + qsort_ssup(state->memtuples, state->memtupcount, + state->onlyKey); + else + qsort_tuple(state->memtuples, + state->memtupcount, + state->comparetup, + state); + } +} + +/* + * Insert a new tuple into an empty or existing heap, maintaining the + * heap invariant. Caller is responsible for ensuring there's room. + * + * Note: For some callers, tuple points to a memtuples[] entry above the + * end of the heap. This is safe as long as it's not immediately adjacent + * to the end of the heap (ie, in the [memtupcount] array entry) --- if it + * is, it might get overwritten before being moved into the heap! + */ +static void +tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple) +{ + SortTuple *memtuples; + int j; + + memtuples = state->memtuples; + Assert(state->memtupcount < state->memtupsize); + + CHECK_FOR_INTERRUPTS(); + + /* + * Sift-up the new entry, per Knuth 5.2.3 exercise 16. Note that Knuth is + * using 1-based array indexes, not 0-based. + */ + j = state->memtupcount++; + while (j > 0) + { + int i = (j - 1) >> 1; + + if (COMPARETUP(state, tuple, &memtuples[i]) >= 0) + break; + memtuples[j] = memtuples[i]; + j = i; + } + memtuples[j] = *tuple; +} + +/* + * Remove the tuple at state->memtuples[0] from the heap. Decrement + * memtupcount, and sift up to maintain the heap invariant. + * + * The caller has already free'd the tuple the top node points to, + * if necessary. + */ +static void +tuplesort_heap_delete_top(Tuplesortstate *state) +{ + SortTuple *memtuples = state->memtuples; + SortTuple *tuple; + + if (--state->memtupcount <= 0) + return; + + /* + * Remove the last tuple in the heap, and re-insert it, by replacing the + * current top node with it. + */ + tuple = &memtuples[state->memtupcount]; + tuplesort_heap_replace_top(state, tuple); +} + +/* + * Replace the tuple at state->memtuples[0] with a new tuple. Sift up to + * maintain the heap invariant. + * + * This corresponds to Knuth's "sift-up" algorithm (Algorithm 5.2.3H, + * Heapsort, steps H3-H8). + */ +static void +tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple *tuple) +{ + SortTuple *memtuples = state->memtuples; + unsigned int i, + n; + + Assert(state->memtupcount >= 1); + + CHECK_FOR_INTERRUPTS(); + + /* + * state->memtupcount is "int", but we use "unsigned int" for i, j, n. + * This prevents overflow in the "2 * i + 1" calculation, since at the top + * of the loop we must have i < n <= INT_MAX <= UINT_MAX/2. + */ + n = state->memtupcount; + i = 0; /* i is where the "hole" is */ + for (;;) + { + unsigned int j = 2 * i + 1; + + if (j >= n) + break; + if (j + 1 < n && + COMPARETUP(state, &memtuples[j], &memtuples[j + 1]) > 0) + j++; + if (COMPARETUP(state, tuple, &memtuples[j]) <= 0) + break; + memtuples[i] = memtuples[j]; + i = j; + } + memtuples[i] = *tuple; +} + +/* + * Function to reverse the sort direction from its current state + * + * It is not safe to call this when performing hash tuplesorts + */ +static void +reversedirection(Tuplesortstate *state) +{ + SortSupport sortKey = state->sortKeys; + int nkey; + + for (nkey = 0; nkey < state->nKeys; nkey++, sortKey++) + { + sortKey->ssup_reverse = !sortKey->ssup_reverse; + sortKey->ssup_nulls_first = !sortKey->ssup_nulls_first; + } +} + + +/* + * Tape interface routines + */ + +static unsigned int +getlen(Tuplesortstate *state, int tapenum, bool eofOK) +{ + unsigned int len; + + if (LogicalTapeRead(state->tapeset, tapenum, + &len, sizeof(len)) != sizeof(len)) + elog(ERROR, "unexpected end of tape"); + if (len == 0 && !eofOK) + elog(ERROR, "unexpected end of data"); + return len; +} + +static void +markrunend(Tuplesortstate *state, int tapenum) +{ + unsigned int len = 0; + + LogicalTapeWrite(state->tapeset, tapenum, (void *) &len, sizeof(len)); +} + +/* + * Get memory for tuple from within READTUP() routine. + * + * We use next free slot from the slab allocator, or palloc() if the tuple + * is too large for that. + */ +static void * +readtup_alloc(Tuplesortstate *state, Size tuplen) +{ + SlabSlot *buf; + + /* + * We pre-allocate enough slots in the slab arena that we should never run + * out. + */ + Assert(state->slabFreeHead); + + if (tuplen > SLAB_SLOT_SIZE || !state->slabFreeHead) + return MemoryContextAlloc(state->sortcontext, tuplen); + else + { + buf = state->slabFreeHead; + /* Reuse this slot */ + state->slabFreeHead = buf->nextfree; + + return buf; + } +} + + +/* + * Routines specialized for HeapTuple (actually MinimalTuple) case + */ + +static int +comparetup_heap(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) +{ + SortSupport sortKey = state->sortKeys; + HeapTupleData ltup; + HeapTupleData rtup; + TupleDesc tupDesc; + int nkey; + int32 compare; + AttrNumber attno; + Datum datum1, + datum2; + bool isnull1, + isnull2; + + + /* Compare the leading sort key */ + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + sortKey); + if (compare != 0) + return compare; + + /* Compare additional sort keys */ + ltup.t_len = ((MinimalTuple) a->tuple)->t_len + MINIMAL_TUPLE_OFFSET; + ltup.t_data = (HeapTupleHeader) ((char *) a->tuple - MINIMAL_TUPLE_OFFSET); + rtup.t_len = ((MinimalTuple) b->tuple)->t_len + MINIMAL_TUPLE_OFFSET; + rtup.t_data = (HeapTupleHeader) ((char *) b->tuple - MINIMAL_TUPLE_OFFSET); + tupDesc = state->tupDesc; + + if (sortKey->abbrev_converter) + { + attno = sortKey->ssup_attno; + + datum1 = heap_getattr(<up, attno, tupDesc, &isnull1); + datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2); + + compare = ApplySortAbbrevFullComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + + sortKey++; + for (nkey = 1; nkey < state->nKeys; nkey++, sortKey++) + { + attno = sortKey->ssup_attno; + + datum1 = heap_getattr(<up, attno, tupDesc, &isnull1); + datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2); + + compare = ApplySortComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + + return 0; +} + +static void +copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + /* + * We expect the passed "tup" to be a TupleTableSlot, and form a + * MinimalTuple using the exported interface for that. + */ + TupleTableSlot *slot = (TupleTableSlot *) tup; + Datum original; + MinimalTuple tuple; + HeapTupleData htup; + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + + /* copy the tuple into sort storage */ + tuple = ExecCopySlotMinimalTuple(slot); + stup->tuple = (void *) tuple; + USEMEM(state, GetMemoryChunkSpace(tuple)); + /* set up first-column key value */ + htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET; + htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET); + original = heap_getattr(&htup, + state->sortKeys[0].ssup_attno, + state->tupDesc, + &stup->isnull1); + + MemoryContextSwitchTo(oldcontext); + + if (!state->sortKeys->abbrev_converter || stup->isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to zeroed representation (to be + * consistent, and to support cheap inequality tests for NULL + * abbreviated keys). + */ + stup->datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup->datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup->datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + htup.t_len = ((MinimalTuple) mtup->tuple)->t_len + + MINIMAL_TUPLE_OFFSET; + htup.t_data = (HeapTupleHeader) ((char *) mtup->tuple - + MINIMAL_TUPLE_OFFSET); + + mtup->datum1 = heap_getattr(&htup, + state->sortKeys[0].ssup_attno, + state->tupDesc, + &mtup->isnull1); + } + } +} + +static void +writetup_heap(Tuplesortstate *state, int tapenum, SortTuple *stup) +{ + MinimalTuple tuple = (MinimalTuple) stup->tuple; + + /* the part of the MinimalTuple we'll write: */ + char *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET; + unsigned int tupbodylen = tuple->t_len - MINIMAL_TUPLE_DATA_OFFSET; + + /* total on-disk footprint: */ + unsigned int tuplen = tupbodylen + sizeof(int); + + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &tuplen, sizeof(tuplen)); + LogicalTapeWrite(state->tapeset, tapenum, + (void *) tupbody, tupbodylen); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &tuplen, sizeof(tuplen)); + + if (!state->slabAllocatorUsed) + { + FREEMEM(state, GetMemoryChunkSpace(tuple)); + heap_free_minimal_tuple(tuple); + } +} + +static void +readtup_heap(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len) +{ + unsigned int tupbodylen = len - sizeof(int); + unsigned int tuplen = tupbodylen + MINIMAL_TUPLE_DATA_OFFSET; + MinimalTuple tuple = (MinimalTuple) readtup_alloc(state, tuplen); + char *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET; + HeapTupleData htup; + + /* read in the tuple proper */ + tuple->t_len = tuplen; + LogicalTapeReadExact(state->tapeset, tapenum, + tupbody, tupbodylen); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeReadExact(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + stup->tuple = (void *) tuple; + /* set up first-column key value */ + htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET; + htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET); + stup->datum1 = heap_getattr(&htup, + state->sortKeys[0].ssup_attno, + state->tupDesc, + &stup->isnull1); +} + +/* + * Routines specialized for the CLUSTER case (HeapTuple data, with + * comparisons per a btree index definition) + */ + +static int +comparetup_cluster(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state) +{ + SortSupport sortKey = state->sortKeys; + HeapTuple ltup; + HeapTuple rtup; + TupleDesc tupDesc; + int nkey; + int32 compare; + Datum datum1, + datum2; + bool isnull1, + isnull2; + AttrNumber leading = state->indexInfo->ii_IndexAttrNumbers[0]; + + /* Be prepared to compare additional sort keys */ + ltup = (HeapTuple) a->tuple; + rtup = (HeapTuple) b->tuple; + tupDesc = state->tupDesc; + + /* Compare the leading sort key, if it's simple */ + if (leading != 0) + { + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + sortKey); + if (compare != 0) + return compare; + + if (sortKey->abbrev_converter) + { + datum1 = heap_getattr(ltup, leading, tupDesc, &isnull1); + datum2 = heap_getattr(rtup, leading, tupDesc, &isnull2); + + compare = ApplySortAbbrevFullComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + } + if (compare != 0 || state->nKeys == 1) + return compare; + /* Compare additional columns the hard way */ + sortKey++; + nkey = 1; + } + else + { + /* Must compare all keys the hard way */ + nkey = 0; + } + + if (state->indexInfo->ii_Expressions == NULL) + { + /* If not expression index, just compare the proper heap attrs */ + + for (; nkey < state->nKeys; nkey++, sortKey++) + { + AttrNumber attno = state->indexInfo->ii_IndexAttrNumbers[nkey]; + + datum1 = heap_getattr(ltup, attno, tupDesc, &isnull1); + datum2 = heap_getattr(rtup, attno, tupDesc, &isnull2); + + compare = ApplySortComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + } + else + { + /* + * In the expression index case, compute the whole index tuple and + * then compare values. It would perhaps be faster to compute only as + * many columns as we need to compare, but that would require + * duplicating all the logic in FormIndexDatum. + */ + Datum l_index_values[INDEX_MAX_KEYS]; + bool l_index_isnull[INDEX_MAX_KEYS]; + Datum r_index_values[INDEX_MAX_KEYS]; + bool r_index_isnull[INDEX_MAX_KEYS]; + TupleTableSlot *ecxt_scantuple; + + /* Reset context each time to prevent memory leakage */ + ResetPerTupleExprContext(state->estate); + + ecxt_scantuple = GetPerTupleExprContext(state->estate)->ecxt_scantuple; + + ExecStoreHeapTuple(ltup, ecxt_scantuple, false); + FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate, + l_index_values, l_index_isnull); + + ExecStoreHeapTuple(rtup, ecxt_scantuple, false); + FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate, + r_index_values, r_index_isnull); + + for (; nkey < state->nKeys; nkey++, sortKey++) + { + compare = ApplySortComparator(l_index_values[nkey], + l_index_isnull[nkey], + r_index_values[nkey], + r_index_isnull[nkey], + sortKey); + if (compare != 0) + return compare; + } + } + + return 0; +} + +static void +copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + HeapTuple tuple = (HeapTuple) tup; + Datum original; + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + + /* copy the tuple into sort storage */ + tuple = heap_copytuple(tuple); + stup->tuple = (void *) tuple; + USEMEM(state, GetMemoryChunkSpace(tuple)); + + MemoryContextSwitchTo(oldcontext); + + /* + * set up first-column key value, and potentially abbreviate, if it's a + * simple column + */ + if (state->indexInfo->ii_IndexAttrNumbers[0] == 0) + return; + + original = heap_getattr(tuple, + state->indexInfo->ii_IndexAttrNumbers[0], + state->tupDesc, + &stup->isnull1); + + if (!state->sortKeys->abbrev_converter || stup->isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to zeroed representation (to be + * consistent, and to support cheap inequality tests for NULL + * abbreviated keys). + */ + stup->datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup->datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup->datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + tuple = (HeapTuple) mtup->tuple; + mtup->datum1 = heap_getattr(tuple, + state->indexInfo->ii_IndexAttrNumbers[0], + state->tupDesc, + &mtup->isnull1); + } + } +} + +static void +writetup_cluster(Tuplesortstate *state, int tapenum, SortTuple *stup) +{ + HeapTuple tuple = (HeapTuple) stup->tuple; + unsigned int tuplen = tuple->t_len + sizeof(ItemPointerData) + sizeof(int); + + /* We need to store t_self, but not other fields of HeapTupleData */ + LogicalTapeWrite(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + LogicalTapeWrite(state->tapeset, tapenum, + &tuple->t_self, sizeof(ItemPointerData)); + LogicalTapeWrite(state->tapeset, tapenum, + tuple->t_data, tuple->t_len); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeWrite(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + + if (!state->slabAllocatorUsed) + { + FREEMEM(state, GetMemoryChunkSpace(tuple)); + heap_freetuple(tuple); + } +} + +static void +readtup_cluster(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int tuplen) +{ + unsigned int t_len = tuplen - sizeof(ItemPointerData) - sizeof(int); + HeapTuple tuple = (HeapTuple) readtup_alloc(state, + t_len + HEAPTUPLESIZE); + + /* Reconstruct the HeapTupleData header */ + tuple->t_data = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE); + tuple->t_len = t_len; + LogicalTapeReadExact(state->tapeset, tapenum, + &tuple->t_self, sizeof(ItemPointerData)); + /* We don't currently bother to reconstruct t_tableOid */ + tuple->t_tableOid = InvalidOid; + /* Read in the tuple body */ + LogicalTapeReadExact(state->tapeset, tapenum, + tuple->t_data, tuple->t_len); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeReadExact(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + stup->tuple = (void *) tuple; + /* set up first-column key value, if it's a simple column */ + if (state->indexInfo->ii_IndexAttrNumbers[0] != 0) + stup->datum1 = heap_getattr(tuple, + state->indexInfo->ii_IndexAttrNumbers[0], + state->tupDesc, + &stup->isnull1); +} + +/* + * Routines specialized for IndexTuple case + * + * The btree and hash cases require separate comparison functions, but the + * IndexTuple representation is the same so the copy/write/read support + * functions can be shared. + */ + +static int +comparetup_index_btree(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state) +{ + /* + * This is similar to comparetup_heap(), but expects index tuples. There + * is also special handling for enforcing uniqueness, and special + * treatment for equal keys at the end. + */ + SortSupport sortKey = state->sortKeys; + IndexTuple tuple1; + IndexTuple tuple2; + int keysz; + TupleDesc tupDes; + bool equal_hasnull = false; + int nkey; + int32 compare; + Datum datum1, + datum2; + bool isnull1, + isnull2; + + + /* Compare the leading sort key */ + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + sortKey); + if (compare != 0) + return compare; + + /* Compare additional sort keys */ + tuple1 = (IndexTuple) a->tuple; + tuple2 = (IndexTuple) b->tuple; + keysz = state->nKeys; + tupDes = RelationGetDescr(state->indexRel); + + if (sortKey->abbrev_converter) + { + datum1 = index_getattr(tuple1, 1, tupDes, &isnull1); + datum2 = index_getattr(tuple2, 1, tupDes, &isnull2); + + compare = ApplySortAbbrevFullComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + + /* they are equal, so we only need to examine one null flag */ + if (a->isnull1) + equal_hasnull = true; + + sortKey++; + for (nkey = 2; nkey <= keysz; nkey++, sortKey++) + { + datum1 = index_getattr(tuple1, nkey, tupDes, &isnull1); + datum2 = index_getattr(tuple2, nkey, tupDes, &isnull2); + + compare = ApplySortComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; /* done when we find unequal attributes */ + + /* they are equal, so we only need to examine one null flag */ + if (isnull1) + equal_hasnull = true; + } + + /* + * If btree has asked us to enforce uniqueness, complain if two equal + * tuples are detected (unless there was at least one NULL field). + * + * It is sufficient to make the test here, because if two tuples are equal + * they *must* get compared at some stage of the sort --- otherwise the + * sort algorithm wouldn't have checked whether one must appear before the + * other. + */ + if (state->enforceUnique && !equal_hasnull) + { + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + char *key_desc; + + /* + * Some rather brain-dead implementations of qsort (such as the one in + * QNX 4) will sometimes call the comparison routine to compare a + * value to itself, but we always use our own implementation, which + * does not. + */ + Assert(tuple1 != tuple2); + + index_deform_tuple(tuple1, tupDes, values, isnull); + + key_desc = BuildIndexValueDescription(state->indexRel, values, isnull); + + ereport(ERROR, + (errcode(ERRCODE_UNIQUE_VIOLATION), + errmsg("could not create unique index \"%s\"", + RelationGetRelationName(state->indexRel)), + key_desc ? errdetail("Key %s is duplicated.", key_desc) : + errdetail("Duplicate keys exist."), + errtableconstraint(state->heapRel, + RelationGetRelationName(state->indexRel)))); + } + + /* + * If key values are equal, we sort on ItemPointer. This is required for + * btree indexes, since heap TID is treated as an implicit last key + * attribute in order to ensure that all keys in the index are physically + * unique. + */ + { + BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid); + BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid); + + if (blk1 != blk2) + return (blk1 < blk2) ? -1 : 1; + } + { + OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid); + OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid); + + if (pos1 != pos2) + return (pos1 < pos2) ? -1 : 1; + } + + /* ItemPointer values should never be equal */ + Assert(false); + + return 0; +} + +static int +comparetup_index_hash(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state) +{ + Bucket bucket1; + Bucket bucket2; + IndexTuple tuple1; + IndexTuple tuple2; + + /* + * Fetch hash keys and mask off bits we don't want to sort by. We know + * that the first column of the index tuple is the hash key. + */ + Assert(!a->isnull1); + bucket1 = _hash_hashkey2bucket(DatumGetUInt32(a->datum1), + state->max_buckets, state->high_mask, + state->low_mask); + Assert(!b->isnull1); + bucket2 = _hash_hashkey2bucket(DatumGetUInt32(b->datum1), + state->max_buckets, state->high_mask, + state->low_mask); + if (bucket1 > bucket2) + return 1; + else if (bucket1 < bucket2) + return -1; + + /* + * If hash values are equal, we sort on ItemPointer. This does not affect + * validity of the finished index, but it may be useful to have index + * scans in physical order. + */ + tuple1 = (IndexTuple) a->tuple; + tuple2 = (IndexTuple) b->tuple; + + { + BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid); + BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid); + + if (blk1 != blk2) + return (blk1 < blk2) ? -1 : 1; + } + { + OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid); + OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid); + + if (pos1 != pos2) + return (pos1 < pos2) ? -1 : 1; + } + + /* ItemPointer values should never be equal */ + Assert(false); + + return 0; +} + +static void +copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + IndexTuple tuple = (IndexTuple) tup; + unsigned int tuplen = IndexTupleSize(tuple); + IndexTuple newtuple; + Datum original; + + /* copy the tuple into sort storage */ + newtuple = (IndexTuple) MemoryContextAlloc(state->tuplecontext, tuplen); + memcpy(newtuple, tuple, tuplen); + USEMEM(state, GetMemoryChunkSpace(newtuple)); + stup->tuple = (void *) newtuple; + /* set up first-column key value */ + original = index_getattr(newtuple, + 1, + RelationGetDescr(state->indexRel), + &stup->isnull1); + + if (!state->sortKeys->abbrev_converter || stup->isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to zeroed representation (to be + * consistent, and to support cheap inequality tests for NULL + * abbreviated keys). + */ + stup->datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup->datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup->datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + tuple = (IndexTuple) mtup->tuple; + mtup->datum1 = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &mtup->isnull1); + } + } +} + +static void +writetup_index(Tuplesortstate *state, int tapenum, SortTuple *stup) +{ + IndexTuple tuple = (IndexTuple) stup->tuple; + unsigned int tuplen; + + tuplen = IndexTupleSize(tuple) + sizeof(tuplen); + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &tuplen, sizeof(tuplen)); + LogicalTapeWrite(state->tapeset, tapenum, + (void *) tuple, IndexTupleSize(tuple)); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &tuplen, sizeof(tuplen)); + + if (!state->slabAllocatorUsed) + { + FREEMEM(state, GetMemoryChunkSpace(tuple)); + pfree(tuple); + } +} + +static void +readtup_index(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len) +{ + unsigned int tuplen = len - sizeof(unsigned int); + IndexTuple tuple = (IndexTuple) readtup_alloc(state, tuplen); + + LogicalTapeReadExact(state->tapeset, tapenum, + tuple, tuplen); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeReadExact(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + stup->tuple = (void *) tuple; + /* set up first-column key value */ + stup->datum1 = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &stup->isnull1); +} + +/* + * Routines specialized for DatumTuple case + */ + +static int +comparetup_datum(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) +{ + int compare; + + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + state->sortKeys); + if (compare != 0) + return compare; + + /* if we have abbreviations, then "tuple" has the original value */ + + if (state->sortKeys->abbrev_converter) + compare = ApplySortAbbrevFullComparator(PointerGetDatum(a->tuple), a->isnull1, + PointerGetDatum(b->tuple), b->isnull1, + state->sortKeys); + + return compare; +} + +static void +copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + /* Not currently needed */ + elog(ERROR, "copytup_datum() should not be called"); +} + +static void +writetup_datum(Tuplesortstate *state, int tapenum, SortTuple *stup) +{ + void *waddr; + unsigned int tuplen; + unsigned int writtenlen; + + if (stup->isnull1) + { + waddr = NULL; + tuplen = 0; + } + else if (!state->tuples) + { + waddr = &stup->datum1; + tuplen = sizeof(Datum); + } + else + { + waddr = stup->tuple; + tuplen = datumGetSize(PointerGetDatum(stup->tuple), false, state->datumTypeLen); + Assert(tuplen != 0); + } + + writtenlen = tuplen + sizeof(unsigned int); + + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &writtenlen, sizeof(writtenlen)); + LogicalTapeWrite(state->tapeset, tapenum, + waddr, tuplen); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &writtenlen, sizeof(writtenlen)); + + if (!state->slabAllocatorUsed && stup->tuple) + { + FREEMEM(state, GetMemoryChunkSpace(stup->tuple)); + pfree(stup->tuple); + } +} + +static void +readtup_datum(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len) +{ + unsigned int tuplen = len - sizeof(unsigned int); + + if (tuplen == 0) + { + /* it's NULL */ + stup->datum1 = (Datum) 0; + stup->isnull1 = true; + stup->tuple = NULL; + } + else if (!state->tuples) + { + Assert(tuplen == sizeof(Datum)); + LogicalTapeReadExact(state->tapeset, tapenum, + &stup->datum1, tuplen); + stup->isnull1 = false; + stup->tuple = NULL; + } + else + { + void *raddr = readtup_alloc(state, tuplen); + + LogicalTapeReadExact(state->tapeset, tapenum, + raddr, tuplen); + stup->datum1 = PointerGetDatum(raddr); + stup->isnull1 = false; + stup->tuple = raddr; + } + + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeReadExact(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); +} + +/* + * Parallel sort routines + */ + +/* + * tuplesort_estimate_shared - estimate required shared memory allocation + * + * nWorkers is an estimate of the number of workers (it's the number that + * will be requested). + */ +Size +tuplesort_estimate_shared(int nWorkers) +{ + Size tapesSize; + + Assert(nWorkers > 0); + + /* Make sure that BufFile shared state is MAXALIGN'd */ + tapesSize = mul_size(sizeof(TapeShare), nWorkers); + tapesSize = MAXALIGN(add_size(tapesSize, offsetof(Sharedsort, tapes))); + + return tapesSize; +} + +/* + * tuplesort_initialize_shared - initialize shared tuplesort state + * + * Must be called from leader process before workers are launched, to + * establish state needed up-front for worker tuplesortstates. nWorkers + * should match the argument passed to tuplesort_estimate_shared(). + */ +void +tuplesort_initialize_shared(Sharedsort *shared, int nWorkers, dsm_segment *seg) +{ + int i; + + Assert(nWorkers > 0); + + SpinLockInit(&shared->mutex); + shared->currentWorker = 0; + shared->workersFinished = 0; + SharedFileSetInit(&shared->fileset, seg); + shared->nTapes = nWorkers; + for (i = 0; i < nWorkers; i++) + { + shared->tapes[i].firstblocknumber = 0L; + } +} + +/* + * tuplesort_attach_shared - attach to shared tuplesort state + * + * Must be called by all worker processes. + */ +void +tuplesort_attach_shared(Sharedsort *shared, dsm_segment *seg) +{ + /* Attach to SharedFileSet */ + SharedFileSetAttach(&shared->fileset, seg); +} + +/* + * worker_get_identifier - Assign and return ordinal identifier for worker + * + * The order in which these are assigned is not well defined, and should not + * matter; worker numbers across parallel sort participants need only be + * distinct and gapless. logtape.c requires this. + * + * Note that the identifiers assigned from here have no relation to + * ParallelWorkerNumber number, to avoid making any assumption about + * caller's requirements. However, we do follow the ParallelWorkerNumber + * convention of representing a non-worker with worker number -1. This + * includes the leader, as well as serial Tuplesort processes. + */ +static int +worker_get_identifier(Tuplesortstate *state) +{ + Sharedsort *shared = state->shared; + int worker; + + Assert(WORKER(state)); + + SpinLockAcquire(&shared->mutex); + worker = shared->currentWorker++; + SpinLockRelease(&shared->mutex); + + return worker; +} + +/* + * worker_freeze_result_tape - freeze worker's result tape for leader + * + * This is called by workers just after the result tape has been determined, + * instead of calling LogicalTapeFreeze() directly. They do so because + * workers require a few additional steps over similar serial + * TSS_SORTEDONTAPE external sort cases, which also happen here. The extra + * steps are around freeing now unneeded resources, and representing to + * leader that worker's input run is available for its merge. + * + * There should only be one final output run for each worker, which consists + * of all tuples that were originally input into worker. + */ +static void +worker_freeze_result_tape(Tuplesortstate *state) +{ + Sharedsort *shared = state->shared; + TapeShare output; + + Assert(WORKER(state)); + Assert(state->result_tape != -1); + Assert(state->memtupcount == 0); + + /* + * Free most remaining memory, in case caller is sensitive to our holding + * on to it. memtuples may not be a tiny merge heap at this point. + */ + pfree(state->memtuples); + /* Be tidy */ + state->memtuples = NULL; + state->memtupsize = 0; + + /* + * Parallel worker requires result tape metadata, which is to be stored in + * shared memory for leader + */ + LogicalTapeFreeze(state->tapeset, state->result_tape, &output); + + /* Store properties of output tape, and update finished worker count */ + SpinLockAcquire(&shared->mutex); + shared->tapes[state->worker] = output; + shared->workersFinished++; + SpinLockRelease(&shared->mutex); +} + +/* + * worker_nomergeruns - dump memtuples in worker, without merging + * + * This called as an alternative to mergeruns() with a worker when no + * merging is required. + */ +static void +worker_nomergeruns(Tuplesortstate *state) +{ + Assert(WORKER(state)); + Assert(state->result_tape == -1); + + state->result_tape = state->tp_tapenum[state->destTape]; + worker_freeze_result_tape(state); +} + +/* + * leader_takeover_tapes - create tapeset for leader from worker tapes + * + * So far, leader Tuplesortstate has performed no actual sorting. By now, all + * sorting has occurred in workers, all of which must have already returned + * from tuplesort_performsort(). + * + * When this returns, leader process is left in a state that is virtually + * indistinguishable from it having generated runs as a serial external sort + * might have. + */ +static void +leader_takeover_tapes(Tuplesortstate *state) +{ + Sharedsort *shared = state->shared; + int nParticipants = state->nParticipants; + int workersFinished; + int j; + + Assert(LEADER(state)); + Assert(nParticipants >= 1); + + SpinLockAcquire(&shared->mutex); + workersFinished = shared->workersFinished; + SpinLockRelease(&shared->mutex); + + if (nParticipants != workersFinished) + elog(ERROR, "cannot take over tapes before all workers finish"); + + /* + * Create the tapeset from worker tapes, including a leader-owned tape at + * the end. Parallel workers are far more expensive than logical tapes, + * so the number of tapes allocated here should never be excessive. + * + * We still have a leader tape, though it's not possible to write to it + * due to restrictions in the shared fileset infrastructure used by + * logtape.c. It will never be written to in practice because + * randomAccess is disallowed for parallel sorts. + */ + inittapestate(state, nParticipants + 1); + state->tapeset = LogicalTapeSetCreate(nParticipants + 1, shared->tapes, + &shared->fileset, state->worker); + + /* mergeruns() relies on currentRun for # of runs (in one-pass cases) */ + state->currentRun = nParticipants; + + /* + * Initialize variables of Algorithm D to be consistent with runs from + * workers having been generated in the leader. + * + * There will always be exactly 1 run per worker, and exactly one input + * tape per run, because workers always output exactly 1 run, even when + * there were no input tuples for workers to sort. + */ + for (j = 0; j < state->maxTapes; j++) + { + /* One real run; no dummy runs for worker tapes */ + state->tp_fib[j] = 1; + state->tp_runs[j] = 1; + state->tp_dummy[j] = 0; + state->tp_tapenum[j] = j; + } + /* Leader tape gets one dummy run, and no real runs */ + state->tp_fib[state->tapeRange] = 0; + state->tp_runs[state->tapeRange] = 0; + state->tp_dummy[state->tapeRange] = 1; + + state->Level = 1; + state->destTape = 0; + + state->status = TSS_BUILDRUNS; +} + +/* + * Convenience routine to free a tuple previously loaded into sort memory + */ +static void +free_sort_tuple(Tuplesortstate *state, SortTuple *stup) +{ + if (stup->tuple) + { + FREEMEM(state, GetMemoryChunkSpace(stup->tuple)); + pfree(stup->tuple); + stup->tuple = NULL; + } +} diff --git a/src/tuplesort13.c b/src/tuplesort13.c new file mode 100644 index 0000000000..87354a38b4 --- /dev/null +++ b/src/tuplesort13.c @@ -0,0 +1,4708 @@ +/*------------------------------------------------------------------------- + * + * tuplesort.c + * Generalized tuple sorting routines. + * + * This module handles sorting of heap tuples, index tuples, or single + * Datums (and could easily support other kinds of sortable objects, + * if necessary). It works efficiently for both small and large amounts + * of data. Small amounts are sorted in-memory using qsort(). Large + * amounts are sorted using temporary files and a standard external sort + * algorithm. + * + * See Knuth, volume 3, for more than you want to know about the external + * sorting algorithm. Historically, we divided the input into sorted runs + * using replacement selection, in the form of a priority tree implemented + * as a heap (essentially his Algorithm 5.2.3H), but now we always use + * quicksort for run generation. We merge the runs using polyphase merge, + * Knuth's Algorithm 5.4.2D. The logical "tapes" used by Algorithm D are + * implemented by logtape.c, which avoids space wastage by recycling disk + * space as soon as each block is read from its "tape". + * + * The approximate amount of memory allowed for any one sort operation + * is specified in kilobytes by the caller (most pass work_mem). Initially, + * we absorb tuples and simply store them in an unsorted array as long as + * we haven't exceeded workMem. If we reach the end of the input without + * exceeding workMem, we sort the array using qsort() and subsequently return + * tuples just by scanning the tuple array sequentially. If we do exceed + * workMem, we begin to emit tuples into sorted runs in temporary tapes. + * When tuples are dumped in batch after quicksorting, we begin a new run + * with a new output tape (selected per Algorithm D). After the end of the + * input is reached, we dump out remaining tuples in memory into a final run, + * then merge the runs using Algorithm D. + * + * When merging runs, we use a heap containing just the frontmost tuple from + * each source run; we repeatedly output the smallest tuple and replace it + * with the next tuple from its source tape (if any). When the heap empties, + * the merge is complete. The basic merge algorithm thus needs very little + * memory --- only M tuples for an M-way merge, and M is constrained to a + * small number. However, we can still make good use of our full workMem + * allocation by pre-reading additional blocks from each source tape. Without + * prereading, our access pattern to the temporary file would be very erratic; + * on average we'd read one block from each of M source tapes during the same + * time that we're writing M blocks to the output tape, so there is no + * sequentiality of access at all, defeating the read-ahead methods used by + * most Unix kernels. Worse, the output tape gets written into a very random + * sequence of blocks of the temp file, ensuring that things will be even + * worse when it comes time to read that tape. A straightforward merge pass + * thus ends up doing a lot of waiting for disk seeks. We can improve matters + * by prereading from each source tape sequentially, loading about workMem/M + * bytes from each tape in turn, and making the sequential blocks immediately + * available for reuse. This approach helps to localize both read and write + * accesses. The pre-reading is handled by logtape.c, we just tell it how + * much memory to use for the buffers. + * + * When the caller requests random access to the sort result, we form + * the final sorted run on a logical tape which is then "frozen", so + * that we can access it randomly. When the caller does not need random + * access, we return from tuplesort_performsort() as soon as we are down + * to one run per logical tape. The final merge is then performed + * on-the-fly as the caller repeatedly calls tuplesort_getXXX; this + * saves one cycle of writing all the data out to disk and reading it in. + * + * Before Postgres 8.2, we always used a seven-tape polyphase merge, on the + * grounds that 7 is the "sweet spot" on the tapes-to-passes curve according + * to Knuth's figure 70 (section 5.4.2). However, Knuth is assuming that + * tape drives are expensive beasts, and in particular that there will always + * be many more runs than tape drives. In our implementation a "tape drive" + * doesn't cost much more than a few Kb of memory buffers, so we can afford + * to have lots of them. In particular, if we can have as many tape drives + * as sorted runs, we can eliminate any repeated I/O at all. In the current + * code we determine the number of tapes M on the basis of workMem: we want + * workMem/M to be large enough that we read a fair amount of data each time + * we preread from a tape, so as to maintain the locality of access described + * above. Nonetheless, with large workMem we can have many tapes (but not + * too many -- see the comments in tuplesort_merge_order). + * + * This module supports parallel sorting. Parallel sorts involve coordination + * among one or more worker processes, and a leader process, each with its own + * tuplesort state. The leader process (or, more accurately, the + * Tuplesortstate associated with a leader process) creates a full tapeset + * consisting of worker tapes with one run to merge; a run for every + * worker process. This is then merged. Worker processes are guaranteed to + * produce exactly one output run from their partial input. + * + * + * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/utils/sort/tuplesort.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include + +#include "access/hash.h" +#include "access/htup_details.h" +#include "access/nbtree.h" +#include "catalog/index.h" +#include "catalog/pg_am.h" +#include "commands/tablespace.h" +#include "executor/executor.h" +#include "miscadmin.h" +#include "pg_trace.h" +#include "utils/datum.h" +#include "utils/logtape.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/pg_rusage.h" +#include "utils/rel.h" +#include "utils/sortsupport.h" +#include "utils/tuplesort.h" + +/* Should be the last include */ +#include "disable_core_macro.h" + +/* sort-type codes for sort__start probes */ +#define HEAP_SORT 0 +#define INDEX_SORT 1 +#define DATUM_SORT 2 +#define CLUSTER_SORT 3 + +/* Sort parallel code from state for sort__start probes */ +#define PARALLEL_SORT(state) ((state)->shared == NULL ? 0 : \ + (state)->worker >= 0 ? 1 : 2) + +/* + * Initial size of memtuples array. We're trying to select this size so that + * array doesn't exceed ALLOCSET_SEPARATE_THRESHOLD and so that the overhead of + * allocation might possibly be lowered. However, we don't consider array sizes + * less than 1024. + * + */ +#define INITIAL_MEMTUPSIZE Max(1024, \ + ALLOCSET_SEPARATE_THRESHOLD / sizeof(SortTuple) + 1) + +/* GUC variables */ +#ifdef TRACE_SORT +bool trace_sort = false; +#endif + +#ifdef DEBUG_BOUNDED_SORT +bool optimize_bounded_sort = true; +#endif + + +/* + * The objects we actually sort are SortTuple structs. These contain + * a pointer to the tuple proper (might be a MinimalTuple or IndexTuple), + * which is a separate palloc chunk --- we assume it is just one chunk and + * can be freed by a simple pfree() (except during merge, when we use a + * simple slab allocator). SortTuples also contain the tuple's first key + * column in Datum/nullflag format, and a source/input tape number that + * tracks which tape each heap element/slot belongs to during merging. + * + * Storing the first key column lets us save heap_getattr or index_getattr + * calls during tuple comparisons. We could extract and save all the key + * columns not just the first, but this would increase code complexity and + * overhead, and wouldn't actually save any comparison cycles in the common + * case where the first key determines the comparison result. Note that + * for a pass-by-reference datatype, datum1 points into the "tuple" storage. + * + * There is one special case: when the sort support infrastructure provides an + * "abbreviated key" representation, where the key is (typically) a pass by + * value proxy for a pass by reference type. In this case, the abbreviated key + * is stored in datum1 in place of the actual first key column. + * + * When sorting single Datums, the data value is represented directly by + * datum1/isnull1 for pass by value types (or null values). If the datatype is + * pass-by-reference and isnull1 is false, then "tuple" points to a separately + * palloc'd data value, otherwise "tuple" is NULL. The value of datum1 is then + * either the same pointer as "tuple", or is an abbreviated key value as + * described above. Accordingly, "tuple" is always used in preference to + * datum1 as the authoritative value for pass-by-reference cases. + */ +typedef struct +{ + void *tuple; /* the tuple itself */ + Datum datum1; /* value of first key column */ + bool isnull1; /* is first key column NULL? */ + int srctape; /* source tape number */ +} SortTuple; + +/* + * During merge, we use a pre-allocated set of fixed-size slots to hold + * tuples. To avoid palloc/pfree overhead. + * + * Merge doesn't require a lot of memory, so we can afford to waste some, + * by using gratuitously-sized slots. If a tuple is larger than 1 kB, the + * palloc() overhead is not significant anymore. + * + * 'nextfree' is valid when this chunk is in the free list. When in use, the + * slot holds a tuple. + */ +#define SLAB_SLOT_SIZE 1024 + +typedef union SlabSlot +{ + union SlabSlot *nextfree; + char buffer[SLAB_SLOT_SIZE]; +} SlabSlot; + +/* + * Possible states of a Tuplesort object. These denote the states that + * persist between calls of Tuplesort routines. + */ +typedef enum +{ + TSS_INITIAL, /* Loading tuples; still within memory limit */ + TSS_BOUNDED, /* Loading tuples into bounded-size heap */ + TSS_BUILDRUNS, /* Loading tuples; writing to tape */ + TSS_SORTEDINMEM, /* Sort completed entirely in memory */ + TSS_SORTEDONTAPE, /* Sort completed, final run is on tape */ + TSS_FINALMERGE /* Performing final merge on-the-fly */ +} TupSortStatus; + +/* + * Parameters for calculation of number of tapes to use --- see inittapes() + * and tuplesort_merge_order(). + * + * In this calculation we assume that each tape will cost us about 1 blocks + * worth of buffer space. This ignores the overhead of all the other data + * structures needed for each tape, but it's probably close enough. + * + * MERGE_BUFFER_SIZE is how much data we'd like to read from each input + * tape during a preread cycle (see discussion at top of file). + */ +#define MINORDER 6 /* minimum merge order */ +#define MAXORDER 500 /* maximum merge order */ +#define TAPE_BUFFER_OVERHEAD BLCKSZ +#define MERGE_BUFFER_SIZE (BLCKSZ * 32) + +typedef int (*SortTupleComparator) (const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); + +/* + * Private state of a Tuplesort operation. + */ +struct Tuplesortstate +{ + TupSortStatus status; /* enumerated value as shown above */ + int nKeys; /* number of columns in sort key */ + bool randomAccess; /* did caller request random access? */ + bool bounded; /* did caller specify a maximum number of + * tuples to return? */ + bool boundUsed; /* true if we made use of a bounded heap */ + int bound; /* if bounded, the maximum number of tuples */ + bool tuples; /* Can SortTuple.tuple ever be set? */ + int64 availMem; /* remaining memory available, in bytes */ + int64 allowedMem; /* total memory allowed, in bytes */ + int maxTapes; /* number of tapes (Knuth's T) */ + int tapeRange; /* maxTapes-1 (Knuth's P) */ + int64 maxSpace; /* maximum amount of space occupied among sort + * of groups, either in-memory or on-disk */ + bool isMaxSpaceDisk; /* true when maxSpace is value for on-disk + * space, false when it's value for in-memory + * space */ + TupSortStatus maxSpaceStatus; /* sort status when maxSpace was reached */ + MemoryContext maincontext; /* memory context for tuple sort metadata that + * persists across multiple batches */ + MemoryContext sortcontext; /* memory context holding most sort data */ + MemoryContext tuplecontext; /* sub-context of sortcontext for tuple data */ + LogicalTapeSet *tapeset; /* logtape.c object for tapes in a temp file */ + + /* + * These function pointers decouple the routines that must know what kind + * of tuple we are sorting from the routines that don't need to know it. + * They are set up by the tuplesort_begin_xxx routines. + * + * Function to compare two tuples; result is per qsort() convention, ie: + * <0, 0, >0 according as ab. The API must match + * qsort_arg_comparator. + */ + SortTupleComparator comparetup; + + /* + * Function to copy a supplied input tuple into palloc'd space and set up + * its SortTuple representation (ie, set tuple/datum1/isnull1). Also, + * state->availMem must be decreased by the amount of space used for the + * tuple copy (note the SortTuple struct itself is not counted). + */ + void (*copytup) (Tuplesortstate *state, SortTuple *stup, void *tup); + + /* + * Function to write a stored tuple onto tape. The representation of the + * tuple on tape need not be the same as it is in memory; requirements on + * the tape representation are given below. Unless the slab allocator is + * used, after writing the tuple, pfree() the out-of-line data (not the + * SortTuple struct!), and increase state->availMem by the amount of + * memory space thereby released. + */ + void (*writetup) (Tuplesortstate *state, int tapenum, + SortTuple *stup); + + /* + * Function to read a stored tuple from tape back into memory. 'len' is + * the already-read length of the stored tuple. The tuple is allocated + * from the slab memory arena, or is palloc'd, see readtup_alloc(). + */ + void (*readtup) (Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); + + /* + * This array holds the tuples now in sort memory. If we are in state + * INITIAL, the tuples are in no particular order; if we are in state + * SORTEDINMEM, the tuples are in final sorted order; in states BUILDRUNS + * and FINALMERGE, the tuples are organized in "heap" order per Algorithm + * H. In state SORTEDONTAPE, the array is not used. + */ + SortTuple *memtuples; /* array of SortTuple structs */ + int memtupcount; /* number of tuples currently present */ + int memtupsize; /* allocated length of memtuples array */ + bool growmemtuples; /* memtuples' growth still underway? */ + + /* + * Memory for tuples is sometimes allocated using a simple slab allocator, + * rather than with palloc(). Currently, we switch to slab allocation + * when we start merging. Merging only needs to keep a small, fixed + * number of tuples in memory at any time, so we can avoid the + * palloc/pfree overhead by recycling a fixed number of fixed-size slots + * to hold the tuples. + * + * For the slab, we use one large allocation, divided into SLAB_SLOT_SIZE + * slots. The allocation is sized to have one slot per tape, plus one + * additional slot. We need that many slots to hold all the tuples kept + * in the heap during merge, plus the one we have last returned from the + * sort, with tuplesort_gettuple. + * + * Initially, all the slots are kept in a linked list of free slots. When + * a tuple is read from a tape, it is put to the next available slot, if + * it fits. If the tuple is larger than SLAB_SLOT_SIZE, it is palloc'd + * instead. + * + * When we're done processing a tuple, we return the slot back to the free + * list, or pfree() if it was palloc'd. We know that a tuple was + * allocated from the slab, if its pointer value is between + * slabMemoryBegin and -End. + * + * When the slab allocator is used, the USEMEM/LACKMEM mechanism of + * tracking memory usage is not used. + */ + bool slabAllocatorUsed; + + char *slabMemoryBegin; /* beginning of slab memory arena */ + char *slabMemoryEnd; /* end of slab memory arena */ + SlabSlot *slabFreeHead; /* head of free list */ + + /* Buffer size to use for reading input tapes, during merge. */ + size_t read_buffer_size; + + /* + * When we return a tuple to the caller in tuplesort_gettuple_XXX, that + * came from a tape (that is, in TSS_SORTEDONTAPE or TSS_FINALMERGE + * modes), we remember the tuple in 'lastReturnedTuple', so that we can + * recycle the memory on next gettuple call. + */ + void *lastReturnedTuple; + + /* + * While building initial runs, this is the current output run number. + * Afterwards, it is the number of initial runs we made. + */ + int currentRun; + + /* + * Unless otherwise noted, all pointer variables below are pointers to + * arrays of length maxTapes, holding per-tape data. + */ + + /* + * This variable is only used during merge passes. mergeactive[i] is true + * if we are reading an input run from (actual) tape number i and have not + * yet exhausted that run. + */ + bool *mergeactive; /* active input run source? */ + + /* + * Variables for Algorithm D. Note that destTape is a "logical" tape + * number, ie, an index into the tp_xxx[] arrays. Be careful to keep + * "logical" and "actual" tape numbers straight! + */ + int Level; /* Knuth's l */ + int destTape; /* current output tape (Knuth's j, less 1) */ + int *tp_fib; /* Target Fibonacci run counts (A[]) */ + int *tp_runs; /* # of real runs on each tape */ + int *tp_dummy; /* # of dummy runs for each tape (D[]) */ + int *tp_tapenum; /* Actual tape numbers (TAPE[]) */ + int activeTapes; /* # of active input tapes in merge pass */ + + /* + * These variables are used after completion of sorting to keep track of + * the next tuple to return. (In the tape case, the tape's current read + * position is also critical state.) + */ + int result_tape; /* actual tape number of finished output */ + int current; /* array index (only used if SORTEDINMEM) */ + bool eof_reached; /* reached EOF (needed for cursors) */ + + /* markpos_xxx holds marked position for mark and restore */ + long markpos_block; /* tape block# (only used if SORTEDONTAPE) */ + int markpos_offset; /* saved "current", or offset in tape block */ + bool markpos_eof; /* saved "eof_reached" */ + + /* + * These variables are used during parallel sorting. + * + * worker is our worker identifier. Follows the general convention that + * -1 value relates to a leader tuplesort, and values >= 0 worker + * tuplesorts. (-1 can also be a serial tuplesort.) + * + * shared is mutable shared memory state, which is used to coordinate + * parallel sorts. + * + * nParticipants is the number of worker Tuplesortstates known by the + * leader to have actually been launched, which implies that they must + * finish a run leader can merge. Typically includes a worker state held + * by the leader process itself. Set in the leader Tuplesortstate only. + */ + int worker; + Sharedsort *shared; + int nParticipants; + + /* + * The sortKeys variable is used by every case other than the hash index + * case; it is set by tuplesort_begin_xxx. tupDesc is only used by the + * MinimalTuple and CLUSTER routines, though. + */ + TupleDesc tupDesc; + SortSupport sortKeys; /* array of length nKeys */ + + /* + * This variable is shared by the single-key MinimalTuple case and the + * Datum case (which both use qsort_ssup()). Otherwise it's NULL. + */ + SortSupport onlyKey; + + /* + * Additional state for managing "abbreviated key" sortsupport routines + * (which currently may be used by all cases except the hash index case). + * Tracks the intervals at which the optimization's effectiveness is + * tested. + */ + int64 abbrevNext; /* Tuple # at which to next check + * applicability */ + + /* + * These variables are specific to the CLUSTER case; they are set by + * tuplesort_begin_cluster. + */ + IndexInfo *indexInfo; /* info about index being used for reference */ + EState *estate; /* for evaluating index expressions */ + + /* + * These variables are specific to the IndexTuple case; they are set by + * tuplesort_begin_index_xxx and used only by the IndexTuple routines. + */ + Relation heapRel; /* table the index is being built on */ + Relation indexRel; /* index being built */ + + /* These are specific to the index_btree subcase: */ + bool enforceUnique; /* complain if we find duplicate tuples */ + + /* These are specific to the index_hash subcase: */ + uint32 high_mask; /* masks for sortable part of hash code */ + uint32 low_mask; + uint32 max_buckets; + + /* + * These variables are specific to the Datum case; they are set by + * tuplesort_begin_datum and used only by the DatumTuple routines. + */ + Oid datumType; + /* we need typelen in order to know how to copy the Datums. */ + int datumTypeLen; + + /* + * Resource snapshot for time of sort start. + */ +#ifdef TRACE_SORT + PGRUsage ru_start; +#endif +}; + +/* + * Private mutable state of tuplesort-parallel-operation. This is allocated + * in shared memory. + */ +struct Sharedsort +{ + /* mutex protects all fields prior to tapes */ + slock_t mutex; + + /* + * currentWorker generates ordinal identifier numbers for parallel sort + * workers. These start from 0, and are always gapless. + * + * Workers increment workersFinished to indicate having finished. If this + * is equal to state.nParticipants within the leader, leader is ready to + * merge worker runs. + */ + int currentWorker; + int workersFinished; + + /* Temporary file space */ + SharedFileSet fileset; + + /* Size of tapes flexible array */ + int nTapes; + + /* + * Tapes array used by workers to report back information needed by the + * leader to concatenate all worker tapes into one for merging + */ + TapeShare tapes[FLEXIBLE_ARRAY_MEMBER]; +}; + +/* + * Is the given tuple allocated from the slab memory arena? + */ +#define IS_SLAB_SLOT(state, tuple) \ + ((char *) (tuple) >= (state)->slabMemoryBegin && \ + (char *) (tuple) < (state)->slabMemoryEnd) + +/* + * Return the given tuple to the slab memory free list, or free it + * if it was palloc'd. + */ +#define RELEASE_SLAB_SLOT(state, tuple) \ + do { \ + SlabSlot *buf = (SlabSlot *) tuple; \ + \ + if (IS_SLAB_SLOT((state), buf)) \ + { \ + buf->nextfree = (state)->slabFreeHead; \ + (state)->slabFreeHead = buf; \ + } else \ + pfree(buf); \ + } while(0) + +#define COMPARETUP(state,a,b) ((*(state)->comparetup) (a, b, state)) +#define COPYTUP(state,stup,tup) ((*(state)->copytup) (state, stup, tup)) +#define WRITETUP(state,tape,stup) ((*(state)->writetup) (state, tape, stup)) +#define READTUP(state,stup,tape,len) ((*(state)->readtup) (state, stup, tape, len)) +#define LACKMEM(state) ((state)->availMem < 0 && !(state)->slabAllocatorUsed) +#define USEMEM(state,amt) ((state)->availMem -= (amt)) +#define FREEMEM(state,amt) ((state)->availMem += (amt)) +#define SERIAL(state) ((state)->shared == NULL) +#define WORKER(state) ((state)->shared && (state)->worker != -1) +#define LEADER(state) ((state)->shared && (state)->worker == -1) + +/* + * NOTES about on-tape representation of tuples: + * + * We require the first "unsigned int" of a stored tuple to be the total size + * on-tape of the tuple, including itself (so it is never zero; an all-zero + * unsigned int is used to delimit runs). The remainder of the stored tuple + * may or may not match the in-memory representation of the tuple --- + * any conversion needed is the job of the writetup and readtup routines. + * + * If state->randomAccess is true, then the stored representation of the + * tuple must be followed by another "unsigned int" that is a copy of the + * length --- so the total tape space used is actually sizeof(unsigned int) + * more than the stored length value. This allows read-backwards. When + * randomAccess is not true, the write/read routines may omit the extra + * length word. + * + * writetup is expected to write both length words as well as the tuple + * data. When readtup is called, the tape is positioned just after the + * front length word; readtup must read the tuple data and advance past + * the back length word (if present). + * + * The write/read routines can make use of the tuple description data + * stored in the Tuplesortstate record, if needed. They are also expected + * to adjust state->availMem by the amount of memory space (not tape space!) + * released or consumed. There is no error return from either writetup + * or readtup; they should ereport() on failure. + * + * + * NOTES about memory consumption calculations: + * + * We count space allocated for tuples against the workMem limit, plus + * the space used by the variable-size memtuples array. Fixed-size space + * is not counted; it's small enough to not be interesting. + * + * Note that we count actual space used (as shown by GetMemoryChunkSpace) + * rather than the originally-requested size. This is important since + * palloc can add substantial overhead. It's not a complete answer since + * we won't count any wasted space in palloc allocation blocks, but it's + * a lot better than what we were doing before 7.3. As of 9.6, a + * separate memory context is used for caller passed tuples. Resetting + * it at certain key increments significantly ameliorates fragmentation. + * Note that this places a responsibility on copytup routines to use the + * correct memory context for these tuples (and to not use the reset + * context for anything whose lifetime needs to span multiple external + * sort runs). readtup routines use the slab allocator (they cannot use + * the reset context because it gets deleted at the point that merging + * begins). + */ + +/* When using this macro, beware of double evaluation of len */ +#define LogicalTapeReadExact(tapeset, tapenum, ptr, len) \ + do { \ + if (LogicalTapeRead(tapeset, tapenum, ptr, len) != (size_t) (len)) \ + elog(ERROR, "unexpected end of data"); \ + } while(0) + + +static Tuplesortstate *tuplesort_begin_common(int workMem, + SortCoordinate coordinate, + bool randomAccess); +static void tuplesort_begin_batch(Tuplesortstate *state); +static void puttuple_common(Tuplesortstate *state, SortTuple *tuple); +static bool consider_abort_common(Tuplesortstate *state); +static void inittapes(Tuplesortstate *state, bool mergeruns); +static void inittapestate(Tuplesortstate *state, int maxTapes); +static void selectnewtape(Tuplesortstate *state); +static void init_slab_allocator(Tuplesortstate *state, int numSlots); +static void mergeruns(Tuplesortstate *state); +static void mergeonerun(Tuplesortstate *state); +static void beginmerge(Tuplesortstate *state); +static bool mergereadnext(Tuplesortstate *state, int srcTape, SortTuple *stup); +static void dumptuples(Tuplesortstate *state, bool alltuples); +static void make_bounded_heap(Tuplesortstate *state); +static void sort_bounded_heap(Tuplesortstate *state); +static void tuplesort_sort_memtuples(Tuplesortstate *state); +static void tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple); +static void tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple *tuple); +static void tuplesort_heap_delete_top(Tuplesortstate *state); +static void reversedirection(Tuplesortstate *state); +static unsigned int getlen(Tuplesortstate *state, int tapenum, bool eofOK); +static void markrunend(Tuplesortstate *state, int tapenum); +static void *readtup_alloc(Tuplesortstate *state, Size tuplen); +static int comparetup_heap(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_heap(Tuplesortstate *state, int tapenum, + SortTuple *stup); +static void readtup_heap(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); +static int comparetup_cluster(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_cluster(Tuplesortstate *state, int tapenum, + SortTuple *stup); +static void readtup_cluster(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); +static int comparetup_index_btree(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static int comparetup_index_hash(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_index(Tuplesortstate *state, int tapenum, + SortTuple *stup); +static void readtup_index(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); +static int comparetup_datum(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_datum(Tuplesortstate *state, int tapenum, + SortTuple *stup); +static void readtup_datum(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); +static int worker_get_identifier(Tuplesortstate *state); +static void worker_freeze_result_tape(Tuplesortstate *state); +static void worker_nomergeruns(Tuplesortstate *state); +static void leader_takeover_tapes(Tuplesortstate *state); +static void free_sort_tuple(Tuplesortstate *state, SortTuple *stup); +static void tuplesort_free(Tuplesortstate *state); +static void tuplesort_updatemax(Tuplesortstate *state); + +/* + * Special versions of qsort just for SortTuple objects. qsort_tuple() sorts + * any variant of SortTuples, using the appropriate comparetup function. + * qsort_ssup() is specialized for the case where the comparetup function + * reduces to ApplySortComparator(), that is single-key MinimalTuple sorts + * and Datum sorts. + */ +#include "qsort_tuple.c" + + +/* + * tuplesort_begin_xxx + * + * Initialize for a tuple sort operation. + * + * After calling tuplesort_begin, the caller should call tuplesort_putXXX + * zero or more times, then call tuplesort_performsort when all the tuples + * have been supplied. After performsort, retrieve the tuples in sorted + * order by calling tuplesort_getXXX until it returns false/NULL. (If random + * access was requested, rescan, markpos, and restorepos can also be called.) + * Call tuplesort_end to terminate the operation and release memory/disk space. + * + * Each variant of tuplesort_begin has a workMem parameter specifying the + * maximum number of kilobytes of RAM to use before spilling data to disk. + * (The normal value of this parameter is work_mem, but some callers use + * other values.) Each variant also has a randomAccess parameter specifying + * whether the caller needs non-sequential access to the sort result. + */ + +static Tuplesortstate * +tuplesort_begin_common(int workMem, SortCoordinate coordinate, + bool randomAccess) +{ + Tuplesortstate *state; + MemoryContext maincontext; + MemoryContext sortcontext; + MemoryContext oldcontext; + + /* See leader_takeover_tapes() remarks on randomAccess support */ + if (coordinate && randomAccess) + elog(ERROR, "random access disallowed under parallel sort"); + + /* + * Memory context surviving tuplesort_reset. This memory context holds + * data which is useful to keep while sorting multiple similar batches. + */ + maincontext = AllocSetContextCreate(CurrentMemoryContext, + "TupleSort main", + ALLOCSET_DEFAULT_SIZES); + + /* + * Create a working memory context for one sort operation. The content of + * this context is deleted by tuplesort_reset. + */ + sortcontext = AllocSetContextCreate(maincontext, + "TupleSort sort", + ALLOCSET_DEFAULT_SIZES); + + /* + * Additionally a working memory context for tuples is setup in + * tuplesort_begin_batch. + */ + + /* + * Make the Tuplesortstate within the per-sortstate context. This way, we + * don't need a separate pfree() operation for it at shutdown. + */ + oldcontext = MemoryContextSwitchTo(maincontext); + + state = (Tuplesortstate *) palloc0(sizeof(Tuplesortstate)); + +#ifdef TRACE_SORT + if (trace_sort) + pg_rusage_init(&state->ru_start); +#endif + + state->randomAccess = randomAccess; + state->tuples = true; + + /* + * workMem is forced to be at least 64KB, the current minimum valid value + * for the work_mem GUC. This is a defense against parallel sort callers + * that divide out memory among many workers in a way that leaves each + * with very little memory. + */ + state->allowedMem = Max(workMem, 64) * (int64) 1024; + state->sortcontext = sortcontext; + state->maincontext = maincontext; + + /* + * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD; + * see comments in grow_memtuples(). + */ + state->memtupsize = INITIAL_MEMTUPSIZE; + state->memtuples = NULL; + + /* + * After all of the other non-parallel-related state, we setup all of the + * state needed for each batch. + */ + tuplesort_begin_batch(state); + + /* + * Initialize parallel-related state based on coordination information + * from caller + */ + if (!coordinate) + { + /* Serial sort */ + state->shared = NULL; + state->worker = -1; + state->nParticipants = -1; + } + else if (coordinate->isWorker) + { + /* Parallel worker produces exactly one final run from all input */ + state->shared = coordinate->sharedsort; + state->worker = worker_get_identifier(state); + state->nParticipants = -1; + } + else + { + /* Parallel leader state only used for final merge */ + state->shared = coordinate->sharedsort; + state->worker = -1; + state->nParticipants = coordinate->nParticipants; + Assert(state->nParticipants >= 1); + } + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +/* + * tuplesort_begin_batch + * + * Setup, or reset, all state need for processing a new set of tuples with this + * sort state. Called both from tuplesort_begin_common (the first time sorting + * with this sort state) and tuplesort_reset (for subsequent usages). + */ +static void +tuplesort_begin_batch(Tuplesortstate *state) +{ + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(state->maincontext); + + /* + * Caller tuple (e.g. IndexTuple) memory context. + * + * A dedicated child context used exclusively for caller passed tuples + * eases memory management. Resetting at key points reduces + * fragmentation. Note that the memtuples array of SortTuples is allocated + * in the parent context, not this context, because there is no need to + * free memtuples early. + */ + state->tuplecontext = AllocSetContextCreate(state->sortcontext, + "Caller tuples", + ALLOCSET_DEFAULT_SIZES); + + state->status = TSS_INITIAL; + state->bounded = false; + state->boundUsed = false; + + state->availMem = state->allowedMem; + + state->tapeset = NULL; + + state->memtupcount = 0; + + /* + * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD; + * see comments in grow_memtuples(). + */ + state->growmemtuples = true; + state->slabAllocatorUsed = false; + if (state->memtuples != NULL && state->memtupsize != INITIAL_MEMTUPSIZE) + { + pfree(state->memtuples); + state->memtuples = NULL; + state->memtupsize = INITIAL_MEMTUPSIZE; + } + if (state->memtuples == NULL) + { + state->memtuples = (SortTuple *) palloc(state->memtupsize * sizeof(SortTuple)); + USEMEM(state, GetMemoryChunkSpace(state->memtuples)); + } + + /* workMem must be large enough for the minimal memtuples array */ + if (LACKMEM(state)) + elog(ERROR, "insufficient memory allowed for sort"); + + state->currentRun = 0; + + /* + * maxTapes, tapeRange, and Algorithm D variables will be initialized by + * inittapes(), if needed + */ + + state->result_tape = -1; /* flag that result tape has not been formed */ + + MemoryContextSwitchTo(oldcontext); +} + +Tuplesortstate * +tuplesort_begin_heap(TupleDesc tupDesc, + int nkeys, AttrNumber *attNums, + Oid *sortOperators, Oid *sortCollations, + bool *nullsFirstFlags, + int workMem, SortCoordinate coordinate, bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + randomAccess); + MemoryContext oldcontext; + int i; + + oldcontext = MemoryContextSwitchTo(state->maincontext); + + AssertArg(nkeys > 0); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c", + nkeys, workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = nkeys; + + TRACE_POSTGRESQL_SORT_START(HEAP_SORT, + false, /* no unique check */ + nkeys, + workMem, + randomAccess, + PARALLEL_SORT(state)); + + state->comparetup = comparetup_heap; + state->copytup = copytup_heap; + state->writetup = writetup_heap; + state->readtup = readtup_heap; + + state->tupDesc = tupDesc; /* assume we need not copy tupDesc */ + state->abbrevNext = 10; + + /* Prepare SortSupport data for each column */ + state->sortKeys = (SortSupport) palloc0(nkeys * sizeof(SortSupportData)); + + for (i = 0; i < nkeys; i++) + { + SortSupport sortKey = state->sortKeys + i; + + AssertArg(attNums[i] != 0); + AssertArg(sortOperators[i] != 0); + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = sortCollations[i]; + sortKey->ssup_nulls_first = nullsFirstFlags[i]; + sortKey->ssup_attno = attNums[i]; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0); + + PrepareSortSupportFromOrderingOp(sortOperators[i], sortKey); + } + + /* + * The "onlyKey" optimization cannot be used with abbreviated keys, since + * tie-breaker comparisons may be required. Typically, the optimization + * is only of value to pass-by-value types anyway, whereas abbreviated + * keys are typically only of value to pass-by-reference types. + */ + if (nkeys == 1 && !state->sortKeys->abbrev_converter) + state->onlyKey = state->sortKeys; + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_cluster(TupleDesc tupDesc, + Relation indexRel, + int workMem, + SortCoordinate coordinate, bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + randomAccess); + BTScanInsert indexScanKey; + MemoryContext oldcontext; + int i; + + Assert(indexRel->rd_rel->relam == BTREE_AM_OID); + + oldcontext = MemoryContextSwitchTo(state->maincontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c", + RelationGetNumberOfAttributes(indexRel), + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = IndexRelationGetNumberOfKeyAttributes(indexRel); + + TRACE_POSTGRESQL_SORT_START(CLUSTER_SORT, + false, /* no unique check */ + state->nKeys, + workMem, + randomAccess, + PARALLEL_SORT(state)); + + state->comparetup = comparetup_cluster; + state->copytup = copytup_cluster; + state->writetup = writetup_cluster; + state->readtup = readtup_cluster; + state->abbrevNext = 10; + + state->indexInfo = BuildIndexInfo(indexRel); + + state->tupDesc = tupDesc; /* assume we need not copy tupDesc */ + + indexScanKey = _bt_mkscankey(indexRel, NULL); + + if (state->indexInfo->ii_Expressions != NULL) + { + TupleTableSlot *slot; + ExprContext *econtext; + + /* + * We will need to use FormIndexDatum to evaluate the index + * expressions. To do that, we need an EState, as well as a + * TupleTableSlot to put the table tuples into. The econtext's + * scantuple has to point to that slot, too. + */ + state->estate = CreateExecutorState(); + slot = MakeSingleTupleTableSlot(tupDesc, &TTSOpsHeapTuple); + econtext = GetPerTupleExprContext(state->estate); + econtext->ecxt_scantuple = slot; + } + + /* Prepare SortSupport data for each column */ + state->sortKeys = (SortSupport) palloc0(state->nKeys * + sizeof(SortSupportData)); + + for (i = 0; i < state->nKeys; i++) + { + SortSupport sortKey = state->sortKeys + i; + ScanKey scanKey = indexScanKey->scankeys + i; + int16 strategy; + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = scanKey->sk_collation; + sortKey->ssup_nulls_first = + (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0; + sortKey->ssup_attno = scanKey->sk_attno; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0); + + AssertState(sortKey->ssup_attno != 0); + + strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ? + BTGreaterStrategyNumber : BTLessStrategyNumber; + + PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey); + } + + pfree(indexScanKey); + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_index_btree(Relation heapRel, + Relation indexRel, + bool enforceUnique, + int workMem, + SortCoordinate coordinate, + bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + randomAccess); + BTScanInsert indexScanKey; + MemoryContext oldcontext; + int i; + + oldcontext = MemoryContextSwitchTo(state->maincontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin index sort: unique = %c, workMem = %d, randomAccess = %c", + enforceUnique ? 't' : 'f', + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = IndexRelationGetNumberOfKeyAttributes(indexRel); + + TRACE_POSTGRESQL_SORT_START(INDEX_SORT, + enforceUnique, + state->nKeys, + workMem, + randomAccess, + PARALLEL_SORT(state)); + + state->comparetup = comparetup_index_btree; + state->copytup = copytup_index; + state->writetup = writetup_index; + state->readtup = readtup_index; + state->abbrevNext = 10; + + state->heapRel = heapRel; + state->indexRel = indexRel; + state->enforceUnique = enforceUnique; + + indexScanKey = _bt_mkscankey(indexRel, NULL); + + /* Prepare SortSupport data for each column */ + state->sortKeys = (SortSupport) palloc0(state->nKeys * + sizeof(SortSupportData)); + + for (i = 0; i < state->nKeys; i++) + { + SortSupport sortKey = state->sortKeys + i; + ScanKey scanKey = indexScanKey->scankeys + i; + int16 strategy; + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = scanKey->sk_collation; + sortKey->ssup_nulls_first = + (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0; + sortKey->ssup_attno = scanKey->sk_attno; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0); + + AssertState(sortKey->ssup_attno != 0); + + strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ? + BTGreaterStrategyNumber : BTLessStrategyNumber; + + PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey); + } + + pfree(indexScanKey); + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_index_hash(Relation heapRel, + Relation indexRel, + uint32 high_mask, + uint32 low_mask, + uint32 max_buckets, + int workMem, + SortCoordinate coordinate, + bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + randomAccess); + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(state->maincontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin index sort: high_mask = 0x%x, low_mask = 0x%x, " + "max_buckets = 0x%x, workMem = %d, randomAccess = %c", + high_mask, + low_mask, + max_buckets, + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = 1; /* Only one sort column, the hash code */ + + state->comparetup = comparetup_index_hash; + state->copytup = copytup_index; + state->writetup = writetup_index; + state->readtup = readtup_index; + + state->heapRel = heapRel; + state->indexRel = indexRel; + + state->high_mask = high_mask; + state->low_mask = low_mask; + state->max_buckets = max_buckets; + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation, + bool nullsFirstFlag, int workMem, + SortCoordinate coordinate, bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + randomAccess); + MemoryContext oldcontext; + int16 typlen; + bool typbyval; + + oldcontext = MemoryContextSwitchTo(state->maincontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin datum sort: workMem = %d, randomAccess = %c", + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = 1; /* always a one-column sort */ + + TRACE_POSTGRESQL_SORT_START(DATUM_SORT, + false, /* no unique check */ + 1, + workMem, + randomAccess, + PARALLEL_SORT(state)); + + state->comparetup = comparetup_datum; + state->copytup = copytup_datum; + state->writetup = writetup_datum; + state->readtup = readtup_datum; + state->abbrevNext = 10; + + state->datumType = datumType; + + /* lookup necessary attributes of the datum type */ + get_typlenbyval(datumType, &typlen, &typbyval); + state->datumTypeLen = typlen; + state->tuples = !typbyval; + + /* Prepare SortSupport data */ + state->sortKeys = (SortSupport) palloc0(sizeof(SortSupportData)); + + state->sortKeys->ssup_cxt = CurrentMemoryContext; + state->sortKeys->ssup_collation = sortCollation; + state->sortKeys->ssup_nulls_first = nullsFirstFlag; + + /* + * Abbreviation is possible here only for by-reference types. In theory, + * a pass-by-value datatype could have an abbreviated form that is cheaper + * to compare. In a tuple sort, we could support that, because we can + * always extract the original datum from the tuple as needed. Here, we + * can't, because a datum sort only stores a single copy of the datum; the + * "tuple" field of each SortTuple is NULL. + */ + state->sortKeys->abbreviate = !typbyval; + + PrepareSortSupportFromOrderingOp(sortOperator, state->sortKeys); + + /* + * The "onlyKey" optimization cannot be used with abbreviated keys, since + * tie-breaker comparisons may be required. Typically, the optimization + * is only of value to pass-by-value types anyway, whereas abbreviated + * keys are typically only of value to pass-by-reference types. + */ + if (!state->sortKeys->abbrev_converter) + state->onlyKey = state->sortKeys; + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +/* + * tuplesort_set_bound + * + * Advise tuplesort that at most the first N result tuples are required. + * + * Must be called before inserting any tuples. (Actually, we could allow it + * as long as the sort hasn't spilled to disk, but there seems no need for + * delayed calls at the moment.) + * + * This is a hint only. The tuplesort may still return more tuples than + * requested. Parallel leader tuplesorts will always ignore the hint. + */ +void +tuplesort_set_bound(Tuplesortstate *state, int64 bound) +{ + /* Assert we're called before loading any tuples */ + Assert(state->status == TSS_INITIAL && state->memtupcount == 0); + /* Can't set the bound twice, either */ + Assert(!state->bounded); + /* Also, this shouldn't be called in a parallel worker */ + Assert(!WORKER(state)); + + /* Parallel leader allows but ignores hint */ + if (LEADER(state)) + return; + +#ifdef DEBUG_BOUNDED_SORT + /* Honor GUC setting that disables the feature (for easy testing) */ + if (!optimize_bounded_sort) + return; +#endif + + /* We want to be able to compute bound * 2, so limit the setting */ + if (bound > (int64) (INT_MAX / 2)) + return; + + state->bounded = true; + state->bound = (int) bound; + + /* + * Bounded sorts are not an effective target for abbreviated key + * optimization. Disable by setting state to be consistent with no + * abbreviation support. + */ + state->sortKeys->abbrev_converter = NULL; + if (state->sortKeys->abbrev_full_comparator) + state->sortKeys->comparator = state->sortKeys->abbrev_full_comparator; + + /* Not strictly necessary, but be tidy */ + state->sortKeys->abbrev_abort = NULL; + state->sortKeys->abbrev_full_comparator = NULL; +} + +/* + * tuplesort_used_bound + * + * Allow callers to find out if the sort state was able to use a bound. + */ +bool +tuplesort_used_bound(Tuplesortstate *state) +{ + return state->boundUsed; +} + +/* + * tuplesort_free + * + * Internal routine for freeing resources of tuplesort. + */ +static void +tuplesort_free(Tuplesortstate *state) +{ + /* context swap probably not needed, but let's be safe */ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + long spaceUsed; + + if (state->tapeset) + spaceUsed = LogicalTapeSetBlocks(state->tapeset); + else + spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024; +#endif + + /* + * Delete temporary "tape" files, if any. + * + * Note: want to include this in reported total cost of sort, hence need + * for two #ifdef TRACE_SORT sections. + */ + if (state->tapeset) + LogicalTapeSetClose(state->tapeset); + +#ifdef TRACE_SORT + if (trace_sort) + { + if (state->tapeset) + elog(LOG, "%s of worker %d ended, %ld disk blocks used: %s", + SERIAL(state) ? "external sort" : "parallel external sort", + state->worker, spaceUsed, pg_rusage_show(&state->ru_start)); + else + elog(LOG, "%s of worker %d ended, %ld KB used: %s", + SERIAL(state) ? "internal sort" : "unperformed parallel sort", + state->worker, spaceUsed, pg_rusage_show(&state->ru_start)); + } + + TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, spaceUsed); +#else + + /* + * If you disabled TRACE_SORT, you can still probe sort__done, but you + * ain't getting space-used stats. + */ + TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, 0L); +#endif + + /* Free any execution state created for CLUSTER case */ + if (state->estate != NULL) + { + ExprContext *econtext = GetPerTupleExprContext(state->estate); + + ExecDropSingleTupleTableSlot(econtext->ecxt_scantuple); + FreeExecutorState(state->estate); + } + + MemoryContextSwitchTo(oldcontext); + + /* + * Free the per-sort memory context, thereby releasing all working memory. + */ + MemoryContextReset(state->sortcontext); +} + +/* + * tuplesort_end + * + * Release resources and clean up. + * + * NOTE: after calling this, any pointers returned by tuplesort_getXXX are + * pointing to garbage. Be careful not to attempt to use or free such + * pointers afterwards! + */ +void +tuplesort_end(Tuplesortstate *state) +{ + tuplesort_free(state); + + /* + * Free the main memory context, including the Tuplesortstate struct + * itself. + */ + MemoryContextDelete(state->maincontext); +} + +/* + * tuplesort_updatemax + * + * Update maximum resource usage statistics. + */ +static void +tuplesort_updatemax(Tuplesortstate *state) +{ + int64 spaceUsed; + bool isSpaceDisk; + + /* + * Note: it might seem we should provide both memory and disk usage for a + * disk-based sort. However, the current code doesn't track memory space + * accurately once we have begun to return tuples to the caller (since we + * don't account for pfree's the caller is expected to do), so we cannot + * rely on availMem in a disk sort. This does not seem worth the overhead + * to fix. Is it worth creating an API for the memory context code to + * tell us how much is actually used in sortcontext? + */ + if (state->tapeset) + { + isSpaceDisk = true; + spaceUsed = LogicalTapeSetBlocks(state->tapeset) * BLCKSZ; + } + else + { + isSpaceDisk = false; + spaceUsed = state->allowedMem - state->availMem; + } + + /* + * Sort evicts data to the disk when it wasn't able to fit that data into + * main memory. This is why we assume space used on the disk to be more + * important for tracking resource usage than space used in memory. Note + * that the amount of space occupied by some tupleset on the disk might be + * less than amount of space occupied by the same tupleset in memory due + * to more compact representation. + */ + if ((isSpaceDisk && !state->isMaxSpaceDisk) || + (isSpaceDisk == state->isMaxSpaceDisk && spaceUsed > state->maxSpace)) + { + state->maxSpace = spaceUsed; + state->isMaxSpaceDisk = isSpaceDisk; + state->maxSpaceStatus = state->status; + } +} + +/* + * tuplesort_reset + * + * Reset the tuplesort. Reset all the data in the tuplesort, but leave the + * meta-information in. After tuplesort_reset, tuplesort is ready to start + * a new sort. This allows avoiding recreation of tuple sort states (and + * save resources) when sorting multiple small batches. + */ +void +tuplesort_reset(Tuplesortstate *state) +{ + tuplesort_updatemax(state); + tuplesort_free(state); + + /* + * After we've freed up per-batch memory, re-setup all of the state common + * to both the first batch and any subsequent batch. + */ + tuplesort_begin_batch(state); + + state->lastReturnedTuple = NULL; + state->slabMemoryBegin = NULL; + state->slabMemoryEnd = NULL; + state->slabFreeHead = NULL; +} + +/* + * Grow the memtuples[] array, if possible within our memory constraint. We + * must not exceed INT_MAX tuples in memory or the caller-provided memory + * limit. Return true if we were able to enlarge the array, false if not. + * + * Normally, at each increment we double the size of the array. When doing + * that would exceed a limit, we attempt one last, smaller increase (and then + * clear the growmemtuples flag so we don't try any more). That allows us to + * use memory as fully as permitted; sticking to the pure doubling rule could + * result in almost half going unused. Because availMem moves around with + * tuple addition/removal, we need some rule to prevent making repeated small + * increases in memtupsize, which would just be useless thrashing. The + * growmemtuples flag accomplishes that and also prevents useless + * recalculations in this function. + */ +static bool +grow_memtuples(Tuplesortstate *state) +{ + int newmemtupsize; + int memtupsize = state->memtupsize; + int64 memNowUsed = state->allowedMem - state->availMem; + + /* Forget it if we've already maxed out memtuples, per comment above */ + if (!state->growmemtuples) + return false; + + /* Select new value of memtupsize */ + if (memNowUsed <= state->availMem) + { + /* + * We've used no more than half of allowedMem; double our usage, + * clamping at INT_MAX tuples. + */ + if (memtupsize < INT_MAX / 2) + newmemtupsize = memtupsize * 2; + else + { + newmemtupsize = INT_MAX; + state->growmemtuples = false; + } + } + else + { + /* + * This will be the last increment of memtupsize. Abandon doubling + * strategy and instead increase as much as we safely can. + * + * To stay within allowedMem, we can't increase memtupsize by more + * than availMem / sizeof(SortTuple) elements. In practice, we want + * to increase it by considerably less, because we need to leave some + * space for the tuples to which the new array slots will refer. We + * assume the new tuples will be about the same size as the tuples + * we've already seen, and thus we can extrapolate from the space + * consumption so far to estimate an appropriate new size for the + * memtuples array. The optimal value might be higher or lower than + * this estimate, but it's hard to know that in advance. We again + * clamp at INT_MAX tuples. + * + * This calculation is safe against enlarging the array so much that + * LACKMEM becomes true, because the memory currently used includes + * the present array; thus, there would be enough allowedMem for the + * new array elements even if no other memory were currently used. + * + * We do the arithmetic in float8, because otherwise the product of + * memtupsize and allowedMem could overflow. Any inaccuracy in the + * result should be insignificant; but even if we computed a + * completely insane result, the checks below will prevent anything + * really bad from happening. + */ + double grow_ratio; + + grow_ratio = (double) state->allowedMem / (double) memNowUsed; + if (memtupsize * grow_ratio < INT_MAX) + newmemtupsize = (int) (memtupsize * grow_ratio); + else + newmemtupsize = INT_MAX; + + /* We won't make any further enlargement attempts */ + state->growmemtuples = false; + } + + /* Must enlarge array by at least one element, else report failure */ + if (newmemtupsize <= memtupsize) + goto noalloc; + + /* + * On a 32-bit machine, allowedMem could exceed MaxAllocHugeSize. Clamp + * to ensure our request won't be rejected. Note that we can easily + * exhaust address space before facing this outcome. (This is presently + * impossible due to guc.c's MAX_KILOBYTES limitation on work_mem, but + * don't rely on that at this distance.) + */ + if ((Size) newmemtupsize >= MaxAllocHugeSize / sizeof(SortTuple)) + { + newmemtupsize = (int) (MaxAllocHugeSize / sizeof(SortTuple)); + state->growmemtuples = false; /* can't grow any more */ + } + + /* + * We need to be sure that we do not cause LACKMEM to become true, else + * the space management algorithm will go nuts. The code above should + * never generate a dangerous request, but to be safe, check explicitly + * that the array growth fits within availMem. (We could still cause + * LACKMEM if the memory chunk overhead associated with the memtuples + * array were to increase. That shouldn't happen because we chose the + * initial array size large enough to ensure that palloc will be treating + * both old and new arrays as separate chunks. But we'll check LACKMEM + * explicitly below just in case.) + */ + if (state->availMem < (int64) ((newmemtupsize - memtupsize) * sizeof(SortTuple))) + goto noalloc; + + /* OK, do it */ + FREEMEM(state, GetMemoryChunkSpace(state->memtuples)); + state->memtupsize = newmemtupsize; + state->memtuples = (SortTuple *) + repalloc_huge(state->memtuples, + state->memtupsize * sizeof(SortTuple)); + USEMEM(state, GetMemoryChunkSpace(state->memtuples)); + if (LACKMEM(state)) + elog(ERROR, "unexpected out-of-memory situation in tuplesort"); + return true; + +noalloc: + /* If for any reason we didn't realloc, shut off future attempts */ + state->growmemtuples = false; + return false; +} + +/* + * Accept one tuple while collecting input data for sort. + * + * Note that the input data is always copied; the caller need not save it. + */ +void +tuplesort_puttupleslot(Tuplesortstate *state, TupleTableSlot *slot) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + /* + * Copy the given tuple into memory we control, and decrease availMem. + * Then call the common code. + */ + COPYTUP(state, &stup, (void *) slot); + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Accept one tuple while collecting input data for sort. + * + * Note that the input data is always copied; the caller need not save it. + */ +void +tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + /* + * Copy the given tuple into memory we control, and decrease availMem. + * Then call the common code. + */ + COPYTUP(state, &stup, (void *) tup); + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Collect one index tuple while collecting input data for sort, building + * it from caller-supplied values. + */ +void +tuplesort_putindextuplevalues(Tuplesortstate *state, Relation rel, + ItemPointer self, Datum *values, + bool *isnull) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + SortTuple stup; + Datum original; + IndexTuple tuple; + + stup.tuple = index_form_tuple(RelationGetDescr(rel), values, isnull); + tuple = ((IndexTuple) stup.tuple); + tuple->t_tid = *self; + USEMEM(state, GetMemoryChunkSpace(stup.tuple)); + /* set up first-column key value */ + original = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &stup.isnull1); + + MemoryContextSwitchTo(state->sortcontext); + + if (!state->sortKeys || !state->sortKeys->abbrev_converter || stup.isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to zeroed representation (to be + * consistent, and to support cheap inequality tests for NULL + * abbreviated keys). + */ + stup.datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup.datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup.datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + tuple = mtup->tuple; + mtup->datum1 = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &mtup->isnull1); + } + } + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Accept one Datum while collecting input data for sort. + * + * If the Datum is pass-by-ref type, the value will be copied. + */ +void +tuplesort_putdatum(Tuplesortstate *state, Datum val, bool isNull) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + SortTuple stup; + + /* + * Pass-by-value types or null values are just stored directly in + * stup.datum1 (and stup.tuple is not used and set to NULL). + * + * Non-null pass-by-reference values need to be copied into memory we + * control, and possibly abbreviated. The copied value is pointed to by + * stup.tuple and is treated as the canonical copy (e.g. to return via + * tuplesort_getdatum or when writing to tape); stup.datum1 gets the + * abbreviated value if abbreviation is happening, otherwise it's + * identical to stup.tuple. + */ + + if (isNull || !state->tuples) + { + /* + * Set datum1 to zeroed representation for NULLs (to be consistent, + * and to support cheap inequality tests for NULL abbreviated keys). + */ + stup.datum1 = !isNull ? val : (Datum) 0; + stup.isnull1 = isNull; + stup.tuple = NULL; /* no separate storage */ + MemoryContextSwitchTo(state->sortcontext); + } + else + { + Datum original = datumCopy(val, false, state->datumTypeLen); + + stup.isnull1 = false; + stup.tuple = DatumGetPointer(original); + USEMEM(state, GetMemoryChunkSpace(stup.tuple)); + MemoryContextSwitchTo(state->sortcontext); + + if (!state->sortKeys->abbrev_converter) + { + stup.datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup.datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup.datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any + * case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + mtup->datum1 = PointerGetDatum(mtup->tuple); + } + } + } + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Shared code for tuple and datum cases. + */ +static void +puttuple_common(Tuplesortstate *state, SortTuple *tuple) +{ + Assert(!LEADER(state)); + + switch (state->status) + { + case TSS_INITIAL: + + /* + * Save the tuple into the unsorted array. First, grow the array + * as needed. Note that we try to grow the array when there is + * still one free slot remaining --- if we fail, there'll still be + * room to store the incoming tuple, and then we'll switch to + * tape-based operation. + */ + if (state->memtupcount >= state->memtupsize - 1) + { + (void) grow_memtuples(state); + Assert(state->memtupcount < state->memtupsize); + } + state->memtuples[state->memtupcount++] = *tuple; + + /* + * Check if it's time to switch over to a bounded heapsort. We do + * so if the input tuple count exceeds twice the desired tuple + * count (this is a heuristic for where heapsort becomes cheaper + * than a quicksort), or if we've just filled workMem and have + * enough tuples to meet the bound. + * + * Note that once we enter TSS_BOUNDED state we will always try to + * complete the sort that way. In the worst case, if later input + * tuples are larger than earlier ones, this might cause us to + * exceed workMem significantly. + */ + if (state->bounded && + (state->memtupcount > state->bound * 2 || + (state->memtupcount > state->bound && LACKMEM(state)))) + { +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "switching to bounded heapsort at %d tuples: %s", + state->memtupcount, + pg_rusage_show(&state->ru_start)); +#endif + make_bounded_heap(state); + return; + } + + /* + * Done if we still fit in available memory and have array slots. + */ + if (state->memtupcount < state->memtupsize && !LACKMEM(state)) + return; + + /* + * Nope; time to switch to tape-based operation. + */ + inittapes(state, true); + + /* + * Dump all tuples. + */ + dumptuples(state, false); + break; + + case TSS_BOUNDED: + + /* + * We don't want to grow the array here, so check whether the new + * tuple can be discarded before putting it in. This should be a + * good speed optimization, too, since when there are many more + * input tuples than the bound, most input tuples can be discarded + * with just this one comparison. Note that because we currently + * have the sort direction reversed, we must check for <= not >=. + */ + if (COMPARETUP(state, tuple, &state->memtuples[0]) <= 0) + { + /* new tuple <= top of the heap, so we can discard it */ + free_sort_tuple(state, tuple); + CHECK_FOR_INTERRUPTS(); + } + else + { + /* discard top of heap, replacing it with the new tuple */ + free_sort_tuple(state, &state->memtuples[0]); + tuplesort_heap_replace_top(state, tuple); + } + break; + + case TSS_BUILDRUNS: + + /* + * Save the tuple into the unsorted array (there must be space) + */ + state->memtuples[state->memtupcount++] = *tuple; + + /* + * If we are over the memory limit, dump all tuples. + */ + dumptuples(state, false); + break; + + default: + elog(ERROR, "invalid tuplesort state"); + break; + } +} + +static bool +consider_abort_common(Tuplesortstate *state) +{ + Assert(state->sortKeys[0].abbrev_converter != NULL); + Assert(state->sortKeys[0].abbrev_abort != NULL); + Assert(state->sortKeys[0].abbrev_full_comparator != NULL); + + /* + * Check effectiveness of abbreviation optimization. Consider aborting + * when still within memory limit. + */ + if (state->status == TSS_INITIAL && + state->memtupcount >= state->abbrevNext) + { + state->abbrevNext *= 2; + + /* + * Check opclass-supplied abbreviation abort routine. It may indicate + * that abbreviation should not proceed. + */ + if (!state->sortKeys->abbrev_abort(state->memtupcount, + state->sortKeys)) + return false; + + /* + * Finally, restore authoritative comparator, and indicate that + * abbreviation is not in play by setting abbrev_converter to NULL + */ + state->sortKeys[0].comparator = state->sortKeys[0].abbrev_full_comparator; + state->sortKeys[0].abbrev_converter = NULL; + /* Not strictly necessary, but be tidy */ + state->sortKeys[0].abbrev_abort = NULL; + state->sortKeys[0].abbrev_full_comparator = NULL; + + /* Give up - expect original pass-by-value representation */ + return true; + } + + return false; +} + +/* + * All tuples have been provided; finish the sort. + */ +void +tuplesort_performsort(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "performsort of worker %d starting: %s", + state->worker, pg_rusage_show(&state->ru_start)); +#endif + + switch (state->status) + { + case TSS_INITIAL: + + /* + * We were able to accumulate all the tuples within the allowed + * amount of memory, or leader to take over worker tapes + */ + if (SERIAL(state)) + { + /* Just qsort 'em and we're done */ + tuplesort_sort_memtuples(state); + state->status = TSS_SORTEDINMEM; + } + else if (WORKER(state)) + { + /* + * Parallel workers must still dump out tuples to tape. No + * merge is required to produce single output run, though. + */ + inittapes(state, false); + dumptuples(state, true); + worker_nomergeruns(state); + state->status = TSS_SORTEDONTAPE; + } + else + { + /* + * Leader will take over worker tapes and merge worker runs. + * Note that mergeruns sets the correct state->status. + */ + leader_takeover_tapes(state); + mergeruns(state); + } + state->current = 0; + state->eof_reached = false; + state->markpos_block = 0L; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + + case TSS_BOUNDED: + + /* + * We were able to accumulate all the tuples required for output + * in memory, using a heap to eliminate excess tuples. Now we + * have to transform the heap to a properly-sorted array. + */ + sort_bounded_heap(state); + state->current = 0; + state->eof_reached = false; + state->markpos_offset = 0; + state->markpos_eof = false; + state->status = TSS_SORTEDINMEM; + break; + + case TSS_BUILDRUNS: + + /* + * Finish tape-based sort. First, flush all tuples remaining in + * memory out to tape; then merge until we have a single remaining + * run (or, if !randomAccess and !WORKER(), one run per tape). + * Note that mergeruns sets the correct state->status. + */ + dumptuples(state, true); + mergeruns(state); + state->eof_reached = false; + state->markpos_block = 0L; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + +#ifdef TRACE_SORT + if (trace_sort) + { + if (state->status == TSS_FINALMERGE) + elog(LOG, "performsort of worker %d done (except %d-way final merge): %s", + state->worker, state->activeTapes, + pg_rusage_show(&state->ru_start)); + else + elog(LOG, "performsort of worker %d done: %s", + state->worker, pg_rusage_show(&state->ru_start)); + } +#endif + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Internal routine to fetch the next tuple in either forward or back + * direction into *stup. Returns false if no more tuples. + * Returned tuple belongs to tuplesort memory context, and must not be freed + * by caller. Note that fetched tuple is stored in memory that may be + * recycled by any future fetch. + */ +static bool +tuplesort_gettuple_common(Tuplesortstate *state, bool forward, + SortTuple *stup) +{ + unsigned int tuplen; + size_t nmoved; + + Assert(!WORKER(state)); + + switch (state->status) + { + case TSS_SORTEDINMEM: + Assert(forward || state->randomAccess); + Assert(!state->slabAllocatorUsed); + if (forward) + { + if (state->current < state->memtupcount) + { + *stup = state->memtuples[state->current++]; + return true; + } + state->eof_reached = true; + + /* + * Complain if caller tries to retrieve more tuples than + * originally asked for in a bounded sort. This is because + * returning EOF here might be the wrong thing. + */ + if (state->bounded && state->current >= state->bound) + elog(ERROR, "retrieved too many tuples in a bounded sort"); + + return false; + } + else + { + if (state->current <= 0) + return false; + + /* + * if all tuples are fetched already then we return last + * tuple, else - tuple before last returned. + */ + if (state->eof_reached) + state->eof_reached = false; + else + { + state->current--; /* last returned tuple */ + if (state->current <= 0) + return false; + } + *stup = state->memtuples[state->current - 1]; + return true; + } + break; + + case TSS_SORTEDONTAPE: + Assert(forward || state->randomAccess); + Assert(state->slabAllocatorUsed); + + /* + * The slot that held the tuple that we returned in previous + * gettuple call can now be reused. + */ + if (state->lastReturnedTuple) + { + RELEASE_SLAB_SLOT(state, state->lastReturnedTuple); + state->lastReturnedTuple = NULL; + } + + if (forward) + { + if (state->eof_reached) + return false; + + if ((tuplen = getlen(state, state->result_tape, true)) != 0) + { + READTUP(state, stup, state->result_tape, tuplen); + + /* + * Remember the tuple we return, so that we can recycle + * its memory on next call. (This can be NULL, in the + * !state->tuples case). + */ + state->lastReturnedTuple = stup->tuple; + + return true; + } + else + { + state->eof_reached = true; + return false; + } + } + + /* + * Backward. + * + * if all tuples are fetched already then we return last tuple, + * else - tuple before last returned. + */ + if (state->eof_reached) + { + /* + * Seek position is pointing just past the zero tuplen at the + * end of file; back up to fetch last tuple's ending length + * word. If seek fails we must have a completely empty file. + */ + nmoved = LogicalTapeBackspace(state->tapeset, + state->result_tape, + 2 * sizeof(unsigned int)); + if (nmoved == 0) + return false; + else if (nmoved != 2 * sizeof(unsigned int)) + elog(ERROR, "unexpected tape position"); + state->eof_reached = false; + } + else + { + /* + * Back up and fetch previously-returned tuple's ending length + * word. If seek fails, assume we are at start of file. + */ + nmoved = LogicalTapeBackspace(state->tapeset, + state->result_tape, + sizeof(unsigned int)); + if (nmoved == 0) + return false; + else if (nmoved != sizeof(unsigned int)) + elog(ERROR, "unexpected tape position"); + tuplen = getlen(state, state->result_tape, false); + + /* + * Back up to get ending length word of tuple before it. + */ + nmoved = LogicalTapeBackspace(state->tapeset, + state->result_tape, + tuplen + 2 * sizeof(unsigned int)); + if (nmoved == tuplen + sizeof(unsigned int)) + { + /* + * We backed up over the previous tuple, but there was no + * ending length word before it. That means that the prev + * tuple is the first tuple in the file. It is now the + * next to read in forward direction (not obviously right, + * but that is what in-memory case does). + */ + return false; + } + else if (nmoved != tuplen + 2 * sizeof(unsigned int)) + elog(ERROR, "bogus tuple length in backward scan"); + } + + tuplen = getlen(state, state->result_tape, false); + + /* + * Now we have the length of the prior tuple, back up and read it. + * Note: READTUP expects we are positioned after the initial + * length word of the tuple, so back up to that point. + */ + nmoved = LogicalTapeBackspace(state->tapeset, + state->result_tape, + tuplen); + if (nmoved != tuplen) + elog(ERROR, "bogus tuple length in backward scan"); + READTUP(state, stup, state->result_tape, tuplen); + + /* + * Remember the tuple we return, so that we can recycle its memory + * on next call. (This can be NULL, in the Datum case). + */ + state->lastReturnedTuple = stup->tuple; + + return true; + + case TSS_FINALMERGE: + Assert(forward); + /* We are managing memory ourselves, with the slab allocator. */ + Assert(state->slabAllocatorUsed); + + /* + * The slab slot holding the tuple that we returned in previous + * gettuple call can now be reused. + */ + if (state->lastReturnedTuple) + { + RELEASE_SLAB_SLOT(state, state->lastReturnedTuple); + state->lastReturnedTuple = NULL; + } + + /* + * This code should match the inner loop of mergeonerun(). + */ + if (state->memtupcount > 0) + { + int srcTape = state->memtuples[0].srctape; + SortTuple newtup; + + *stup = state->memtuples[0]; + + /* + * Remember the tuple we return, so that we can recycle its + * memory on next call. (This can be NULL, in the Datum case). + */ + state->lastReturnedTuple = stup->tuple; + + /* + * Pull next tuple from tape, and replace the returned tuple + * at top of the heap with it. + */ + if (!mergereadnext(state, srcTape, &newtup)) + { + /* + * If no more data, we've reached end of run on this tape. + * Remove the top node from the heap. + */ + tuplesort_heap_delete_top(state); + + /* + * Rewind to free the read buffer. It'd go away at the + * end of the sort anyway, but better to release the + * memory early. + */ + LogicalTapeRewindForWrite(state->tapeset, srcTape); + return true; + } + newtup.srctape = srcTape; + tuplesort_heap_replace_top(state, &newtup); + return true; + } + return false; + + default: + elog(ERROR, "invalid tuplesort state"); + return false; /* keep compiler quiet */ + } +} + +/* + * Fetch the next tuple in either forward or back direction. + * If successful, put tuple in slot and return true; else, clear the slot + * and return false. + * + * Caller may optionally be passed back abbreviated value (on true return + * value) when abbreviation was used, which can be used to cheaply avoid + * equality checks that might otherwise be required. Caller can safely make a + * determination of "non-equal tuple" based on simple binary inequality. A + * NULL value in leading attribute will set abbreviated value to zeroed + * representation, which caller may rely on in abbreviated inequality check. + * + * If copy is true, the slot receives a tuple that's been copied into the + * caller's memory context, so that it will stay valid regardless of future + * manipulations of the tuplesort's state (up to and including deleting the + * tuplesort). If copy is false, the slot will just receive a pointer to a + * tuple held within the tuplesort, which is more efficient, but only safe for + * callers that are prepared to have any subsequent manipulation of the + * tuplesort's state invalidate slot contents. + */ +bool +tuplesort_gettupleslot(Tuplesortstate *state, bool forward, bool copy, + TupleTableSlot *slot, Datum *abbrev) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + stup.tuple = NULL; + + MemoryContextSwitchTo(oldcontext); + + if (stup.tuple) + { + /* Record abbreviated key for caller */ + if (state->sortKeys->abbrev_converter && abbrev) + *abbrev = stup.datum1; + + if (copy) + stup.tuple = heap_copy_minimal_tuple((MinimalTuple) stup.tuple); + + ExecStoreMinimalTuple((MinimalTuple) stup.tuple, slot, copy); + return true; + } + else + { + ExecClearTuple(slot); + return false; + } +} + +/* + * Fetch the next tuple in either forward or back direction. + * Returns NULL if no more tuples. Returned tuple belongs to tuplesort memory + * context, and must not be freed by caller. Caller may not rely on tuple + * remaining valid after any further manipulation of tuplesort. + */ +HeapTuple +tuplesort_getheaptuple(Tuplesortstate *state, bool forward) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + stup.tuple = NULL; + + MemoryContextSwitchTo(oldcontext); + + return stup.tuple; +} + +/* + * Fetch the next index tuple in either forward or back direction. + * Returns NULL if no more tuples. Returned tuple belongs to tuplesort memory + * context, and must not be freed by caller. Caller may not rely on tuple + * remaining valid after any further manipulation of tuplesort. + */ +IndexTuple +tuplesort_getindextuple(Tuplesortstate *state, bool forward) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + stup.tuple = NULL; + + MemoryContextSwitchTo(oldcontext); + + return (IndexTuple) stup.tuple; +} + +/* + * Fetch the next Datum in either forward or back direction. + * Returns false if no more datums. + * + * If the Datum is pass-by-ref type, the returned value is freshly palloc'd + * in caller's context, and is now owned by the caller (this differs from + * similar routines for other types of tuplesorts). + * + * Caller may optionally be passed back abbreviated value (on true return + * value) when abbreviation was used, which can be used to cheaply avoid + * equality checks that might otherwise be required. Caller can safely make a + * determination of "non-equal tuple" based on simple binary inequality. A + * NULL value will have a zeroed abbreviated value representation, which caller + * may rely on in abbreviated inequality check. + */ +bool +tuplesort_getdatum(Tuplesortstate *state, bool forward, + Datum *val, bool *isNull, Datum *abbrev) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + { + MemoryContextSwitchTo(oldcontext); + return false; + } + + /* Ensure we copy into caller's memory context */ + MemoryContextSwitchTo(oldcontext); + + /* Record abbreviated key for caller */ + if (state->sortKeys->abbrev_converter && abbrev) + *abbrev = stup.datum1; + + if (stup.isnull1 || !state->tuples) + { + *val = stup.datum1; + *isNull = stup.isnull1; + } + else + { + /* use stup.tuple because stup.datum1 may be an abbreviation */ + *val = datumCopy(PointerGetDatum(stup.tuple), false, state->datumTypeLen); + *isNull = false; + } + + return true; +} + +/* + * Advance over N tuples in either forward or back direction, + * without returning any data. N==0 is a no-op. + * Returns true if successful, false if ran out of tuples. + */ +bool +tuplesort_skiptuples(Tuplesortstate *state, int64 ntuples, bool forward) +{ + MemoryContext oldcontext; + + /* + * We don't actually support backwards skip yet, because no callers need + * it. The API is designed to allow for that later, though. + */ + Assert(forward); + Assert(ntuples >= 0); + Assert(!WORKER(state)); + + switch (state->status) + { + case TSS_SORTEDINMEM: + if (state->memtupcount - state->current >= ntuples) + { + state->current += ntuples; + return true; + } + state->current = state->memtupcount; + state->eof_reached = true; + + /* + * Complain if caller tries to retrieve more tuples than + * originally asked for in a bounded sort. This is because + * returning EOF here might be the wrong thing. + */ + if (state->bounded && state->current >= state->bound) + elog(ERROR, "retrieved too many tuples in a bounded sort"); + + return false; + + case TSS_SORTEDONTAPE: + case TSS_FINALMERGE: + + /* + * We could probably optimize these cases better, but for now it's + * not worth the trouble. + */ + oldcontext = MemoryContextSwitchTo(state->sortcontext); + while (ntuples-- > 0) + { + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + { + MemoryContextSwitchTo(oldcontext); + return false; + } + CHECK_FOR_INTERRUPTS(); + } + MemoryContextSwitchTo(oldcontext); + return true; + + default: + elog(ERROR, "invalid tuplesort state"); + return false; /* keep compiler quiet */ + } +} + +/* + * tuplesort_merge_order - report merge order we'll use for given memory + * (note: "merge order" just means the number of input tapes in the merge). + * + * This is exported for use by the planner. allowedMem is in bytes. + */ +int +tuplesort_merge_order(int64 allowedMem) +{ + int mOrder; + + /* + * We need one tape for each merge input, plus another one for the output, + * and each of these tapes needs buffer space. In addition we want + * MERGE_BUFFER_SIZE workspace per input tape (but the output tape doesn't + * count). + * + * Note: you might be thinking we need to account for the memtuples[] + * array in this calculation, but we effectively treat that as part of the + * MERGE_BUFFER_SIZE workspace. + */ + mOrder = (allowedMem - TAPE_BUFFER_OVERHEAD) / + (MERGE_BUFFER_SIZE + TAPE_BUFFER_OVERHEAD); + + /* + * Even in minimum memory, use at least a MINORDER merge. On the other + * hand, even when we have lots of memory, do not use more than a MAXORDER + * merge. Tapes are pretty cheap, but they're not entirely free. Each + * additional tape reduces the amount of memory available to build runs, + * which in turn can cause the same sort to need more runs, which makes + * merging slower even if it can still be done in a single pass. Also, + * high order merges are quite slow due to CPU cache effects; it can be + * faster to pay the I/O cost of a polyphase merge than to perform a + * single merge pass across many hundreds of tapes. + */ + mOrder = Max(mOrder, MINORDER); + mOrder = Min(mOrder, MAXORDER); + + return mOrder; +} + +/* + * inittapes - initialize for tape sorting. + * + * This is called only if we have found we won't sort in memory. + */ +static void +inittapes(Tuplesortstate *state, bool mergeruns) +{ + int maxTapes, + j; + + Assert(!LEADER(state)); + + if (mergeruns) + { + /* Compute number of tapes to use: merge order plus 1 */ + maxTapes = tuplesort_merge_order(state->allowedMem) + 1; + } + else + { + /* Workers can sometimes produce single run, output without merge */ + Assert(WORKER(state)); + maxTapes = MINORDER + 1; + } + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d switching to external sort with %d tapes: %s", + state->worker, maxTapes, pg_rusage_show(&state->ru_start)); +#endif + + /* Create the tape set and allocate the per-tape data arrays */ + inittapestate(state, maxTapes); + state->tapeset = + LogicalTapeSetCreate(maxTapes, false, NULL, + state->shared ? &state->shared->fileset : NULL, + state->worker); + + state->currentRun = 0; + + /* + * Initialize variables of Algorithm D (step D1). + */ + for (j = 0; j < maxTapes; j++) + { + state->tp_fib[j] = 1; + state->tp_runs[j] = 0; + state->tp_dummy[j] = 1; + state->tp_tapenum[j] = j; + } + state->tp_fib[state->tapeRange] = 0; + state->tp_dummy[state->tapeRange] = 0; + + state->Level = 1; + state->destTape = 0; + + state->status = TSS_BUILDRUNS; +} + +/* + * inittapestate - initialize generic tape management state + */ +static void +inittapestate(Tuplesortstate *state, int maxTapes) +{ + int64 tapeSpace; + + /* + * Decrease availMem to reflect the space needed for tape buffers; but + * don't decrease it to the point that we have no room for tuples. (That + * case is only likely to occur if sorting pass-by-value Datums; in all + * other scenarios the memtuples[] array is unlikely to occupy more than + * half of allowedMem. In the pass-by-value case it's not important to + * account for tuple space, so we don't care if LACKMEM becomes + * inaccurate.) + */ + tapeSpace = (int64) maxTapes * TAPE_BUFFER_OVERHEAD; + + if (tapeSpace + GetMemoryChunkSpace(state->memtuples) < state->allowedMem) + USEMEM(state, tapeSpace); + + /* + * Make sure that the temp file(s) underlying the tape set are created in + * suitable temp tablespaces. For parallel sorts, this should have been + * called already, but it doesn't matter if it is called a second time. + */ + PrepareTempTablespaces(); + + state->mergeactive = (bool *) palloc0(maxTapes * sizeof(bool)); + state->tp_fib = (int *) palloc0(maxTapes * sizeof(int)); + state->tp_runs = (int *) palloc0(maxTapes * sizeof(int)); + state->tp_dummy = (int *) palloc0(maxTapes * sizeof(int)); + state->tp_tapenum = (int *) palloc0(maxTapes * sizeof(int)); + + /* Record # of tapes allocated (for duration of sort) */ + state->maxTapes = maxTapes; + /* Record maximum # of tapes usable as inputs when merging */ + state->tapeRange = maxTapes - 1; +} + +/* + * selectnewtape -- select new tape for new initial run. + * + * This is called after finishing a run when we know another run + * must be started. This implements steps D3, D4 of Algorithm D. + */ +static void +selectnewtape(Tuplesortstate *state) +{ + int j; + int a; + + /* Step D3: advance j (destTape) */ + if (state->tp_dummy[state->destTape] < state->tp_dummy[state->destTape + 1]) + { + state->destTape++; + return; + } + if (state->tp_dummy[state->destTape] != 0) + { + state->destTape = 0; + return; + } + + /* Step D4: increase level */ + state->Level++; + a = state->tp_fib[0]; + for (j = 0; j < state->tapeRange; j++) + { + state->tp_dummy[j] = a + state->tp_fib[j + 1] - state->tp_fib[j]; + state->tp_fib[j] = a + state->tp_fib[j + 1]; + } + state->destTape = 0; +} + +/* + * Initialize the slab allocation arena, for the given number of slots. + */ +static void +init_slab_allocator(Tuplesortstate *state, int numSlots) +{ + if (numSlots > 0) + { + char *p; + int i; + + state->slabMemoryBegin = palloc(numSlots * SLAB_SLOT_SIZE); + state->slabMemoryEnd = state->slabMemoryBegin + + numSlots * SLAB_SLOT_SIZE; + state->slabFreeHead = (SlabSlot *) state->slabMemoryBegin; + USEMEM(state, numSlots * SLAB_SLOT_SIZE); + + p = state->slabMemoryBegin; + for (i = 0; i < numSlots - 1; i++) + { + ((SlabSlot *) p)->nextfree = (SlabSlot *) (p + SLAB_SLOT_SIZE); + p += SLAB_SLOT_SIZE; + } + ((SlabSlot *) p)->nextfree = NULL; + } + else + { + state->slabMemoryBegin = state->slabMemoryEnd = NULL; + state->slabFreeHead = NULL; + } + state->slabAllocatorUsed = true; +} + +/* + * mergeruns -- merge all the completed initial runs. + * + * This implements steps D5, D6 of Algorithm D. All input data has + * already been written to initial runs on tape (see dumptuples). + */ +static void +mergeruns(Tuplesortstate *state) +{ + int tapenum, + svTape, + svRuns, + svDummy; + int numTapes; + int numInputTapes; + + Assert(state->status == TSS_BUILDRUNS); + Assert(state->memtupcount == 0); + + if (state->sortKeys != NULL && state->sortKeys->abbrev_converter != NULL) + { + /* + * If there are multiple runs to be merged, when we go to read back + * tuples from disk, abbreviated keys will not have been stored, and + * we don't care to regenerate them. Disable abbreviation from this + * point on. + */ + state->sortKeys->abbrev_converter = NULL; + state->sortKeys->comparator = state->sortKeys->abbrev_full_comparator; + + /* Not strictly necessary, but be tidy */ + state->sortKeys->abbrev_abort = NULL; + state->sortKeys->abbrev_full_comparator = NULL; + } + + /* + * Reset tuple memory. We've freed all the tuples that we previously + * allocated. We will use the slab allocator from now on. + */ + MemoryContextResetOnly(state->tuplecontext); + + /* + * We no longer need a large memtuples array. (We will allocate a smaller + * one for the heap later.) + */ + FREEMEM(state, GetMemoryChunkSpace(state->memtuples)); + pfree(state->memtuples); + state->memtuples = NULL; + + /* + * If we had fewer runs than tapes, refund the memory that we imagined we + * would need for the tape buffers of the unused tapes. + * + * numTapes and numInputTapes reflect the actual number of tapes we will + * use. Note that the output tape's tape number is maxTapes - 1, so the + * tape numbers of the used tapes are not consecutive, and you cannot just + * loop from 0 to numTapes to visit all used tapes! + */ + if (state->Level == 1) + { + numInputTapes = state->currentRun; + numTapes = numInputTapes + 1; + FREEMEM(state, (state->maxTapes - numTapes) * TAPE_BUFFER_OVERHEAD); + } + else + { + numInputTapes = state->tapeRange; + numTapes = state->maxTapes; + } + + /* + * Initialize the slab allocator. We need one slab slot per input tape, + * for the tuples in the heap, plus one to hold the tuple last returned + * from tuplesort_gettuple. (If we're sorting pass-by-val Datums, + * however, we don't need to do allocate anything.) + * + * From this point on, we no longer use the USEMEM()/LACKMEM() mechanism + * to track memory usage of individual tuples. + */ + if (state->tuples) + init_slab_allocator(state, numInputTapes + 1); + else + init_slab_allocator(state, 0); + + /* + * Allocate a new 'memtuples' array, for the heap. It will hold one tuple + * from each input tape. + */ + state->memtupsize = numInputTapes; + state->memtuples = (SortTuple *) MemoryContextAlloc(state->maincontext, + numInputTapes * sizeof(SortTuple)); + USEMEM(state, GetMemoryChunkSpace(state->memtuples)); + + /* + * Use all the remaining memory we have available for read buffers among + * the input tapes. + * + * We don't try to "rebalance" the memory among tapes, when we start a new + * merge phase, even if some tapes are inactive in the new phase. That + * would be hard, because logtape.c doesn't know where one run ends and + * another begins. When a new merge phase begins, and a tape doesn't + * participate in it, its buffer nevertheless already contains tuples from + * the next run on same tape, so we cannot release the buffer. That's OK + * in practice, merge performance isn't that sensitive to the amount of + * buffers used, and most merge phases use all or almost all tapes, + * anyway. + */ +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d using " INT64_FORMAT " KB of memory for read buffers among %d input tapes", + state->worker, state->availMem / 1024, numInputTapes); +#endif + + state->read_buffer_size = Max(state->availMem / numInputTapes, 0); + USEMEM(state, state->read_buffer_size * numInputTapes); + + /* End of step D2: rewind all output tapes to prepare for merging */ + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + LogicalTapeRewindForRead(state->tapeset, tapenum, state->read_buffer_size); + + for (;;) + { + /* + * At this point we know that tape[T] is empty. If there's just one + * (real or dummy) run left on each input tape, then only one merge + * pass remains. If we don't have to produce a materialized sorted + * tape, we can stop at this point and do the final merge on-the-fly. + */ + if (!state->randomAccess && !WORKER(state)) + { + bool allOneRun = true; + + Assert(state->tp_runs[state->tapeRange] == 0); + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + { + if (state->tp_runs[tapenum] + state->tp_dummy[tapenum] != 1) + { + allOneRun = false; + break; + } + } + if (allOneRun) + { + /* Tell logtape.c we won't be writing anymore */ + LogicalTapeSetForgetFreeSpace(state->tapeset); + /* Initialize for the final merge pass */ + beginmerge(state); + state->status = TSS_FINALMERGE; + return; + } + } + + /* Step D5: merge runs onto tape[T] until tape[P] is empty */ + while (state->tp_runs[state->tapeRange - 1] || + state->tp_dummy[state->tapeRange - 1]) + { + bool allDummy = true; + + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + { + if (state->tp_dummy[tapenum] == 0) + { + allDummy = false; + break; + } + } + + if (allDummy) + { + state->tp_dummy[state->tapeRange]++; + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + state->tp_dummy[tapenum]--; + } + else + mergeonerun(state); + } + + /* Step D6: decrease level */ + if (--state->Level == 0) + break; + /* rewind output tape T to use as new input */ + LogicalTapeRewindForRead(state->tapeset, state->tp_tapenum[state->tapeRange], + state->read_buffer_size); + /* rewind used-up input tape P, and prepare it for write pass */ + LogicalTapeRewindForWrite(state->tapeset, state->tp_tapenum[state->tapeRange - 1]); + state->tp_runs[state->tapeRange - 1] = 0; + + /* + * reassign tape units per step D6; note we no longer care about A[] + */ + svTape = state->tp_tapenum[state->tapeRange]; + svDummy = state->tp_dummy[state->tapeRange]; + svRuns = state->tp_runs[state->tapeRange]; + for (tapenum = state->tapeRange; tapenum > 0; tapenum--) + { + state->tp_tapenum[tapenum] = state->tp_tapenum[tapenum - 1]; + state->tp_dummy[tapenum] = state->tp_dummy[tapenum - 1]; + state->tp_runs[tapenum] = state->tp_runs[tapenum - 1]; + } + state->tp_tapenum[0] = svTape; + state->tp_dummy[0] = svDummy; + state->tp_runs[0] = svRuns; + } + + /* + * Done. Knuth says that the result is on TAPE[1], but since we exited + * the loop without performing the last iteration of step D6, we have not + * rearranged the tape unit assignment, and therefore the result is on + * TAPE[T]. We need to do it this way so that we can freeze the final + * output tape while rewinding it. The last iteration of step D6 would be + * a waste of cycles anyway... + */ + state->result_tape = state->tp_tapenum[state->tapeRange]; + if (!WORKER(state)) + LogicalTapeFreeze(state->tapeset, state->result_tape, NULL); + else + worker_freeze_result_tape(state); + state->status = TSS_SORTEDONTAPE; + + /* Release the read buffers of all the other tapes, by rewinding them. */ + for (tapenum = 0; tapenum < state->maxTapes; tapenum++) + { + if (tapenum != state->result_tape) + LogicalTapeRewindForWrite(state->tapeset, tapenum); + } +} + +/* + * Merge one run from each input tape, except ones with dummy runs. + * + * This is the inner loop of Algorithm D step D5. We know that the + * output tape is TAPE[T]. + */ +static void +mergeonerun(Tuplesortstate *state) +{ + int destTape = state->tp_tapenum[state->tapeRange]; + int srcTape; + + /* + * Start the merge by loading one tuple from each active source tape into + * the heap. We can also decrease the input run/dummy run counts. + */ + beginmerge(state); + + /* + * Execute merge by repeatedly extracting lowest tuple in heap, writing it + * out, and replacing it with next tuple from same tape (if there is + * another one). + */ + while (state->memtupcount > 0) + { + SortTuple stup; + + /* write the tuple to destTape */ + srcTape = state->memtuples[0].srctape; + WRITETUP(state, destTape, &state->memtuples[0]); + + /* recycle the slot of the tuple we just wrote out, for the next read */ + if (state->memtuples[0].tuple) + RELEASE_SLAB_SLOT(state, state->memtuples[0].tuple); + + /* + * pull next tuple from the tape, and replace the written-out tuple in + * the heap with it. + */ + if (mergereadnext(state, srcTape, &stup)) + { + stup.srctape = srcTape; + tuplesort_heap_replace_top(state, &stup); + } + else + tuplesort_heap_delete_top(state); + } + + /* + * When the heap empties, we're done. Write an end-of-run marker on the + * output tape, and increment its count of real runs. + */ + markrunend(state, destTape); + state->tp_runs[state->tapeRange]++; + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d finished %d-way merge step: %s", state->worker, + state->activeTapes, pg_rusage_show(&state->ru_start)); +#endif +} + +/* + * beginmerge - initialize for a merge pass + * + * We decrease the counts of real and dummy runs for each tape, and mark + * which tapes contain active input runs in mergeactive[]. Then, fill the + * merge heap with the first tuple from each active tape. + */ +static void +beginmerge(Tuplesortstate *state) +{ + int activeTapes; + int tapenum; + int srcTape; + + /* Heap should be empty here */ + Assert(state->memtupcount == 0); + + /* Adjust run counts and mark the active tapes */ + memset(state->mergeactive, 0, + state->maxTapes * sizeof(*state->mergeactive)); + activeTapes = 0; + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + { + if (state->tp_dummy[tapenum] > 0) + state->tp_dummy[tapenum]--; + else + { + Assert(state->tp_runs[tapenum] > 0); + state->tp_runs[tapenum]--; + srcTape = state->tp_tapenum[tapenum]; + state->mergeactive[srcTape] = true; + activeTapes++; + } + } + Assert(activeTapes > 0); + state->activeTapes = activeTapes; + + /* Load the merge heap with the first tuple from each input tape */ + for (srcTape = 0; srcTape < state->maxTapes; srcTape++) + { + SortTuple tup; + + if (mergereadnext(state, srcTape, &tup)) + { + tup.srctape = srcTape; + tuplesort_heap_insert(state, &tup); + } + } +} + +/* + * mergereadnext - read next tuple from one merge input tape + * + * Returns false on EOF. + */ +static bool +mergereadnext(Tuplesortstate *state, int srcTape, SortTuple *stup) +{ + unsigned int tuplen; + + if (!state->mergeactive[srcTape]) + return false; /* tape's run is already exhausted */ + + /* read next tuple, if any */ + if ((tuplen = getlen(state, srcTape, true)) == 0) + { + state->mergeactive[srcTape] = false; + return false; + } + READTUP(state, stup, srcTape, tuplen); + + return true; +} + +/* + * dumptuples - remove tuples from memtuples and write initial run to tape + * + * When alltuples = true, dump everything currently in memory. (This case is + * only used at end of input data.) + */ +static void +dumptuples(Tuplesortstate *state, bool alltuples) +{ + int memtupwrite; + int i; + + /* + * Nothing to do if we still fit in available memory and have array slots, + * unless this is the final call during initial run generation. + */ + if (state->memtupcount < state->memtupsize && !LACKMEM(state) && + !alltuples) + return; + + /* + * Final call might require no sorting, in rare cases where we just so + * happen to have previously LACKMEM()'d at the point where exactly all + * remaining tuples are loaded into memory, just before input was + * exhausted. + * + * In general, short final runs are quite possible. Rather than allowing + * a special case where there was a superfluous selectnewtape() call (i.e. + * a call with no subsequent run actually written to destTape), we prefer + * to write out a 0 tuple run. + * + * mergereadnext() is prepared for 0 tuple runs, and will reliably mark + * the tape inactive for the merge when called from beginmerge(). This + * case is therefore similar to the case where mergeonerun() finds a dummy + * run for the tape, and so doesn't need to merge a run from the tape (or + * conceptually "merges" the dummy run, if you prefer). According to + * Knuth, Algorithm D "isn't strictly optimal" in its method of + * distribution and dummy run assignment; this edge case seems very + * unlikely to make that appreciably worse. + */ + Assert(state->status == TSS_BUILDRUNS); + + /* + * It seems unlikely that this limit will ever be exceeded, but take no + * chances + */ + if (state->currentRun == INT_MAX) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("cannot have more than %d runs for an external sort", + INT_MAX))); + + state->currentRun++; + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d starting quicksort of run %d: %s", + state->worker, state->currentRun, + pg_rusage_show(&state->ru_start)); +#endif + + /* + * Sort all tuples accumulated within the allowed amount of memory for + * this run using quicksort + */ + tuplesort_sort_memtuples(state); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d finished quicksort of run %d: %s", + state->worker, state->currentRun, + pg_rusage_show(&state->ru_start)); +#endif + + memtupwrite = state->memtupcount; + for (i = 0; i < memtupwrite; i++) + { + WRITETUP(state, state->tp_tapenum[state->destTape], + &state->memtuples[i]); + state->memtupcount--; + } + + /* + * Reset tuple memory. We've freed all of the tuples that we previously + * allocated. It's important to avoid fragmentation when there is a stark + * change in the sizes of incoming tuples. Fragmentation due to + * AllocSetFree's bucketing by size class might be particularly bad if + * this step wasn't taken. + */ + MemoryContextReset(state->tuplecontext); + + markrunend(state, state->tp_tapenum[state->destTape]); + state->tp_runs[state->destTape]++; + state->tp_dummy[state->destTape]--; /* per Alg D step D2 */ + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d finished writing run %d to tape %d: %s", + state->worker, state->currentRun, state->destTape, + pg_rusage_show(&state->ru_start)); +#endif + + if (!alltuples) + selectnewtape(state); +} + +/* + * tuplesort_rescan - rewind and replay the scan + */ +void +tuplesort_rescan(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + + Assert(state->randomAccess); + + switch (state->status) + { + case TSS_SORTEDINMEM: + state->current = 0; + state->eof_reached = false; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + case TSS_SORTEDONTAPE: + LogicalTapeRewindForRead(state->tapeset, + state->result_tape, + 0); + state->eof_reached = false; + state->markpos_block = 0L; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + + MemoryContextSwitchTo(oldcontext); +} + +/* + * tuplesort_markpos - saves current position in the merged sort file + */ +void +tuplesort_markpos(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + + Assert(state->randomAccess); + + switch (state->status) + { + case TSS_SORTEDINMEM: + state->markpos_offset = state->current; + state->markpos_eof = state->eof_reached; + break; + case TSS_SORTEDONTAPE: + LogicalTapeTell(state->tapeset, + state->result_tape, + &state->markpos_block, + &state->markpos_offset); + state->markpos_eof = state->eof_reached; + break; + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + + MemoryContextSwitchTo(oldcontext); +} + +/* + * tuplesort_restorepos - restores current position in merged sort file to + * last saved position + */ +void +tuplesort_restorepos(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + + Assert(state->randomAccess); + + switch (state->status) + { + case TSS_SORTEDINMEM: + state->current = state->markpos_offset; + state->eof_reached = state->markpos_eof; + break; + case TSS_SORTEDONTAPE: + LogicalTapeSeek(state->tapeset, + state->result_tape, + state->markpos_block, + state->markpos_offset); + state->eof_reached = state->markpos_eof; + break; + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + + MemoryContextSwitchTo(oldcontext); +} + +/* + * tuplesort_get_stats - extract summary statistics + * + * This can be called after tuplesort_performsort() finishes to obtain + * printable summary information about how the sort was performed. + */ +void +tuplesort_get_stats(Tuplesortstate *state, + TuplesortInstrumentation *stats) +{ + /* + * Note: it might seem we should provide both memory and disk usage for a + * disk-based sort. However, the current code doesn't track memory space + * accurately once we have begun to return tuples to the caller (since we + * don't account for pfree's the caller is expected to do), so we cannot + * rely on availMem in a disk sort. This does not seem worth the overhead + * to fix. Is it worth creating an API for the memory context code to + * tell us how much is actually used in sortcontext? + */ + tuplesort_updatemax(state); + + if (state->isMaxSpaceDisk) + stats->spaceType = SORT_SPACE_TYPE_DISK; + else + stats->spaceType = SORT_SPACE_TYPE_MEMORY; + stats->spaceUsed = (state->maxSpace + 1023) / 1024; + + switch (state->maxSpaceStatus) + { + case TSS_SORTEDINMEM: + if (state->boundUsed) + stats->sortMethod = SORT_TYPE_TOP_N_HEAPSORT; + else + stats->sortMethod = SORT_TYPE_QUICKSORT; + break; + case TSS_SORTEDONTAPE: + stats->sortMethod = SORT_TYPE_EXTERNAL_SORT; + break; + case TSS_FINALMERGE: + stats->sortMethod = SORT_TYPE_EXTERNAL_MERGE; + break; + default: + stats->sortMethod = SORT_TYPE_STILL_IN_PROGRESS; + break; + } +} + +/* + * Convert TuplesortMethod to a string. + */ +const char * +tuplesort_method_name(TuplesortMethod m) +{ + switch (m) + { + case SORT_TYPE_STILL_IN_PROGRESS: + return "still in progress"; + case SORT_TYPE_TOP_N_HEAPSORT: + return "top-N heapsort"; + case SORT_TYPE_QUICKSORT: + return "quicksort"; + case SORT_TYPE_EXTERNAL_SORT: + return "external sort"; + case SORT_TYPE_EXTERNAL_MERGE: + return "external merge"; + } + + return "unknown"; +} + +/* + * Convert TuplesortSpaceType to a string. + */ +const char * +tuplesort_space_type_name(TuplesortSpaceType t) +{ + Assert(t == SORT_SPACE_TYPE_DISK || t == SORT_SPACE_TYPE_MEMORY); + return t == SORT_SPACE_TYPE_DISK ? "Disk" : "Memory"; +} + + +/* + * Heap manipulation routines, per Knuth's Algorithm 5.2.3H. + */ + +/* + * Convert the existing unordered array of SortTuples to a bounded heap, + * discarding all but the smallest "state->bound" tuples. + * + * When working with a bounded heap, we want to keep the largest entry + * at the root (array entry zero), instead of the smallest as in the normal + * sort case. This allows us to discard the largest entry cheaply. + * Therefore, we temporarily reverse the sort direction. + */ +static void +make_bounded_heap(Tuplesortstate *state) +{ + int tupcount = state->memtupcount; + int i; + + Assert(state->status == TSS_INITIAL); + Assert(state->bounded); + Assert(tupcount >= state->bound); + Assert(SERIAL(state)); + + /* Reverse sort direction so largest entry will be at root */ + reversedirection(state); + + state->memtupcount = 0; /* make the heap empty */ + for (i = 0; i < tupcount; i++) + { + if (state->memtupcount < state->bound) + { + /* Insert next tuple into heap */ + /* Must copy source tuple to avoid possible overwrite */ + SortTuple stup = state->memtuples[i]; + + tuplesort_heap_insert(state, &stup); + } + else + { + /* + * The heap is full. Replace the largest entry with the new + * tuple, or just discard it, if it's larger than anything already + * in the heap. + */ + if (COMPARETUP(state, &state->memtuples[i], &state->memtuples[0]) <= 0) + { + free_sort_tuple(state, &state->memtuples[i]); + CHECK_FOR_INTERRUPTS(); + } + else + tuplesort_heap_replace_top(state, &state->memtuples[i]); + } + } + + Assert(state->memtupcount == state->bound); + state->status = TSS_BOUNDED; +} + +/* + * Convert the bounded heap to a properly-sorted array + */ +static void +sort_bounded_heap(Tuplesortstate *state) +{ + int tupcount = state->memtupcount; + + Assert(state->status == TSS_BOUNDED); + Assert(state->bounded); + Assert(tupcount == state->bound); + Assert(SERIAL(state)); + + /* + * We can unheapify in place because each delete-top call will remove the + * largest entry, which we can promptly store in the newly freed slot at + * the end. Once we're down to a single-entry heap, we're done. + */ + while (state->memtupcount > 1) + { + SortTuple stup = state->memtuples[0]; + + /* this sifts-up the next-largest entry and decreases memtupcount */ + tuplesort_heap_delete_top(state); + state->memtuples[state->memtupcount] = stup; + } + state->memtupcount = tupcount; + + /* + * Reverse sort direction back to the original state. This is not + * actually necessary but seems like a good idea for tidiness. + */ + reversedirection(state); + + state->status = TSS_SORTEDINMEM; + state->boundUsed = true; +} + +/* + * Sort all memtuples using specialized qsort() routines. + * + * Quicksort is used for small in-memory sorts, and external sort runs. + */ +static void +tuplesort_sort_memtuples(Tuplesortstate *state) +{ + Assert(!LEADER(state)); + + if (state->memtupcount > 1) + { + /* Can we use the single-key sort function? */ + if (state->onlyKey != NULL) + qsort_ssup(state->memtuples, state->memtupcount, + state->onlyKey); + else + qsort_tuple(state->memtuples, + state->memtupcount, + state->comparetup, + state); + } +} + +/* + * Insert a new tuple into an empty or existing heap, maintaining the + * heap invariant. Caller is responsible for ensuring there's room. + * + * Note: For some callers, tuple points to a memtuples[] entry above the + * end of the heap. This is safe as long as it's not immediately adjacent + * to the end of the heap (ie, in the [memtupcount] array entry) --- if it + * is, it might get overwritten before being moved into the heap! + */ +static void +tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple) +{ + SortTuple *memtuples; + int j; + + memtuples = state->memtuples; + Assert(state->memtupcount < state->memtupsize); + + CHECK_FOR_INTERRUPTS(); + + /* + * Sift-up the new entry, per Knuth 5.2.3 exercise 16. Note that Knuth is + * using 1-based array indexes, not 0-based. + */ + j = state->memtupcount++; + while (j > 0) + { + int i = (j - 1) >> 1; + + if (COMPARETUP(state, tuple, &memtuples[i]) >= 0) + break; + memtuples[j] = memtuples[i]; + j = i; + } + memtuples[j] = *tuple; +} + +/* + * Remove the tuple at state->memtuples[0] from the heap. Decrement + * memtupcount, and sift up to maintain the heap invariant. + * + * The caller has already free'd the tuple the top node points to, + * if necessary. + */ +static void +tuplesort_heap_delete_top(Tuplesortstate *state) +{ + SortTuple *memtuples = state->memtuples; + SortTuple *tuple; + + if (--state->memtupcount <= 0) + return; + + /* + * Remove the last tuple in the heap, and re-insert it, by replacing the + * current top node with it. + */ + tuple = &memtuples[state->memtupcount]; + tuplesort_heap_replace_top(state, tuple); +} + +/* + * Replace the tuple at state->memtuples[0] with a new tuple. Sift up to + * maintain the heap invariant. + * + * This corresponds to Knuth's "sift-up" algorithm (Algorithm 5.2.3H, + * Heapsort, steps H3-H8). + */ +static void +tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple *tuple) +{ + SortTuple *memtuples = state->memtuples; + unsigned int i, + n; + + Assert(state->memtupcount >= 1); + + CHECK_FOR_INTERRUPTS(); + + /* + * state->memtupcount is "int", but we use "unsigned int" for i, j, n. + * This prevents overflow in the "2 * i + 1" calculation, since at the top + * of the loop we must have i < n <= INT_MAX <= UINT_MAX/2. + */ + n = state->memtupcount; + i = 0; /* i is where the "hole" is */ + for (;;) + { + unsigned int j = 2 * i + 1; + + if (j >= n) + break; + if (j + 1 < n && + COMPARETUP(state, &memtuples[j], &memtuples[j + 1]) > 0) + j++; + if (COMPARETUP(state, tuple, &memtuples[j]) <= 0) + break; + memtuples[i] = memtuples[j]; + i = j; + } + memtuples[i] = *tuple; +} + +/* + * Function to reverse the sort direction from its current state + * + * It is not safe to call this when performing hash tuplesorts + */ +static void +reversedirection(Tuplesortstate *state) +{ + SortSupport sortKey = state->sortKeys; + int nkey; + + for (nkey = 0; nkey < state->nKeys; nkey++, sortKey++) + { + sortKey->ssup_reverse = !sortKey->ssup_reverse; + sortKey->ssup_nulls_first = !sortKey->ssup_nulls_first; + } +} + + +/* + * Tape interface routines + */ + +static unsigned int +getlen(Tuplesortstate *state, int tapenum, bool eofOK) +{ + unsigned int len; + + if (LogicalTapeRead(state->tapeset, tapenum, + &len, sizeof(len)) != sizeof(len)) + elog(ERROR, "unexpected end of tape"); + if (len == 0 && !eofOK) + elog(ERROR, "unexpected end of data"); + return len; +} + +static void +markrunend(Tuplesortstate *state, int tapenum) +{ + unsigned int len = 0; + + LogicalTapeWrite(state->tapeset, tapenum, (void *) &len, sizeof(len)); +} + +/* + * Get memory for tuple from within READTUP() routine. + * + * We use next free slot from the slab allocator, or palloc() if the tuple + * is too large for that. + */ +static void * +readtup_alloc(Tuplesortstate *state, Size tuplen) +{ + SlabSlot *buf; + + /* + * We pre-allocate enough slots in the slab arena that we should never run + * out. + */ + Assert(state->slabFreeHead); + + if (tuplen > SLAB_SLOT_SIZE || !state->slabFreeHead) + return MemoryContextAlloc(state->sortcontext, tuplen); + else + { + buf = state->slabFreeHead; + /* Reuse this slot */ + state->slabFreeHead = buf->nextfree; + + return buf; + } +} + + +/* + * Routines specialized for HeapTuple (actually MinimalTuple) case + */ + +static int +comparetup_heap(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) +{ + SortSupport sortKey = state->sortKeys; + HeapTupleData ltup; + HeapTupleData rtup; + TupleDesc tupDesc; + int nkey; + int32 compare; + AttrNumber attno; + Datum datum1, + datum2; + bool isnull1, + isnull2; + + + /* Compare the leading sort key */ + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + sortKey); + if (compare != 0) + return compare; + + /* Compare additional sort keys */ + ltup.t_len = ((MinimalTuple) a->tuple)->t_len + MINIMAL_TUPLE_OFFSET; + ltup.t_data = (HeapTupleHeader) ((char *) a->tuple - MINIMAL_TUPLE_OFFSET); + rtup.t_len = ((MinimalTuple) b->tuple)->t_len + MINIMAL_TUPLE_OFFSET; + rtup.t_data = (HeapTupleHeader) ((char *) b->tuple - MINIMAL_TUPLE_OFFSET); + tupDesc = state->tupDesc; + + if (sortKey->abbrev_converter) + { + attno = sortKey->ssup_attno; + + datum1 = heap_getattr(<up, attno, tupDesc, &isnull1); + datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2); + + compare = ApplySortAbbrevFullComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + + sortKey++; + for (nkey = 1; nkey < state->nKeys; nkey++, sortKey++) + { + attno = sortKey->ssup_attno; + + datum1 = heap_getattr(<up, attno, tupDesc, &isnull1); + datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2); + + compare = ApplySortComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + + return 0; +} + +static void +copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + /* + * We expect the passed "tup" to be a TupleTableSlot, and form a + * MinimalTuple using the exported interface for that. + */ + TupleTableSlot *slot = (TupleTableSlot *) tup; + Datum original; + MinimalTuple tuple; + HeapTupleData htup; + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + + /* copy the tuple into sort storage */ + tuple = ExecCopySlotMinimalTuple(slot); + stup->tuple = (void *) tuple; + USEMEM(state, GetMemoryChunkSpace(tuple)); + /* set up first-column key value */ + htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET; + htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET); + original = heap_getattr(&htup, + state->sortKeys[0].ssup_attno, + state->tupDesc, + &stup->isnull1); + + MemoryContextSwitchTo(oldcontext); + + if (!state->sortKeys->abbrev_converter || stup->isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to zeroed representation (to be + * consistent, and to support cheap inequality tests for NULL + * abbreviated keys). + */ + stup->datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup->datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup->datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + htup.t_len = ((MinimalTuple) mtup->tuple)->t_len + + MINIMAL_TUPLE_OFFSET; + htup.t_data = (HeapTupleHeader) ((char *) mtup->tuple - + MINIMAL_TUPLE_OFFSET); + + mtup->datum1 = heap_getattr(&htup, + state->sortKeys[0].ssup_attno, + state->tupDesc, + &mtup->isnull1); + } + } +} + +static void +writetup_heap(Tuplesortstate *state, int tapenum, SortTuple *stup) +{ + MinimalTuple tuple = (MinimalTuple) stup->tuple; + + /* the part of the MinimalTuple we'll write: */ + char *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET; + unsigned int tupbodylen = tuple->t_len - MINIMAL_TUPLE_DATA_OFFSET; + + /* total on-disk footprint: */ + unsigned int tuplen = tupbodylen + sizeof(int); + + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &tuplen, sizeof(tuplen)); + LogicalTapeWrite(state->tapeset, tapenum, + (void *) tupbody, tupbodylen); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &tuplen, sizeof(tuplen)); + + if (!state->slabAllocatorUsed) + { + FREEMEM(state, GetMemoryChunkSpace(tuple)); + heap_free_minimal_tuple(tuple); + } +} + +static void +readtup_heap(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len) +{ + unsigned int tupbodylen = len - sizeof(int); + unsigned int tuplen = tupbodylen + MINIMAL_TUPLE_DATA_OFFSET; + MinimalTuple tuple = (MinimalTuple) readtup_alloc(state, tuplen); + char *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET; + HeapTupleData htup; + + /* read in the tuple proper */ + tuple->t_len = tuplen; + LogicalTapeReadExact(state->tapeset, tapenum, + tupbody, tupbodylen); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeReadExact(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + stup->tuple = (void *) tuple; + /* set up first-column key value */ + htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET; + htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET); + stup->datum1 = heap_getattr(&htup, + state->sortKeys[0].ssup_attno, + state->tupDesc, + &stup->isnull1); +} + +/* + * Routines specialized for the CLUSTER case (HeapTuple data, with + * comparisons per a btree index definition) + */ + +static int +comparetup_cluster(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state) +{ + SortSupport sortKey = state->sortKeys; + HeapTuple ltup; + HeapTuple rtup; + TupleDesc tupDesc; + int nkey; + int32 compare; + Datum datum1, + datum2; + bool isnull1, + isnull2; + AttrNumber leading = state->indexInfo->ii_IndexAttrNumbers[0]; + + /* Be prepared to compare additional sort keys */ + ltup = (HeapTuple) a->tuple; + rtup = (HeapTuple) b->tuple; + tupDesc = state->tupDesc; + + /* Compare the leading sort key, if it's simple */ + if (leading != 0) + { + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + sortKey); + if (compare != 0) + return compare; + + if (sortKey->abbrev_converter) + { + datum1 = heap_getattr(ltup, leading, tupDesc, &isnull1); + datum2 = heap_getattr(rtup, leading, tupDesc, &isnull2); + + compare = ApplySortAbbrevFullComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + } + if (compare != 0 || state->nKeys == 1) + return compare; + /* Compare additional columns the hard way */ + sortKey++; + nkey = 1; + } + else + { + /* Must compare all keys the hard way */ + nkey = 0; + } + + if (state->indexInfo->ii_Expressions == NULL) + { + /* If not expression index, just compare the proper heap attrs */ + + for (; nkey < state->nKeys; nkey++, sortKey++) + { + AttrNumber attno = state->indexInfo->ii_IndexAttrNumbers[nkey]; + + datum1 = heap_getattr(ltup, attno, tupDesc, &isnull1); + datum2 = heap_getattr(rtup, attno, tupDesc, &isnull2); + + compare = ApplySortComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + } + else + { + /* + * In the expression index case, compute the whole index tuple and + * then compare values. It would perhaps be faster to compute only as + * many columns as we need to compare, but that would require + * duplicating all the logic in FormIndexDatum. + */ + Datum l_index_values[INDEX_MAX_KEYS]; + bool l_index_isnull[INDEX_MAX_KEYS]; + Datum r_index_values[INDEX_MAX_KEYS]; + bool r_index_isnull[INDEX_MAX_KEYS]; + TupleTableSlot *ecxt_scantuple; + + /* Reset context each time to prevent memory leakage */ + ResetPerTupleExprContext(state->estate); + + ecxt_scantuple = GetPerTupleExprContext(state->estate)->ecxt_scantuple; + + ExecStoreHeapTuple(ltup, ecxt_scantuple, false); + FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate, + l_index_values, l_index_isnull); + + ExecStoreHeapTuple(rtup, ecxt_scantuple, false); + FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate, + r_index_values, r_index_isnull); + + for (; nkey < state->nKeys; nkey++, sortKey++) + { + compare = ApplySortComparator(l_index_values[nkey], + l_index_isnull[nkey], + r_index_values[nkey], + r_index_isnull[nkey], + sortKey); + if (compare != 0) + return compare; + } + } + + return 0; +} + +static void +copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + HeapTuple tuple = (HeapTuple) tup; + Datum original; + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + + /* copy the tuple into sort storage */ + tuple = heap_copytuple(tuple); + stup->tuple = (void *) tuple; + USEMEM(state, GetMemoryChunkSpace(tuple)); + + MemoryContextSwitchTo(oldcontext); + + /* + * set up first-column key value, and potentially abbreviate, if it's a + * simple column + */ + if (state->indexInfo->ii_IndexAttrNumbers[0] == 0) + return; + + original = heap_getattr(tuple, + state->indexInfo->ii_IndexAttrNumbers[0], + state->tupDesc, + &stup->isnull1); + + if (!state->sortKeys->abbrev_converter || stup->isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to zeroed representation (to be + * consistent, and to support cheap inequality tests for NULL + * abbreviated keys). + */ + stup->datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup->datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup->datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + tuple = (HeapTuple) mtup->tuple; + mtup->datum1 = heap_getattr(tuple, + state->indexInfo->ii_IndexAttrNumbers[0], + state->tupDesc, + &mtup->isnull1); + } + } +} + +static void +writetup_cluster(Tuplesortstate *state, int tapenum, SortTuple *stup) +{ + HeapTuple tuple = (HeapTuple) stup->tuple; + unsigned int tuplen = tuple->t_len + sizeof(ItemPointerData) + sizeof(int); + + /* We need to store t_self, but not other fields of HeapTupleData */ + LogicalTapeWrite(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + LogicalTapeWrite(state->tapeset, tapenum, + &tuple->t_self, sizeof(ItemPointerData)); + LogicalTapeWrite(state->tapeset, tapenum, + tuple->t_data, tuple->t_len); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeWrite(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + + if (!state->slabAllocatorUsed) + { + FREEMEM(state, GetMemoryChunkSpace(tuple)); + heap_freetuple(tuple); + } +} + +static void +readtup_cluster(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int tuplen) +{ + unsigned int t_len = tuplen - sizeof(ItemPointerData) - sizeof(int); + HeapTuple tuple = (HeapTuple) readtup_alloc(state, + t_len + HEAPTUPLESIZE); + + /* Reconstruct the HeapTupleData header */ + tuple->t_data = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE); + tuple->t_len = t_len; + LogicalTapeReadExact(state->tapeset, tapenum, + &tuple->t_self, sizeof(ItemPointerData)); + /* We don't currently bother to reconstruct t_tableOid */ + tuple->t_tableOid = InvalidOid; + /* Read in the tuple body */ + LogicalTapeReadExact(state->tapeset, tapenum, + tuple->t_data, tuple->t_len); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeReadExact(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + stup->tuple = (void *) tuple; + /* set up first-column key value, if it's a simple column */ + if (state->indexInfo->ii_IndexAttrNumbers[0] != 0) + stup->datum1 = heap_getattr(tuple, + state->indexInfo->ii_IndexAttrNumbers[0], + state->tupDesc, + &stup->isnull1); +} + +/* + * Routines specialized for IndexTuple case + * + * The btree and hash cases require separate comparison functions, but the + * IndexTuple representation is the same so the copy/write/read support + * functions can be shared. + */ + +static int +comparetup_index_btree(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state) +{ + /* + * This is similar to comparetup_heap(), but expects index tuples. There + * is also special handling for enforcing uniqueness, and special + * treatment for equal keys at the end. + */ + SortSupport sortKey = state->sortKeys; + IndexTuple tuple1; + IndexTuple tuple2; + int keysz; + TupleDesc tupDes; + bool equal_hasnull = false; + int nkey; + int32 compare; + Datum datum1, + datum2; + bool isnull1, + isnull2; + + + /* Compare the leading sort key */ + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + sortKey); + if (compare != 0) + return compare; + + /* Compare additional sort keys */ + tuple1 = (IndexTuple) a->tuple; + tuple2 = (IndexTuple) b->tuple; + keysz = state->nKeys; + tupDes = RelationGetDescr(state->indexRel); + + if (sortKey->abbrev_converter) + { + datum1 = index_getattr(tuple1, 1, tupDes, &isnull1); + datum2 = index_getattr(tuple2, 1, tupDes, &isnull2); + + compare = ApplySortAbbrevFullComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + + /* they are equal, so we only need to examine one null flag */ + if (a->isnull1) + equal_hasnull = true; + + sortKey++; + for (nkey = 2; nkey <= keysz; nkey++, sortKey++) + { + datum1 = index_getattr(tuple1, nkey, tupDes, &isnull1); + datum2 = index_getattr(tuple2, nkey, tupDes, &isnull2); + + compare = ApplySortComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; /* done when we find unequal attributes */ + + /* they are equal, so we only need to examine one null flag */ + if (isnull1) + equal_hasnull = true; + } + + /* + * If btree has asked us to enforce uniqueness, complain if two equal + * tuples are detected (unless there was at least one NULL field). + * + * It is sufficient to make the test here, because if two tuples are equal + * they *must* get compared at some stage of the sort --- otherwise the + * sort algorithm wouldn't have checked whether one must appear before the + * other. + */ + if (state->enforceUnique && !equal_hasnull) + { + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + char *key_desc; + + /* + * Some rather brain-dead implementations of qsort (such as the one in + * QNX 4) will sometimes call the comparison routine to compare a + * value to itself, but we always use our own implementation, which + * does not. + */ + Assert(tuple1 != tuple2); + + index_deform_tuple(tuple1, tupDes, values, isnull); + + key_desc = BuildIndexValueDescription(state->indexRel, values, isnull); + + ereport(ERROR, + (errcode(ERRCODE_UNIQUE_VIOLATION), + errmsg("could not create unique index \"%s\"", + RelationGetRelationName(state->indexRel)), + key_desc ? errdetail("Key %s is duplicated.", key_desc) : + errdetail("Duplicate keys exist."), + errtableconstraint(state->heapRel, + RelationGetRelationName(state->indexRel)))); + } + + /* + * If key values are equal, we sort on ItemPointer. This is required for + * btree indexes, since heap TID is treated as an implicit last key + * attribute in order to ensure that all keys in the index are physically + * unique. + */ + { + BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid); + BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid); + + if (blk1 != blk2) + return (blk1 < blk2) ? -1 : 1; + } + { + OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid); + OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid); + + if (pos1 != pos2) + return (pos1 < pos2) ? -1 : 1; + } + + /* ItemPointer values should never be equal */ + Assert(false); + + return 0; +} + +static int +comparetup_index_hash(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state) +{ + Bucket bucket1; + Bucket bucket2; + IndexTuple tuple1; + IndexTuple tuple2; + + /* + * Fetch hash keys and mask off bits we don't want to sort by. We know + * that the first column of the index tuple is the hash key. + */ + Assert(!a->isnull1); + bucket1 = _hash_hashkey2bucket(DatumGetUInt32(a->datum1), + state->max_buckets, state->high_mask, + state->low_mask); + Assert(!b->isnull1); + bucket2 = _hash_hashkey2bucket(DatumGetUInt32(b->datum1), + state->max_buckets, state->high_mask, + state->low_mask); + if (bucket1 > bucket2) + return 1; + else if (bucket1 < bucket2) + return -1; + + /* + * If hash values are equal, we sort on ItemPointer. This does not affect + * validity of the finished index, but it may be useful to have index + * scans in physical order. + */ + tuple1 = (IndexTuple) a->tuple; + tuple2 = (IndexTuple) b->tuple; + + { + BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid); + BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid); + + if (blk1 != blk2) + return (blk1 < blk2) ? -1 : 1; + } + { + OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid); + OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid); + + if (pos1 != pos2) + return (pos1 < pos2) ? -1 : 1; + } + + /* ItemPointer values should never be equal */ + Assert(false); + + return 0; +} + +static void +copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + /* Not currently needed */ + elog(ERROR, "copytup_index() should not be called"); +} + +static void +writetup_index(Tuplesortstate *state, int tapenum, SortTuple *stup) +{ + IndexTuple tuple = (IndexTuple) stup->tuple; + unsigned int tuplen; + + tuplen = IndexTupleSize(tuple) + sizeof(tuplen); + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &tuplen, sizeof(tuplen)); + LogicalTapeWrite(state->tapeset, tapenum, + (void *) tuple, IndexTupleSize(tuple)); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &tuplen, sizeof(tuplen)); + + if (!state->slabAllocatorUsed) + { + FREEMEM(state, GetMemoryChunkSpace(tuple)); + pfree(tuple); + } +} + +static void +readtup_index(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len) +{ + unsigned int tuplen = len - sizeof(unsigned int); + IndexTuple tuple = (IndexTuple) readtup_alloc(state, tuplen); + + LogicalTapeReadExact(state->tapeset, tapenum, + tuple, tuplen); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeReadExact(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + stup->tuple = (void *) tuple; + /* set up first-column key value */ + stup->datum1 = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &stup->isnull1); +} + +/* + * Routines specialized for DatumTuple case + */ + +static int +comparetup_datum(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) +{ + int compare; + + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + state->sortKeys); + if (compare != 0) + return compare; + + /* if we have abbreviations, then "tuple" has the original value */ + + if (state->sortKeys->abbrev_converter) + compare = ApplySortAbbrevFullComparator(PointerGetDatum(a->tuple), a->isnull1, + PointerGetDatum(b->tuple), b->isnull1, + state->sortKeys); + + return compare; +} + +static void +copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + /* Not currently needed */ + elog(ERROR, "copytup_datum() should not be called"); +} + +static void +writetup_datum(Tuplesortstate *state, int tapenum, SortTuple *stup) +{ + void *waddr; + unsigned int tuplen; + unsigned int writtenlen; + + if (stup->isnull1) + { + waddr = NULL; + tuplen = 0; + } + else if (!state->tuples) + { + waddr = &stup->datum1; + tuplen = sizeof(Datum); + } + else + { + waddr = stup->tuple; + tuplen = datumGetSize(PointerGetDatum(stup->tuple), false, state->datumTypeLen); + Assert(tuplen != 0); + } + + writtenlen = tuplen + sizeof(unsigned int); + + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &writtenlen, sizeof(writtenlen)); + LogicalTapeWrite(state->tapeset, tapenum, + waddr, tuplen); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &writtenlen, sizeof(writtenlen)); + + if (!state->slabAllocatorUsed && stup->tuple) + { + FREEMEM(state, GetMemoryChunkSpace(stup->tuple)); + pfree(stup->tuple); + } +} + +static void +readtup_datum(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len) +{ + unsigned int tuplen = len - sizeof(unsigned int); + + if (tuplen == 0) + { + /* it's NULL */ + stup->datum1 = (Datum) 0; + stup->isnull1 = true; + stup->tuple = NULL; + } + else if (!state->tuples) + { + Assert(tuplen == sizeof(Datum)); + LogicalTapeReadExact(state->tapeset, tapenum, + &stup->datum1, tuplen); + stup->isnull1 = false; + stup->tuple = NULL; + } + else + { + void *raddr = readtup_alloc(state, tuplen); + + LogicalTapeReadExact(state->tapeset, tapenum, + raddr, tuplen); + stup->datum1 = PointerGetDatum(raddr); + stup->isnull1 = false; + stup->tuple = raddr; + } + + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeReadExact(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); +} + +/* + * Parallel sort routines + */ + +/* + * tuplesort_estimate_shared - estimate required shared memory allocation + * + * nWorkers is an estimate of the number of workers (it's the number that + * will be requested). + */ +Size +tuplesort_estimate_shared(int nWorkers) +{ + Size tapesSize; + + Assert(nWorkers > 0); + + /* Make sure that BufFile shared state is MAXALIGN'd */ + tapesSize = mul_size(sizeof(TapeShare), nWorkers); + tapesSize = MAXALIGN(add_size(tapesSize, offsetof(Sharedsort, tapes))); + + return tapesSize; +} + +/* + * tuplesort_initialize_shared - initialize shared tuplesort state + * + * Must be called from leader process before workers are launched, to + * establish state needed up-front for worker tuplesortstates. nWorkers + * should match the argument passed to tuplesort_estimate_shared(). + */ +void +tuplesort_initialize_shared(Sharedsort *shared, int nWorkers, dsm_segment *seg) +{ + int i; + + Assert(nWorkers > 0); + + SpinLockInit(&shared->mutex); + shared->currentWorker = 0; + shared->workersFinished = 0; + SharedFileSetInit(&shared->fileset, seg); + shared->nTapes = nWorkers; + for (i = 0; i < nWorkers; i++) + { + shared->tapes[i].firstblocknumber = 0L; + } +} + +/* + * tuplesort_attach_shared - attach to shared tuplesort state + * + * Must be called by all worker processes. + */ +void +tuplesort_attach_shared(Sharedsort *shared, dsm_segment *seg) +{ + /* Attach to SharedFileSet */ + SharedFileSetAttach(&shared->fileset, seg); +} + +/* + * worker_get_identifier - Assign and return ordinal identifier for worker + * + * The order in which these are assigned is not well defined, and should not + * matter; worker numbers across parallel sort participants need only be + * distinct and gapless. logtape.c requires this. + * + * Note that the identifiers assigned from here have no relation to + * ParallelWorkerNumber number, to avoid making any assumption about + * caller's requirements. However, we do follow the ParallelWorkerNumber + * convention of representing a non-worker with worker number -1. This + * includes the leader, as well as serial Tuplesort processes. + */ +static int +worker_get_identifier(Tuplesortstate *state) +{ + Sharedsort *shared = state->shared; + int worker; + + Assert(WORKER(state)); + + SpinLockAcquire(&shared->mutex); + worker = shared->currentWorker++; + SpinLockRelease(&shared->mutex); + + return worker; +} + +/* + * worker_freeze_result_tape - freeze worker's result tape for leader + * + * This is called by workers just after the result tape has been determined, + * instead of calling LogicalTapeFreeze() directly. They do so because + * workers require a few additional steps over similar serial + * TSS_SORTEDONTAPE external sort cases, which also happen here. The extra + * steps are around freeing now unneeded resources, and representing to + * leader that worker's input run is available for its merge. + * + * There should only be one final output run for each worker, which consists + * of all tuples that were originally input into worker. + */ +static void +worker_freeze_result_tape(Tuplesortstate *state) +{ + Sharedsort *shared = state->shared; + TapeShare output; + + Assert(WORKER(state)); + Assert(state->result_tape != -1); + Assert(state->memtupcount == 0); + + /* + * Free most remaining memory, in case caller is sensitive to our holding + * on to it. memtuples may not be a tiny merge heap at this point. + */ + pfree(state->memtuples); + /* Be tidy */ + state->memtuples = NULL; + state->memtupsize = 0; + + /* + * Parallel worker requires result tape metadata, which is to be stored in + * shared memory for leader + */ + LogicalTapeFreeze(state->tapeset, state->result_tape, &output); + + /* Store properties of output tape, and update finished worker count */ + SpinLockAcquire(&shared->mutex); + shared->tapes[state->worker] = output; + shared->workersFinished++; + SpinLockRelease(&shared->mutex); +} + +/* + * worker_nomergeruns - dump memtuples in worker, without merging + * + * This called as an alternative to mergeruns() with a worker when no + * merging is required. + */ +static void +worker_nomergeruns(Tuplesortstate *state) +{ + Assert(WORKER(state)); + Assert(state->result_tape == -1); + + state->result_tape = state->tp_tapenum[state->destTape]; + worker_freeze_result_tape(state); +} + +/* + * leader_takeover_tapes - create tapeset for leader from worker tapes + * + * So far, leader Tuplesortstate has performed no actual sorting. By now, all + * sorting has occurred in workers, all of which must have already returned + * from tuplesort_performsort(). + * + * When this returns, leader process is left in a state that is virtually + * indistinguishable from it having generated runs as a serial external sort + * might have. + */ +static void +leader_takeover_tapes(Tuplesortstate *state) +{ + Sharedsort *shared = state->shared; + int nParticipants = state->nParticipants; + int workersFinished; + int j; + + Assert(LEADER(state)); + Assert(nParticipants >= 1); + + SpinLockAcquire(&shared->mutex); + workersFinished = shared->workersFinished; + SpinLockRelease(&shared->mutex); + + if (nParticipants != workersFinished) + elog(ERROR, "cannot take over tapes before all workers finish"); + + /* + * Create the tapeset from worker tapes, including a leader-owned tape at + * the end. Parallel workers are far more expensive than logical tapes, + * so the number of tapes allocated here should never be excessive. + * + * We still have a leader tape, though it's not possible to write to it + * due to restrictions in the shared fileset infrastructure used by + * logtape.c. It will never be written to in practice because + * randomAccess is disallowed for parallel sorts. + */ + inittapestate(state, nParticipants + 1); + state->tapeset = LogicalTapeSetCreate(nParticipants + 1, false, + shared->tapes, &shared->fileset, + state->worker); + + /* mergeruns() relies on currentRun for # of runs (in one-pass cases) */ + state->currentRun = nParticipants; + + /* + * Initialize variables of Algorithm D to be consistent with runs from + * workers having been generated in the leader. + * + * There will always be exactly 1 run per worker, and exactly one input + * tape per run, because workers always output exactly 1 run, even when + * there were no input tuples for workers to sort. + */ + for (j = 0; j < state->maxTapes; j++) + { + /* One real run; no dummy runs for worker tapes */ + state->tp_fib[j] = 1; + state->tp_runs[j] = 1; + state->tp_dummy[j] = 0; + state->tp_tapenum[j] = j; + } + /* Leader tape gets one dummy run, and no real runs */ + state->tp_fib[state->tapeRange] = 0; + state->tp_runs[state->tapeRange] = 0; + state->tp_dummy[state->tapeRange] = 1; + + state->Level = 1; + state->destTape = 0; + + state->status = TSS_BUILDRUNS; +} + +/* + * Convenience routine to free a tuple previously loaded into sort memory + */ +static void +free_sort_tuple(Tuplesortstate *state, SortTuple *stup) +{ + if (stup->tuple) + { + FREEMEM(state, GetMemoryChunkSpace(stup->tuple)); + pfree(stup->tuple); + stup->tuple = NULL; + } +} diff --git a/src/tuplesort14.c b/src/tuplesort14.c new file mode 100644 index 0000000000..85c8b10415 --- /dev/null +++ b/src/tuplesort14.c @@ -0,0 +1,4784 @@ +/*------------------------------------------------------------------------- + * + * tuplesort.c + * Generalized tuple sorting routines. + * + * This module handles sorting of heap tuples, index tuples, or single + * Datums (and could easily support other kinds of sortable objects, + * if necessary). It works efficiently for both small and large amounts + * of data. Small amounts are sorted in-memory using qsort(). Large + * amounts are sorted using temporary files and a standard external sort + * algorithm. + * + * See Knuth, volume 3, for more than you want to know about the external + * sorting algorithm. Historically, we divided the input into sorted runs + * using replacement selection, in the form of a priority tree implemented + * as a heap (essentially his Algorithm 5.2.3H), but now we always use + * quicksort for run generation. We merge the runs using polyphase merge, + * Knuth's Algorithm 5.4.2D. The logical "tapes" used by Algorithm D are + * implemented by logtape.c, which avoids space wastage by recycling disk + * space as soon as each block is read from its "tape". + * + * The approximate amount of memory allowed for any one sort operation + * is specified in kilobytes by the caller (most pass work_mem). Initially, + * we absorb tuples and simply store them in an unsorted array as long as + * we haven't exceeded workMem. If we reach the end of the input without + * exceeding workMem, we sort the array using qsort() and subsequently return + * tuples just by scanning the tuple array sequentially. If we do exceed + * workMem, we begin to emit tuples into sorted runs in temporary tapes. + * When tuples are dumped in batch after quicksorting, we begin a new run + * with a new output tape (selected per Algorithm D). After the end of the + * input is reached, we dump out remaining tuples in memory into a final run, + * then merge the runs using Algorithm D. + * + * When merging runs, we use a heap containing just the frontmost tuple from + * each source run; we repeatedly output the smallest tuple and replace it + * with the next tuple from its source tape (if any). When the heap empties, + * the merge is complete. The basic merge algorithm thus needs very little + * memory --- only M tuples for an M-way merge, and M is constrained to a + * small number. However, we can still make good use of our full workMem + * allocation by pre-reading additional blocks from each source tape. Without + * prereading, our access pattern to the temporary file would be very erratic; + * on average we'd read one block from each of M source tapes during the same + * time that we're writing M blocks to the output tape, so there is no + * sequentiality of access at all, defeating the read-ahead methods used by + * most Unix kernels. Worse, the output tape gets written into a very random + * sequence of blocks of the temp file, ensuring that things will be even + * worse when it comes time to read that tape. A straightforward merge pass + * thus ends up doing a lot of waiting for disk seeks. We can improve matters + * by prereading from each source tape sequentially, loading about workMem/M + * bytes from each tape in turn, and making the sequential blocks immediately + * available for reuse. This approach helps to localize both read and write + * accesses. The pre-reading is handled by logtape.c, we just tell it how + * much memory to use for the buffers. + * + * When the caller requests random access to the sort result, we form + * the final sorted run on a logical tape which is then "frozen", so + * that we can access it randomly. When the caller does not need random + * access, we return from tuplesort_performsort() as soon as we are down + * to one run per logical tape. The final merge is then performed + * on-the-fly as the caller repeatedly calls tuplesort_getXXX; this + * saves one cycle of writing all the data out to disk and reading it in. + * + * Before Postgres 8.2, we always used a seven-tape polyphase merge, on the + * grounds that 7 is the "sweet spot" on the tapes-to-passes curve according + * to Knuth's figure 70 (section 5.4.2). However, Knuth is assuming that + * tape drives are expensive beasts, and in particular that there will always + * be many more runs than tape drives. In our implementation a "tape drive" + * doesn't cost much more than a few Kb of memory buffers, so we can afford + * to have lots of them. In particular, if we can have as many tape drives + * as sorted runs, we can eliminate any repeated I/O at all. In the current + * code we determine the number of tapes M on the basis of workMem: we want + * workMem/M to be large enough that we read a fair amount of data each time + * we preread from a tape, so as to maintain the locality of access described + * above. Nonetheless, with large workMem we can have many tapes (but not + * too many -- see the comments in tuplesort_merge_order). + * + * This module supports parallel sorting. Parallel sorts involve coordination + * among one or more worker processes, and a leader process, each with its own + * tuplesort state. The leader process (or, more accurately, the + * Tuplesortstate associated with a leader process) creates a full tapeset + * consisting of worker tapes with one run to merge; a run for every + * worker process. This is then merged. Worker processes are guaranteed to + * produce exactly one output run from their partial input. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/utils/sort/tuplesort.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include + +#include "access/hash.h" +#include "access/htup_details.h" +#include "access/nbtree.h" +#include "catalog/index.h" +#include "catalog/pg_am.h" +#include "commands/tablespace.h" +#include "executor/executor.h" +#include "miscadmin.h" +#include "pg_trace.h" +#include "utils/datum.h" +#include "utils/logtape.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/pg_rusage.h" +#include "utils/rel.h" +#include "utils/sortsupport.h" +#include "utils/tuplesort.h" + +/* Should be the last include */ +#include "disable_core_macro.h" + +/* sort-type codes for sort__start probes */ +#define HEAP_SORT 0 +#define INDEX_SORT 1 +#define DATUM_SORT 2 +#define CLUSTER_SORT 3 + +/* Sort parallel code from state for sort__start probes */ +#define PARALLEL_SORT(state) ((state)->shared == NULL ? 0 : \ + (state)->worker >= 0 ? 1 : 2) + +/* + * Initial size of memtuples array. We're trying to select this size so that + * array doesn't exceed ALLOCSET_SEPARATE_THRESHOLD and so that the overhead of + * allocation might possibly be lowered. However, we don't consider array sizes + * less than 1024. + * + */ +#define INITIAL_MEMTUPSIZE Max(1024, \ + ALLOCSET_SEPARATE_THRESHOLD / sizeof(SortTuple) + 1) + +/* GUC variables */ +#ifdef TRACE_SORT +bool trace_sort = false; +#endif + +#ifdef DEBUG_BOUNDED_SORT +bool optimize_bounded_sort = true; +#endif + + +/* + * The objects we actually sort are SortTuple structs. These contain + * a pointer to the tuple proper (might be a MinimalTuple or IndexTuple), + * which is a separate palloc chunk --- we assume it is just one chunk and + * can be freed by a simple pfree() (except during merge, when we use a + * simple slab allocator). SortTuples also contain the tuple's first key + * column in Datum/nullflag format, and a source/input tape number that + * tracks which tape each heap element/slot belongs to during merging. + * + * Storing the first key column lets us save heap_getattr or index_getattr + * calls during tuple comparisons. We could extract and save all the key + * columns not just the first, but this would increase code complexity and + * overhead, and wouldn't actually save any comparison cycles in the common + * case where the first key determines the comparison result. Note that + * for a pass-by-reference datatype, datum1 points into the "tuple" storage. + * + * There is one special case: when the sort support infrastructure provides an + * "abbreviated key" representation, where the key is (typically) a pass by + * value proxy for a pass by reference type. In this case, the abbreviated key + * is stored in datum1 in place of the actual first key column. + * + * When sorting single Datums, the data value is represented directly by + * datum1/isnull1 for pass by value types (or null values). If the datatype is + * pass-by-reference and isnull1 is false, then "tuple" points to a separately + * palloc'd data value, otherwise "tuple" is NULL. The value of datum1 is then + * either the same pointer as "tuple", or is an abbreviated key value as + * described above. Accordingly, "tuple" is always used in preference to + * datum1 as the authoritative value for pass-by-reference cases. + */ +typedef struct +{ + void *tuple; /* the tuple itself */ + Datum datum1; /* value of first key column */ + bool isnull1; /* is first key column NULL? */ + int srctape; /* source tape number */ +} SortTuple; + +/* + * During merge, we use a pre-allocated set of fixed-size slots to hold + * tuples. To avoid palloc/pfree overhead. + * + * Merge doesn't require a lot of memory, so we can afford to waste some, + * by using gratuitously-sized slots. If a tuple is larger than 1 kB, the + * palloc() overhead is not significant anymore. + * + * 'nextfree' is valid when this chunk is in the free list. When in use, the + * slot holds a tuple. + */ +#define SLAB_SLOT_SIZE 1024 + +typedef union SlabSlot +{ + union SlabSlot *nextfree; + char buffer[SLAB_SLOT_SIZE]; +} SlabSlot; + +/* + * Possible states of a Tuplesort object. These denote the states that + * persist between calls of Tuplesort routines. + */ +typedef enum +{ + TSS_INITIAL, /* Loading tuples; still within memory limit */ + TSS_BOUNDED, /* Loading tuples into bounded-size heap */ + TSS_BUILDRUNS, /* Loading tuples; writing to tape */ + TSS_SORTEDINMEM, /* Sort completed entirely in memory */ + TSS_SORTEDONTAPE, /* Sort completed, final run is on tape */ + TSS_FINALMERGE /* Performing final merge on-the-fly */ +} TupSortStatus; + +/* + * Parameters for calculation of number of tapes to use --- see inittapes() + * and tuplesort_merge_order(). + * + * In this calculation we assume that each tape will cost us about 1 blocks + * worth of buffer space. This ignores the overhead of all the other data + * structures needed for each tape, but it's probably close enough. + * + * MERGE_BUFFER_SIZE is how much data we'd like to read from each input + * tape during a preread cycle (see discussion at top of file). + */ +#define MINORDER 6 /* minimum merge order */ +#define MAXORDER 500 /* maximum merge order */ +#define TAPE_BUFFER_OVERHEAD BLCKSZ +#define MERGE_BUFFER_SIZE (BLCKSZ * 32) + +typedef int (*SortTupleComparator) (const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); + +/* + * Private state of a Tuplesort operation. + */ +struct Tuplesortstate +{ + TupSortStatus status; /* enumerated value as shown above */ + int nKeys; /* number of columns in sort key */ + bool randomAccess; /* did caller request random access? */ + bool bounded; /* did caller specify a maximum number of + * tuples to return? */ + bool boundUsed; /* true if we made use of a bounded heap */ + int bound; /* if bounded, the maximum number of tuples */ + bool tuples; /* Can SortTuple.tuple ever be set? */ + int64 availMem; /* remaining memory available, in bytes */ + int64 allowedMem; /* total memory allowed, in bytes */ + int maxTapes; /* number of tapes (Knuth's T) */ + int tapeRange; /* maxTapes-1 (Knuth's P) */ + int64 maxSpace; /* maximum amount of space occupied among sort + * of groups, either in-memory or on-disk */ + bool isMaxSpaceDisk; /* true when maxSpace is value for on-disk + * space, false when it's value for in-memory + * space */ + TupSortStatus maxSpaceStatus; /* sort status when maxSpace was reached */ + MemoryContext maincontext; /* memory context for tuple sort metadata that + * persists across multiple batches */ + MemoryContext sortcontext; /* memory context holding most sort data */ + MemoryContext tuplecontext; /* sub-context of sortcontext for tuple data */ + LogicalTapeSet *tapeset; /* logtape.c object for tapes in a temp file */ + + /* + * These function pointers decouple the routines that must know what kind + * of tuple we are sorting from the routines that don't need to know it. + * They are set up by the tuplesort_begin_xxx routines. + * + * Function to compare two tuples; result is per qsort() convention, ie: + * <0, 0, >0 according as ab. The API must match + * qsort_arg_comparator. + */ + SortTupleComparator comparetup; + + /* + * Function to copy a supplied input tuple into palloc'd space and set up + * its SortTuple representation (ie, set tuple/datum1/isnull1). Also, + * state->availMem must be decreased by the amount of space used for the + * tuple copy (note the SortTuple struct itself is not counted). + */ + void (*copytup) (Tuplesortstate *state, SortTuple *stup, void *tup); + + /* + * Function to write a stored tuple onto tape. The representation of the + * tuple on tape need not be the same as it is in memory; requirements on + * the tape representation are given below. Unless the slab allocator is + * used, after writing the tuple, pfree() the out-of-line data (not the + * SortTuple struct!), and increase state->availMem by the amount of + * memory space thereby released. + */ + void (*writetup) (Tuplesortstate *state, int tapenum, + SortTuple *stup); + + /* + * Function to read a stored tuple from tape back into memory. 'len' is + * the already-read length of the stored tuple. The tuple is allocated + * from the slab memory arena, or is palloc'd, see readtup_alloc(). + */ + void (*readtup) (Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); + + /* + * This array holds the tuples now in sort memory. If we are in state + * INITIAL, the tuples are in no particular order; if we are in state + * SORTEDINMEM, the tuples are in final sorted order; in states BUILDRUNS + * and FINALMERGE, the tuples are organized in "heap" order per Algorithm + * H. In state SORTEDONTAPE, the array is not used. + */ + SortTuple *memtuples; /* array of SortTuple structs */ + int memtupcount; /* number of tuples currently present */ + int memtupsize; /* allocated length of memtuples array */ + bool growmemtuples; /* memtuples' growth still underway? */ + + /* + * Memory for tuples is sometimes allocated using a simple slab allocator, + * rather than with palloc(). Currently, we switch to slab allocation + * when we start merging. Merging only needs to keep a small, fixed + * number of tuples in memory at any time, so we can avoid the + * palloc/pfree overhead by recycling a fixed number of fixed-size slots + * to hold the tuples. + * + * For the slab, we use one large allocation, divided into SLAB_SLOT_SIZE + * slots. The allocation is sized to have one slot per tape, plus one + * additional slot. We need that many slots to hold all the tuples kept + * in the heap during merge, plus the one we have last returned from the + * sort, with tuplesort_gettuple. + * + * Initially, all the slots are kept in a linked list of free slots. When + * a tuple is read from a tape, it is put to the next available slot, if + * it fits. If the tuple is larger than SLAB_SLOT_SIZE, it is palloc'd + * instead. + * + * When we're done processing a tuple, we return the slot back to the free + * list, or pfree() if it was palloc'd. We know that a tuple was + * allocated from the slab, if its pointer value is between + * slabMemoryBegin and -End. + * + * When the slab allocator is used, the USEMEM/LACKMEM mechanism of + * tracking memory usage is not used. + */ + bool slabAllocatorUsed; + + char *slabMemoryBegin; /* beginning of slab memory arena */ + char *slabMemoryEnd; /* end of slab memory arena */ + SlabSlot *slabFreeHead; /* head of free list */ + + /* Buffer size to use for reading input tapes, during merge. */ + size_t read_buffer_size; + + /* + * When we return a tuple to the caller in tuplesort_gettuple_XXX, that + * came from a tape (that is, in TSS_SORTEDONTAPE or TSS_FINALMERGE + * modes), we remember the tuple in 'lastReturnedTuple', so that we can + * recycle the memory on next gettuple call. + */ + void *lastReturnedTuple; + + /* + * While building initial runs, this is the current output run number. + * Afterwards, it is the number of initial runs we made. + */ + int currentRun; + + /* + * Unless otherwise noted, all pointer variables below are pointers to + * arrays of length maxTapes, holding per-tape data. + */ + + /* + * This variable is only used during merge passes. mergeactive[i] is true + * if we are reading an input run from (actual) tape number i and have not + * yet exhausted that run. + */ + bool *mergeactive; /* active input run source? */ + + /* + * Variables for Algorithm D. Note that destTape is a "logical" tape + * number, ie, an index into the tp_xxx[] arrays. Be careful to keep + * "logical" and "actual" tape numbers straight! + */ + int Level; /* Knuth's l */ + int destTape; /* current output tape (Knuth's j, less 1) */ + int *tp_fib; /* Target Fibonacci run counts (A[]) */ + int *tp_runs; /* # of real runs on each tape */ + int *tp_dummy; /* # of dummy runs for each tape (D[]) */ + int *tp_tapenum; /* Actual tape numbers (TAPE[]) */ + int activeTapes; /* # of active input tapes in merge pass */ + + /* + * These variables are used after completion of sorting to keep track of + * the next tuple to return. (In the tape case, the tape's current read + * position is also critical state.) + */ + int result_tape; /* actual tape number of finished output */ + int current; /* array index (only used if SORTEDINMEM) */ + bool eof_reached; /* reached EOF (needed for cursors) */ + + /* markpos_xxx holds marked position for mark and restore */ + long markpos_block; /* tape block# (only used if SORTEDONTAPE) */ + int markpos_offset; /* saved "current", or offset in tape block */ + bool markpos_eof; /* saved "eof_reached" */ + + /* + * These variables are used during parallel sorting. + * + * worker is our worker identifier. Follows the general convention that + * -1 value relates to a leader tuplesort, and values >= 0 worker + * tuplesorts. (-1 can also be a serial tuplesort.) + * + * shared is mutable shared memory state, which is used to coordinate + * parallel sorts. + * + * nParticipants is the number of worker Tuplesortstates known by the + * leader to have actually been launched, which implies that they must + * finish a run leader can merge. Typically includes a worker state held + * by the leader process itself. Set in the leader Tuplesortstate only. + */ + int worker; + Sharedsort *shared; + int nParticipants; + + /* + * The sortKeys variable is used by every case other than the hash index + * case; it is set by tuplesort_begin_xxx. tupDesc is only used by the + * MinimalTuple and CLUSTER routines, though. + */ + TupleDesc tupDesc; + SortSupport sortKeys; /* array of length nKeys */ + + /* + * This variable is shared by the single-key MinimalTuple case and the + * Datum case (which both use qsort_ssup()). Otherwise it's NULL. + */ + SortSupport onlyKey; + + /* + * Additional state for managing "abbreviated key" sortsupport routines + * (which currently may be used by all cases except the hash index case). + * Tracks the intervals at which the optimization's effectiveness is + * tested. + */ + int64 abbrevNext; /* Tuple # at which to next check + * applicability */ + + /* + * These variables are specific to the CLUSTER case; they are set by + * tuplesort_begin_cluster. + */ + IndexInfo *indexInfo; /* info about index being used for reference */ + EState *estate; /* for evaluating index expressions */ + + /* + * These variables are specific to the IndexTuple case; they are set by + * tuplesort_begin_index_xxx and used only by the IndexTuple routines. + */ + Relation heapRel; /* table the index is being built on */ + Relation indexRel; /* index being built */ + + /* These are specific to the index_btree subcase: */ + bool enforceUnique; /* complain if we find duplicate tuples */ + + /* These are specific to the index_hash subcase: */ + uint32 high_mask; /* masks for sortable part of hash code */ + uint32 low_mask; + uint32 max_buckets; + + /* + * These variables are specific to the Datum case; they are set by + * tuplesort_begin_datum and used only by the DatumTuple routines. + */ + Oid datumType; + /* we need typelen in order to know how to copy the Datums. */ + int datumTypeLen; + + /* + * Resource snapshot for time of sort start. + */ +#ifdef TRACE_SORT + PGRUsage ru_start; +#endif +}; + +/* + * Private mutable state of tuplesort-parallel-operation. This is allocated + * in shared memory. + */ +struct Sharedsort +{ + /* mutex protects all fields prior to tapes */ + slock_t mutex; + + /* + * currentWorker generates ordinal identifier numbers for parallel sort + * workers. These start from 0, and are always gapless. + * + * Workers increment workersFinished to indicate having finished. If this + * is equal to state.nParticipants within the leader, leader is ready to + * merge worker runs. + */ + int currentWorker; + int workersFinished; + + /* Temporary file space */ + SharedFileSet fileset; + + /* Size of tapes flexible array */ + int nTapes; + + /* + * Tapes array used by workers to report back information needed by the + * leader to concatenate all worker tapes into one for merging + */ + TapeShare tapes[FLEXIBLE_ARRAY_MEMBER]; +}; + +/* + * Is the given tuple allocated from the slab memory arena? + */ +#define IS_SLAB_SLOT(state, tuple) \ + ((char *) (tuple) >= (state)->slabMemoryBegin && \ + (char *) (tuple) < (state)->slabMemoryEnd) + +/* + * Return the given tuple to the slab memory free list, or free it + * if it was palloc'd. + */ +#define RELEASE_SLAB_SLOT(state, tuple) \ + do { \ + SlabSlot *buf = (SlabSlot *) tuple; \ + \ + if (IS_SLAB_SLOT((state), buf)) \ + { \ + buf->nextfree = (state)->slabFreeHead; \ + (state)->slabFreeHead = buf; \ + } else \ + pfree(buf); \ + } while(0) + +#define COMPARETUP(state,a,b) ((*(state)->comparetup) (a, b, state)) +#define COPYTUP(state,stup,tup) ((*(state)->copytup) (state, stup, tup)) +#define WRITETUP(state,tape,stup) ((*(state)->writetup) (state, tape, stup)) +#define READTUP(state,stup,tape,len) ((*(state)->readtup) (state, stup, tape, len)) +#define LACKMEM(state) ((state)->availMem < 0 && !(state)->slabAllocatorUsed) +#define USEMEM(state,amt) ((state)->availMem -= (amt)) +#define FREEMEM(state,amt) ((state)->availMem += (amt)) +#define SERIAL(state) ((state)->shared == NULL) +#define WORKER(state) ((state)->shared && (state)->worker != -1) +#define LEADER(state) ((state)->shared && (state)->worker == -1) + +/* + * NOTES about on-tape representation of tuples: + * + * We require the first "unsigned int" of a stored tuple to be the total size + * on-tape of the tuple, including itself (so it is never zero; an all-zero + * unsigned int is used to delimit runs). The remainder of the stored tuple + * may or may not match the in-memory representation of the tuple --- + * any conversion needed is the job of the writetup and readtup routines. + * + * If state->randomAccess is true, then the stored representation of the + * tuple must be followed by another "unsigned int" that is a copy of the + * length --- so the total tape space used is actually sizeof(unsigned int) + * more than the stored length value. This allows read-backwards. When + * randomAccess is not true, the write/read routines may omit the extra + * length word. + * + * writetup is expected to write both length words as well as the tuple + * data. When readtup is called, the tape is positioned just after the + * front length word; readtup must read the tuple data and advance past + * the back length word (if present). + * + * The write/read routines can make use of the tuple description data + * stored in the Tuplesortstate record, if needed. They are also expected + * to adjust state->availMem by the amount of memory space (not tape space!) + * released or consumed. There is no error return from either writetup + * or readtup; they should ereport() on failure. + * + * + * NOTES about memory consumption calculations: + * + * We count space allocated for tuples against the workMem limit, plus + * the space used by the variable-size memtuples array. Fixed-size space + * is not counted; it's small enough to not be interesting. + * + * Note that we count actual space used (as shown by GetMemoryChunkSpace) + * rather than the originally-requested size. This is important since + * palloc can add substantial overhead. It's not a complete answer since + * we won't count any wasted space in palloc allocation blocks, but it's + * a lot better than what we were doing before 7.3. As of 9.6, a + * separate memory context is used for caller passed tuples. Resetting + * it at certain key increments significantly ameliorates fragmentation. + * Note that this places a responsibility on copytup routines to use the + * correct memory context for these tuples (and to not use the reset + * context for anything whose lifetime needs to span multiple external + * sort runs). readtup routines use the slab allocator (they cannot use + * the reset context because it gets deleted at the point that merging + * begins). + */ + +/* When using this macro, beware of double evaluation of len */ +#define LogicalTapeReadExact(tapeset, tapenum, ptr, len) \ + do { \ + if (LogicalTapeRead(tapeset, tapenum, ptr, len) != (size_t) (len)) \ + elog(ERROR, "unexpected end of data"); \ + } while(0) + + +static Tuplesortstate *tuplesort_begin_common(int workMem, + SortCoordinate coordinate, + bool randomAccess); +static void tuplesort_begin_batch(Tuplesortstate *state); +static void puttuple_common(Tuplesortstate *state, SortTuple *tuple); +static bool consider_abort_common(Tuplesortstate *state); +static void inittapes(Tuplesortstate *state, bool mergeruns); +static void inittapestate(Tuplesortstate *state, int maxTapes); +static void selectnewtape(Tuplesortstate *state); +static void init_slab_allocator(Tuplesortstate *state, int numSlots); +static void mergeruns(Tuplesortstate *state); +static void mergeonerun(Tuplesortstate *state); +static void beginmerge(Tuplesortstate *state); +static bool mergereadnext(Tuplesortstate *state, int srcTape, SortTuple *stup); +static void dumptuples(Tuplesortstate *state, bool alltuples); +static void make_bounded_heap(Tuplesortstate *state); +static void sort_bounded_heap(Tuplesortstate *state); +static void tuplesort_sort_memtuples(Tuplesortstate *state); +static void tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple); +static void tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple *tuple); +static void tuplesort_heap_delete_top(Tuplesortstate *state); +static void reversedirection(Tuplesortstate *state); +static unsigned int getlen(Tuplesortstate *state, int tapenum, bool eofOK); +static void markrunend(Tuplesortstate *state, int tapenum); +static void *readtup_alloc(Tuplesortstate *state, Size tuplen); +static int comparetup_heap(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_heap(Tuplesortstate *state, int tapenum, + SortTuple *stup); +static void readtup_heap(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); +static int comparetup_cluster(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_cluster(Tuplesortstate *state, int tapenum, + SortTuple *stup); +static void readtup_cluster(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); +static int comparetup_index_btree(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static int comparetup_index_hash(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_index(Tuplesortstate *state, int tapenum, + SortTuple *stup); +static void readtup_index(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); +static int comparetup_datum(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_datum(Tuplesortstate *state, int tapenum, + SortTuple *stup); +static void readtup_datum(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); +static int worker_get_identifier(Tuplesortstate *state); +static void worker_freeze_result_tape(Tuplesortstate *state); +static void worker_nomergeruns(Tuplesortstate *state); +static void leader_takeover_tapes(Tuplesortstate *state); +static void free_sort_tuple(Tuplesortstate *state, SortTuple *stup); +static void tuplesort_free(Tuplesortstate *state); +static void tuplesort_updatemax(Tuplesortstate *state); + +/* + * Special versions of qsort just for SortTuple objects. qsort_tuple() sorts + * any variant of SortTuples, using the appropriate comparetup function. + * qsort_ssup() is specialized for the case where the comparetup function + * reduces to ApplySortComparator(), that is single-key MinimalTuple sorts + * and Datum sorts. + */ + +#define ST_SORT qsort_tuple +#define ST_ELEMENT_TYPE SortTuple +#define ST_COMPARE_RUNTIME_POINTER +#define ST_COMPARE_ARG_TYPE Tuplesortstate +#define ST_CHECK_FOR_INTERRUPTS +#define ST_SCOPE static +#define ST_DECLARE +#define ST_DEFINE +#include "lib/sort_template.h" + +#define ST_SORT qsort_ssup +#define ST_ELEMENT_TYPE SortTuple +#define ST_COMPARE(a, b, ssup) \ + ApplySortComparator((a)->datum1, (a)->isnull1, \ + (b)->datum1, (b)->isnull1, (ssup)) +#define ST_COMPARE_ARG_TYPE SortSupportData +#define ST_CHECK_FOR_INTERRUPTS +#define ST_SCOPE static +#define ST_DEFINE +#include "lib/sort_template.h" + +/* + * tuplesort_begin_xxx + * + * Initialize for a tuple sort operation. + * + * After calling tuplesort_begin, the caller should call tuplesort_putXXX + * zero or more times, then call tuplesort_performsort when all the tuples + * have been supplied. After performsort, retrieve the tuples in sorted + * order by calling tuplesort_getXXX until it returns false/NULL. (If random + * access was requested, rescan, markpos, and restorepos can also be called.) + * Call tuplesort_end to terminate the operation and release memory/disk space. + * + * Each variant of tuplesort_begin has a workMem parameter specifying the + * maximum number of kilobytes of RAM to use before spilling data to disk. + * (The normal value of this parameter is work_mem, but some callers use + * other values.) Each variant also has a randomAccess parameter specifying + * whether the caller needs non-sequential access to the sort result. + */ + +static Tuplesortstate * +tuplesort_begin_common(int workMem, SortCoordinate coordinate, + bool randomAccess) +{ + Tuplesortstate *state; + MemoryContext maincontext; + MemoryContext sortcontext; + MemoryContext oldcontext; + + /* See leader_takeover_tapes() remarks on randomAccess support */ + if (coordinate && randomAccess) + elog(ERROR, "random access disallowed under parallel sort"); + + /* + * Memory context surviving tuplesort_reset. This memory context holds + * data which is useful to keep while sorting multiple similar batches. + */ + maincontext = AllocSetContextCreate(CurrentMemoryContext, + "TupleSort main", + ALLOCSET_DEFAULT_SIZES); + + /* + * Create a working memory context for one sort operation. The content of + * this context is deleted by tuplesort_reset. + */ + sortcontext = AllocSetContextCreate(maincontext, + "TupleSort sort", + ALLOCSET_DEFAULT_SIZES); + + /* + * Additionally a working memory context for tuples is setup in + * tuplesort_begin_batch. + */ + + /* + * Make the Tuplesortstate within the per-sortstate context. This way, we + * don't need a separate pfree() operation for it at shutdown. + */ + oldcontext = MemoryContextSwitchTo(maincontext); + + state = (Tuplesortstate *) palloc0(sizeof(Tuplesortstate)); + +#ifdef TRACE_SORT + if (trace_sort) + pg_rusage_init(&state->ru_start); +#endif + + state->randomAccess = randomAccess; + state->tuples = true; + + /* + * workMem is forced to be at least 64KB, the current minimum valid value + * for the work_mem GUC. This is a defense against parallel sort callers + * that divide out memory among many workers in a way that leaves each + * with very little memory. + */ + state->allowedMem = Max(workMem, 64) * (int64) 1024; + state->sortcontext = sortcontext; + state->maincontext = maincontext; + + /* + * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD; + * see comments in grow_memtuples(). + */ + state->memtupsize = INITIAL_MEMTUPSIZE; + state->memtuples = NULL; + + /* + * After all of the other non-parallel-related state, we setup all of the + * state needed for each batch. + */ + tuplesort_begin_batch(state); + + /* + * Initialize parallel-related state based on coordination information + * from caller + */ + if (!coordinate) + { + /* Serial sort */ + state->shared = NULL; + state->worker = -1; + state->nParticipants = -1; + } + else if (coordinate->isWorker) + { + /* Parallel worker produces exactly one final run from all input */ + state->shared = coordinate->sharedsort; + state->worker = worker_get_identifier(state); + state->nParticipants = -1; + } + else + { + /* Parallel leader state only used for final merge */ + state->shared = coordinate->sharedsort; + state->worker = -1; + state->nParticipants = coordinate->nParticipants; + Assert(state->nParticipants >= 1); + } + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +/* + * tuplesort_begin_batch + * + * Setup, or reset, all state need for processing a new set of tuples with this + * sort state. Called both from tuplesort_begin_common (the first time sorting + * with this sort state) and tuplesort_reset (for subsequent usages). + */ +static void +tuplesort_begin_batch(Tuplesortstate *state) +{ + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(state->maincontext); + + /* + * Caller tuple (e.g. IndexTuple) memory context. + * + * A dedicated child context used exclusively for caller passed tuples + * eases memory management. Resetting at key points reduces + * fragmentation. Note that the memtuples array of SortTuples is allocated + * in the parent context, not this context, because there is no need to + * free memtuples early. + */ + state->tuplecontext = AllocSetContextCreate(state->sortcontext, + "Caller tuples", + ALLOCSET_DEFAULT_SIZES); + + state->status = TSS_INITIAL; + state->bounded = false; + state->boundUsed = false; + + state->availMem = state->allowedMem; + + state->tapeset = NULL; + + state->memtupcount = 0; + + /* + * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD; + * see comments in grow_memtuples(). + */ + state->growmemtuples = true; + state->slabAllocatorUsed = false; + if (state->memtuples != NULL && state->memtupsize != INITIAL_MEMTUPSIZE) + { + pfree(state->memtuples); + state->memtuples = NULL; + state->memtupsize = INITIAL_MEMTUPSIZE; + } + if (state->memtuples == NULL) + { + state->memtuples = (SortTuple *) palloc(state->memtupsize * sizeof(SortTuple)); + USEMEM(state, GetMemoryChunkSpace(state->memtuples)); + } + + /* workMem must be large enough for the minimal memtuples array */ + if (LACKMEM(state)) + elog(ERROR, "insufficient memory allowed for sort"); + + state->currentRun = 0; + + /* + * maxTapes, tapeRange, and Algorithm D variables will be initialized by + * inittapes(), if needed + */ + + state->result_tape = -1; /* flag that result tape has not been formed */ + + MemoryContextSwitchTo(oldcontext); +} + +Tuplesortstate * +tuplesort_begin_heap(TupleDesc tupDesc, + int nkeys, AttrNumber *attNums, + Oid *sortOperators, Oid *sortCollations, + bool *nullsFirstFlags, + int workMem, SortCoordinate coordinate, bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + randomAccess); + MemoryContext oldcontext; + int i; + + oldcontext = MemoryContextSwitchTo(state->maincontext); + + AssertArg(nkeys > 0); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c", + nkeys, workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = nkeys; + + TRACE_POSTGRESQL_SORT_START(HEAP_SORT, + false, /* no unique check */ + nkeys, + workMem, + randomAccess, + PARALLEL_SORT(state)); + + state->comparetup = comparetup_heap; + state->copytup = copytup_heap; + state->writetup = writetup_heap; + state->readtup = readtup_heap; + + state->tupDesc = tupDesc; /* assume we need not copy tupDesc */ + state->abbrevNext = 10; + + /* Prepare SortSupport data for each column */ + state->sortKeys = (SortSupport) palloc0(nkeys * sizeof(SortSupportData)); + + for (i = 0; i < nkeys; i++) + { + SortSupport sortKey = state->sortKeys + i; + + AssertArg(attNums[i] != 0); + AssertArg(sortOperators[i] != 0); + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = sortCollations[i]; + sortKey->ssup_nulls_first = nullsFirstFlags[i]; + sortKey->ssup_attno = attNums[i]; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0); + + PrepareSortSupportFromOrderingOp(sortOperators[i], sortKey); + } + + /* + * The "onlyKey" optimization cannot be used with abbreviated keys, since + * tie-breaker comparisons may be required. Typically, the optimization + * is only of value to pass-by-value types anyway, whereas abbreviated + * keys are typically only of value to pass-by-reference types. + */ + if (nkeys == 1 && !state->sortKeys->abbrev_converter) + state->onlyKey = state->sortKeys; + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_cluster(TupleDesc tupDesc, + Relation indexRel, + int workMem, + SortCoordinate coordinate, bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + randomAccess); + BTScanInsert indexScanKey; + MemoryContext oldcontext; + int i; + + Assert(indexRel->rd_rel->relam == BTREE_AM_OID); + + oldcontext = MemoryContextSwitchTo(state->maincontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c", + RelationGetNumberOfAttributes(indexRel), + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = IndexRelationGetNumberOfKeyAttributes(indexRel); + + TRACE_POSTGRESQL_SORT_START(CLUSTER_SORT, + false, /* no unique check */ + state->nKeys, + workMem, + randomAccess, + PARALLEL_SORT(state)); + + state->comparetup = comparetup_cluster; + state->copytup = copytup_cluster; + state->writetup = writetup_cluster; + state->readtup = readtup_cluster; + state->abbrevNext = 10; + + state->indexInfo = BuildIndexInfo(indexRel); + + state->tupDesc = tupDesc; /* assume we need not copy tupDesc */ + + indexScanKey = _bt_mkscankey(indexRel, NULL); + + if (state->indexInfo->ii_Expressions != NULL) + { + TupleTableSlot *slot; + ExprContext *econtext; + + /* + * We will need to use FormIndexDatum to evaluate the index + * expressions. To do that, we need an EState, as well as a + * TupleTableSlot to put the table tuples into. The econtext's + * scantuple has to point to that slot, too. + */ + state->estate = CreateExecutorState(); + slot = MakeSingleTupleTableSlot(tupDesc, &TTSOpsHeapTuple); + econtext = GetPerTupleExprContext(state->estate); + econtext->ecxt_scantuple = slot; + } + + /* Prepare SortSupport data for each column */ + state->sortKeys = (SortSupport) palloc0(state->nKeys * + sizeof(SortSupportData)); + + for (i = 0; i < state->nKeys; i++) + { + SortSupport sortKey = state->sortKeys + i; + ScanKey scanKey = indexScanKey->scankeys + i; + int16 strategy; + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = scanKey->sk_collation; + sortKey->ssup_nulls_first = + (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0; + sortKey->ssup_attno = scanKey->sk_attno; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0); + + AssertState(sortKey->ssup_attno != 0); + + strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ? + BTGreaterStrategyNumber : BTLessStrategyNumber; + + PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey); + } + + pfree(indexScanKey); + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_index_btree(Relation heapRel, + Relation indexRel, + bool enforceUnique, + int workMem, + SortCoordinate coordinate, + bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + randomAccess); + BTScanInsert indexScanKey; + MemoryContext oldcontext; + int i; + + oldcontext = MemoryContextSwitchTo(state->maincontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin index sort: unique = %c, workMem = %d, randomAccess = %c", + enforceUnique ? 't' : 'f', + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = IndexRelationGetNumberOfKeyAttributes(indexRel); + + TRACE_POSTGRESQL_SORT_START(INDEX_SORT, + enforceUnique, + state->nKeys, + workMem, + randomAccess, + PARALLEL_SORT(state)); + + state->comparetup = comparetup_index_btree; + state->copytup = copytup_index; + state->writetup = writetup_index; + state->readtup = readtup_index; + state->abbrevNext = 10; + + state->heapRel = heapRel; + state->indexRel = indexRel; + state->enforceUnique = enforceUnique; + + indexScanKey = _bt_mkscankey(indexRel, NULL); + + /* Prepare SortSupport data for each column */ + state->sortKeys = (SortSupport) palloc0(state->nKeys * + sizeof(SortSupportData)); + + for (i = 0; i < state->nKeys; i++) + { + SortSupport sortKey = state->sortKeys + i; + ScanKey scanKey = indexScanKey->scankeys + i; + int16 strategy; + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = scanKey->sk_collation; + sortKey->ssup_nulls_first = + (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0; + sortKey->ssup_attno = scanKey->sk_attno; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0); + + AssertState(sortKey->ssup_attno != 0); + + strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ? + BTGreaterStrategyNumber : BTLessStrategyNumber; + + PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey); + } + + pfree(indexScanKey); + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_index_hash(Relation heapRel, + Relation indexRel, + uint32 high_mask, + uint32 low_mask, + uint32 max_buckets, + int workMem, + SortCoordinate coordinate, + bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + randomAccess); + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(state->maincontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin index sort: high_mask = 0x%x, low_mask = 0x%x, " + "max_buckets = 0x%x, workMem = %d, randomAccess = %c", + high_mask, + low_mask, + max_buckets, + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = 1; /* Only one sort column, the hash code */ + + state->comparetup = comparetup_index_hash; + state->copytup = copytup_index; + state->writetup = writetup_index; + state->readtup = readtup_index; + + state->heapRel = heapRel; + state->indexRel = indexRel; + + state->high_mask = high_mask; + state->low_mask = low_mask; + state->max_buckets = max_buckets; + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_index_gist(Relation heapRel, + Relation indexRel, + int workMem, + SortCoordinate coordinate, + bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + randomAccess); + MemoryContext oldcontext; + int i; + + oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin index sort: workMem = %d, randomAccess = %c", + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = IndexRelationGetNumberOfKeyAttributes(indexRel); + + state->comparetup = comparetup_index_btree; + state->copytup = copytup_index; + state->writetup = writetup_index; + state->readtup = readtup_index; + + state->heapRel = heapRel; + state->indexRel = indexRel; + + /* Prepare SortSupport data for each column */ + state->sortKeys = (SortSupport) palloc0(state->nKeys * + sizeof(SortSupportData)); + + for (i = 0; i < state->nKeys; i++) + { + SortSupport sortKey = state->sortKeys + i; + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = indexRel->rd_indcollation[i]; + sortKey->ssup_nulls_first = false; + sortKey->ssup_attno = i + 1; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0); + + AssertState(sortKey->ssup_attno != 0); + + /* Look for a sort support function */ + PrepareSortSupportFromGistIndexRel(indexRel, sortKey); + } + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation, + bool nullsFirstFlag, int workMem, + SortCoordinate coordinate, bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + randomAccess); + MemoryContext oldcontext; + int16 typlen; + bool typbyval; + + oldcontext = MemoryContextSwitchTo(state->maincontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin datum sort: workMem = %d, randomAccess = %c", + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = 1; /* always a one-column sort */ + + TRACE_POSTGRESQL_SORT_START(DATUM_SORT, + false, /* no unique check */ + 1, + workMem, + randomAccess, + PARALLEL_SORT(state)); + + state->comparetup = comparetup_datum; + state->copytup = copytup_datum; + state->writetup = writetup_datum; + state->readtup = readtup_datum; + state->abbrevNext = 10; + + state->datumType = datumType; + + /* lookup necessary attributes of the datum type */ + get_typlenbyval(datumType, &typlen, &typbyval); + state->datumTypeLen = typlen; + state->tuples = !typbyval; + + /* Prepare SortSupport data */ + state->sortKeys = (SortSupport) palloc0(sizeof(SortSupportData)); + + state->sortKeys->ssup_cxt = CurrentMemoryContext; + state->sortKeys->ssup_collation = sortCollation; + state->sortKeys->ssup_nulls_first = nullsFirstFlag; + + /* + * Abbreviation is possible here only for by-reference types. In theory, + * a pass-by-value datatype could have an abbreviated form that is cheaper + * to compare. In a tuple sort, we could support that, because we can + * always extract the original datum from the tuple as needed. Here, we + * can't, because a datum sort only stores a single copy of the datum; the + * "tuple" field of each SortTuple is NULL. + */ + state->sortKeys->abbreviate = !typbyval; + + PrepareSortSupportFromOrderingOp(sortOperator, state->sortKeys); + + /* + * The "onlyKey" optimization cannot be used with abbreviated keys, since + * tie-breaker comparisons may be required. Typically, the optimization + * is only of value to pass-by-value types anyway, whereas abbreviated + * keys are typically only of value to pass-by-reference types. + */ + if (!state->sortKeys->abbrev_converter) + state->onlyKey = state->sortKeys; + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +/* + * tuplesort_set_bound + * + * Advise tuplesort that at most the first N result tuples are required. + * + * Must be called before inserting any tuples. (Actually, we could allow it + * as long as the sort hasn't spilled to disk, but there seems no need for + * delayed calls at the moment.) + * + * This is a hint only. The tuplesort may still return more tuples than + * requested. Parallel leader tuplesorts will always ignore the hint. + */ +void +tuplesort_set_bound(Tuplesortstate *state, int64 bound) +{ + /* Assert we're called before loading any tuples */ + Assert(state->status == TSS_INITIAL && state->memtupcount == 0); + /* Can't set the bound twice, either */ + Assert(!state->bounded); + /* Also, this shouldn't be called in a parallel worker */ + Assert(!WORKER(state)); + + /* Parallel leader allows but ignores hint */ + if (LEADER(state)) + return; + +#ifdef DEBUG_BOUNDED_SORT + /* Honor GUC setting that disables the feature (for easy testing) */ + if (!optimize_bounded_sort) + return; +#endif + + /* We want to be able to compute bound * 2, so limit the setting */ + if (bound > (int64) (INT_MAX / 2)) + return; + + state->bounded = true; + state->bound = (int) bound; + + /* + * Bounded sorts are not an effective target for abbreviated key + * optimization. Disable by setting state to be consistent with no + * abbreviation support. + */ + state->sortKeys->abbrev_converter = NULL; + if (state->sortKeys->abbrev_full_comparator) + state->sortKeys->comparator = state->sortKeys->abbrev_full_comparator; + + /* Not strictly necessary, but be tidy */ + state->sortKeys->abbrev_abort = NULL; + state->sortKeys->abbrev_full_comparator = NULL; +} + +/* + * tuplesort_used_bound + * + * Allow callers to find out if the sort state was able to use a bound. + */ +bool +tuplesort_used_bound(Tuplesortstate *state) +{ + return state->boundUsed; +} + +/* + * tuplesort_free + * + * Internal routine for freeing resources of tuplesort. + */ +static void +tuplesort_free(Tuplesortstate *state) +{ + /* context swap probably not needed, but let's be safe */ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + long spaceUsed; + + if (state->tapeset) + spaceUsed = LogicalTapeSetBlocks(state->tapeset); + else + spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024; +#endif + + /* + * Delete temporary "tape" files, if any. + * + * Note: want to include this in reported total cost of sort, hence need + * for two #ifdef TRACE_SORT sections. + */ + if (state->tapeset) + LogicalTapeSetClose(state->tapeset); + +#ifdef TRACE_SORT + if (trace_sort) + { + if (state->tapeset) + elog(LOG, "%s of worker %d ended, %ld disk blocks used: %s", + SERIAL(state) ? "external sort" : "parallel external sort", + state->worker, spaceUsed, pg_rusage_show(&state->ru_start)); + else + elog(LOG, "%s of worker %d ended, %ld KB used: %s", + SERIAL(state) ? "internal sort" : "unperformed parallel sort", + state->worker, spaceUsed, pg_rusage_show(&state->ru_start)); + } + + TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, spaceUsed); +#else + + /* + * If you disabled TRACE_SORT, you can still probe sort__done, but you + * ain't getting space-used stats. + */ + TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, 0L); +#endif + + /* Free any execution state created for CLUSTER case */ + if (state->estate != NULL) + { + ExprContext *econtext = GetPerTupleExprContext(state->estate); + + ExecDropSingleTupleTableSlot(econtext->ecxt_scantuple); + FreeExecutorState(state->estate); + } + + MemoryContextSwitchTo(oldcontext); + + /* + * Free the per-sort memory context, thereby releasing all working memory. + */ + MemoryContextReset(state->sortcontext); +} + +/* + * tuplesort_end + * + * Release resources and clean up. + * + * NOTE: after calling this, any pointers returned by tuplesort_getXXX are + * pointing to garbage. Be careful not to attempt to use or free such + * pointers afterwards! + */ +void +tuplesort_end(Tuplesortstate *state) +{ + tuplesort_free(state); + + /* + * Free the main memory context, including the Tuplesortstate struct + * itself. + */ + MemoryContextDelete(state->maincontext); +} + +/* + * tuplesort_updatemax + * + * Update maximum resource usage statistics. + */ +static void +tuplesort_updatemax(Tuplesortstate *state) +{ + int64 spaceUsed; + bool isSpaceDisk; + + /* + * Note: it might seem we should provide both memory and disk usage for a + * disk-based sort. However, the current code doesn't track memory space + * accurately once we have begun to return tuples to the caller (since we + * don't account for pfree's the caller is expected to do), so we cannot + * rely on availMem in a disk sort. This does not seem worth the overhead + * to fix. Is it worth creating an API for the memory context code to + * tell us how much is actually used in sortcontext? + */ + if (state->tapeset) + { + isSpaceDisk = true; + spaceUsed = LogicalTapeSetBlocks(state->tapeset) * BLCKSZ; + } + else + { + isSpaceDisk = false; + spaceUsed = state->allowedMem - state->availMem; + } + + /* + * Sort evicts data to the disk when it wasn't able to fit that data into + * main memory. This is why we assume space used on the disk to be more + * important for tracking resource usage than space used in memory. Note + * that the amount of space occupied by some tupleset on the disk might be + * less than amount of space occupied by the same tupleset in memory due + * to more compact representation. + */ + if ((isSpaceDisk && !state->isMaxSpaceDisk) || + (isSpaceDisk == state->isMaxSpaceDisk && spaceUsed > state->maxSpace)) + { + state->maxSpace = spaceUsed; + state->isMaxSpaceDisk = isSpaceDisk; + state->maxSpaceStatus = state->status; + } +} + +/* + * tuplesort_reset + * + * Reset the tuplesort. Reset all the data in the tuplesort, but leave the + * meta-information in. After tuplesort_reset, tuplesort is ready to start + * a new sort. This allows avoiding recreation of tuple sort states (and + * save resources) when sorting multiple small batches. + */ +void +tuplesort_reset(Tuplesortstate *state) +{ + tuplesort_updatemax(state); + tuplesort_free(state); + + /* + * After we've freed up per-batch memory, re-setup all of the state common + * to both the first batch and any subsequent batch. + */ + tuplesort_begin_batch(state); + + state->lastReturnedTuple = NULL; + state->slabMemoryBegin = NULL; + state->slabMemoryEnd = NULL; + state->slabFreeHead = NULL; +} + +/* + * Grow the memtuples[] array, if possible within our memory constraint. We + * must not exceed INT_MAX tuples in memory or the caller-provided memory + * limit. Return true if we were able to enlarge the array, false if not. + * + * Normally, at each increment we double the size of the array. When doing + * that would exceed a limit, we attempt one last, smaller increase (and then + * clear the growmemtuples flag so we don't try any more). That allows us to + * use memory as fully as permitted; sticking to the pure doubling rule could + * result in almost half going unused. Because availMem moves around with + * tuple addition/removal, we need some rule to prevent making repeated small + * increases in memtupsize, which would just be useless thrashing. The + * growmemtuples flag accomplishes that and also prevents useless + * recalculations in this function. + */ +static bool +grow_memtuples(Tuplesortstate *state) +{ + int newmemtupsize; + int memtupsize = state->memtupsize; + int64 memNowUsed = state->allowedMem - state->availMem; + + /* Forget it if we've already maxed out memtuples, per comment above */ + if (!state->growmemtuples) + return false; + + /* Select new value of memtupsize */ + if (memNowUsed <= state->availMem) + { + /* + * We've used no more than half of allowedMem; double our usage, + * clamping at INT_MAX tuples. + */ + if (memtupsize < INT_MAX / 2) + newmemtupsize = memtupsize * 2; + else + { + newmemtupsize = INT_MAX; + state->growmemtuples = false; + } + } + else + { + /* + * This will be the last increment of memtupsize. Abandon doubling + * strategy and instead increase as much as we safely can. + * + * To stay within allowedMem, we can't increase memtupsize by more + * than availMem / sizeof(SortTuple) elements. In practice, we want + * to increase it by considerably less, because we need to leave some + * space for the tuples to which the new array slots will refer. We + * assume the new tuples will be about the same size as the tuples + * we've already seen, and thus we can extrapolate from the space + * consumption so far to estimate an appropriate new size for the + * memtuples array. The optimal value might be higher or lower than + * this estimate, but it's hard to know that in advance. We again + * clamp at INT_MAX tuples. + * + * This calculation is safe against enlarging the array so much that + * LACKMEM becomes true, because the memory currently used includes + * the present array; thus, there would be enough allowedMem for the + * new array elements even if no other memory were currently used. + * + * We do the arithmetic in float8, because otherwise the product of + * memtupsize and allowedMem could overflow. Any inaccuracy in the + * result should be insignificant; but even if we computed a + * completely insane result, the checks below will prevent anything + * really bad from happening. + */ + double grow_ratio; + + grow_ratio = (double) state->allowedMem / (double) memNowUsed; + if (memtupsize * grow_ratio < INT_MAX) + newmemtupsize = (int) (memtupsize * grow_ratio); + else + newmemtupsize = INT_MAX; + + /* We won't make any further enlargement attempts */ + state->growmemtuples = false; + } + + /* Must enlarge array by at least one element, else report failure */ + if (newmemtupsize <= memtupsize) + goto noalloc; + + /* + * On a 32-bit machine, allowedMem could exceed MaxAllocHugeSize. Clamp + * to ensure our request won't be rejected. Note that we can easily + * exhaust address space before facing this outcome. (This is presently + * impossible due to guc.c's MAX_KILOBYTES limitation on work_mem, but + * don't rely on that at this distance.) + */ + if ((Size) newmemtupsize >= MaxAllocHugeSize / sizeof(SortTuple)) + { + newmemtupsize = (int) (MaxAllocHugeSize / sizeof(SortTuple)); + state->growmemtuples = false; /* can't grow any more */ + } + + /* + * We need to be sure that we do not cause LACKMEM to become true, else + * the space management algorithm will go nuts. The code above should + * never generate a dangerous request, but to be safe, check explicitly + * that the array growth fits within availMem. (We could still cause + * LACKMEM if the memory chunk overhead associated with the memtuples + * array were to increase. That shouldn't happen because we chose the + * initial array size large enough to ensure that palloc will be treating + * both old and new arrays as separate chunks. But we'll check LACKMEM + * explicitly below just in case.) + */ + if (state->availMem < (int64) ((newmemtupsize - memtupsize) * sizeof(SortTuple))) + goto noalloc; + + /* OK, do it */ + FREEMEM(state, GetMemoryChunkSpace(state->memtuples)); + state->memtupsize = newmemtupsize; + state->memtuples = (SortTuple *) + repalloc_huge(state->memtuples, + state->memtupsize * sizeof(SortTuple)); + USEMEM(state, GetMemoryChunkSpace(state->memtuples)); + if (LACKMEM(state)) + elog(ERROR, "unexpected out-of-memory situation in tuplesort"); + return true; + +noalloc: + /* If for any reason we didn't realloc, shut off future attempts */ + state->growmemtuples = false; + return false; +} + +/* + * Accept one tuple while collecting input data for sort. + * + * Note that the input data is always copied; the caller need not save it. + */ +void +tuplesort_puttupleslot(Tuplesortstate *state, TupleTableSlot *slot) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + /* + * Copy the given tuple into memory we control, and decrease availMem. + * Then call the common code. + */ + COPYTUP(state, &stup, (void *) slot); + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Accept one tuple while collecting input data for sort. + * + * Note that the input data is always copied; the caller need not save it. + */ +void +tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + /* + * Copy the given tuple into memory we control, and decrease availMem. + * Then call the common code. + */ + COPYTUP(state, &stup, (void *) tup); + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Collect one index tuple while collecting input data for sort, building + * it from caller-supplied values. + */ +void +tuplesort_putindextuplevalues(Tuplesortstate *state, Relation rel, + ItemPointer self, Datum *values, + bool *isnull) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + SortTuple stup; + Datum original; + IndexTuple tuple; + + stup.tuple = index_form_tuple(RelationGetDescr(rel), values, isnull); + tuple = ((IndexTuple) stup.tuple); + tuple->t_tid = *self; + USEMEM(state, GetMemoryChunkSpace(stup.tuple)); + /* set up first-column key value */ + original = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &stup.isnull1); + + MemoryContextSwitchTo(state->sortcontext); + + if (!state->sortKeys || !state->sortKeys->abbrev_converter || stup.isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to zeroed representation (to be + * consistent, and to support cheap inequality tests for NULL + * abbreviated keys). + */ + stup.datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup.datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup.datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + tuple = mtup->tuple; + mtup->datum1 = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &mtup->isnull1); + } + } + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Accept one Datum while collecting input data for sort. + * + * If the Datum is pass-by-ref type, the value will be copied. + */ +void +tuplesort_putdatum(Tuplesortstate *state, Datum val, bool isNull) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + SortTuple stup; + + /* + * Pass-by-value types or null values are just stored directly in + * stup.datum1 (and stup.tuple is not used and set to NULL). + * + * Non-null pass-by-reference values need to be copied into memory we + * control, and possibly abbreviated. The copied value is pointed to by + * stup.tuple and is treated as the canonical copy (e.g. to return via + * tuplesort_getdatum or when writing to tape); stup.datum1 gets the + * abbreviated value if abbreviation is happening, otherwise it's + * identical to stup.tuple. + */ + + if (isNull || !state->tuples) + { + /* + * Set datum1 to zeroed representation for NULLs (to be consistent, + * and to support cheap inequality tests for NULL abbreviated keys). + */ + stup.datum1 = !isNull ? val : (Datum) 0; + stup.isnull1 = isNull; + stup.tuple = NULL; /* no separate storage */ + MemoryContextSwitchTo(state->sortcontext); + } + else + { + Datum original = datumCopy(val, false, state->datumTypeLen); + + stup.isnull1 = false; + stup.tuple = DatumGetPointer(original); + USEMEM(state, GetMemoryChunkSpace(stup.tuple)); + MemoryContextSwitchTo(state->sortcontext); + + if (!state->sortKeys->abbrev_converter) + { + stup.datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup.datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup.datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any + * case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + mtup->datum1 = PointerGetDatum(mtup->tuple); + } + } + } + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Shared code for tuple and datum cases. + */ +static void +puttuple_common(Tuplesortstate *state, SortTuple *tuple) +{ + Assert(!LEADER(state)); + + switch (state->status) + { + case TSS_INITIAL: + + /* + * Save the tuple into the unsorted array. First, grow the array + * as needed. Note that we try to grow the array when there is + * still one free slot remaining --- if we fail, there'll still be + * room to store the incoming tuple, and then we'll switch to + * tape-based operation. + */ + if (state->memtupcount >= state->memtupsize - 1) + { + (void) grow_memtuples(state); + Assert(state->memtupcount < state->memtupsize); + } + state->memtuples[state->memtupcount++] = *tuple; + + /* + * Check if it's time to switch over to a bounded heapsort. We do + * so if the input tuple count exceeds twice the desired tuple + * count (this is a heuristic for where heapsort becomes cheaper + * than a quicksort), or if we've just filled workMem and have + * enough tuples to meet the bound. + * + * Note that once we enter TSS_BOUNDED state we will always try to + * complete the sort that way. In the worst case, if later input + * tuples are larger than earlier ones, this might cause us to + * exceed workMem significantly. + */ + if (state->bounded && + (state->memtupcount > state->bound * 2 || + (state->memtupcount > state->bound && LACKMEM(state)))) + { +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "switching to bounded heapsort at %d tuples: %s", + state->memtupcount, + pg_rusage_show(&state->ru_start)); +#endif + make_bounded_heap(state); + return; + } + + /* + * Done if we still fit in available memory and have array slots. + */ + if (state->memtupcount < state->memtupsize && !LACKMEM(state)) + return; + + /* + * Nope; time to switch to tape-based operation. + */ + inittapes(state, true); + + /* + * Dump all tuples. + */ + dumptuples(state, false); + break; + + case TSS_BOUNDED: + + /* + * We don't want to grow the array here, so check whether the new + * tuple can be discarded before putting it in. This should be a + * good speed optimization, too, since when there are many more + * input tuples than the bound, most input tuples can be discarded + * with just this one comparison. Note that because we currently + * have the sort direction reversed, we must check for <= not >=. + */ + if (COMPARETUP(state, tuple, &state->memtuples[0]) <= 0) + { + /* new tuple <= top of the heap, so we can discard it */ + free_sort_tuple(state, tuple); + CHECK_FOR_INTERRUPTS(); + } + else + { + /* discard top of heap, replacing it with the new tuple */ + free_sort_tuple(state, &state->memtuples[0]); + tuplesort_heap_replace_top(state, tuple); + } + break; + + case TSS_BUILDRUNS: + + /* + * Save the tuple into the unsorted array (there must be space) + */ + state->memtuples[state->memtupcount++] = *tuple; + + /* + * If we are over the memory limit, dump all tuples. + */ + dumptuples(state, false); + break; + + default: + elog(ERROR, "invalid tuplesort state"); + break; + } +} + +static bool +consider_abort_common(Tuplesortstate *state) +{ + Assert(state->sortKeys[0].abbrev_converter != NULL); + Assert(state->sortKeys[0].abbrev_abort != NULL); + Assert(state->sortKeys[0].abbrev_full_comparator != NULL); + + /* + * Check effectiveness of abbreviation optimization. Consider aborting + * when still within memory limit. + */ + if (state->status == TSS_INITIAL && + state->memtupcount >= state->abbrevNext) + { + state->abbrevNext *= 2; + + /* + * Check opclass-supplied abbreviation abort routine. It may indicate + * that abbreviation should not proceed. + */ + if (!state->sortKeys->abbrev_abort(state->memtupcount, + state->sortKeys)) + return false; + + /* + * Finally, restore authoritative comparator, and indicate that + * abbreviation is not in play by setting abbrev_converter to NULL + */ + state->sortKeys[0].comparator = state->sortKeys[0].abbrev_full_comparator; + state->sortKeys[0].abbrev_converter = NULL; + /* Not strictly necessary, but be tidy */ + state->sortKeys[0].abbrev_abort = NULL; + state->sortKeys[0].abbrev_full_comparator = NULL; + + /* Give up - expect original pass-by-value representation */ + return true; + } + + return false; +} + +/* + * All tuples have been provided; finish the sort. + */ +void +tuplesort_performsort(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "performsort of worker %d starting: %s", + state->worker, pg_rusage_show(&state->ru_start)); +#endif + + switch (state->status) + { + case TSS_INITIAL: + + /* + * We were able to accumulate all the tuples within the allowed + * amount of memory, or leader to take over worker tapes + */ + if (SERIAL(state)) + { + /* Just qsort 'em and we're done */ + tuplesort_sort_memtuples(state); + state->status = TSS_SORTEDINMEM; + } + else if (WORKER(state)) + { + /* + * Parallel workers must still dump out tuples to tape. No + * merge is required to produce single output run, though. + */ + inittapes(state, false); + dumptuples(state, true); + worker_nomergeruns(state); + state->status = TSS_SORTEDONTAPE; + } + else + { + /* + * Leader will take over worker tapes and merge worker runs. + * Note that mergeruns sets the correct state->status. + */ + leader_takeover_tapes(state); + mergeruns(state); + } + state->current = 0; + state->eof_reached = false; + state->markpos_block = 0L; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + + case TSS_BOUNDED: + + /* + * We were able to accumulate all the tuples required for output + * in memory, using a heap to eliminate excess tuples. Now we + * have to transform the heap to a properly-sorted array. + */ + sort_bounded_heap(state); + state->current = 0; + state->eof_reached = false; + state->markpos_offset = 0; + state->markpos_eof = false; + state->status = TSS_SORTEDINMEM; + break; + + case TSS_BUILDRUNS: + + /* + * Finish tape-based sort. First, flush all tuples remaining in + * memory out to tape; then merge until we have a single remaining + * run (or, if !randomAccess and !WORKER(), one run per tape). + * Note that mergeruns sets the correct state->status. + */ + dumptuples(state, true); + mergeruns(state); + state->eof_reached = false; + state->markpos_block = 0L; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + +#ifdef TRACE_SORT + if (trace_sort) + { + if (state->status == TSS_FINALMERGE) + elog(LOG, "performsort of worker %d done (except %d-way final merge): %s", + state->worker, state->activeTapes, + pg_rusage_show(&state->ru_start)); + else + elog(LOG, "performsort of worker %d done: %s", + state->worker, pg_rusage_show(&state->ru_start)); + } +#endif + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Internal routine to fetch the next tuple in either forward or back + * direction into *stup. Returns false if no more tuples. + * Returned tuple belongs to tuplesort memory context, and must not be freed + * by caller. Note that fetched tuple is stored in memory that may be + * recycled by any future fetch. + */ +static bool +tuplesort_gettuple_common(Tuplesortstate *state, bool forward, + SortTuple *stup) +{ + unsigned int tuplen; + size_t nmoved; + + Assert(!WORKER(state)); + + switch (state->status) + { + case TSS_SORTEDINMEM: + Assert(forward || state->randomAccess); + Assert(!state->slabAllocatorUsed); + if (forward) + { + if (state->current < state->memtupcount) + { + *stup = state->memtuples[state->current++]; + return true; + } + state->eof_reached = true; + + /* + * Complain if caller tries to retrieve more tuples than + * originally asked for in a bounded sort. This is because + * returning EOF here might be the wrong thing. + */ + if (state->bounded && state->current >= state->bound) + elog(ERROR, "retrieved too many tuples in a bounded sort"); + + return false; + } + else + { + if (state->current <= 0) + return false; + + /* + * if all tuples are fetched already then we return last + * tuple, else - tuple before last returned. + */ + if (state->eof_reached) + state->eof_reached = false; + else + { + state->current--; /* last returned tuple */ + if (state->current <= 0) + return false; + } + *stup = state->memtuples[state->current - 1]; + return true; + } + break; + + case TSS_SORTEDONTAPE: + Assert(forward || state->randomAccess); + Assert(state->slabAllocatorUsed); + + /* + * The slot that held the tuple that we returned in previous + * gettuple call can now be reused. + */ + if (state->lastReturnedTuple) + { + RELEASE_SLAB_SLOT(state, state->lastReturnedTuple); + state->lastReturnedTuple = NULL; + } + + if (forward) + { + if (state->eof_reached) + return false; + + if ((tuplen = getlen(state, state->result_tape, true)) != 0) + { + READTUP(state, stup, state->result_tape, tuplen); + + /* + * Remember the tuple we return, so that we can recycle + * its memory on next call. (This can be NULL, in the + * !state->tuples case). + */ + state->lastReturnedTuple = stup->tuple; + + return true; + } + else + { + state->eof_reached = true; + return false; + } + } + + /* + * Backward. + * + * if all tuples are fetched already then we return last tuple, + * else - tuple before last returned. + */ + if (state->eof_reached) + { + /* + * Seek position is pointing just past the zero tuplen at the + * end of file; back up to fetch last tuple's ending length + * word. If seek fails we must have a completely empty file. + */ + nmoved = LogicalTapeBackspace(state->tapeset, + state->result_tape, + 2 * sizeof(unsigned int)); + if (nmoved == 0) + return false; + else if (nmoved != 2 * sizeof(unsigned int)) + elog(ERROR, "unexpected tape position"); + state->eof_reached = false; + } + else + { + /* + * Back up and fetch previously-returned tuple's ending length + * word. If seek fails, assume we are at start of file. + */ + nmoved = LogicalTapeBackspace(state->tapeset, + state->result_tape, + sizeof(unsigned int)); + if (nmoved == 0) + return false; + else if (nmoved != sizeof(unsigned int)) + elog(ERROR, "unexpected tape position"); + tuplen = getlen(state, state->result_tape, false); + + /* + * Back up to get ending length word of tuple before it. + */ + nmoved = LogicalTapeBackspace(state->tapeset, + state->result_tape, + tuplen + 2 * sizeof(unsigned int)); + if (nmoved == tuplen + sizeof(unsigned int)) + { + /* + * We backed up over the previous tuple, but there was no + * ending length word before it. That means that the prev + * tuple is the first tuple in the file. It is now the + * next to read in forward direction (not obviously right, + * but that is what in-memory case does). + */ + return false; + } + else if (nmoved != tuplen + 2 * sizeof(unsigned int)) + elog(ERROR, "bogus tuple length in backward scan"); + } + + tuplen = getlen(state, state->result_tape, false); + + /* + * Now we have the length of the prior tuple, back up and read it. + * Note: READTUP expects we are positioned after the initial + * length word of the tuple, so back up to that point. + */ + nmoved = LogicalTapeBackspace(state->tapeset, + state->result_tape, + tuplen); + if (nmoved != tuplen) + elog(ERROR, "bogus tuple length in backward scan"); + READTUP(state, stup, state->result_tape, tuplen); + + /* + * Remember the tuple we return, so that we can recycle its memory + * on next call. (This can be NULL, in the Datum case). + */ + state->lastReturnedTuple = stup->tuple; + + return true; + + case TSS_FINALMERGE: + Assert(forward); + /* We are managing memory ourselves, with the slab allocator. */ + Assert(state->slabAllocatorUsed); + + /* + * The slab slot holding the tuple that we returned in previous + * gettuple call can now be reused. + */ + if (state->lastReturnedTuple) + { + RELEASE_SLAB_SLOT(state, state->lastReturnedTuple); + state->lastReturnedTuple = NULL; + } + + /* + * This code should match the inner loop of mergeonerun(). + */ + if (state->memtupcount > 0) + { + int srcTape = state->memtuples[0].srctape; + SortTuple newtup; + + *stup = state->memtuples[0]; + + /* + * Remember the tuple we return, so that we can recycle its + * memory on next call. (This can be NULL, in the Datum case). + */ + state->lastReturnedTuple = stup->tuple; + + /* + * Pull next tuple from tape, and replace the returned tuple + * at top of the heap with it. + */ + if (!mergereadnext(state, srcTape, &newtup)) + { + /* + * If no more data, we've reached end of run on this tape. + * Remove the top node from the heap. + */ + tuplesort_heap_delete_top(state); + + /* + * Rewind to free the read buffer. It'd go away at the + * end of the sort anyway, but better to release the + * memory early. + */ + LogicalTapeRewindForWrite(state->tapeset, srcTape); + return true; + } + newtup.srctape = srcTape; + tuplesort_heap_replace_top(state, &newtup); + return true; + } + return false; + + default: + elog(ERROR, "invalid tuplesort state"); + return false; /* keep compiler quiet */ + } +} + +/* + * Fetch the next tuple in either forward or back direction. + * If successful, put tuple in slot and return true; else, clear the slot + * and return false. + * + * Caller may optionally be passed back abbreviated value (on true return + * value) when abbreviation was used, which can be used to cheaply avoid + * equality checks that might otherwise be required. Caller can safely make a + * determination of "non-equal tuple" based on simple binary inequality. A + * NULL value in leading attribute will set abbreviated value to zeroed + * representation, which caller may rely on in abbreviated inequality check. + * + * If copy is true, the slot receives a tuple that's been copied into the + * caller's memory context, so that it will stay valid regardless of future + * manipulations of the tuplesort's state (up to and including deleting the + * tuplesort). If copy is false, the slot will just receive a pointer to a + * tuple held within the tuplesort, which is more efficient, but only safe for + * callers that are prepared to have any subsequent manipulation of the + * tuplesort's state invalidate slot contents. + */ +bool +tuplesort_gettupleslot(Tuplesortstate *state, bool forward, bool copy, + TupleTableSlot *slot, Datum *abbrev) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + stup.tuple = NULL; + + MemoryContextSwitchTo(oldcontext); + + if (stup.tuple) + { + /* Record abbreviated key for caller */ + if (state->sortKeys->abbrev_converter && abbrev) + *abbrev = stup.datum1; + + if (copy) + stup.tuple = heap_copy_minimal_tuple((MinimalTuple) stup.tuple); + + ExecStoreMinimalTuple((MinimalTuple) stup.tuple, slot, copy); + return true; + } + else + { + ExecClearTuple(slot); + return false; + } +} + +/* + * Fetch the next tuple in either forward or back direction. + * Returns NULL if no more tuples. Returned tuple belongs to tuplesort memory + * context, and must not be freed by caller. Caller may not rely on tuple + * remaining valid after any further manipulation of tuplesort. + */ +HeapTuple +tuplesort_getheaptuple(Tuplesortstate *state, bool forward) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + stup.tuple = NULL; + + MemoryContextSwitchTo(oldcontext); + + return stup.tuple; +} + +/* + * Fetch the next index tuple in either forward or back direction. + * Returns NULL if no more tuples. Returned tuple belongs to tuplesort memory + * context, and must not be freed by caller. Caller may not rely on tuple + * remaining valid after any further manipulation of tuplesort. + */ +IndexTuple +tuplesort_getindextuple(Tuplesortstate *state, bool forward) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + stup.tuple = NULL; + + MemoryContextSwitchTo(oldcontext); + + return (IndexTuple) stup.tuple; +} + +/* + * Fetch the next Datum in either forward or back direction. + * Returns false if no more datums. + * + * If the Datum is pass-by-ref type, the returned value is freshly palloc'd + * in caller's context, and is now owned by the caller (this differs from + * similar routines for other types of tuplesorts). + * + * Caller may optionally be passed back abbreviated value (on true return + * value) when abbreviation was used, which can be used to cheaply avoid + * equality checks that might otherwise be required. Caller can safely make a + * determination of "non-equal tuple" based on simple binary inequality. A + * NULL value will have a zeroed abbreviated value representation, which caller + * may rely on in abbreviated inequality check. + */ +bool +tuplesort_getdatum(Tuplesortstate *state, bool forward, + Datum *val, bool *isNull, Datum *abbrev) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + { + MemoryContextSwitchTo(oldcontext); + return false; + } + + /* Ensure we copy into caller's memory context */ + MemoryContextSwitchTo(oldcontext); + + /* Record abbreviated key for caller */ + if (state->sortKeys->abbrev_converter && abbrev) + *abbrev = stup.datum1; + + if (stup.isnull1 || !state->tuples) + { + *val = stup.datum1; + *isNull = stup.isnull1; + } + else + { + /* use stup.tuple because stup.datum1 may be an abbreviation */ + *val = datumCopy(PointerGetDatum(stup.tuple), false, state->datumTypeLen); + *isNull = false; + } + + return true; +} + +/* + * Advance over N tuples in either forward or back direction, + * without returning any data. N==0 is a no-op. + * Returns true if successful, false if ran out of tuples. + */ +bool +tuplesort_skiptuples(Tuplesortstate *state, int64 ntuples, bool forward) +{ + MemoryContext oldcontext; + + /* + * We don't actually support backwards skip yet, because no callers need + * it. The API is designed to allow for that later, though. + */ + Assert(forward); + Assert(ntuples >= 0); + Assert(!WORKER(state)); + + switch (state->status) + { + case TSS_SORTEDINMEM: + if (state->memtupcount - state->current >= ntuples) + { + state->current += ntuples; + return true; + } + state->current = state->memtupcount; + state->eof_reached = true; + + /* + * Complain if caller tries to retrieve more tuples than + * originally asked for in a bounded sort. This is because + * returning EOF here might be the wrong thing. + */ + if (state->bounded && state->current >= state->bound) + elog(ERROR, "retrieved too many tuples in a bounded sort"); + + return false; + + case TSS_SORTEDONTAPE: + case TSS_FINALMERGE: + + /* + * We could probably optimize these cases better, but for now it's + * not worth the trouble. + */ + oldcontext = MemoryContextSwitchTo(state->sortcontext); + while (ntuples-- > 0) + { + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + { + MemoryContextSwitchTo(oldcontext); + return false; + } + CHECK_FOR_INTERRUPTS(); + } + MemoryContextSwitchTo(oldcontext); + return true; + + default: + elog(ERROR, "invalid tuplesort state"); + return false; /* keep compiler quiet */ + } +} + +/* + * tuplesort_merge_order - report merge order we'll use for given memory + * (note: "merge order" just means the number of input tapes in the merge). + * + * This is exported for use by the planner. allowedMem is in bytes. + */ +int +tuplesort_merge_order(int64 allowedMem) +{ + int mOrder; + + /* + * We need one tape for each merge input, plus another one for the output, + * and each of these tapes needs buffer space. In addition we want + * MERGE_BUFFER_SIZE workspace per input tape (but the output tape doesn't + * count). + * + * Note: you might be thinking we need to account for the memtuples[] + * array in this calculation, but we effectively treat that as part of the + * MERGE_BUFFER_SIZE workspace. + */ + mOrder = (allowedMem - TAPE_BUFFER_OVERHEAD) / + (MERGE_BUFFER_SIZE + TAPE_BUFFER_OVERHEAD); + + /* + * Even in minimum memory, use at least a MINORDER merge. On the other + * hand, even when we have lots of memory, do not use more than a MAXORDER + * merge. Tapes are pretty cheap, but they're not entirely free. Each + * additional tape reduces the amount of memory available to build runs, + * which in turn can cause the same sort to need more runs, which makes + * merging slower even if it can still be done in a single pass. Also, + * high order merges are quite slow due to CPU cache effects; it can be + * faster to pay the I/O cost of a polyphase merge than to perform a + * single merge pass across many hundreds of tapes. + */ + mOrder = Max(mOrder, MINORDER); + mOrder = Min(mOrder, MAXORDER); + + return mOrder; +} + +/* + * inittapes - initialize for tape sorting. + * + * This is called only if we have found we won't sort in memory. + */ +static void +inittapes(Tuplesortstate *state, bool mergeruns) +{ + int maxTapes, + j; + + Assert(!LEADER(state)); + + if (mergeruns) + { + /* Compute number of tapes to use: merge order plus 1 */ + maxTapes = tuplesort_merge_order(state->allowedMem) + 1; + } + else + { + /* Workers can sometimes produce single run, output without merge */ + Assert(WORKER(state)); + maxTapes = MINORDER + 1; + } + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d switching to external sort with %d tapes: %s", + state->worker, maxTapes, pg_rusage_show(&state->ru_start)); +#endif + + /* Create the tape set and allocate the per-tape data arrays */ + inittapestate(state, maxTapes); + state->tapeset = + LogicalTapeSetCreate(maxTapes, false, NULL, + state->shared ? &state->shared->fileset : NULL, + state->worker); + + state->currentRun = 0; + + /* + * Initialize variables of Algorithm D (step D1). + */ + for (j = 0; j < maxTapes; j++) + { + state->tp_fib[j] = 1; + state->tp_runs[j] = 0; + state->tp_dummy[j] = 1; + state->tp_tapenum[j] = j; + } + state->tp_fib[state->tapeRange] = 0; + state->tp_dummy[state->tapeRange] = 0; + + state->Level = 1; + state->destTape = 0; + + state->status = TSS_BUILDRUNS; +} + +/* + * inittapestate - initialize generic tape management state + */ +static void +inittapestate(Tuplesortstate *state, int maxTapes) +{ + int64 tapeSpace; + + /* + * Decrease availMem to reflect the space needed for tape buffers; but + * don't decrease it to the point that we have no room for tuples. (That + * case is only likely to occur if sorting pass-by-value Datums; in all + * other scenarios the memtuples[] array is unlikely to occupy more than + * half of allowedMem. In the pass-by-value case it's not important to + * account for tuple space, so we don't care if LACKMEM becomes + * inaccurate.) + */ + tapeSpace = (int64) maxTapes * TAPE_BUFFER_OVERHEAD; + + if (tapeSpace + GetMemoryChunkSpace(state->memtuples) < state->allowedMem) + USEMEM(state, tapeSpace); + + /* + * Make sure that the temp file(s) underlying the tape set are created in + * suitable temp tablespaces. For parallel sorts, this should have been + * called already, but it doesn't matter if it is called a second time. + */ + PrepareTempTablespaces(); + + state->mergeactive = (bool *) palloc0(maxTapes * sizeof(bool)); + state->tp_fib = (int *) palloc0(maxTapes * sizeof(int)); + state->tp_runs = (int *) palloc0(maxTapes * sizeof(int)); + state->tp_dummy = (int *) palloc0(maxTapes * sizeof(int)); + state->tp_tapenum = (int *) palloc0(maxTapes * sizeof(int)); + + /* Record # of tapes allocated (for duration of sort) */ + state->maxTapes = maxTapes; + /* Record maximum # of tapes usable as inputs when merging */ + state->tapeRange = maxTapes - 1; +} + +/* + * selectnewtape -- select new tape for new initial run. + * + * This is called after finishing a run when we know another run + * must be started. This implements steps D3, D4 of Algorithm D. + */ +static void +selectnewtape(Tuplesortstate *state) +{ + int j; + int a; + + /* Step D3: advance j (destTape) */ + if (state->tp_dummy[state->destTape] < state->tp_dummy[state->destTape + 1]) + { + state->destTape++; + return; + } + if (state->tp_dummy[state->destTape] != 0) + { + state->destTape = 0; + return; + } + + /* Step D4: increase level */ + state->Level++; + a = state->tp_fib[0]; + for (j = 0; j < state->tapeRange; j++) + { + state->tp_dummy[j] = a + state->tp_fib[j + 1] - state->tp_fib[j]; + state->tp_fib[j] = a + state->tp_fib[j + 1]; + } + state->destTape = 0; +} + +/* + * Initialize the slab allocation arena, for the given number of slots. + */ +static void +init_slab_allocator(Tuplesortstate *state, int numSlots) +{ + if (numSlots > 0) + { + char *p; + int i; + + state->slabMemoryBegin = palloc(numSlots * SLAB_SLOT_SIZE); + state->slabMemoryEnd = state->slabMemoryBegin + + numSlots * SLAB_SLOT_SIZE; + state->slabFreeHead = (SlabSlot *) state->slabMemoryBegin; + USEMEM(state, numSlots * SLAB_SLOT_SIZE); + + p = state->slabMemoryBegin; + for (i = 0; i < numSlots - 1; i++) + { + ((SlabSlot *) p)->nextfree = (SlabSlot *) (p + SLAB_SLOT_SIZE); + p += SLAB_SLOT_SIZE; + } + ((SlabSlot *) p)->nextfree = NULL; + } + else + { + state->slabMemoryBegin = state->slabMemoryEnd = NULL; + state->slabFreeHead = NULL; + } + state->slabAllocatorUsed = true; +} + +/* + * mergeruns -- merge all the completed initial runs. + * + * This implements steps D5, D6 of Algorithm D. All input data has + * already been written to initial runs on tape (see dumptuples). + */ +static void +mergeruns(Tuplesortstate *state) +{ + int tapenum, + svTape, + svRuns, + svDummy; + int numTapes; + int numInputTapes; + + Assert(state->status == TSS_BUILDRUNS); + Assert(state->memtupcount == 0); + + if (state->sortKeys != NULL && state->sortKeys->abbrev_converter != NULL) + { + /* + * If there are multiple runs to be merged, when we go to read back + * tuples from disk, abbreviated keys will not have been stored, and + * we don't care to regenerate them. Disable abbreviation from this + * point on. + */ + state->sortKeys->abbrev_converter = NULL; + state->sortKeys->comparator = state->sortKeys->abbrev_full_comparator; + + /* Not strictly necessary, but be tidy */ + state->sortKeys->abbrev_abort = NULL; + state->sortKeys->abbrev_full_comparator = NULL; + } + + /* + * Reset tuple memory. We've freed all the tuples that we previously + * allocated. We will use the slab allocator from now on. + */ + MemoryContextResetOnly(state->tuplecontext); + + /* + * We no longer need a large memtuples array. (We will allocate a smaller + * one for the heap later.) + */ + FREEMEM(state, GetMemoryChunkSpace(state->memtuples)); + pfree(state->memtuples); + state->memtuples = NULL; + + /* + * If we had fewer runs than tapes, refund the memory that we imagined we + * would need for the tape buffers of the unused tapes. + * + * numTapes and numInputTapes reflect the actual number of tapes we will + * use. Note that the output tape's tape number is maxTapes - 1, so the + * tape numbers of the used tapes are not consecutive, and you cannot just + * loop from 0 to numTapes to visit all used tapes! + */ + if (state->Level == 1) + { + numInputTapes = state->currentRun; + numTapes = numInputTapes + 1; + FREEMEM(state, (state->maxTapes - numTapes) * TAPE_BUFFER_OVERHEAD); + } + else + { + numInputTapes = state->tapeRange; + numTapes = state->maxTapes; + } + + /* + * Initialize the slab allocator. We need one slab slot per input tape, + * for the tuples in the heap, plus one to hold the tuple last returned + * from tuplesort_gettuple. (If we're sorting pass-by-val Datums, + * however, we don't need to do allocate anything.) + * + * From this point on, we no longer use the USEMEM()/LACKMEM() mechanism + * to track memory usage of individual tuples. + */ + if (state->tuples) + init_slab_allocator(state, numInputTapes + 1); + else + init_slab_allocator(state, 0); + + /* + * Allocate a new 'memtuples' array, for the heap. It will hold one tuple + * from each input tape. + */ + state->memtupsize = numInputTapes; + state->memtuples = (SortTuple *) MemoryContextAlloc(state->maincontext, + numInputTapes * sizeof(SortTuple)); + USEMEM(state, GetMemoryChunkSpace(state->memtuples)); + + /* + * Use all the remaining memory we have available for read buffers among + * the input tapes. + * + * We don't try to "rebalance" the memory among tapes, when we start a new + * merge phase, even if some tapes are inactive in the new phase. That + * would be hard, because logtape.c doesn't know where one run ends and + * another begins. When a new merge phase begins, and a tape doesn't + * participate in it, its buffer nevertheless already contains tuples from + * the next run on same tape, so we cannot release the buffer. That's OK + * in practice, merge performance isn't that sensitive to the amount of + * buffers used, and most merge phases use all or almost all tapes, + * anyway. + */ +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d using " INT64_FORMAT " KB of memory for read buffers among %d input tapes", + state->worker, state->availMem / 1024, numInputTapes); +#endif + + state->read_buffer_size = Max(state->availMem / numInputTapes, 0); + USEMEM(state, state->read_buffer_size * numInputTapes); + + /* End of step D2: rewind all output tapes to prepare for merging */ + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + LogicalTapeRewindForRead(state->tapeset, tapenum, state->read_buffer_size); + + for (;;) + { + /* + * At this point we know that tape[T] is empty. If there's just one + * (real or dummy) run left on each input tape, then only one merge + * pass remains. If we don't have to produce a materialized sorted + * tape, we can stop at this point and do the final merge on-the-fly. + */ + if (!state->randomAccess && !WORKER(state)) + { + bool allOneRun = true; + + Assert(state->tp_runs[state->tapeRange] == 0); + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + { + if (state->tp_runs[tapenum] + state->tp_dummy[tapenum] != 1) + { + allOneRun = false; + break; + } + } + if (allOneRun) + { + /* Tell logtape.c we won't be writing anymore */ + LogicalTapeSetForgetFreeSpace(state->tapeset); + /* Initialize for the final merge pass */ + beginmerge(state); + state->status = TSS_FINALMERGE; + return; + } + } + + /* Step D5: merge runs onto tape[T] until tape[P] is empty */ + while (state->tp_runs[state->tapeRange - 1] || + state->tp_dummy[state->tapeRange - 1]) + { + bool allDummy = true; + + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + { + if (state->tp_dummy[tapenum] == 0) + { + allDummy = false; + break; + } + } + + if (allDummy) + { + state->tp_dummy[state->tapeRange]++; + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + state->tp_dummy[tapenum]--; + } + else + mergeonerun(state); + } + + /* Step D6: decrease level */ + if (--state->Level == 0) + break; + /* rewind output tape T to use as new input */ + LogicalTapeRewindForRead(state->tapeset, state->tp_tapenum[state->tapeRange], + state->read_buffer_size); + /* rewind used-up input tape P, and prepare it for write pass */ + LogicalTapeRewindForWrite(state->tapeset, state->tp_tapenum[state->tapeRange - 1]); + state->tp_runs[state->tapeRange - 1] = 0; + + /* + * reassign tape units per step D6; note we no longer care about A[] + */ + svTape = state->tp_tapenum[state->tapeRange]; + svDummy = state->tp_dummy[state->tapeRange]; + svRuns = state->tp_runs[state->tapeRange]; + for (tapenum = state->tapeRange; tapenum > 0; tapenum--) + { + state->tp_tapenum[tapenum] = state->tp_tapenum[tapenum - 1]; + state->tp_dummy[tapenum] = state->tp_dummy[tapenum - 1]; + state->tp_runs[tapenum] = state->tp_runs[tapenum - 1]; + } + state->tp_tapenum[0] = svTape; + state->tp_dummy[0] = svDummy; + state->tp_runs[0] = svRuns; + } + + /* + * Done. Knuth says that the result is on TAPE[1], but since we exited + * the loop without performing the last iteration of step D6, we have not + * rearranged the tape unit assignment, and therefore the result is on + * TAPE[T]. We need to do it this way so that we can freeze the final + * output tape while rewinding it. The last iteration of step D6 would be + * a waste of cycles anyway... + */ + state->result_tape = state->tp_tapenum[state->tapeRange]; + if (!WORKER(state)) + LogicalTapeFreeze(state->tapeset, state->result_tape, NULL); + else + worker_freeze_result_tape(state); + state->status = TSS_SORTEDONTAPE; + + /* Release the read buffers of all the other tapes, by rewinding them. */ + for (tapenum = 0; tapenum < state->maxTapes; tapenum++) + { + if (tapenum != state->result_tape) + LogicalTapeRewindForWrite(state->tapeset, tapenum); + } +} + +/* + * Merge one run from each input tape, except ones with dummy runs. + * + * This is the inner loop of Algorithm D step D5. We know that the + * output tape is TAPE[T]. + */ +static void +mergeonerun(Tuplesortstate *state) +{ + int destTape = state->tp_tapenum[state->tapeRange]; + int srcTape; + + /* + * Start the merge by loading one tuple from each active source tape into + * the heap. We can also decrease the input run/dummy run counts. + */ + beginmerge(state); + + /* + * Execute merge by repeatedly extracting lowest tuple in heap, writing it + * out, and replacing it with next tuple from same tape (if there is + * another one). + */ + while (state->memtupcount > 0) + { + SortTuple stup; + + /* write the tuple to destTape */ + srcTape = state->memtuples[0].srctape; + WRITETUP(state, destTape, &state->memtuples[0]); + + /* recycle the slot of the tuple we just wrote out, for the next read */ + if (state->memtuples[0].tuple) + RELEASE_SLAB_SLOT(state, state->memtuples[0].tuple); + + /* + * pull next tuple from the tape, and replace the written-out tuple in + * the heap with it. + */ + if (mergereadnext(state, srcTape, &stup)) + { + stup.srctape = srcTape; + tuplesort_heap_replace_top(state, &stup); + } + else + tuplesort_heap_delete_top(state); + } + + /* + * When the heap empties, we're done. Write an end-of-run marker on the + * output tape, and increment its count of real runs. + */ + markrunend(state, destTape); + state->tp_runs[state->tapeRange]++; + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d finished %d-way merge step: %s", state->worker, + state->activeTapes, pg_rusage_show(&state->ru_start)); +#endif +} + +/* + * beginmerge - initialize for a merge pass + * + * We decrease the counts of real and dummy runs for each tape, and mark + * which tapes contain active input runs in mergeactive[]. Then, fill the + * merge heap with the first tuple from each active tape. + */ +static void +beginmerge(Tuplesortstate *state) +{ + int activeTapes; + int tapenum; + int srcTape; + + /* Heap should be empty here */ + Assert(state->memtupcount == 0); + + /* Adjust run counts and mark the active tapes */ + memset(state->mergeactive, 0, + state->maxTapes * sizeof(*state->mergeactive)); + activeTapes = 0; + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + { + if (state->tp_dummy[tapenum] > 0) + state->tp_dummy[tapenum]--; + else + { + Assert(state->tp_runs[tapenum] > 0); + state->tp_runs[tapenum]--; + srcTape = state->tp_tapenum[tapenum]; + state->mergeactive[srcTape] = true; + activeTapes++; + } + } + Assert(activeTapes > 0); + state->activeTapes = activeTapes; + + /* Load the merge heap with the first tuple from each input tape */ + for (srcTape = 0; srcTape < state->maxTapes; srcTape++) + { + SortTuple tup; + + if (mergereadnext(state, srcTape, &tup)) + { + tup.srctape = srcTape; + tuplesort_heap_insert(state, &tup); + } + } +} + +/* + * mergereadnext - read next tuple from one merge input tape + * + * Returns false on EOF. + */ +static bool +mergereadnext(Tuplesortstate *state, int srcTape, SortTuple *stup) +{ + unsigned int tuplen; + + if (!state->mergeactive[srcTape]) + return false; /* tape's run is already exhausted */ + + /* read next tuple, if any */ + if ((tuplen = getlen(state, srcTape, true)) == 0) + { + state->mergeactive[srcTape] = false; + return false; + } + READTUP(state, stup, srcTape, tuplen); + + return true; +} + +/* + * dumptuples - remove tuples from memtuples and write initial run to tape + * + * When alltuples = true, dump everything currently in memory. (This case is + * only used at end of input data.) + */ +static void +dumptuples(Tuplesortstate *state, bool alltuples) +{ + int memtupwrite; + int i; + + /* + * Nothing to do if we still fit in available memory and have array slots, + * unless this is the final call during initial run generation. + */ + if (state->memtupcount < state->memtupsize && !LACKMEM(state) && + !alltuples) + return; + + /* + * Final call might require no sorting, in rare cases where we just so + * happen to have previously LACKMEM()'d at the point where exactly all + * remaining tuples are loaded into memory, just before input was + * exhausted. + * + * In general, short final runs are quite possible. Rather than allowing + * a special case where there was a superfluous selectnewtape() call (i.e. + * a call with no subsequent run actually written to destTape), we prefer + * to write out a 0 tuple run. + * + * mergereadnext() is prepared for 0 tuple runs, and will reliably mark + * the tape inactive for the merge when called from beginmerge(). This + * case is therefore similar to the case where mergeonerun() finds a dummy + * run for the tape, and so doesn't need to merge a run from the tape (or + * conceptually "merges" the dummy run, if you prefer). According to + * Knuth, Algorithm D "isn't strictly optimal" in its method of + * distribution and dummy run assignment; this edge case seems very + * unlikely to make that appreciably worse. + */ + Assert(state->status == TSS_BUILDRUNS); + + /* + * It seems unlikely that this limit will ever be exceeded, but take no + * chances + */ + if (state->currentRun == INT_MAX) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("cannot have more than %d runs for an external sort", + INT_MAX))); + + state->currentRun++; + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d starting quicksort of run %d: %s", + state->worker, state->currentRun, + pg_rusage_show(&state->ru_start)); +#endif + + /* + * Sort all tuples accumulated within the allowed amount of memory for + * this run using quicksort + */ + tuplesort_sort_memtuples(state); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d finished quicksort of run %d: %s", + state->worker, state->currentRun, + pg_rusage_show(&state->ru_start)); +#endif + + memtupwrite = state->memtupcount; + for (i = 0; i < memtupwrite; i++) + { + WRITETUP(state, state->tp_tapenum[state->destTape], + &state->memtuples[i]); + state->memtupcount--; + } + + /* + * Reset tuple memory. We've freed all of the tuples that we previously + * allocated. It's important to avoid fragmentation when there is a stark + * change in the sizes of incoming tuples. Fragmentation due to + * AllocSetFree's bucketing by size class might be particularly bad if + * this step wasn't taken. + */ + MemoryContextReset(state->tuplecontext); + + markrunend(state, state->tp_tapenum[state->destTape]); + state->tp_runs[state->destTape]++; + state->tp_dummy[state->destTape]--; /* per Alg D step D2 */ + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d finished writing run %d to tape %d: %s", + state->worker, state->currentRun, state->destTape, + pg_rusage_show(&state->ru_start)); +#endif + + if (!alltuples) + selectnewtape(state); +} + +/* + * tuplesort_rescan - rewind and replay the scan + */ +void +tuplesort_rescan(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + + Assert(state->randomAccess); + + switch (state->status) + { + case TSS_SORTEDINMEM: + state->current = 0; + state->eof_reached = false; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + case TSS_SORTEDONTAPE: + LogicalTapeRewindForRead(state->tapeset, + state->result_tape, + 0); + state->eof_reached = false; + state->markpos_block = 0L; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + + MemoryContextSwitchTo(oldcontext); +} + +/* + * tuplesort_markpos - saves current position in the merged sort file + */ +void +tuplesort_markpos(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + + Assert(state->randomAccess); + + switch (state->status) + { + case TSS_SORTEDINMEM: + state->markpos_offset = state->current; + state->markpos_eof = state->eof_reached; + break; + case TSS_SORTEDONTAPE: + LogicalTapeTell(state->tapeset, + state->result_tape, + &state->markpos_block, + &state->markpos_offset); + state->markpos_eof = state->eof_reached; + break; + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + + MemoryContextSwitchTo(oldcontext); +} + +/* + * tuplesort_restorepos - restores current position in merged sort file to + * last saved position + */ +void +tuplesort_restorepos(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + + Assert(state->randomAccess); + + switch (state->status) + { + case TSS_SORTEDINMEM: + state->current = state->markpos_offset; + state->eof_reached = state->markpos_eof; + break; + case TSS_SORTEDONTAPE: + LogicalTapeSeek(state->tapeset, + state->result_tape, + state->markpos_block, + state->markpos_offset); + state->eof_reached = state->markpos_eof; + break; + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + + MemoryContextSwitchTo(oldcontext); +} + +/* + * tuplesort_get_stats - extract summary statistics + * + * This can be called after tuplesort_performsort() finishes to obtain + * printable summary information about how the sort was performed. + */ +void +tuplesort_get_stats(Tuplesortstate *state, + TuplesortInstrumentation *stats) +{ + /* + * Note: it might seem we should provide both memory and disk usage for a + * disk-based sort. However, the current code doesn't track memory space + * accurately once we have begun to return tuples to the caller (since we + * don't account for pfree's the caller is expected to do), so we cannot + * rely on availMem in a disk sort. This does not seem worth the overhead + * to fix. Is it worth creating an API for the memory context code to + * tell us how much is actually used in sortcontext? + */ + tuplesort_updatemax(state); + + if (state->isMaxSpaceDisk) + stats->spaceType = SORT_SPACE_TYPE_DISK; + else + stats->spaceType = SORT_SPACE_TYPE_MEMORY; + stats->spaceUsed = (state->maxSpace + 1023) / 1024; + + switch (state->maxSpaceStatus) + { + case TSS_SORTEDINMEM: + if (state->boundUsed) + stats->sortMethod = SORT_TYPE_TOP_N_HEAPSORT; + else + stats->sortMethod = SORT_TYPE_QUICKSORT; + break; + case TSS_SORTEDONTAPE: + stats->sortMethod = SORT_TYPE_EXTERNAL_SORT; + break; + case TSS_FINALMERGE: + stats->sortMethod = SORT_TYPE_EXTERNAL_MERGE; + break; + default: + stats->sortMethod = SORT_TYPE_STILL_IN_PROGRESS; + break; + } +} + +/* + * Convert TuplesortMethod to a string. + */ +const char * +tuplesort_method_name(TuplesortMethod m) +{ + switch (m) + { + case SORT_TYPE_STILL_IN_PROGRESS: + return "still in progress"; + case SORT_TYPE_TOP_N_HEAPSORT: + return "top-N heapsort"; + case SORT_TYPE_QUICKSORT: + return "quicksort"; + case SORT_TYPE_EXTERNAL_SORT: + return "external sort"; + case SORT_TYPE_EXTERNAL_MERGE: + return "external merge"; + } + + return "unknown"; +} + +/* + * Convert TuplesortSpaceType to a string. + */ +const char * +tuplesort_space_type_name(TuplesortSpaceType t) +{ + Assert(t == SORT_SPACE_TYPE_DISK || t == SORT_SPACE_TYPE_MEMORY); + return t == SORT_SPACE_TYPE_DISK ? "Disk" : "Memory"; +} + + +/* + * Heap manipulation routines, per Knuth's Algorithm 5.2.3H. + */ + +/* + * Convert the existing unordered array of SortTuples to a bounded heap, + * discarding all but the smallest "state->bound" tuples. + * + * When working with a bounded heap, we want to keep the largest entry + * at the root (array entry zero), instead of the smallest as in the normal + * sort case. This allows us to discard the largest entry cheaply. + * Therefore, we temporarily reverse the sort direction. + */ +static void +make_bounded_heap(Tuplesortstate *state) +{ + int tupcount = state->memtupcount; + int i; + + Assert(state->status == TSS_INITIAL); + Assert(state->bounded); + Assert(tupcount >= state->bound); + Assert(SERIAL(state)); + + /* Reverse sort direction so largest entry will be at root */ + reversedirection(state); + + state->memtupcount = 0; /* make the heap empty */ + for (i = 0; i < tupcount; i++) + { + if (state->memtupcount < state->bound) + { + /* Insert next tuple into heap */ + /* Must copy source tuple to avoid possible overwrite */ + SortTuple stup = state->memtuples[i]; + + tuplesort_heap_insert(state, &stup); + } + else + { + /* + * The heap is full. Replace the largest entry with the new + * tuple, or just discard it, if it's larger than anything already + * in the heap. + */ + if (COMPARETUP(state, &state->memtuples[i], &state->memtuples[0]) <= 0) + { + free_sort_tuple(state, &state->memtuples[i]); + CHECK_FOR_INTERRUPTS(); + } + else + tuplesort_heap_replace_top(state, &state->memtuples[i]); + } + } + + Assert(state->memtupcount == state->bound); + state->status = TSS_BOUNDED; +} + +/* + * Convert the bounded heap to a properly-sorted array + */ +static void +sort_bounded_heap(Tuplesortstate *state) +{ + int tupcount = state->memtupcount; + + Assert(state->status == TSS_BOUNDED); + Assert(state->bounded); + Assert(tupcount == state->bound); + Assert(SERIAL(state)); + + /* + * We can unheapify in place because each delete-top call will remove the + * largest entry, which we can promptly store in the newly freed slot at + * the end. Once we're down to a single-entry heap, we're done. + */ + while (state->memtupcount > 1) + { + SortTuple stup = state->memtuples[0]; + + /* this sifts-up the next-largest entry and decreases memtupcount */ + tuplesort_heap_delete_top(state); + state->memtuples[state->memtupcount] = stup; + } + state->memtupcount = tupcount; + + /* + * Reverse sort direction back to the original state. This is not + * actually necessary but seems like a good idea for tidiness. + */ + reversedirection(state); + + state->status = TSS_SORTEDINMEM; + state->boundUsed = true; +} + +/* + * Sort all memtuples using specialized qsort() routines. + * + * Quicksort is used for small in-memory sorts, and external sort runs. + */ +static void +tuplesort_sort_memtuples(Tuplesortstate *state) +{ + Assert(!LEADER(state)); + + if (state->memtupcount > 1) + { + /* Can we use the single-key sort function? */ + if (state->onlyKey != NULL) + qsort_ssup(state->memtuples, state->memtupcount, + state->onlyKey); + else + qsort_tuple(state->memtuples, + state->memtupcount, + state->comparetup, + state); + } +} + +/* + * Insert a new tuple into an empty or existing heap, maintaining the + * heap invariant. Caller is responsible for ensuring there's room. + * + * Note: For some callers, tuple points to a memtuples[] entry above the + * end of the heap. This is safe as long as it's not immediately adjacent + * to the end of the heap (ie, in the [memtupcount] array entry) --- if it + * is, it might get overwritten before being moved into the heap! + */ +static void +tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple) +{ + SortTuple *memtuples; + int j; + + memtuples = state->memtuples; + Assert(state->memtupcount < state->memtupsize); + + CHECK_FOR_INTERRUPTS(); + + /* + * Sift-up the new entry, per Knuth 5.2.3 exercise 16. Note that Knuth is + * using 1-based array indexes, not 0-based. + */ + j = state->memtupcount++; + while (j > 0) + { + int i = (j - 1) >> 1; + + if (COMPARETUP(state, tuple, &memtuples[i]) >= 0) + break; + memtuples[j] = memtuples[i]; + j = i; + } + memtuples[j] = *tuple; +} + +/* + * Remove the tuple at state->memtuples[0] from the heap. Decrement + * memtupcount, and sift up to maintain the heap invariant. + * + * The caller has already free'd the tuple the top node points to, + * if necessary. + */ +static void +tuplesort_heap_delete_top(Tuplesortstate *state) +{ + SortTuple *memtuples = state->memtuples; + SortTuple *tuple; + + if (--state->memtupcount <= 0) + return; + + /* + * Remove the last tuple in the heap, and re-insert it, by replacing the + * current top node with it. + */ + tuple = &memtuples[state->memtupcount]; + tuplesort_heap_replace_top(state, tuple); +} + +/* + * Replace the tuple at state->memtuples[0] with a new tuple. Sift up to + * maintain the heap invariant. + * + * This corresponds to Knuth's "sift-up" algorithm (Algorithm 5.2.3H, + * Heapsort, steps H3-H8). + */ +static void +tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple *tuple) +{ + SortTuple *memtuples = state->memtuples; + unsigned int i, + n; + + Assert(state->memtupcount >= 1); + + CHECK_FOR_INTERRUPTS(); + + /* + * state->memtupcount is "int", but we use "unsigned int" for i, j, n. + * This prevents overflow in the "2 * i + 1" calculation, since at the top + * of the loop we must have i < n <= INT_MAX <= UINT_MAX/2. + */ + n = state->memtupcount; + i = 0; /* i is where the "hole" is */ + for (;;) + { + unsigned int j = 2 * i + 1; + + if (j >= n) + break; + if (j + 1 < n && + COMPARETUP(state, &memtuples[j], &memtuples[j + 1]) > 0) + j++; + if (COMPARETUP(state, tuple, &memtuples[j]) <= 0) + break; + memtuples[i] = memtuples[j]; + i = j; + } + memtuples[i] = *tuple; +} + +/* + * Function to reverse the sort direction from its current state + * + * It is not safe to call this when performing hash tuplesorts + */ +static void +reversedirection(Tuplesortstate *state) +{ + SortSupport sortKey = state->sortKeys; + int nkey; + + for (nkey = 0; nkey < state->nKeys; nkey++, sortKey++) + { + sortKey->ssup_reverse = !sortKey->ssup_reverse; + sortKey->ssup_nulls_first = !sortKey->ssup_nulls_first; + } +} + + +/* + * Tape interface routines + */ + +static unsigned int +getlen(Tuplesortstate *state, int tapenum, bool eofOK) +{ + unsigned int len; + + if (LogicalTapeRead(state->tapeset, tapenum, + &len, sizeof(len)) != sizeof(len)) + elog(ERROR, "unexpected end of tape"); + if (len == 0 && !eofOK) + elog(ERROR, "unexpected end of data"); + return len; +} + +static void +markrunend(Tuplesortstate *state, int tapenum) +{ + unsigned int len = 0; + + LogicalTapeWrite(state->tapeset, tapenum, (void *) &len, sizeof(len)); +} + +/* + * Get memory for tuple from within READTUP() routine. + * + * We use next free slot from the slab allocator, or palloc() if the tuple + * is too large for that. + */ +static void * +readtup_alloc(Tuplesortstate *state, Size tuplen) +{ + SlabSlot *buf; + + /* + * We pre-allocate enough slots in the slab arena that we should never run + * out. + */ + Assert(state->slabFreeHead); + + if (tuplen > SLAB_SLOT_SIZE || !state->slabFreeHead) + return MemoryContextAlloc(state->sortcontext, tuplen); + else + { + buf = state->slabFreeHead; + /* Reuse this slot */ + state->slabFreeHead = buf->nextfree; + + return buf; + } +} + + +/* + * Routines specialized for HeapTuple (actually MinimalTuple) case + */ + +static int +comparetup_heap(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) +{ + SortSupport sortKey = state->sortKeys; + HeapTupleData ltup; + HeapTupleData rtup; + TupleDesc tupDesc; + int nkey; + int32 compare; + AttrNumber attno; + Datum datum1, + datum2; + bool isnull1, + isnull2; + + + /* Compare the leading sort key */ + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + sortKey); + if (compare != 0) + return compare; + + /* Compare additional sort keys */ + ltup.t_len = ((MinimalTuple) a->tuple)->t_len + MINIMAL_TUPLE_OFFSET; + ltup.t_data = (HeapTupleHeader) ((char *) a->tuple - MINIMAL_TUPLE_OFFSET); + rtup.t_len = ((MinimalTuple) b->tuple)->t_len + MINIMAL_TUPLE_OFFSET; + rtup.t_data = (HeapTupleHeader) ((char *) b->tuple - MINIMAL_TUPLE_OFFSET); + tupDesc = state->tupDesc; + + if (sortKey->abbrev_converter) + { + attno = sortKey->ssup_attno; + + datum1 = heap_getattr(<up, attno, tupDesc, &isnull1); + datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2); + + compare = ApplySortAbbrevFullComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + + sortKey++; + for (nkey = 1; nkey < state->nKeys; nkey++, sortKey++) + { + attno = sortKey->ssup_attno; + + datum1 = heap_getattr(<up, attno, tupDesc, &isnull1); + datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2); + + compare = ApplySortComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + + return 0; +} + +static void +copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + /* + * We expect the passed "tup" to be a TupleTableSlot, and form a + * MinimalTuple using the exported interface for that. + */ + TupleTableSlot *slot = (TupleTableSlot *) tup; + Datum original; + MinimalTuple tuple; + HeapTupleData htup; + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + + /* copy the tuple into sort storage */ + tuple = ExecCopySlotMinimalTuple(slot); + stup->tuple = (void *) tuple; + USEMEM(state, GetMemoryChunkSpace(tuple)); + /* set up first-column key value */ + htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET; + htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET); + original = heap_getattr(&htup, + state->sortKeys[0].ssup_attno, + state->tupDesc, + &stup->isnull1); + + MemoryContextSwitchTo(oldcontext); + + if (!state->sortKeys->abbrev_converter || stup->isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to zeroed representation (to be + * consistent, and to support cheap inequality tests for NULL + * abbreviated keys). + */ + stup->datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup->datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup->datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + htup.t_len = ((MinimalTuple) mtup->tuple)->t_len + + MINIMAL_TUPLE_OFFSET; + htup.t_data = (HeapTupleHeader) ((char *) mtup->tuple - + MINIMAL_TUPLE_OFFSET); + + mtup->datum1 = heap_getattr(&htup, + state->sortKeys[0].ssup_attno, + state->tupDesc, + &mtup->isnull1); + } + } +} + +static void +writetup_heap(Tuplesortstate *state, int tapenum, SortTuple *stup) +{ + MinimalTuple tuple = (MinimalTuple) stup->tuple; + + /* the part of the MinimalTuple we'll write: */ + char *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET; + unsigned int tupbodylen = tuple->t_len - MINIMAL_TUPLE_DATA_OFFSET; + + /* total on-disk footprint: */ + unsigned int tuplen = tupbodylen + sizeof(int); + + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &tuplen, sizeof(tuplen)); + LogicalTapeWrite(state->tapeset, tapenum, + (void *) tupbody, tupbodylen); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &tuplen, sizeof(tuplen)); + + if (!state->slabAllocatorUsed) + { + FREEMEM(state, GetMemoryChunkSpace(tuple)); + heap_free_minimal_tuple(tuple); + } +} + +static void +readtup_heap(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len) +{ + unsigned int tupbodylen = len - sizeof(int); + unsigned int tuplen = tupbodylen + MINIMAL_TUPLE_DATA_OFFSET; + MinimalTuple tuple = (MinimalTuple) readtup_alloc(state, tuplen); + char *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET; + HeapTupleData htup; + + /* read in the tuple proper */ + tuple->t_len = tuplen; + LogicalTapeReadExact(state->tapeset, tapenum, + tupbody, tupbodylen); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeReadExact(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + stup->tuple = (void *) tuple; + /* set up first-column key value */ + htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET; + htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET); + stup->datum1 = heap_getattr(&htup, + state->sortKeys[0].ssup_attno, + state->tupDesc, + &stup->isnull1); +} + +/* + * Routines specialized for the CLUSTER case (HeapTuple data, with + * comparisons per a btree index definition) + */ + +static int +comparetup_cluster(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state) +{ + SortSupport sortKey = state->sortKeys; + HeapTuple ltup; + HeapTuple rtup; + TupleDesc tupDesc; + int nkey; + int32 compare; + Datum datum1, + datum2; + bool isnull1, + isnull2; + AttrNumber leading = state->indexInfo->ii_IndexAttrNumbers[0]; + + /* Be prepared to compare additional sort keys */ + ltup = (HeapTuple) a->tuple; + rtup = (HeapTuple) b->tuple; + tupDesc = state->tupDesc; + + /* Compare the leading sort key, if it's simple */ + if (leading != 0) + { + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + sortKey); + if (compare != 0) + return compare; + + if (sortKey->abbrev_converter) + { + datum1 = heap_getattr(ltup, leading, tupDesc, &isnull1); + datum2 = heap_getattr(rtup, leading, tupDesc, &isnull2); + + compare = ApplySortAbbrevFullComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + } + if (compare != 0 || state->nKeys == 1) + return compare; + /* Compare additional columns the hard way */ + sortKey++; + nkey = 1; + } + else + { + /* Must compare all keys the hard way */ + nkey = 0; + } + + if (state->indexInfo->ii_Expressions == NULL) + { + /* If not expression index, just compare the proper heap attrs */ + + for (; nkey < state->nKeys; nkey++, sortKey++) + { + AttrNumber attno = state->indexInfo->ii_IndexAttrNumbers[nkey]; + + datum1 = heap_getattr(ltup, attno, tupDesc, &isnull1); + datum2 = heap_getattr(rtup, attno, tupDesc, &isnull2); + + compare = ApplySortComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + } + else + { + /* + * In the expression index case, compute the whole index tuple and + * then compare values. It would perhaps be faster to compute only as + * many columns as we need to compare, but that would require + * duplicating all the logic in FormIndexDatum. + */ + Datum l_index_values[INDEX_MAX_KEYS]; + bool l_index_isnull[INDEX_MAX_KEYS]; + Datum r_index_values[INDEX_MAX_KEYS]; + bool r_index_isnull[INDEX_MAX_KEYS]; + TupleTableSlot *ecxt_scantuple; + + /* Reset context each time to prevent memory leakage */ + ResetPerTupleExprContext(state->estate); + + ecxt_scantuple = GetPerTupleExprContext(state->estate)->ecxt_scantuple; + + ExecStoreHeapTuple(ltup, ecxt_scantuple, false); + FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate, + l_index_values, l_index_isnull); + + ExecStoreHeapTuple(rtup, ecxt_scantuple, false); + FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate, + r_index_values, r_index_isnull); + + for (; nkey < state->nKeys; nkey++, sortKey++) + { + compare = ApplySortComparator(l_index_values[nkey], + l_index_isnull[nkey], + r_index_values[nkey], + r_index_isnull[nkey], + sortKey); + if (compare != 0) + return compare; + } + } + + return 0; +} + +static void +copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + HeapTuple tuple = (HeapTuple) tup; + Datum original; + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + + /* copy the tuple into sort storage */ + tuple = heap_copytuple(tuple); + stup->tuple = (void *) tuple; + USEMEM(state, GetMemoryChunkSpace(tuple)); + + MemoryContextSwitchTo(oldcontext); + + /* + * set up first-column key value, and potentially abbreviate, if it's a + * simple column + */ + if (state->indexInfo->ii_IndexAttrNumbers[0] == 0) + return; + + original = heap_getattr(tuple, + state->indexInfo->ii_IndexAttrNumbers[0], + state->tupDesc, + &stup->isnull1); + + if (!state->sortKeys->abbrev_converter || stup->isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to zeroed representation (to be + * consistent, and to support cheap inequality tests for NULL + * abbreviated keys). + */ + stup->datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup->datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup->datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + tuple = (HeapTuple) mtup->tuple; + mtup->datum1 = heap_getattr(tuple, + state->indexInfo->ii_IndexAttrNumbers[0], + state->tupDesc, + &mtup->isnull1); + } + } +} + +static void +writetup_cluster(Tuplesortstate *state, int tapenum, SortTuple *stup) +{ + HeapTuple tuple = (HeapTuple) stup->tuple; + unsigned int tuplen = tuple->t_len + sizeof(ItemPointerData) + sizeof(int); + + /* We need to store t_self, but not other fields of HeapTupleData */ + LogicalTapeWrite(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + LogicalTapeWrite(state->tapeset, tapenum, + &tuple->t_self, sizeof(ItemPointerData)); + LogicalTapeWrite(state->tapeset, tapenum, + tuple->t_data, tuple->t_len); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeWrite(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + + if (!state->slabAllocatorUsed) + { + FREEMEM(state, GetMemoryChunkSpace(tuple)); + heap_freetuple(tuple); + } +} + +static void +readtup_cluster(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int tuplen) +{ + unsigned int t_len = tuplen - sizeof(ItemPointerData) - sizeof(int); + HeapTuple tuple = (HeapTuple) readtup_alloc(state, + t_len + HEAPTUPLESIZE); + + /* Reconstruct the HeapTupleData header */ + tuple->t_data = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE); + tuple->t_len = t_len; + LogicalTapeReadExact(state->tapeset, tapenum, + &tuple->t_self, sizeof(ItemPointerData)); + /* We don't currently bother to reconstruct t_tableOid */ + tuple->t_tableOid = InvalidOid; + /* Read in the tuple body */ + LogicalTapeReadExact(state->tapeset, tapenum, + tuple->t_data, tuple->t_len); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeReadExact(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + stup->tuple = (void *) tuple; + /* set up first-column key value, if it's a simple column */ + if (state->indexInfo->ii_IndexAttrNumbers[0] != 0) + stup->datum1 = heap_getattr(tuple, + state->indexInfo->ii_IndexAttrNumbers[0], + state->tupDesc, + &stup->isnull1); +} + +/* + * Routines specialized for IndexTuple case + * + * The btree and hash cases require separate comparison functions, but the + * IndexTuple representation is the same so the copy/write/read support + * functions can be shared. + */ + +static int +comparetup_index_btree(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state) +{ + /* + * This is similar to comparetup_heap(), but expects index tuples. There + * is also special handling for enforcing uniqueness, and special + * treatment for equal keys at the end. + */ + SortSupport sortKey = state->sortKeys; + IndexTuple tuple1; + IndexTuple tuple2; + int keysz; + TupleDesc tupDes; + bool equal_hasnull = false; + int nkey; + int32 compare; + Datum datum1, + datum2; + bool isnull1, + isnull2; + + + /* Compare the leading sort key */ + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + sortKey); + if (compare != 0) + return compare; + + /* Compare additional sort keys */ + tuple1 = (IndexTuple) a->tuple; + tuple2 = (IndexTuple) b->tuple; + keysz = state->nKeys; + tupDes = RelationGetDescr(state->indexRel); + + if (sortKey->abbrev_converter) + { + datum1 = index_getattr(tuple1, 1, tupDes, &isnull1); + datum2 = index_getattr(tuple2, 1, tupDes, &isnull2); + + compare = ApplySortAbbrevFullComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + + /* they are equal, so we only need to examine one null flag */ + if (a->isnull1) + equal_hasnull = true; + + sortKey++; + for (nkey = 2; nkey <= keysz; nkey++, sortKey++) + { + datum1 = index_getattr(tuple1, nkey, tupDes, &isnull1); + datum2 = index_getattr(tuple2, nkey, tupDes, &isnull2); + + compare = ApplySortComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; /* done when we find unequal attributes */ + + /* they are equal, so we only need to examine one null flag */ + if (isnull1) + equal_hasnull = true; + } + + /* + * If btree has asked us to enforce uniqueness, complain if two equal + * tuples are detected (unless there was at least one NULL field). + * + * It is sufficient to make the test here, because if two tuples are equal + * they *must* get compared at some stage of the sort --- otherwise the + * sort algorithm wouldn't have checked whether one must appear before the + * other. + */ + if (state->enforceUnique && !equal_hasnull) + { + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + char *key_desc; + + /* + * Some rather brain-dead implementations of qsort (such as the one in + * QNX 4) will sometimes call the comparison routine to compare a + * value to itself, but we always use our own implementation, which + * does not. + */ + Assert(tuple1 != tuple2); + + index_deform_tuple(tuple1, tupDes, values, isnull); + + key_desc = BuildIndexValueDescription(state->indexRel, values, isnull); + + ereport(ERROR, + (errcode(ERRCODE_UNIQUE_VIOLATION), + errmsg("could not create unique index \"%s\"", + RelationGetRelationName(state->indexRel)), + key_desc ? errdetail("Key %s is duplicated.", key_desc) : + errdetail("Duplicate keys exist."), + errtableconstraint(state->heapRel, + RelationGetRelationName(state->indexRel)))); + } + + /* + * If key values are equal, we sort on ItemPointer. This is required for + * btree indexes, since heap TID is treated as an implicit last key + * attribute in order to ensure that all keys in the index are physically + * unique. + */ + { + BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid); + BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid); + + if (blk1 != blk2) + return (blk1 < blk2) ? -1 : 1; + } + { + OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid); + OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid); + + if (pos1 != pos2) + return (pos1 < pos2) ? -1 : 1; + } + + /* ItemPointer values should never be equal */ + Assert(false); + + return 0; +} + +static int +comparetup_index_hash(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state) +{ + Bucket bucket1; + Bucket bucket2; + IndexTuple tuple1; + IndexTuple tuple2; + + /* + * Fetch hash keys and mask off bits we don't want to sort by. We know + * that the first column of the index tuple is the hash key. + */ + Assert(!a->isnull1); + bucket1 = _hash_hashkey2bucket(DatumGetUInt32(a->datum1), + state->max_buckets, state->high_mask, + state->low_mask); + Assert(!b->isnull1); + bucket2 = _hash_hashkey2bucket(DatumGetUInt32(b->datum1), + state->max_buckets, state->high_mask, + state->low_mask); + if (bucket1 > bucket2) + return 1; + else if (bucket1 < bucket2) + return -1; + + /* + * If hash values are equal, we sort on ItemPointer. This does not affect + * validity of the finished index, but it may be useful to have index + * scans in physical order. + */ + tuple1 = (IndexTuple) a->tuple; + tuple2 = (IndexTuple) b->tuple; + + { + BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid); + BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid); + + if (blk1 != blk2) + return (blk1 < blk2) ? -1 : 1; + } + { + OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid); + OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid); + + if (pos1 != pos2) + return (pos1 < pos2) ? -1 : 1; + } + + /* ItemPointer values should never be equal */ + Assert(false); + + return 0; +} + +static void +copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + /* Not currently needed */ + elog(ERROR, "copytup_index() should not be called"); +} + +static void +writetup_index(Tuplesortstate *state, int tapenum, SortTuple *stup) +{ + IndexTuple tuple = (IndexTuple) stup->tuple; + unsigned int tuplen; + + tuplen = IndexTupleSize(tuple) + sizeof(tuplen); + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &tuplen, sizeof(tuplen)); + LogicalTapeWrite(state->tapeset, tapenum, + (void *) tuple, IndexTupleSize(tuple)); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &tuplen, sizeof(tuplen)); + + if (!state->slabAllocatorUsed) + { + FREEMEM(state, GetMemoryChunkSpace(tuple)); + pfree(tuple); + } +} + +static void +readtup_index(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len) +{ + unsigned int tuplen = len - sizeof(unsigned int); + IndexTuple tuple = (IndexTuple) readtup_alloc(state, tuplen); + + LogicalTapeReadExact(state->tapeset, tapenum, + tuple, tuplen); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeReadExact(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + stup->tuple = (void *) tuple; + /* set up first-column key value */ + stup->datum1 = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &stup->isnull1); +} + +/* + * Routines specialized for DatumTuple case + */ + +static int +comparetup_datum(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) +{ + int compare; + + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + state->sortKeys); + if (compare != 0) + return compare; + + /* if we have abbreviations, then "tuple" has the original value */ + + if (state->sortKeys->abbrev_converter) + compare = ApplySortAbbrevFullComparator(PointerGetDatum(a->tuple), a->isnull1, + PointerGetDatum(b->tuple), b->isnull1, + state->sortKeys); + + return compare; +} + +static void +copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + /* Not currently needed */ + elog(ERROR, "copytup_datum() should not be called"); +} + +static void +writetup_datum(Tuplesortstate *state, int tapenum, SortTuple *stup) +{ + void *waddr; + unsigned int tuplen; + unsigned int writtenlen; + + if (stup->isnull1) + { + waddr = NULL; + tuplen = 0; + } + else if (!state->tuples) + { + waddr = &stup->datum1; + tuplen = sizeof(Datum); + } + else + { + waddr = stup->tuple; + tuplen = datumGetSize(PointerGetDatum(stup->tuple), false, state->datumTypeLen); + Assert(tuplen != 0); + } + + writtenlen = tuplen + sizeof(unsigned int); + + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &writtenlen, sizeof(writtenlen)); + LogicalTapeWrite(state->tapeset, tapenum, + waddr, tuplen); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &writtenlen, sizeof(writtenlen)); + + if (!state->slabAllocatorUsed && stup->tuple) + { + FREEMEM(state, GetMemoryChunkSpace(stup->tuple)); + pfree(stup->tuple); + } +} + +static void +readtup_datum(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len) +{ + unsigned int tuplen = len - sizeof(unsigned int); + + if (tuplen == 0) + { + /* it's NULL */ + stup->datum1 = (Datum) 0; + stup->isnull1 = true; + stup->tuple = NULL; + } + else if (!state->tuples) + { + Assert(tuplen == sizeof(Datum)); + LogicalTapeReadExact(state->tapeset, tapenum, + &stup->datum1, tuplen); + stup->isnull1 = false; + stup->tuple = NULL; + } + else + { + void *raddr = readtup_alloc(state, tuplen); + + LogicalTapeReadExact(state->tapeset, tapenum, + raddr, tuplen); + stup->datum1 = PointerGetDatum(raddr); + stup->isnull1 = false; + stup->tuple = raddr; + } + + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeReadExact(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); +} + +/* + * Parallel sort routines + */ + +/* + * tuplesort_estimate_shared - estimate required shared memory allocation + * + * nWorkers is an estimate of the number of workers (it's the number that + * will be requested). + */ +Size +tuplesort_estimate_shared(int nWorkers) +{ + Size tapesSize; + + Assert(nWorkers > 0); + + /* Make sure that BufFile shared state is MAXALIGN'd */ + tapesSize = mul_size(sizeof(TapeShare), nWorkers); + tapesSize = MAXALIGN(add_size(tapesSize, offsetof(Sharedsort, tapes))); + + return tapesSize; +} + +/* + * tuplesort_initialize_shared - initialize shared tuplesort state + * + * Must be called from leader process before workers are launched, to + * establish state needed up-front for worker tuplesortstates. nWorkers + * should match the argument passed to tuplesort_estimate_shared(). + */ +void +tuplesort_initialize_shared(Sharedsort *shared, int nWorkers, dsm_segment *seg) +{ + int i; + + Assert(nWorkers > 0); + + SpinLockInit(&shared->mutex); + shared->currentWorker = 0; + shared->workersFinished = 0; + SharedFileSetInit(&shared->fileset, seg); + shared->nTapes = nWorkers; + for (i = 0; i < nWorkers; i++) + { + shared->tapes[i].firstblocknumber = 0L; + } +} + +/* + * tuplesort_attach_shared - attach to shared tuplesort state + * + * Must be called by all worker processes. + */ +void +tuplesort_attach_shared(Sharedsort *shared, dsm_segment *seg) +{ + /* Attach to SharedFileSet */ + SharedFileSetAttach(&shared->fileset, seg); +} + +/* + * worker_get_identifier - Assign and return ordinal identifier for worker + * + * The order in which these are assigned is not well defined, and should not + * matter; worker numbers across parallel sort participants need only be + * distinct and gapless. logtape.c requires this. + * + * Note that the identifiers assigned from here have no relation to + * ParallelWorkerNumber number, to avoid making any assumption about + * caller's requirements. However, we do follow the ParallelWorkerNumber + * convention of representing a non-worker with worker number -1. This + * includes the leader, as well as serial Tuplesort processes. + */ +static int +worker_get_identifier(Tuplesortstate *state) +{ + Sharedsort *shared = state->shared; + int worker; + + Assert(WORKER(state)); + + SpinLockAcquire(&shared->mutex); + worker = shared->currentWorker++; + SpinLockRelease(&shared->mutex); + + return worker; +} + +/* + * worker_freeze_result_tape - freeze worker's result tape for leader + * + * This is called by workers just after the result tape has been determined, + * instead of calling LogicalTapeFreeze() directly. They do so because + * workers require a few additional steps over similar serial + * TSS_SORTEDONTAPE external sort cases, which also happen here. The extra + * steps are around freeing now unneeded resources, and representing to + * leader that worker's input run is available for its merge. + * + * There should only be one final output run for each worker, which consists + * of all tuples that were originally input into worker. + */ +static void +worker_freeze_result_tape(Tuplesortstate *state) +{ + Sharedsort *shared = state->shared; + TapeShare output; + + Assert(WORKER(state)); + Assert(state->result_tape != -1); + Assert(state->memtupcount == 0); + + /* + * Free most remaining memory, in case caller is sensitive to our holding + * on to it. memtuples may not be a tiny merge heap at this point. + */ + pfree(state->memtuples); + /* Be tidy */ + state->memtuples = NULL; + state->memtupsize = 0; + + /* + * Parallel worker requires result tape metadata, which is to be stored in + * shared memory for leader + */ + LogicalTapeFreeze(state->tapeset, state->result_tape, &output); + + /* Store properties of output tape, and update finished worker count */ + SpinLockAcquire(&shared->mutex); + shared->tapes[state->worker] = output; + shared->workersFinished++; + SpinLockRelease(&shared->mutex); +} + +/* + * worker_nomergeruns - dump memtuples in worker, without merging + * + * This called as an alternative to mergeruns() with a worker when no + * merging is required. + */ +static void +worker_nomergeruns(Tuplesortstate *state) +{ + Assert(WORKER(state)); + Assert(state->result_tape == -1); + + state->result_tape = state->tp_tapenum[state->destTape]; + worker_freeze_result_tape(state); +} + +/* + * leader_takeover_tapes - create tapeset for leader from worker tapes + * + * So far, leader Tuplesortstate has performed no actual sorting. By now, all + * sorting has occurred in workers, all of which must have already returned + * from tuplesort_performsort(). + * + * When this returns, leader process is left in a state that is virtually + * indistinguishable from it having generated runs as a serial external sort + * might have. + */ +static void +leader_takeover_tapes(Tuplesortstate *state) +{ + Sharedsort *shared = state->shared; + int nParticipants = state->nParticipants; + int workersFinished; + int j; + + Assert(LEADER(state)); + Assert(nParticipants >= 1); + + SpinLockAcquire(&shared->mutex); + workersFinished = shared->workersFinished; + SpinLockRelease(&shared->mutex); + + if (nParticipants != workersFinished) + elog(ERROR, "cannot take over tapes before all workers finish"); + + /* + * Create the tapeset from worker tapes, including a leader-owned tape at + * the end. Parallel workers are far more expensive than logical tapes, + * so the number of tapes allocated here should never be excessive. + * + * We still have a leader tape, though it's not possible to write to it + * due to restrictions in the shared fileset infrastructure used by + * logtape.c. It will never be written to in practice because + * randomAccess is disallowed for parallel sorts. + */ + inittapestate(state, nParticipants + 1); + state->tapeset = LogicalTapeSetCreate(nParticipants + 1, false, + shared->tapes, &shared->fileset, + state->worker); + + /* mergeruns() relies on currentRun for # of runs (in one-pass cases) */ + state->currentRun = nParticipants; + + /* + * Initialize variables of Algorithm D to be consistent with runs from + * workers having been generated in the leader. + * + * There will always be exactly 1 run per worker, and exactly one input + * tape per run, because workers always output exactly 1 run, even when + * there were no input tuples for workers to sort. + */ + for (j = 0; j < state->maxTapes; j++) + { + /* One real run; no dummy runs for worker tapes */ + state->tp_fib[j] = 1; + state->tp_runs[j] = 1; + state->tp_dummy[j] = 0; + state->tp_tapenum[j] = j; + } + /* Leader tape gets one dummy run, and no real runs */ + state->tp_fib[state->tapeRange] = 0; + state->tp_runs[state->tapeRange] = 0; + state->tp_dummy[state->tapeRange] = 1; + + state->Level = 1; + state->destTape = 0; + + state->status = TSS_BUILDRUNS; +} + +/* + * Convenience routine to free a tuple previously loaded into sort memory + */ +static void +free_sort_tuple(Tuplesortstate *state, SortTuple *stup) +{ + if (stup->tuple) + { + FREEMEM(state, GetMemoryChunkSpace(stup->tuple)); + pfree(stup->tuple); + stup->tuple = NULL; + } +} diff --git a/src/tuplesort15.c b/src/tuplesort15.c new file mode 100644 index 0000000000..37184cdcac --- /dev/null +++ b/src/tuplesort15.c @@ -0,0 +1,4939 @@ +/*------------------------------------------------------------------------- + * + * tuplesort.c + * Generalized tuple sorting routines. + * + * This module handles sorting of heap tuples, index tuples, or single + * Datums (and could easily support other kinds of sortable objects, + * if necessary). It works efficiently for both small and large amounts + * of data. Small amounts are sorted in-memory using qsort(). Large + * amounts are sorted using temporary files and a standard external sort + * algorithm. + * + * See Knuth, volume 3, for more than you want to know about external + * sorting algorithms. The algorithm we use is a balanced k-way merge. + * Before PostgreSQL 15, we used the polyphase merge algorithm (Knuth's + * Algorithm 5.4.2D), but with modern hardware, a straightforward balanced + * merge is better. Knuth is assuming that tape drives are expensive + * beasts, and in particular that there will always be many more runs than + * tape drives. The polyphase merge algorithm was good at keeping all the + * tape drives busy, but in our implementation a "tape drive" doesn't cost + * much more than a few Kb of memory buffers, so we can afford to have + * lots of them. In particular, if we can have as many tape drives as + * sorted runs, we can eliminate any repeated I/O at all. + * + * Historically, we divided the input into sorted runs using replacement + * selection, in the form of a priority tree implemented as a heap + * (essentially Knuth's Algorithm 5.2.3H), but now we always use quicksort + * for run generation. + * + * The approximate amount of memory allowed for any one sort operation + * is specified in kilobytes by the caller (most pass work_mem). Initially, + * we absorb tuples and simply store them in an unsorted array as long as + * we haven't exceeded workMem. If we reach the end of the input without + * exceeding workMem, we sort the array using qsort() and subsequently return + * tuples just by scanning the tuple array sequentially. If we do exceed + * workMem, we begin to emit tuples into sorted runs in temporary tapes. + * When tuples are dumped in batch after quicksorting, we begin a new run + * with a new output tape. If we reach the max number of tapes, we write + * subsequent runs on the existing tapes in a round-robin fashion. We will + * need multiple merge passes to finish the merge in that case. After the + * end of the input is reached, we dump out remaining tuples in memory into + * a final run, then merge the runs. + * + * When merging runs, we use a heap containing just the frontmost tuple from + * each source run; we repeatedly output the smallest tuple and replace it + * with the next tuple from its source tape (if any). When the heap empties, + * the merge is complete. The basic merge algorithm thus needs very little + * memory --- only M tuples for an M-way merge, and M is constrained to a + * small number. However, we can still make good use of our full workMem + * allocation by pre-reading additional blocks from each source tape. Without + * prereading, our access pattern to the temporary file would be very erratic; + * on average we'd read one block from each of M source tapes during the same + * time that we're writing M blocks to the output tape, so there is no + * sequentiality of access at all, defeating the read-ahead methods used by + * most Unix kernels. Worse, the output tape gets written into a very random + * sequence of blocks of the temp file, ensuring that things will be even + * worse when it comes time to read that tape. A straightforward merge pass + * thus ends up doing a lot of waiting for disk seeks. We can improve matters + * by prereading from each source tape sequentially, loading about workMem/M + * bytes from each tape in turn, and making the sequential blocks immediately + * available for reuse. This approach helps to localize both read and write + * accesses. The pre-reading is handled by logtape.c, we just tell it how + * much memory to use for the buffers. + * + * In the current code we determine the number of input tapes M on the basis + * of workMem: we want workMem/M to be large enough that we read a fair + * amount of data each time we read from a tape, so as to maintain the + * locality of access described above. Nonetheless, with large workMem we + * can have many tapes. The logical "tapes" are implemented by logtape.c, + * which avoids space wastage by recycling disk space as soon as each block + * is read from its "tape". + * + * When the caller requests random access to the sort result, we form + * the final sorted run on a logical tape which is then "frozen", so + * that we can access it randomly. When the caller does not need random + * access, we return from tuplesort_performsort() as soon as we are down + * to one run per logical tape. The final merge is then performed + * on-the-fly as the caller repeatedly calls tuplesort_getXXX; this + * saves one cycle of writing all the data out to disk and reading it in. + * + * This module supports parallel sorting. Parallel sorts involve coordination + * among one or more worker processes, and a leader process, each with its own + * tuplesort state. The leader process (or, more accurately, the + * Tuplesortstate associated with a leader process) creates a full tapeset + * consisting of worker tapes with one run to merge; a run for every + * worker process. This is then merged. Worker processes are guaranteed to + * produce exactly one output run from their partial input. + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/utils/sort/tuplesort.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include + +#include "access/hash.h" +#include "access/htup_details.h" +#include "access/nbtree.h" +#include "catalog/index.h" +#include "catalog/pg_am.h" +#include "commands/tablespace.h" +#include "executor/executor.h" +#include "miscadmin.h" +#include "pg_trace.h" +#include "utils/datum.h" +#include "utils/logtape.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/pg_rusage.h" +#include "utils/rel.h" +#include "utils/sortsupport.h" +#include "utils/tuplesort.h" + +/* Should be the last include */ +#include "disable_core_macro.h" + +/* sort-type codes for sort__start probes */ +#define HEAP_SORT 0 +#define INDEX_SORT 1 +#define DATUM_SORT 2 +#define CLUSTER_SORT 3 + +/* Sort parallel code from state for sort__start probes */ +#define PARALLEL_SORT(state) ((state)->shared == NULL ? 0 : \ + (state)->worker >= 0 ? 1 : 2) + +/* + * Initial size of memtuples array. We're trying to select this size so that + * array doesn't exceed ALLOCSET_SEPARATE_THRESHOLD and so that the overhead of + * allocation might possibly be lowered. However, we don't consider array sizes + * less than 1024. + * + */ +#define INITIAL_MEMTUPSIZE Max(1024, \ + ALLOCSET_SEPARATE_THRESHOLD / sizeof(SortTuple) + 1) + +/* GUC variables */ +#ifdef TRACE_SORT +bool trace_sort = false; +#endif + +#ifdef DEBUG_BOUNDED_SORT +bool optimize_bounded_sort = true; +#endif + + +/* + * The objects we actually sort are SortTuple structs. These contain + * a pointer to the tuple proper (might be a MinimalTuple or IndexTuple), + * which is a separate palloc chunk --- we assume it is just one chunk and + * can be freed by a simple pfree() (except during merge, when we use a + * simple slab allocator). SortTuples also contain the tuple's first key + * column in Datum/nullflag format, and a source/input tape number that + * tracks which tape each heap element/slot belongs to during merging. + * + * Storing the first key column lets us save heap_getattr or index_getattr + * calls during tuple comparisons. We could extract and save all the key + * columns not just the first, but this would increase code complexity and + * overhead, and wouldn't actually save any comparison cycles in the common + * case where the first key determines the comparison result. Note that + * for a pass-by-reference datatype, datum1 points into the "tuple" storage. + * + * There is one special case: when the sort support infrastructure provides an + * "abbreviated key" representation, where the key is (typically) a pass by + * value proxy for a pass by reference type. In this case, the abbreviated key + * is stored in datum1 in place of the actual first key column. + * + * When sorting single Datums, the data value is represented directly by + * datum1/isnull1 for pass by value types (or null values). If the datatype is + * pass-by-reference and isnull1 is false, then "tuple" points to a separately + * palloc'd data value, otherwise "tuple" is NULL. The value of datum1 is then + * either the same pointer as "tuple", or is an abbreviated key value as + * described above. Accordingly, "tuple" is always used in preference to + * datum1 as the authoritative value for pass-by-reference cases. + */ +typedef struct +{ + void *tuple; /* the tuple itself */ + Datum datum1; /* value of first key column */ + bool isnull1; /* is first key column NULL? */ + int srctape; /* source tape number */ +} SortTuple; + +/* + * During merge, we use a pre-allocated set of fixed-size slots to hold + * tuples. To avoid palloc/pfree overhead. + * + * Merge doesn't require a lot of memory, so we can afford to waste some, + * by using gratuitously-sized slots. If a tuple is larger than 1 kB, the + * palloc() overhead is not significant anymore. + * + * 'nextfree' is valid when this chunk is in the free list. When in use, the + * slot holds a tuple. + */ +#define SLAB_SLOT_SIZE 1024 + +typedef union SlabSlot +{ + union SlabSlot *nextfree; + char buffer[SLAB_SLOT_SIZE]; +} SlabSlot; + +/* + * Possible states of a Tuplesort object. These denote the states that + * persist between calls of Tuplesort routines. + */ +typedef enum +{ + TSS_INITIAL, /* Loading tuples; still within memory limit */ + TSS_BOUNDED, /* Loading tuples into bounded-size heap */ + TSS_BUILDRUNS, /* Loading tuples; writing to tape */ + TSS_SORTEDINMEM, /* Sort completed entirely in memory */ + TSS_SORTEDONTAPE, /* Sort completed, final run is on tape */ + TSS_FINALMERGE /* Performing final merge on-the-fly */ +} TupSortStatus; + +/* + * Parameters for calculation of number of tapes to use --- see inittapes() + * and tuplesort_merge_order(). + * + * In this calculation we assume that each tape will cost us about 1 blocks + * worth of buffer space. This ignores the overhead of all the other data + * structures needed for each tape, but it's probably close enough. + * + * MERGE_BUFFER_SIZE is how much buffer space we'd like to allocate for each + * input tape, for pre-reading (see discussion at top of file). This is *in + * addition to* the 1 block already included in TAPE_BUFFER_OVERHEAD. + */ +#define MINORDER 6 /* minimum merge order */ +#define MAXORDER 500 /* maximum merge order */ +#define TAPE_BUFFER_OVERHEAD BLCKSZ +#define MERGE_BUFFER_SIZE (BLCKSZ * 32) + +typedef int (*SortTupleComparator) (const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); + +/* + * Private state of a Tuplesort operation. + */ +struct Tuplesortstate +{ + TupSortStatus status; /* enumerated value as shown above */ + int nKeys; /* number of columns in sort key */ + int sortopt; /* Bitmask of flags used to setup sort */ + bool bounded; /* did caller specify a maximum number of + * tuples to return? */ + bool boundUsed; /* true if we made use of a bounded heap */ + int bound; /* if bounded, the maximum number of tuples */ + bool tuples; /* Can SortTuple.tuple ever be set? */ + int64 availMem; /* remaining memory available, in bytes */ + int64 allowedMem; /* total memory allowed, in bytes */ + int maxTapes; /* max number of input tapes to merge in each + * pass */ + int64 maxSpace; /* maximum amount of space occupied among sort + * of groups, either in-memory or on-disk */ + bool isMaxSpaceDisk; /* true when maxSpace is value for on-disk + * space, false when it's value for in-memory + * space */ + TupSortStatus maxSpaceStatus; /* sort status when maxSpace was reached */ + MemoryContext maincontext; /* memory context for tuple sort metadata that + * persists across multiple batches */ + MemoryContext sortcontext; /* memory context holding most sort data */ + MemoryContext tuplecontext; /* sub-context of sortcontext for tuple data */ + LogicalTapeSet *tapeset; /* logtape.c object for tapes in a temp file */ + + /* + * These function pointers decouple the routines that must know what kind + * of tuple we are sorting from the routines that don't need to know it. + * They are set up by the tuplesort_begin_xxx routines. + * + * Function to compare two tuples; result is per qsort() convention, ie: + * <0, 0, >0 according as ab. The API must match + * qsort_arg_comparator. + */ + SortTupleComparator comparetup; + + /* + * Function to copy a supplied input tuple into palloc'd space and set up + * its SortTuple representation (ie, set tuple/datum1/isnull1). Also, + * state->availMem must be decreased by the amount of space used for the + * tuple copy (note the SortTuple struct itself is not counted). + */ + void (*copytup) (Tuplesortstate *state, SortTuple *stup, void *tup); + + /* + * Function to write a stored tuple onto tape. The representation of the + * tuple on tape need not be the same as it is in memory; requirements on + * the tape representation are given below. Unless the slab allocator is + * used, after writing the tuple, pfree() the out-of-line data (not the + * SortTuple struct!), and increase state->availMem by the amount of + * memory space thereby released. + */ + void (*writetup) (Tuplesortstate *state, LogicalTape *tape, + SortTuple *stup); + + /* + * Function to read a stored tuple from tape back into memory. 'len' is + * the already-read length of the stored tuple. The tuple is allocated + * from the slab memory arena, or is palloc'd, see readtup_alloc(). + */ + void (*readtup) (Tuplesortstate *state, SortTuple *stup, + LogicalTape *tape, unsigned int len); + + /* + * Whether SortTuple's datum1 and isnull1 members are maintained by the + * above routines. If not, some sort specializations are disabled. + */ + bool haveDatum1; + + /* + * This array holds the tuples now in sort memory. If we are in state + * INITIAL, the tuples are in no particular order; if we are in state + * SORTEDINMEM, the tuples are in final sorted order; in states BUILDRUNS + * and FINALMERGE, the tuples are organized in "heap" order per Algorithm + * H. In state SORTEDONTAPE, the array is not used. + */ + SortTuple *memtuples; /* array of SortTuple structs */ + int memtupcount; /* number of tuples currently present */ + int memtupsize; /* allocated length of memtuples array */ + bool growmemtuples; /* memtuples' growth still underway? */ + + /* + * Memory for tuples is sometimes allocated using a simple slab allocator, + * rather than with palloc(). Currently, we switch to slab allocation + * when we start merging. Merging only needs to keep a small, fixed + * number of tuples in memory at any time, so we can avoid the + * palloc/pfree overhead by recycling a fixed number of fixed-size slots + * to hold the tuples. + * + * For the slab, we use one large allocation, divided into SLAB_SLOT_SIZE + * slots. The allocation is sized to have one slot per tape, plus one + * additional slot. We need that many slots to hold all the tuples kept + * in the heap during merge, plus the one we have last returned from the + * sort, with tuplesort_gettuple. + * + * Initially, all the slots are kept in a linked list of free slots. When + * a tuple is read from a tape, it is put to the next available slot, if + * it fits. If the tuple is larger than SLAB_SLOT_SIZE, it is palloc'd + * instead. + * + * When we're done processing a tuple, we return the slot back to the free + * list, or pfree() if it was palloc'd. We know that a tuple was + * allocated from the slab, if its pointer value is between + * slabMemoryBegin and -End. + * + * When the slab allocator is used, the USEMEM/LACKMEM mechanism of + * tracking memory usage is not used. + */ + bool slabAllocatorUsed; + + char *slabMemoryBegin; /* beginning of slab memory arena */ + char *slabMemoryEnd; /* end of slab memory arena */ + SlabSlot *slabFreeHead; /* head of free list */ + + /* Memory used for input and output tape buffers. */ + size_t tape_buffer_mem; + + /* + * When we return a tuple to the caller in tuplesort_gettuple_XXX, that + * came from a tape (that is, in TSS_SORTEDONTAPE or TSS_FINALMERGE + * modes), we remember the tuple in 'lastReturnedTuple', so that we can + * recycle the memory on next gettuple call. + */ + void *lastReturnedTuple; + + /* + * While building initial runs, this is the current output run number. + * Afterwards, it is the number of initial runs we made. + */ + int currentRun; + + /* + * Logical tapes, for merging. + * + * The initial runs are written in the output tapes. In each merge pass, + * the output tapes of the previous pass become the input tapes, and new + * output tapes are created as needed. When nInputTapes equals + * nInputRuns, there is only one merge pass left. + */ + LogicalTape **inputTapes; + int nInputTapes; + int nInputRuns; + + LogicalTape **outputTapes; + int nOutputTapes; + int nOutputRuns; + + LogicalTape *destTape; /* current output tape */ + + /* + * These variables are used after completion of sorting to keep track of + * the next tuple to return. (In the tape case, the tape's current read + * position is also critical state.) + */ + LogicalTape *result_tape; /* actual tape of finished output */ + int current; /* array index (only used if SORTEDINMEM) */ + bool eof_reached; /* reached EOF (needed for cursors) */ + + /* markpos_xxx holds marked position for mark and restore */ + long markpos_block; /* tape block# (only used if SORTEDONTAPE) */ + int markpos_offset; /* saved "current", or offset in tape block */ + bool markpos_eof; /* saved "eof_reached" */ + + /* + * These variables are used during parallel sorting. + * + * worker is our worker identifier. Follows the general convention that + * -1 value relates to a leader tuplesort, and values >= 0 worker + * tuplesorts. (-1 can also be a serial tuplesort.) + * + * shared is mutable shared memory state, which is used to coordinate + * parallel sorts. + * + * nParticipants is the number of worker Tuplesortstates known by the + * leader to have actually been launched, which implies that they must + * finish a run that the leader needs to merge. Typically includes a + * worker state held by the leader process itself. Set in the leader + * Tuplesortstate only. + */ + int worker; + Sharedsort *shared; + int nParticipants; + + /* + * The sortKeys variable is used by every case other than the hash index + * case; it is set by tuplesort_begin_xxx. tupDesc is only used by the + * MinimalTuple and CLUSTER routines, though. + */ + TupleDesc tupDesc; + SortSupport sortKeys; /* array of length nKeys */ + + /* + * This variable is shared by the single-key MinimalTuple case and the + * Datum case (which both use qsort_ssup()). Otherwise, it's NULL. The + * presence of a value in this field is also checked by various sort + * specialization functions as an optimization when comparing the leading + * key in a tiebreak situation to determine if there are any subsequent + * keys to sort on. + */ + SortSupport onlyKey; + + /* + * Additional state for managing "abbreviated key" sortsupport routines + * (which currently may be used by all cases except the hash index case). + * Tracks the intervals at which the optimization's effectiveness is + * tested. + */ + int64 abbrevNext; /* Tuple # at which to next check + * applicability */ + + /* + * These variables are specific to the CLUSTER case; they are set by + * tuplesort_begin_cluster. + */ + IndexInfo *indexInfo; /* info about index being used for reference */ + EState *estate; /* for evaluating index expressions */ + + /* + * These variables are specific to the IndexTuple case; they are set by + * tuplesort_begin_index_xxx and used only by the IndexTuple routines. + */ + Relation heapRel; /* table the index is being built on */ + Relation indexRel; /* index being built */ + + /* These are specific to the index_btree subcase: */ + bool enforceUnique; /* complain if we find duplicate tuples */ + bool uniqueNullsNotDistinct; /* unique constraint null treatment */ + + /* These are specific to the index_hash subcase: */ + uint32 high_mask; /* masks for sortable part of hash code */ + uint32 low_mask; + uint32 max_buckets; + + /* + * These variables are specific to the Datum case; they are set by + * tuplesort_begin_datum and used only by the DatumTuple routines. + */ + Oid datumType; + /* we need typelen in order to know how to copy the Datums. */ + int datumTypeLen; + + /* + * Resource snapshot for time of sort start. + */ +#ifdef TRACE_SORT + PGRUsage ru_start; +#endif +}; + +/* + * Private mutable state of tuplesort-parallel-operation. This is allocated + * in shared memory. + */ +struct Sharedsort +{ + /* mutex protects all fields prior to tapes */ + slock_t mutex; + + /* + * currentWorker generates ordinal identifier numbers for parallel sort + * workers. These start from 0, and are always gapless. + * + * Workers increment workersFinished to indicate having finished. If this + * is equal to state.nParticipants within the leader, leader is ready to + * merge worker runs. + */ + int currentWorker; + int workersFinished; + + /* Temporary file space */ + SharedFileSet fileset; + + /* Size of tapes flexible array */ + int nTapes; + + /* + * Tapes array used by workers to report back information needed by the + * leader to concatenate all worker tapes into one for merging + */ + TapeShare tapes[FLEXIBLE_ARRAY_MEMBER]; +}; + +/* + * Is the given tuple allocated from the slab memory arena? + */ +#define IS_SLAB_SLOT(state, tuple) \ + ((char *) (tuple) >= (state)->slabMemoryBegin && \ + (char *) (tuple) < (state)->slabMemoryEnd) + +/* + * Return the given tuple to the slab memory free list, or free it + * if it was palloc'd. + */ +#define RELEASE_SLAB_SLOT(state, tuple) \ + do { \ + SlabSlot *buf = (SlabSlot *) tuple; \ + \ + if (IS_SLAB_SLOT((state), buf)) \ + { \ + buf->nextfree = (state)->slabFreeHead; \ + (state)->slabFreeHead = buf; \ + } else \ + pfree(buf); \ + } while(0) + +#define COMPARETUP(state,a,b) ((*(state)->comparetup) (a, b, state)) +#define COPYTUP(state,stup,tup) ((*(state)->copytup) (state, stup, tup)) +#define WRITETUP(state,tape,stup) ((*(state)->writetup) (state, tape, stup)) +#define READTUP(state,stup,tape,len) ((*(state)->readtup) (state, stup, tape, len)) +#define LACKMEM(state) ((state)->availMem < 0 && !(state)->slabAllocatorUsed) +#define USEMEM(state,amt) ((state)->availMem -= (amt)) +#define FREEMEM(state,amt) ((state)->availMem += (amt)) +#define SERIAL(state) ((state)->shared == NULL) +#define WORKER(state) ((state)->shared && (state)->worker != -1) +#define LEADER(state) ((state)->shared && (state)->worker == -1) + +/* + * NOTES about on-tape representation of tuples: + * + * We require the first "unsigned int" of a stored tuple to be the total size + * on-tape of the tuple, including itself (so it is never zero; an all-zero + * unsigned int is used to delimit runs). The remainder of the stored tuple + * may or may not match the in-memory representation of the tuple --- + * any conversion needed is the job of the writetup and readtup routines. + * + * If state->sortopt contains TUPLESORT_RANDOMACCESS, then the stored + * representation of the tuple must be followed by another "unsigned int" that + * is a copy of the length --- so the total tape space used is actually + * sizeof(unsigned int) more than the stored length value. This allows + * read-backwards. When the random access flag was not specified, the + * write/read routines may omit the extra length word. + * + * writetup is expected to write both length words as well as the tuple + * data. When readtup is called, the tape is positioned just after the + * front length word; readtup must read the tuple data and advance past + * the back length word (if present). + * + * The write/read routines can make use of the tuple description data + * stored in the Tuplesortstate record, if needed. They are also expected + * to adjust state->availMem by the amount of memory space (not tape space!) + * released or consumed. There is no error return from either writetup + * or readtup; they should ereport() on failure. + * + * + * NOTES about memory consumption calculations: + * + * We count space allocated for tuples against the workMem limit, plus + * the space used by the variable-size memtuples array. Fixed-size space + * is not counted; it's small enough to not be interesting. + * + * Note that we count actual space used (as shown by GetMemoryChunkSpace) + * rather than the originally-requested size. This is important since + * palloc can add substantial overhead. It's not a complete answer since + * we won't count any wasted space in palloc allocation blocks, but it's + * a lot better than what we were doing before 7.3. As of 9.6, a + * separate memory context is used for caller passed tuples. Resetting + * it at certain key increments significantly ameliorates fragmentation. + * Note that this places a responsibility on copytup routines to use the + * correct memory context for these tuples (and to not use the reset + * context for anything whose lifetime needs to span multiple external + * sort runs). readtup routines use the slab allocator (they cannot use + * the reset context because it gets deleted at the point that merging + * begins). + */ + +/* When using this macro, beware of double evaluation of len */ +#define LogicalTapeReadExact(tape, ptr, len) \ + do { \ + if (LogicalTapeRead(tape, ptr, len) != (size_t) (len)) \ + elog(ERROR, "unexpected end of data"); \ + } while(0) + + +static Tuplesortstate *tuplesort_begin_common(int workMem, + SortCoordinate coordinate, + int sortopt); +static void tuplesort_begin_batch(Tuplesortstate *state); +static void puttuple_common(Tuplesortstate *state, SortTuple *tuple); +static bool consider_abort_common(Tuplesortstate *state); +static void inittapes(Tuplesortstate *state, bool mergeruns); +static void inittapestate(Tuplesortstate *state, int maxTapes); +static void selectnewtape(Tuplesortstate *state); +static void init_slab_allocator(Tuplesortstate *state, int numSlots); +static void mergeruns(Tuplesortstate *state); +static void mergeonerun(Tuplesortstate *state); +static void beginmerge(Tuplesortstate *state); +static bool mergereadnext(Tuplesortstate *state, LogicalTape *srcTape, SortTuple *stup); +static void dumptuples(Tuplesortstate *state, bool alltuples); +static void make_bounded_heap(Tuplesortstate *state); +static void sort_bounded_heap(Tuplesortstate *state); +static void tuplesort_sort_memtuples(Tuplesortstate *state); +static void tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple); +static void tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple *tuple); +static void tuplesort_heap_delete_top(Tuplesortstate *state); +static void reversedirection(Tuplesortstate *state); +static unsigned int getlen(LogicalTape *tape, bool eofOK); +static void markrunend(LogicalTape *tape); +static void *readtup_alloc(Tuplesortstate *state, Size tuplen); +static int comparetup_heap(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_heap(Tuplesortstate *state, LogicalTape *tape, + SortTuple *stup); +static void readtup_heap(Tuplesortstate *state, SortTuple *stup, + LogicalTape *tape, unsigned int len); +static int comparetup_cluster(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_cluster(Tuplesortstate *state, LogicalTape *tape, + SortTuple *stup); +static void readtup_cluster(Tuplesortstate *state, SortTuple *stup, + LogicalTape *tape, unsigned int len); +static int comparetup_index_btree(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static int comparetup_index_hash(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_index(Tuplesortstate *state, LogicalTape *tape, + SortTuple *stup); +static void readtup_index(Tuplesortstate *state, SortTuple *stup, + LogicalTape *tape, unsigned int len); +static int comparetup_datum(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_datum(Tuplesortstate *state, LogicalTape *tape, + SortTuple *stup); +static void readtup_datum(Tuplesortstate *state, SortTuple *stup, + LogicalTape *tape, unsigned int len); +static int worker_get_identifier(Tuplesortstate *state); +static void worker_freeze_result_tape(Tuplesortstate *state); +static void worker_nomergeruns(Tuplesortstate *state); +static void leader_takeover_tapes(Tuplesortstate *state); +static void free_sort_tuple(Tuplesortstate *state, SortTuple *stup); +static void tuplesort_free(Tuplesortstate *state); +static void tuplesort_updatemax(Tuplesortstate *state); + +/* + * Specialized comparators that we can inline into specialized sorts. The goal + * is to try to sort two tuples without having to follow the pointers to the + * comparator or the tuple. + * + * XXX: For now, these fall back to comparator functions that will compare the + * leading datum a second time. + * + * XXX: For now, there is no specialization for cases where datum1 is + * authoritative and we don't even need to fall back to a callback at all (that + * would be true for types like int4/int8/timestamp/date, but not true for + * abbreviations of text or multi-key sorts. There could be! Is it worth it? + */ + +/* Used if first key's comparator is ssup_datum_unsigned_compare */ +static pg_attribute_always_inline int +qsort_tuple_unsigned_compare(SortTuple *a, SortTuple *b, Tuplesortstate *state) +{ + int compare; + + compare = ApplyUnsignedSortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + &state->sortKeys[0]); + if (compare != 0) + return compare; + + /* + * No need to waste effort calling the tiebreak function when there are no + * other keys to sort on. + */ + if (state->onlyKey != NULL) + return 0; + + return state->comparetup(a, b, state); +} + +#if SIZEOF_DATUM >= 8 +/* Used if first key's comparator is ssup_datum_signed_compare */ +static pg_attribute_always_inline int +qsort_tuple_signed_compare(SortTuple *a, SortTuple *b, Tuplesortstate *state) +{ + int compare; + + compare = ApplySignedSortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + &state->sortKeys[0]); + + if (compare != 0) + return compare; + + /* + * No need to waste effort calling the tiebreak function when there are no + * other keys to sort on. + */ + if (state->onlyKey != NULL) + return 0; + + return state->comparetup(a, b, state); +} +#endif + +/* Used if first key's comparator is ssup_datum_int32_compare */ +static pg_attribute_always_inline int +qsort_tuple_int32_compare(SortTuple *a, SortTuple *b, Tuplesortstate *state) +{ + int compare; + + compare = ApplyInt32SortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + &state->sortKeys[0]); + + if (compare != 0) + return compare; + + /* + * No need to waste effort calling the tiebreak function when there are no + * other keys to sort on. + */ + if (state->onlyKey != NULL) + return 0; + + return state->comparetup(a, b, state); +} + +/* + * Special versions of qsort just for SortTuple objects. qsort_tuple() sorts + * any variant of SortTuples, using the appropriate comparetup function. + * qsort_ssup() is specialized for the case where the comparetup function + * reduces to ApplySortComparator(), that is single-key MinimalTuple sorts + * and Datum sorts. qsort_tuple_{unsigned,signed,int32} are specialized for + * common comparison functions on pass-by-value leading datums. + */ + +#define ST_SORT qsort_tuple_unsigned +#define ST_ELEMENT_TYPE SortTuple +#define ST_COMPARE(a, b, state) qsort_tuple_unsigned_compare(a, b, state) +#define ST_COMPARE_ARG_TYPE Tuplesortstate +#define ST_CHECK_FOR_INTERRUPTS +#define ST_SCOPE static +#define ST_DEFINE +#include "lib/sort_template.h" + +#if SIZEOF_DATUM >= 8 +#define ST_SORT qsort_tuple_signed +#define ST_ELEMENT_TYPE SortTuple +#define ST_COMPARE(a, b, state) qsort_tuple_signed_compare(a, b, state) +#define ST_COMPARE_ARG_TYPE Tuplesortstate +#define ST_CHECK_FOR_INTERRUPTS +#define ST_SCOPE static +#define ST_DEFINE +#include "lib/sort_template.h" +#endif + +#define ST_SORT qsort_tuple_int32 +#define ST_ELEMENT_TYPE SortTuple +#define ST_COMPARE(a, b, state) qsort_tuple_int32_compare(a, b, state) +#define ST_COMPARE_ARG_TYPE Tuplesortstate +#define ST_CHECK_FOR_INTERRUPTS +#define ST_SCOPE static +#define ST_DEFINE +#include "lib/sort_template.h" + +#define ST_SORT qsort_tuple +#define ST_ELEMENT_TYPE SortTuple +#define ST_COMPARE_RUNTIME_POINTER +#define ST_COMPARE_ARG_TYPE Tuplesortstate +#define ST_CHECK_FOR_INTERRUPTS +#define ST_SCOPE static +#define ST_DECLARE +#define ST_DEFINE +#include "lib/sort_template.h" + +#define ST_SORT qsort_ssup +#define ST_ELEMENT_TYPE SortTuple +#define ST_COMPARE(a, b, ssup) \ + ApplySortComparator((a)->datum1, (a)->isnull1, \ + (b)->datum1, (b)->isnull1, (ssup)) +#define ST_COMPARE_ARG_TYPE SortSupportData +#define ST_CHECK_FOR_INTERRUPTS +#define ST_SCOPE static +#define ST_DEFINE +#include "lib/sort_template.h" + +/* + * tuplesort_begin_xxx + * + * Initialize for a tuple sort operation. + * + * After calling tuplesort_begin, the caller should call tuplesort_putXXX + * zero or more times, then call tuplesort_performsort when all the tuples + * have been supplied. After performsort, retrieve the tuples in sorted + * order by calling tuplesort_getXXX until it returns false/NULL. (If random + * access was requested, rescan, markpos, and restorepos can also be called.) + * Call tuplesort_end to terminate the operation and release memory/disk space. + * + * Each variant of tuplesort_begin has a workMem parameter specifying the + * maximum number of kilobytes of RAM to use before spilling data to disk. + * (The normal value of this parameter is work_mem, but some callers use + * other values.) Each variant also has a sortopt which is a bitmask of + * sort options. See TUPLESORT_* definitions in tuplesort.h + */ + +static Tuplesortstate * +tuplesort_begin_common(int workMem, SortCoordinate coordinate, int sortopt) +{ + Tuplesortstate *state; + MemoryContext maincontext; + MemoryContext sortcontext; + MemoryContext oldcontext; + + /* See leader_takeover_tapes() remarks on random access support */ + if (coordinate && (sortopt & TUPLESORT_RANDOMACCESS)) + elog(ERROR, "random access disallowed under parallel sort"); + + /* + * Memory context surviving tuplesort_reset. This memory context holds + * data which is useful to keep while sorting multiple similar batches. + */ + maincontext = AllocSetContextCreate(CurrentMemoryContext, + "TupleSort main", + ALLOCSET_DEFAULT_SIZES); + + /* + * Create a working memory context for one sort operation. The content of + * this context is deleted by tuplesort_reset. + */ + sortcontext = AllocSetContextCreate(maincontext, + "TupleSort sort", + ALLOCSET_DEFAULT_SIZES); + + /* + * Additionally a working memory context for tuples is setup in + * tuplesort_begin_batch. + */ + + /* + * Make the Tuplesortstate within the per-sortstate context. This way, we + * don't need a separate pfree() operation for it at shutdown. + */ + oldcontext = MemoryContextSwitchTo(maincontext); + + state = (Tuplesortstate *) palloc0(sizeof(Tuplesortstate)); + +#ifdef TRACE_SORT + if (trace_sort) + pg_rusage_init(&state->ru_start); +#endif + + state->sortopt = sortopt; + state->tuples = true; + + /* + * workMem is forced to be at least 64KB, the current minimum valid value + * for the work_mem GUC. This is a defense against parallel sort callers + * that divide out memory among many workers in a way that leaves each + * with very little memory. + */ + state->allowedMem = Max(workMem, 64) * (int64) 1024; + state->sortcontext = sortcontext; + state->maincontext = maincontext; + + /* + * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD; + * see comments in grow_memtuples(). + */ + state->memtupsize = INITIAL_MEMTUPSIZE; + state->memtuples = NULL; + + /* + * After all of the other non-parallel-related state, we setup all of the + * state needed for each batch. + */ + tuplesort_begin_batch(state); + + /* + * Initialize parallel-related state based on coordination information + * from caller + */ + if (!coordinate) + { + /* Serial sort */ + state->shared = NULL; + state->worker = -1; + state->nParticipants = -1; + } + else if (coordinate->isWorker) + { + /* Parallel worker produces exactly one final run from all input */ + state->shared = coordinate->sharedsort; + state->worker = worker_get_identifier(state); + state->nParticipants = -1; + } + else + { + /* Parallel leader state only used for final merge */ + state->shared = coordinate->sharedsort; + state->worker = -1; + state->nParticipants = coordinate->nParticipants; + Assert(state->nParticipants >= 1); + } + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +/* + * tuplesort_begin_batch + * + * Setup, or reset, all state need for processing a new set of tuples with this + * sort state. Called both from tuplesort_begin_common (the first time sorting + * with this sort state) and tuplesort_reset (for subsequent usages). + */ +static void +tuplesort_begin_batch(Tuplesortstate *state) +{ + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(state->maincontext); + + /* + * Caller tuple (e.g. IndexTuple) memory context. + * + * A dedicated child context used exclusively for caller passed tuples + * eases memory management. Resetting at key points reduces + * fragmentation. Note that the memtuples array of SortTuples is allocated + * in the parent context, not this context, because there is no need to + * free memtuples early. For bounded sorts, tuples may be pfreed in any + * order, so we use a regular aset.c context so that it can make use of + * free'd memory. When the sort is not bounded, we make use of a + * generation.c context as this keeps allocations more compact with less + * wastage. Allocations are also slightly more CPU efficient. + */ + if (state->sortopt & TUPLESORT_ALLOWBOUNDED) + state->tuplecontext = AllocSetContextCreate(state->sortcontext, + "Caller tuples", + ALLOCSET_DEFAULT_SIZES); + else + state->tuplecontext = GenerationContextCreate(state->sortcontext, + "Caller tuples", + ALLOCSET_DEFAULT_SIZES); + + + state->status = TSS_INITIAL; + state->bounded = false; + state->boundUsed = false; + + state->availMem = state->allowedMem; + + state->tapeset = NULL; + + state->memtupcount = 0; + + /* + * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD; + * see comments in grow_memtuples(). + */ + state->growmemtuples = true; + state->slabAllocatorUsed = false; + if (state->memtuples != NULL && state->memtupsize != INITIAL_MEMTUPSIZE) + { + pfree(state->memtuples); + state->memtuples = NULL; + state->memtupsize = INITIAL_MEMTUPSIZE; + } + if (state->memtuples == NULL) + { + state->memtuples = (SortTuple *) palloc(state->memtupsize * sizeof(SortTuple)); + USEMEM(state, GetMemoryChunkSpace(state->memtuples)); + } + + /* workMem must be large enough for the minimal memtuples array */ + if (LACKMEM(state)) + elog(ERROR, "insufficient memory allowed for sort"); + + state->currentRun = 0; + + /* + * Tape variables (inputTapes, outputTapes, etc.) will be initialized by + * inittapes(), if needed. + */ + + state->result_tape = NULL; /* flag that result tape has not been formed */ + + MemoryContextSwitchTo(oldcontext); +} + +Tuplesortstate * +tuplesort_begin_heap(TupleDesc tupDesc, + int nkeys, AttrNumber *attNums, + Oid *sortOperators, Oid *sortCollations, + bool *nullsFirstFlags, + int workMem, SortCoordinate coordinate, int sortopt) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + sortopt); + MemoryContext oldcontext; + int i; + + oldcontext = MemoryContextSwitchTo(state->maincontext); + + AssertArg(nkeys > 0); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c", + nkeys, workMem, sortopt & TUPLESORT_RANDOMACCESS ? 't' : 'f'); +#endif + + state->nKeys = nkeys; + + TRACE_POSTGRESQL_SORT_START(HEAP_SORT, + false, /* no unique check */ + nkeys, + workMem, + sortopt & TUPLESORT_RANDOMACCESS, + PARALLEL_SORT(state)); + + state->comparetup = comparetup_heap; + state->copytup = copytup_heap; + state->writetup = writetup_heap; + state->readtup = readtup_heap; + state->haveDatum1 = true; + + state->tupDesc = tupDesc; /* assume we need not copy tupDesc */ + state->abbrevNext = 10; + + /* Prepare SortSupport data for each column */ + state->sortKeys = (SortSupport) palloc0(nkeys * sizeof(SortSupportData)); + + for (i = 0; i < nkeys; i++) + { + SortSupport sortKey = state->sortKeys + i; + + AssertArg(attNums[i] != 0); + AssertArg(sortOperators[i] != 0); + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = sortCollations[i]; + sortKey->ssup_nulls_first = nullsFirstFlags[i]; + sortKey->ssup_attno = attNums[i]; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0 && state->haveDatum1); + + PrepareSortSupportFromOrderingOp(sortOperators[i], sortKey); + } + + /* + * The "onlyKey" optimization cannot be used with abbreviated keys, since + * tie-breaker comparisons may be required. Typically, the optimization + * is only of value to pass-by-value types anyway, whereas abbreviated + * keys are typically only of value to pass-by-reference types. + */ + if (nkeys == 1 && !state->sortKeys->abbrev_converter) + state->onlyKey = state->sortKeys; + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_cluster(TupleDesc tupDesc, + Relation indexRel, + int workMem, + SortCoordinate coordinate, int sortopt) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + sortopt); + BTScanInsert indexScanKey; + MemoryContext oldcontext; + int i; + + Assert(indexRel->rd_rel->relam == BTREE_AM_OID); + + oldcontext = MemoryContextSwitchTo(state->maincontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c", + RelationGetNumberOfAttributes(indexRel), + workMem, sortopt & TUPLESORT_RANDOMACCESS ? 't' : 'f'); +#endif + + state->nKeys = IndexRelationGetNumberOfKeyAttributes(indexRel); + + TRACE_POSTGRESQL_SORT_START(CLUSTER_SORT, + false, /* no unique check */ + state->nKeys, + workMem, + sortopt & TUPLESORT_RANDOMACCESS, + PARALLEL_SORT(state)); + + state->comparetup = comparetup_cluster; + state->copytup = copytup_cluster; + state->writetup = writetup_cluster; + state->readtup = readtup_cluster; + state->abbrevNext = 10; + + state->indexInfo = BuildIndexInfo(indexRel); + + /* + * If we don't have a simple leading attribute, we don't currently + * initialize datum1, so disable optimizations that require it. + */ + if (state->indexInfo->ii_IndexAttrNumbers[0] == 0) + state->haveDatum1 = false; + else + state->haveDatum1 = true; + + state->tupDesc = tupDesc; /* assume we need not copy tupDesc */ + + indexScanKey = _bt_mkscankey(indexRel, NULL); + + if (state->indexInfo->ii_Expressions != NULL) + { + TupleTableSlot *slot; + ExprContext *econtext; + + /* + * We will need to use FormIndexDatum to evaluate the index + * expressions. To do that, we need an EState, as well as a + * TupleTableSlot to put the table tuples into. The econtext's + * scantuple has to point to that slot, too. + */ + state->estate = CreateExecutorState(); + slot = MakeSingleTupleTableSlot(tupDesc, &TTSOpsHeapTuple); + econtext = GetPerTupleExprContext(state->estate); + econtext->ecxt_scantuple = slot; + } + + /* Prepare SortSupport data for each column */ + state->sortKeys = (SortSupport) palloc0(state->nKeys * + sizeof(SortSupportData)); + + for (i = 0; i < state->nKeys; i++) + { + SortSupport sortKey = state->sortKeys + i; + ScanKey scanKey = indexScanKey->scankeys + i; + int16 strategy; + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = scanKey->sk_collation; + sortKey->ssup_nulls_first = + (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0; + sortKey->ssup_attno = scanKey->sk_attno; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0 && state->haveDatum1); + + AssertState(sortKey->ssup_attno != 0); + + strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ? + BTGreaterStrategyNumber : BTLessStrategyNumber; + + PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey); + } + + pfree(indexScanKey); + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_index_btree(Relation heapRel, + Relation indexRel, + bool enforceUnique, + bool uniqueNullsNotDistinct, + int workMem, + SortCoordinate coordinate, + int sortopt) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + sortopt); + BTScanInsert indexScanKey; + MemoryContext oldcontext; + int i; + + oldcontext = MemoryContextSwitchTo(state->maincontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin index sort: unique = %c, workMem = %d, randomAccess = %c", + enforceUnique ? 't' : 'f', + workMem, sortopt & TUPLESORT_RANDOMACCESS ? 't' : 'f'); +#endif + + state->nKeys = IndexRelationGetNumberOfKeyAttributes(indexRel); + + TRACE_POSTGRESQL_SORT_START(INDEX_SORT, + enforceUnique, + state->nKeys, + workMem, + sortopt & TUPLESORT_RANDOMACCESS, + PARALLEL_SORT(state)); + + state->comparetup = comparetup_index_btree; + state->copytup = copytup_index; + state->writetup = writetup_index; + state->readtup = readtup_index; + state->abbrevNext = 10; + state->haveDatum1 = true; + + state->heapRel = heapRel; + state->indexRel = indexRel; + state->enforceUnique = enforceUnique; + state->uniqueNullsNotDistinct = uniqueNullsNotDistinct; + + indexScanKey = _bt_mkscankey(indexRel, NULL); + + /* Prepare SortSupport data for each column */ + state->sortKeys = (SortSupport) palloc0(state->nKeys * + sizeof(SortSupportData)); + + for (i = 0; i < state->nKeys; i++) + { + SortSupport sortKey = state->sortKeys + i; + ScanKey scanKey = indexScanKey->scankeys + i; + int16 strategy; + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = scanKey->sk_collation; + sortKey->ssup_nulls_first = + (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0; + sortKey->ssup_attno = scanKey->sk_attno; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0 && state->haveDatum1); + + AssertState(sortKey->ssup_attno != 0); + + strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ? + BTGreaterStrategyNumber : BTLessStrategyNumber; + + PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey); + } + + pfree(indexScanKey); + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_index_hash(Relation heapRel, + Relation indexRel, + uint32 high_mask, + uint32 low_mask, + uint32 max_buckets, + int workMem, + SortCoordinate coordinate, + int sortopt) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + sortopt); + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(state->maincontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin index sort: high_mask = 0x%x, low_mask = 0x%x, " + "max_buckets = 0x%x, workMem = %d, randomAccess = %c", + high_mask, + low_mask, + max_buckets, + workMem, + sortopt & TUPLESORT_RANDOMACCESS ? 't' : 'f'); +#endif + + state->nKeys = 1; /* Only one sort column, the hash code */ + + state->comparetup = comparetup_index_hash; + state->copytup = copytup_index; + state->writetup = writetup_index; + state->readtup = readtup_index; + state->haveDatum1 = true; + + state->heapRel = heapRel; + state->indexRel = indexRel; + + state->high_mask = high_mask; + state->low_mask = low_mask; + state->max_buckets = max_buckets; + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_index_gist(Relation heapRel, + Relation indexRel, + int workMem, + SortCoordinate coordinate, + int sortopt) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + sortopt); + MemoryContext oldcontext; + int i; + + oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin index sort: workMem = %d, randomAccess = %c", + workMem, sortopt & TUPLESORT_RANDOMACCESS ? 't' : 'f'); +#endif + + state->nKeys = IndexRelationGetNumberOfKeyAttributes(indexRel); + + state->comparetup = comparetup_index_btree; + state->copytup = copytup_index; + state->writetup = writetup_index; + state->readtup = readtup_index; + state->haveDatum1 = true; + + state->heapRel = heapRel; + state->indexRel = indexRel; + + /* Prepare SortSupport data for each column */ + state->sortKeys = (SortSupport) palloc0(state->nKeys * + sizeof(SortSupportData)); + + for (i = 0; i < state->nKeys; i++) + { + SortSupport sortKey = state->sortKeys + i; + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = indexRel->rd_indcollation[i]; + sortKey->ssup_nulls_first = false; + sortKey->ssup_attno = i + 1; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0 && state->haveDatum1); + + AssertState(sortKey->ssup_attno != 0); + + /* Look for a sort support function */ + PrepareSortSupportFromGistIndexRel(indexRel, sortKey); + } + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation, + bool nullsFirstFlag, int workMem, + SortCoordinate coordinate, int sortopt) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate, + sortopt); + MemoryContext oldcontext; + int16 typlen; + bool typbyval; + + oldcontext = MemoryContextSwitchTo(state->maincontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin datum sort: workMem = %d, randomAccess = %c", + workMem, sortopt & TUPLESORT_RANDOMACCESS ? 't' : 'f'); +#endif + + state->nKeys = 1; /* always a one-column sort */ + + TRACE_POSTGRESQL_SORT_START(DATUM_SORT, + false, /* no unique check */ + 1, + workMem, + sortopt & TUPLESORT_RANDOMACCESS, + PARALLEL_SORT(state)); + + state->comparetup = comparetup_datum; + state->copytup = copytup_datum; + state->writetup = writetup_datum; + state->readtup = readtup_datum; + state->abbrevNext = 10; + state->haveDatum1 = true; + + state->datumType = datumType; + + /* lookup necessary attributes of the datum type */ + get_typlenbyval(datumType, &typlen, &typbyval); + state->datumTypeLen = typlen; + state->tuples = !typbyval; + + /* Prepare SortSupport data */ + state->sortKeys = (SortSupport) palloc0(sizeof(SortSupportData)); + + state->sortKeys->ssup_cxt = CurrentMemoryContext; + state->sortKeys->ssup_collation = sortCollation; + state->sortKeys->ssup_nulls_first = nullsFirstFlag; + + /* + * Abbreviation is possible here only for by-reference types. In theory, + * a pass-by-value datatype could have an abbreviated form that is cheaper + * to compare. In a tuple sort, we could support that, because we can + * always extract the original datum from the tuple as needed. Here, we + * can't, because a datum sort only stores a single copy of the datum; the + * "tuple" field of each SortTuple is NULL. + */ + state->sortKeys->abbreviate = !typbyval; + + PrepareSortSupportFromOrderingOp(sortOperator, state->sortKeys); + + /* + * The "onlyKey" optimization cannot be used with abbreviated keys, since + * tie-breaker comparisons may be required. Typically, the optimization + * is only of value to pass-by-value types anyway, whereas abbreviated + * keys are typically only of value to pass-by-reference types. + */ + if (!state->sortKeys->abbrev_converter) + state->onlyKey = state->sortKeys; + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +/* + * tuplesort_set_bound + * + * Advise tuplesort that at most the first N result tuples are required. + * + * Must be called before inserting any tuples. (Actually, we could allow it + * as long as the sort hasn't spilled to disk, but there seems no need for + * delayed calls at the moment.) + * + * This is a hint only. The tuplesort may still return more tuples than + * requested. Parallel leader tuplesorts will always ignore the hint. + */ +void +tuplesort_set_bound(Tuplesortstate *state, int64 bound) +{ + /* Assert we're called before loading any tuples */ + Assert(state->status == TSS_INITIAL && state->memtupcount == 0); + /* Assert we allow bounded sorts */ + Assert(state->sortopt & TUPLESORT_ALLOWBOUNDED); + /* Can't set the bound twice, either */ + Assert(!state->bounded); + /* Also, this shouldn't be called in a parallel worker */ + Assert(!WORKER(state)); + + /* Parallel leader allows but ignores hint */ + if (LEADER(state)) + return; + +#ifdef DEBUG_BOUNDED_SORT + /* Honor GUC setting that disables the feature (for easy testing) */ + if (!optimize_bounded_sort) + return; +#endif + + /* We want to be able to compute bound * 2, so limit the setting */ + if (bound > (int64) (INT_MAX / 2)) + return; + + state->bounded = true; + state->bound = (int) bound; + + /* + * Bounded sorts are not an effective target for abbreviated key + * optimization. Disable by setting state to be consistent with no + * abbreviation support. + */ + state->sortKeys->abbrev_converter = NULL; + if (state->sortKeys->abbrev_full_comparator) + state->sortKeys->comparator = state->sortKeys->abbrev_full_comparator; + + /* Not strictly necessary, but be tidy */ + state->sortKeys->abbrev_abort = NULL; + state->sortKeys->abbrev_full_comparator = NULL; +} + +/* + * tuplesort_used_bound + * + * Allow callers to find out if the sort state was able to use a bound. + */ +bool +tuplesort_used_bound(Tuplesortstate *state) +{ + return state->boundUsed; +} + +/* + * tuplesort_free + * + * Internal routine for freeing resources of tuplesort. + */ +static void +tuplesort_free(Tuplesortstate *state) +{ + /* context swap probably not needed, but let's be safe */ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + long spaceUsed; + + if (state->tapeset) + spaceUsed = LogicalTapeSetBlocks(state->tapeset); + else + spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024; +#endif + + /* + * Delete temporary "tape" files, if any. + * + * Note: want to include this in reported total cost of sort, hence need + * for two #ifdef TRACE_SORT sections. + * + * We don't bother to destroy the individual tapes here. They will go away + * with the sortcontext. (In TSS_FINALMERGE state, we have closed + * finished tapes already.) + */ + if (state->tapeset) + LogicalTapeSetClose(state->tapeset); + +#ifdef TRACE_SORT + if (trace_sort) + { + if (state->tapeset) + elog(LOG, "%s of worker %d ended, %ld disk blocks used: %s", + SERIAL(state) ? "external sort" : "parallel external sort", + state->worker, spaceUsed, pg_rusage_show(&state->ru_start)); + else + elog(LOG, "%s of worker %d ended, %ld KB used: %s", + SERIAL(state) ? "internal sort" : "unperformed parallel sort", + state->worker, spaceUsed, pg_rusage_show(&state->ru_start)); + } + + TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, spaceUsed); +#else + + /* + * If you disabled TRACE_SORT, you can still probe sort__done, but you + * ain't getting space-used stats. + */ + TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, 0L); +#endif + + /* Free any execution state created for CLUSTER case */ + if (state->estate != NULL) + { + ExprContext *econtext = GetPerTupleExprContext(state->estate); + + ExecDropSingleTupleTableSlot(econtext->ecxt_scantuple); + FreeExecutorState(state->estate); + } + + MemoryContextSwitchTo(oldcontext); + + /* + * Free the per-sort memory context, thereby releasing all working memory. + */ + MemoryContextReset(state->sortcontext); +} + +/* + * tuplesort_end + * + * Release resources and clean up. + * + * NOTE: after calling this, any pointers returned by tuplesort_getXXX are + * pointing to garbage. Be careful not to attempt to use or free such + * pointers afterwards! + */ +void +tuplesort_end(Tuplesortstate *state) +{ + tuplesort_free(state); + + /* + * Free the main memory context, including the Tuplesortstate struct + * itself. + */ + MemoryContextDelete(state->maincontext); +} + +/* + * tuplesort_updatemax + * + * Update maximum resource usage statistics. + */ +static void +tuplesort_updatemax(Tuplesortstate *state) +{ + int64 spaceUsed; + bool isSpaceDisk; + + /* + * Note: it might seem we should provide both memory and disk usage for a + * disk-based sort. However, the current code doesn't track memory space + * accurately once we have begun to return tuples to the caller (since we + * don't account for pfree's the caller is expected to do), so we cannot + * rely on availMem in a disk sort. This does not seem worth the overhead + * to fix. Is it worth creating an API for the memory context code to + * tell us how much is actually used in sortcontext? + */ + if (state->tapeset) + { + isSpaceDisk = true; + spaceUsed = LogicalTapeSetBlocks(state->tapeset) * BLCKSZ; + } + else + { + isSpaceDisk = false; + spaceUsed = state->allowedMem - state->availMem; + } + + /* + * Sort evicts data to the disk when it wasn't able to fit that data into + * main memory. This is why we assume space used on the disk to be more + * important for tracking resource usage than space used in memory. Note + * that the amount of space occupied by some tupleset on the disk might be + * less than amount of space occupied by the same tupleset in memory due + * to more compact representation. + */ + if ((isSpaceDisk && !state->isMaxSpaceDisk) || + (isSpaceDisk == state->isMaxSpaceDisk && spaceUsed > state->maxSpace)) + { + state->maxSpace = spaceUsed; + state->isMaxSpaceDisk = isSpaceDisk; + state->maxSpaceStatus = state->status; + } +} + +/* + * tuplesort_reset + * + * Reset the tuplesort. Reset all the data in the tuplesort, but leave the + * meta-information in. After tuplesort_reset, tuplesort is ready to start + * a new sort. This allows avoiding recreation of tuple sort states (and + * save resources) when sorting multiple small batches. + */ +void +tuplesort_reset(Tuplesortstate *state) +{ + tuplesort_updatemax(state); + tuplesort_free(state); + + /* + * After we've freed up per-batch memory, re-setup all of the state common + * to both the first batch and any subsequent batch. + */ + tuplesort_begin_batch(state); + + state->lastReturnedTuple = NULL; + state->slabMemoryBegin = NULL; + state->slabMemoryEnd = NULL; + state->slabFreeHead = NULL; +} + +/* + * Grow the memtuples[] array, if possible within our memory constraint. We + * must not exceed INT_MAX tuples in memory or the caller-provided memory + * limit. Return true if we were able to enlarge the array, false if not. + * + * Normally, at each increment we double the size of the array. When doing + * that would exceed a limit, we attempt one last, smaller increase (and then + * clear the growmemtuples flag so we don't try any more). That allows us to + * use memory as fully as permitted; sticking to the pure doubling rule could + * result in almost half going unused. Because availMem moves around with + * tuple addition/removal, we need some rule to prevent making repeated small + * increases in memtupsize, which would just be useless thrashing. The + * growmemtuples flag accomplishes that and also prevents useless + * recalculations in this function. + */ +static bool +grow_memtuples(Tuplesortstate *state) +{ + int newmemtupsize; + int memtupsize = state->memtupsize; + int64 memNowUsed = state->allowedMem - state->availMem; + + /* Forget it if we've already maxed out memtuples, per comment above */ + if (!state->growmemtuples) + return false; + + /* Select new value of memtupsize */ + if (memNowUsed <= state->availMem) + { + /* + * We've used no more than half of allowedMem; double our usage, + * clamping at INT_MAX tuples. + */ + if (memtupsize < INT_MAX / 2) + newmemtupsize = memtupsize * 2; + else + { + newmemtupsize = INT_MAX; + state->growmemtuples = false; + } + } + else + { + /* + * This will be the last increment of memtupsize. Abandon doubling + * strategy and instead increase as much as we safely can. + * + * To stay within allowedMem, we can't increase memtupsize by more + * than availMem / sizeof(SortTuple) elements. In practice, we want + * to increase it by considerably less, because we need to leave some + * space for the tuples to which the new array slots will refer. We + * assume the new tuples will be about the same size as the tuples + * we've already seen, and thus we can extrapolate from the space + * consumption so far to estimate an appropriate new size for the + * memtuples array. The optimal value might be higher or lower than + * this estimate, but it's hard to know that in advance. We again + * clamp at INT_MAX tuples. + * + * This calculation is safe against enlarging the array so much that + * LACKMEM becomes true, because the memory currently used includes + * the present array; thus, there would be enough allowedMem for the + * new array elements even if no other memory were currently used. + * + * We do the arithmetic in float8, because otherwise the product of + * memtupsize and allowedMem could overflow. Any inaccuracy in the + * result should be insignificant; but even if we computed a + * completely insane result, the checks below will prevent anything + * really bad from happening. + */ + double grow_ratio; + + grow_ratio = (double) state->allowedMem / (double) memNowUsed; + if (memtupsize * grow_ratio < INT_MAX) + newmemtupsize = (int) (memtupsize * grow_ratio); + else + newmemtupsize = INT_MAX; + + /* We won't make any further enlargement attempts */ + state->growmemtuples = false; + } + + /* Must enlarge array by at least one element, else report failure */ + if (newmemtupsize <= memtupsize) + goto noalloc; + + /* + * On a 32-bit machine, allowedMem could exceed MaxAllocHugeSize. Clamp + * to ensure our request won't be rejected. Note that we can easily + * exhaust address space before facing this outcome. (This is presently + * impossible due to guc.c's MAX_KILOBYTES limitation on work_mem, but + * don't rely on that at this distance.) + */ + if ((Size) newmemtupsize >= MaxAllocHugeSize / sizeof(SortTuple)) + { + newmemtupsize = (int) (MaxAllocHugeSize / sizeof(SortTuple)); + state->growmemtuples = false; /* can't grow any more */ + } + + /* + * We need to be sure that we do not cause LACKMEM to become true, else + * the space management algorithm will go nuts. The code above should + * never generate a dangerous request, but to be safe, check explicitly + * that the array growth fits within availMem. (We could still cause + * LACKMEM if the memory chunk overhead associated with the memtuples + * array were to increase. That shouldn't happen because we chose the + * initial array size large enough to ensure that palloc will be treating + * both old and new arrays as separate chunks. But we'll check LACKMEM + * explicitly below just in case.) + */ + if (state->availMem < (int64) ((newmemtupsize - memtupsize) * sizeof(SortTuple))) + goto noalloc; + + /* OK, do it */ + FREEMEM(state, GetMemoryChunkSpace(state->memtuples)); + state->memtupsize = newmemtupsize; + state->memtuples = (SortTuple *) + repalloc_huge(state->memtuples, + state->memtupsize * sizeof(SortTuple)); + USEMEM(state, GetMemoryChunkSpace(state->memtuples)); + if (LACKMEM(state)) + elog(ERROR, "unexpected out-of-memory situation in tuplesort"); + return true; + +noalloc: + /* If for any reason we didn't realloc, shut off future attempts */ + state->growmemtuples = false; + return false; +} + +/* + * Accept one tuple while collecting input data for sort. + * + * Note that the input data is always copied; the caller need not save it. + */ +void +tuplesort_puttupleslot(Tuplesortstate *state, TupleTableSlot *slot) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + /* + * Copy the given tuple into memory we control, and decrease availMem. + * Then call the common code. + */ + COPYTUP(state, &stup, (void *) slot); + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Accept one tuple while collecting input data for sort. + * + * Note that the input data is always copied; the caller need not save it. + */ +void +tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + /* + * Copy the given tuple into memory we control, and decrease availMem. + * Then call the common code. + */ + COPYTUP(state, &stup, (void *) tup); + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Collect one index tuple while collecting input data for sort, building + * it from caller-supplied values. + */ +void +tuplesort_putindextuplevalues(Tuplesortstate *state, Relation rel, + ItemPointer self, Datum *values, + bool *isnull) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + SortTuple stup; + Datum original; + IndexTuple tuple; + + stup.tuple = index_form_tuple(RelationGetDescr(rel), values, isnull); + tuple = ((IndexTuple) stup.tuple); + tuple->t_tid = *self; + USEMEM(state, GetMemoryChunkSpace(stup.tuple)); + /* set up first-column key value */ + original = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &stup.isnull1); + + MemoryContextSwitchTo(state->sortcontext); + + if (!state->sortKeys || !state->sortKeys->abbrev_converter || stup.isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to zeroed representation (to be + * consistent, and to support cheap inequality tests for NULL + * abbreviated keys). + */ + stup.datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup.datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup.datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + tuple = mtup->tuple; + mtup->datum1 = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &mtup->isnull1); + } + } + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Accept one Datum while collecting input data for sort. + * + * If the Datum is pass-by-ref type, the value will be copied. + */ +void +tuplesort_putdatum(Tuplesortstate *state, Datum val, bool isNull) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + SortTuple stup; + + /* + * Pass-by-value types or null values are just stored directly in + * stup.datum1 (and stup.tuple is not used and set to NULL). + * + * Non-null pass-by-reference values need to be copied into memory we + * control, and possibly abbreviated. The copied value is pointed to by + * stup.tuple and is treated as the canonical copy (e.g. to return via + * tuplesort_getdatum or when writing to tape); stup.datum1 gets the + * abbreviated value if abbreviation is happening, otherwise it's + * identical to stup.tuple. + */ + + if (isNull || !state->tuples) + { + /* + * Set datum1 to zeroed representation for NULLs (to be consistent, + * and to support cheap inequality tests for NULL abbreviated keys). + */ + stup.datum1 = !isNull ? val : (Datum) 0; + stup.isnull1 = isNull; + stup.tuple = NULL; /* no separate storage */ + MemoryContextSwitchTo(state->sortcontext); + } + else + { + Datum original = datumCopy(val, false, state->datumTypeLen); + + stup.isnull1 = false; + stup.tuple = DatumGetPointer(original); + USEMEM(state, GetMemoryChunkSpace(stup.tuple)); + MemoryContextSwitchTo(state->sortcontext); + + if (!state->sortKeys->abbrev_converter) + { + stup.datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup.datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup.datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any + * case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + mtup->datum1 = PointerGetDatum(mtup->tuple); + } + } + } + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Shared code for tuple and datum cases. + */ +static void +puttuple_common(Tuplesortstate *state, SortTuple *tuple) +{ + Assert(!LEADER(state)); + + switch (state->status) + { + case TSS_INITIAL: + + /* + * Save the tuple into the unsorted array. First, grow the array + * as needed. Note that we try to grow the array when there is + * still one free slot remaining --- if we fail, there'll still be + * room to store the incoming tuple, and then we'll switch to + * tape-based operation. + */ + if (state->memtupcount >= state->memtupsize - 1) + { + (void) grow_memtuples(state); + Assert(state->memtupcount < state->memtupsize); + } + state->memtuples[state->memtupcount++] = *tuple; + + /* + * Check if it's time to switch over to a bounded heapsort. We do + * so if the input tuple count exceeds twice the desired tuple + * count (this is a heuristic for where heapsort becomes cheaper + * than a quicksort), or if we've just filled workMem and have + * enough tuples to meet the bound. + * + * Note that once we enter TSS_BOUNDED state we will always try to + * complete the sort that way. In the worst case, if later input + * tuples are larger than earlier ones, this might cause us to + * exceed workMem significantly. + */ + if (state->bounded && + (state->memtupcount > state->bound * 2 || + (state->memtupcount > state->bound && LACKMEM(state)))) + { +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "switching to bounded heapsort at %d tuples: %s", + state->memtupcount, + pg_rusage_show(&state->ru_start)); +#endif + make_bounded_heap(state); + return; + } + + /* + * Done if we still fit in available memory and have array slots. + */ + if (state->memtupcount < state->memtupsize && !LACKMEM(state)) + return; + + /* + * Nope; time to switch to tape-based operation. + */ + inittapes(state, true); + + /* + * Dump all tuples. + */ + dumptuples(state, false); + break; + + case TSS_BOUNDED: + + /* + * We don't want to grow the array here, so check whether the new + * tuple can be discarded before putting it in. This should be a + * good speed optimization, too, since when there are many more + * input tuples than the bound, most input tuples can be discarded + * with just this one comparison. Note that because we currently + * have the sort direction reversed, we must check for <= not >=. + */ + if (COMPARETUP(state, tuple, &state->memtuples[0]) <= 0) + { + /* new tuple <= top of the heap, so we can discard it */ + free_sort_tuple(state, tuple); + CHECK_FOR_INTERRUPTS(); + } + else + { + /* discard top of heap, replacing it with the new tuple */ + free_sort_tuple(state, &state->memtuples[0]); + tuplesort_heap_replace_top(state, tuple); + } + break; + + case TSS_BUILDRUNS: + + /* + * Save the tuple into the unsorted array (there must be space) + */ + state->memtuples[state->memtupcount++] = *tuple; + + /* + * If we are over the memory limit, dump all tuples. + */ + dumptuples(state, false); + break; + + default: + elog(ERROR, "invalid tuplesort state"); + break; + } +} + +static bool +consider_abort_common(Tuplesortstate *state) +{ + Assert(state->sortKeys[0].abbrev_converter != NULL); + Assert(state->sortKeys[0].abbrev_abort != NULL); + Assert(state->sortKeys[0].abbrev_full_comparator != NULL); + + /* + * Check effectiveness of abbreviation optimization. Consider aborting + * when still within memory limit. + */ + if (state->status == TSS_INITIAL && + state->memtupcount >= state->abbrevNext) + { + state->abbrevNext *= 2; + + /* + * Check opclass-supplied abbreviation abort routine. It may indicate + * that abbreviation should not proceed. + */ + if (!state->sortKeys->abbrev_abort(state->memtupcount, + state->sortKeys)) + return false; + + /* + * Finally, restore authoritative comparator, and indicate that + * abbreviation is not in play by setting abbrev_converter to NULL + */ + state->sortKeys[0].comparator = state->sortKeys[0].abbrev_full_comparator; + state->sortKeys[0].abbrev_converter = NULL; + /* Not strictly necessary, but be tidy */ + state->sortKeys[0].abbrev_abort = NULL; + state->sortKeys[0].abbrev_full_comparator = NULL; + + /* Give up - expect original pass-by-value representation */ + return true; + } + + return false; +} + +/* + * All tuples have been provided; finish the sort. + */ +void +tuplesort_performsort(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "performsort of worker %d starting: %s", + state->worker, pg_rusage_show(&state->ru_start)); +#endif + + switch (state->status) + { + case TSS_INITIAL: + + /* + * We were able to accumulate all the tuples within the allowed + * amount of memory, or leader to take over worker tapes + */ + if (SERIAL(state)) + { + /* Just qsort 'em and we're done */ + tuplesort_sort_memtuples(state); + state->status = TSS_SORTEDINMEM; + } + else if (WORKER(state)) + { + /* + * Parallel workers must still dump out tuples to tape. No + * merge is required to produce single output run, though. + */ + inittapes(state, false); + dumptuples(state, true); + worker_nomergeruns(state); + state->status = TSS_SORTEDONTAPE; + } + else + { + /* + * Leader will take over worker tapes and merge worker runs. + * Note that mergeruns sets the correct state->status. + */ + leader_takeover_tapes(state); + mergeruns(state); + } + state->current = 0; + state->eof_reached = false; + state->markpos_block = 0L; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + + case TSS_BOUNDED: + + /* + * We were able to accumulate all the tuples required for output + * in memory, using a heap to eliminate excess tuples. Now we + * have to transform the heap to a properly-sorted array. + */ + sort_bounded_heap(state); + state->current = 0; + state->eof_reached = false; + state->markpos_offset = 0; + state->markpos_eof = false; + state->status = TSS_SORTEDINMEM; + break; + + case TSS_BUILDRUNS: + + /* + * Finish tape-based sort. First, flush all tuples remaining in + * memory out to tape; then merge until we have a single remaining + * run (or, if !randomAccess and !WORKER(), one run per tape). + * Note that mergeruns sets the correct state->status. + */ + dumptuples(state, true); + mergeruns(state); + state->eof_reached = false; + state->markpos_block = 0L; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + +#ifdef TRACE_SORT + if (trace_sort) + { + if (state->status == TSS_FINALMERGE) + elog(LOG, "performsort of worker %d done (except %d-way final merge): %s", + state->worker, state->nInputTapes, + pg_rusage_show(&state->ru_start)); + else + elog(LOG, "performsort of worker %d done: %s", + state->worker, pg_rusage_show(&state->ru_start)); + } +#endif + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Internal routine to fetch the next tuple in either forward or back + * direction into *stup. Returns false if no more tuples. + * Returned tuple belongs to tuplesort memory context, and must not be freed + * by caller. Note that fetched tuple is stored in memory that may be + * recycled by any future fetch. + */ +static bool +tuplesort_gettuple_common(Tuplesortstate *state, bool forward, + SortTuple *stup) +{ + unsigned int tuplen; + size_t nmoved; + + Assert(!WORKER(state)); + + switch (state->status) + { + case TSS_SORTEDINMEM: + Assert(forward || state->sortopt & TUPLESORT_RANDOMACCESS); + Assert(!state->slabAllocatorUsed); + if (forward) + { + if (state->current < state->memtupcount) + { + *stup = state->memtuples[state->current++]; + return true; + } + state->eof_reached = true; + + /* + * Complain if caller tries to retrieve more tuples than + * originally asked for in a bounded sort. This is because + * returning EOF here might be the wrong thing. + */ + if (state->bounded && state->current >= state->bound) + elog(ERROR, "retrieved too many tuples in a bounded sort"); + + return false; + } + else + { + if (state->current <= 0) + return false; + + /* + * if all tuples are fetched already then we return last + * tuple, else - tuple before last returned. + */ + if (state->eof_reached) + state->eof_reached = false; + else + { + state->current--; /* last returned tuple */ + if (state->current <= 0) + return false; + } + *stup = state->memtuples[state->current - 1]; + return true; + } + break; + + case TSS_SORTEDONTAPE: + Assert(forward || state->sortopt & TUPLESORT_RANDOMACCESS); + Assert(state->slabAllocatorUsed); + + /* + * The slot that held the tuple that we returned in previous + * gettuple call can now be reused. + */ + if (state->lastReturnedTuple) + { + RELEASE_SLAB_SLOT(state, state->lastReturnedTuple); + state->lastReturnedTuple = NULL; + } + + if (forward) + { + if (state->eof_reached) + return false; + + if ((tuplen = getlen(state->result_tape, true)) != 0) + { + READTUP(state, stup, state->result_tape, tuplen); + + /* + * Remember the tuple we return, so that we can recycle + * its memory on next call. (This can be NULL, in the + * !state->tuples case). + */ + state->lastReturnedTuple = stup->tuple; + + return true; + } + else + { + state->eof_reached = true; + return false; + } + } + + /* + * Backward. + * + * if all tuples are fetched already then we return last tuple, + * else - tuple before last returned. + */ + if (state->eof_reached) + { + /* + * Seek position is pointing just past the zero tuplen at the + * end of file; back up to fetch last tuple's ending length + * word. If seek fails we must have a completely empty file. + */ + nmoved = LogicalTapeBackspace(state->result_tape, + 2 * sizeof(unsigned int)); + if (nmoved == 0) + return false; + else if (nmoved != 2 * sizeof(unsigned int)) + elog(ERROR, "unexpected tape position"); + state->eof_reached = false; + } + else + { + /* + * Back up and fetch previously-returned tuple's ending length + * word. If seek fails, assume we are at start of file. + */ + nmoved = LogicalTapeBackspace(state->result_tape, + sizeof(unsigned int)); + if (nmoved == 0) + return false; + else if (nmoved != sizeof(unsigned int)) + elog(ERROR, "unexpected tape position"); + tuplen = getlen(state->result_tape, false); + + /* + * Back up to get ending length word of tuple before it. + */ + nmoved = LogicalTapeBackspace(state->result_tape, + tuplen + 2 * sizeof(unsigned int)); + if (nmoved == tuplen + sizeof(unsigned int)) + { + /* + * We backed up over the previous tuple, but there was no + * ending length word before it. That means that the prev + * tuple is the first tuple in the file. It is now the + * next to read in forward direction (not obviously right, + * but that is what in-memory case does). + */ + return false; + } + else if (nmoved != tuplen + 2 * sizeof(unsigned int)) + elog(ERROR, "bogus tuple length in backward scan"); + } + + tuplen = getlen(state->result_tape, false); + + /* + * Now we have the length of the prior tuple, back up and read it. + * Note: READTUP expects we are positioned after the initial + * length word of the tuple, so back up to that point. + */ + nmoved = LogicalTapeBackspace(state->result_tape, + tuplen); + if (nmoved != tuplen) + elog(ERROR, "bogus tuple length in backward scan"); + READTUP(state, stup, state->result_tape, tuplen); + + /* + * Remember the tuple we return, so that we can recycle its memory + * on next call. (This can be NULL, in the Datum case). + */ + state->lastReturnedTuple = stup->tuple; + + return true; + + case TSS_FINALMERGE: + Assert(forward); + /* We are managing memory ourselves, with the slab allocator. */ + Assert(state->slabAllocatorUsed); + + /* + * The slab slot holding the tuple that we returned in previous + * gettuple call can now be reused. + */ + if (state->lastReturnedTuple) + { + RELEASE_SLAB_SLOT(state, state->lastReturnedTuple); + state->lastReturnedTuple = NULL; + } + + /* + * This code should match the inner loop of mergeonerun(). + */ + if (state->memtupcount > 0) + { + int srcTapeIndex = state->memtuples[0].srctape; + LogicalTape *srcTape = state->inputTapes[srcTapeIndex]; + SortTuple newtup; + + *stup = state->memtuples[0]; + + /* + * Remember the tuple we return, so that we can recycle its + * memory on next call. (This can be NULL, in the Datum case). + */ + state->lastReturnedTuple = stup->tuple; + + /* + * Pull next tuple from tape, and replace the returned tuple + * at top of the heap with it. + */ + if (!mergereadnext(state, srcTape, &newtup)) + { + /* + * If no more data, we've reached end of run on this tape. + * Remove the top node from the heap. + */ + tuplesort_heap_delete_top(state); + state->nInputRuns--; + + /* + * Close the tape. It'd go away at the end of the sort + * anyway, but better to release the memory early. + */ + LogicalTapeClose(srcTape); + return true; + } + newtup.srctape = srcTapeIndex; + tuplesort_heap_replace_top(state, &newtup); + return true; + } + return false; + + default: + elog(ERROR, "invalid tuplesort state"); + return false; /* keep compiler quiet */ + } +} + +/* + * Fetch the next tuple in either forward or back direction. + * If successful, put tuple in slot and return true; else, clear the slot + * and return false. + * + * Caller may optionally be passed back abbreviated value (on true return + * value) when abbreviation was used, which can be used to cheaply avoid + * equality checks that might otherwise be required. Caller can safely make a + * determination of "non-equal tuple" based on simple binary inequality. A + * NULL value in leading attribute will set abbreviated value to zeroed + * representation, which caller may rely on in abbreviated inequality check. + * + * If copy is true, the slot receives a tuple that's been copied into the + * caller's memory context, so that it will stay valid regardless of future + * manipulations of the tuplesort's state (up to and including deleting the + * tuplesort). If copy is false, the slot will just receive a pointer to a + * tuple held within the tuplesort, which is more efficient, but only safe for + * callers that are prepared to have any subsequent manipulation of the + * tuplesort's state invalidate slot contents. + */ +bool +tuplesort_gettupleslot(Tuplesortstate *state, bool forward, bool copy, + TupleTableSlot *slot, Datum *abbrev) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + stup.tuple = NULL; + + MemoryContextSwitchTo(oldcontext); + + if (stup.tuple) + { + /* Record abbreviated key for caller */ + if (state->sortKeys->abbrev_converter && abbrev) + *abbrev = stup.datum1; + + if (copy) + stup.tuple = heap_copy_minimal_tuple((MinimalTuple) stup.tuple); + + ExecStoreMinimalTuple((MinimalTuple) stup.tuple, slot, copy); + return true; + } + else + { + ExecClearTuple(slot); + return false; + } +} + +/* + * Fetch the next tuple in either forward or back direction. + * Returns NULL if no more tuples. Returned tuple belongs to tuplesort memory + * context, and must not be freed by caller. Caller may not rely on tuple + * remaining valid after any further manipulation of tuplesort. + */ +HeapTuple +tuplesort_getheaptuple(Tuplesortstate *state, bool forward) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + stup.tuple = NULL; + + MemoryContextSwitchTo(oldcontext); + + return stup.tuple; +} + +/* + * Fetch the next index tuple in either forward or back direction. + * Returns NULL if no more tuples. Returned tuple belongs to tuplesort memory + * context, and must not be freed by caller. Caller may not rely on tuple + * remaining valid after any further manipulation of tuplesort. + */ +IndexTuple +tuplesort_getindextuple(Tuplesortstate *state, bool forward) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + stup.tuple = NULL; + + MemoryContextSwitchTo(oldcontext); + + return (IndexTuple) stup.tuple; +} + +/* + * Fetch the next Datum in either forward or back direction. + * Returns false if no more datums. + * + * If the Datum is pass-by-ref type, the returned value is freshly palloc'd + * in caller's context, and is now owned by the caller (this differs from + * similar routines for other types of tuplesorts). + * + * Caller may optionally be passed back abbreviated value (on true return + * value) when abbreviation was used, which can be used to cheaply avoid + * equality checks that might otherwise be required. Caller can safely make a + * determination of "non-equal tuple" based on simple binary inequality. A + * NULL value will have a zeroed abbreviated value representation, which caller + * may rely on in abbreviated inequality check. + */ +bool +tuplesort_getdatum(Tuplesortstate *state, bool forward, + Datum *val, bool *isNull, Datum *abbrev) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + { + MemoryContextSwitchTo(oldcontext); + return false; + } + + /* Ensure we copy into caller's memory context */ + MemoryContextSwitchTo(oldcontext); + + /* Record abbreviated key for caller */ + if (state->sortKeys->abbrev_converter && abbrev) + *abbrev = stup.datum1; + + if (stup.isnull1 || !state->tuples) + { + *val = stup.datum1; + *isNull = stup.isnull1; + } + else + { + /* use stup.tuple because stup.datum1 may be an abbreviation */ + *val = datumCopy(PointerGetDatum(stup.tuple), false, state->datumTypeLen); + *isNull = false; + } + + return true; +} + +/* + * Advance over N tuples in either forward or back direction, + * without returning any data. N==0 is a no-op. + * Returns true if successful, false if ran out of tuples. + */ +bool +tuplesort_skiptuples(Tuplesortstate *state, int64 ntuples, bool forward) +{ + MemoryContext oldcontext; + + /* + * We don't actually support backwards skip yet, because no callers need + * it. The API is designed to allow for that later, though. + */ + Assert(forward); + Assert(ntuples >= 0); + Assert(!WORKER(state)); + + switch (state->status) + { + case TSS_SORTEDINMEM: + if (state->memtupcount - state->current >= ntuples) + { + state->current += ntuples; + return true; + } + state->current = state->memtupcount; + state->eof_reached = true; + + /* + * Complain if caller tries to retrieve more tuples than + * originally asked for in a bounded sort. This is because + * returning EOF here might be the wrong thing. + */ + if (state->bounded && state->current >= state->bound) + elog(ERROR, "retrieved too many tuples in a bounded sort"); + + return false; + + case TSS_SORTEDONTAPE: + case TSS_FINALMERGE: + + /* + * We could probably optimize these cases better, but for now it's + * not worth the trouble. + */ + oldcontext = MemoryContextSwitchTo(state->sortcontext); + while (ntuples-- > 0) + { + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup)) + { + MemoryContextSwitchTo(oldcontext); + return false; + } + CHECK_FOR_INTERRUPTS(); + } + MemoryContextSwitchTo(oldcontext); + return true; + + default: + elog(ERROR, "invalid tuplesort state"); + return false; /* keep compiler quiet */ + } +} + +/* + * tuplesort_merge_order - report merge order we'll use for given memory + * (note: "merge order" just means the number of input tapes in the merge). + * + * This is exported for use by the planner. allowedMem is in bytes. + */ +int +tuplesort_merge_order(int64 allowedMem) +{ + int mOrder; + + /*---------- + * In the merge phase, we need buffer space for each input and output tape. + * Each pass in the balanced merge algorithm reads from M input tapes, and + * writes to N output tapes. Each tape consumes TAPE_BUFFER_OVERHEAD bytes + * of memory. In addition to that, we want MERGE_BUFFER_SIZE workspace per + * input tape. + * + * totalMem = M * (TAPE_BUFFER_OVERHEAD + MERGE_BUFFER_SIZE) + + * N * TAPE_BUFFER_OVERHEAD + * + * Except for the last and next-to-last merge passes, where there can be + * fewer tapes left to process, M = N. We choose M so that we have the + * desired amount of memory available for the input buffers + * (TAPE_BUFFER_OVERHEAD + MERGE_BUFFER_SIZE), given the total memory + * available for the tape buffers (allowedMem). + * + * Note: you might be thinking we need to account for the memtuples[] + * array in this calculation, but we effectively treat that as part of the + * MERGE_BUFFER_SIZE workspace. + *---------- + */ + mOrder = allowedMem / + (2 * TAPE_BUFFER_OVERHEAD + MERGE_BUFFER_SIZE); + + /* + * Even in minimum memory, use at least a MINORDER merge. On the other + * hand, even when we have lots of memory, do not use more than a MAXORDER + * merge. Tapes are pretty cheap, but they're not entirely free. Each + * additional tape reduces the amount of memory available to build runs, + * which in turn can cause the same sort to need more runs, which makes + * merging slower even if it can still be done in a single pass. Also, + * high order merges are quite slow due to CPU cache effects; it can be + * faster to pay the I/O cost of a multi-pass merge than to perform a + * single merge pass across many hundreds of tapes. + */ + mOrder = Max(mOrder, MINORDER); + mOrder = Min(mOrder, MAXORDER); + + return mOrder; +} + +/* + * Helper function to calculate how much memory to allocate for the read buffer + * of each input tape in a merge pass. + * + * 'avail_mem' is the amount of memory available for the buffers of all the + * tapes, both input and output. + * 'nInputTapes' and 'nInputRuns' are the number of input tapes and runs. + * 'maxOutputTapes' is the max. number of output tapes we should produce. + */ +static int64 +merge_read_buffer_size(int64 avail_mem, int nInputTapes, int nInputRuns, + int maxOutputTapes) +{ + int nOutputRuns; + int nOutputTapes; + + /* + * How many output tapes will we produce in this pass? + * + * This is nInputRuns / nInputTapes, rounded up. + */ + nOutputRuns = (nInputRuns + nInputTapes - 1) / nInputTapes; + + nOutputTapes = Min(nOutputRuns, maxOutputTapes); + + /* + * Each output tape consumes TAPE_BUFFER_OVERHEAD bytes of memory. All + * remaining memory is divided evenly between the input tapes. + * + * This also follows from the formula in tuplesort_merge_order, but here + * we derive the input buffer size from the amount of memory available, + * and M and N. + */ + return Max((avail_mem - TAPE_BUFFER_OVERHEAD * nOutputTapes) / nInputTapes, 0); +} + +/* + * inittapes - initialize for tape sorting. + * + * This is called only if we have found we won't sort in memory. + */ +static void +inittapes(Tuplesortstate *state, bool mergeruns) +{ + Assert(!LEADER(state)); + + if (mergeruns) + { + /* Compute number of input tapes to use when merging */ + state->maxTapes = tuplesort_merge_order(state->allowedMem); + } + else + { + /* Workers can sometimes produce single run, output without merge */ + Assert(WORKER(state)); + state->maxTapes = MINORDER; + } + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d switching to external sort with %d tapes: %s", + state->worker, state->maxTapes, pg_rusage_show(&state->ru_start)); +#endif + + /* Create the tape set */ + inittapestate(state, state->maxTapes); + state->tapeset = + LogicalTapeSetCreate(false, + state->shared ? &state->shared->fileset : NULL, + state->worker); + + state->currentRun = 0; + + /* + * Initialize logical tape arrays. + */ + state->inputTapes = NULL; + state->nInputTapes = 0; + state->nInputRuns = 0; + + state->outputTapes = palloc0(state->maxTapes * sizeof(LogicalTape *)); + state->nOutputTapes = 0; + state->nOutputRuns = 0; + + state->status = TSS_BUILDRUNS; + + selectnewtape(state); +} + +/* + * inittapestate - initialize generic tape management state + */ +static void +inittapestate(Tuplesortstate *state, int maxTapes) +{ + int64 tapeSpace; + + /* + * Decrease availMem to reflect the space needed for tape buffers; but + * don't decrease it to the point that we have no room for tuples. (That + * case is only likely to occur if sorting pass-by-value Datums; in all + * other scenarios the memtuples[] array is unlikely to occupy more than + * half of allowedMem. In the pass-by-value case it's not important to + * account for tuple space, so we don't care if LACKMEM becomes + * inaccurate.) + */ + tapeSpace = (int64) maxTapes * TAPE_BUFFER_OVERHEAD; + + if (tapeSpace + GetMemoryChunkSpace(state->memtuples) < state->allowedMem) + USEMEM(state, tapeSpace); + + /* + * Make sure that the temp file(s) underlying the tape set are created in + * suitable temp tablespaces. For parallel sorts, this should have been + * called already, but it doesn't matter if it is called a second time. + */ + PrepareTempTablespaces(); +} + +/* + * selectnewtape -- select next tape to output to. + * + * This is called after finishing a run when we know another run + * must be started. This is used both when building the initial + * runs, and during merge passes. + */ +static void +selectnewtape(Tuplesortstate *state) +{ + /* + * At the beginning of each merge pass, nOutputTapes and nOutputRuns are + * both zero. On each call, we create a new output tape to hold the next + * run, until maxTapes is reached. After that, we assign new runs to the + * existing tapes in a round robin fashion. + */ + if (state->nOutputTapes < state->maxTapes) + { + /* Create a new tape to hold the next run */ + Assert(state->outputTapes[state->nOutputRuns] == NULL); + Assert(state->nOutputRuns == state->nOutputTapes); + state->destTape = LogicalTapeCreate(state->tapeset); + state->outputTapes[state->nOutputTapes] = state->destTape; + state->nOutputTapes++; + state->nOutputRuns++; + } + else + { + /* + * We have reached the max number of tapes. Append to an existing + * tape. + */ + state->destTape = state->outputTapes[state->nOutputRuns % state->nOutputTapes]; + state->nOutputRuns++; + } +} + +/* + * Initialize the slab allocation arena, for the given number of slots. + */ +static void +init_slab_allocator(Tuplesortstate *state, int numSlots) +{ + if (numSlots > 0) + { + char *p; + int i; + + state->slabMemoryBegin = palloc(numSlots * SLAB_SLOT_SIZE); + state->slabMemoryEnd = state->slabMemoryBegin + + numSlots * SLAB_SLOT_SIZE; + state->slabFreeHead = (SlabSlot *) state->slabMemoryBegin; + USEMEM(state, numSlots * SLAB_SLOT_SIZE); + + p = state->slabMemoryBegin; + for (i = 0; i < numSlots - 1; i++) + { + ((SlabSlot *) p)->nextfree = (SlabSlot *) (p + SLAB_SLOT_SIZE); + p += SLAB_SLOT_SIZE; + } + ((SlabSlot *) p)->nextfree = NULL; + } + else + { + state->slabMemoryBegin = state->slabMemoryEnd = NULL; + state->slabFreeHead = NULL; + } + state->slabAllocatorUsed = true; +} + +/* + * mergeruns -- merge all the completed initial runs. + * + * This implements the Balanced k-Way Merge Algorithm. All input data has + * already been written to initial runs on tape (see dumptuples). + */ +static void +mergeruns(Tuplesortstate *state) +{ + int tapenum; + + Assert(state->status == TSS_BUILDRUNS); + Assert(state->memtupcount == 0); + + if (state->sortKeys != NULL && state->sortKeys->abbrev_converter != NULL) + { + /* + * If there are multiple runs to be merged, when we go to read back + * tuples from disk, abbreviated keys will not have been stored, and + * we don't care to regenerate them. Disable abbreviation from this + * point on. + */ + state->sortKeys->abbrev_converter = NULL; + state->sortKeys->comparator = state->sortKeys->abbrev_full_comparator; + + /* Not strictly necessary, but be tidy */ + state->sortKeys->abbrev_abort = NULL; + state->sortKeys->abbrev_full_comparator = NULL; + } + + /* + * Reset tuple memory. We've freed all the tuples that we previously + * allocated. We will use the slab allocator from now on. + */ + MemoryContextResetOnly(state->tuplecontext); + + /* + * We no longer need a large memtuples array. (We will allocate a smaller + * one for the heap later.) + */ + FREEMEM(state, GetMemoryChunkSpace(state->memtuples)); + pfree(state->memtuples); + state->memtuples = NULL; + + /* + * Initialize the slab allocator. We need one slab slot per input tape, + * for the tuples in the heap, plus one to hold the tuple last returned + * from tuplesort_gettuple. (If we're sorting pass-by-val Datums, + * however, we don't need to do allocate anything.) + * + * In a multi-pass merge, we could shrink this allocation for the last + * merge pass, if it has fewer tapes than previous passes, but we don't + * bother. + * + * From this point on, we no longer use the USEMEM()/LACKMEM() mechanism + * to track memory usage of individual tuples. + */ + if (state->tuples) + init_slab_allocator(state, state->nOutputTapes + 1); + else + init_slab_allocator(state, 0); + + /* + * Allocate a new 'memtuples' array, for the heap. It will hold one tuple + * from each input tape. + * + * We could shrink this, too, between passes in a multi-pass merge, but we + * don't bother. (The initial input tapes are still in outputTapes. The + * number of input tapes will not increase between passes.) + */ + state->memtupsize = state->nOutputTapes; + state->memtuples = (SortTuple *) MemoryContextAlloc(state->maincontext, + state->nOutputTapes * sizeof(SortTuple)); + USEMEM(state, GetMemoryChunkSpace(state->memtuples)); + + /* + * Use all the remaining memory we have available for tape buffers among + * all the input tapes. At the beginning of each merge pass, we will + * divide this memory between the input and output tapes in the pass. + */ + state->tape_buffer_mem = state->availMem; + USEMEM(state, state->tape_buffer_mem); +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d using %zu KB of memory for tape buffers", + state->worker, state->tape_buffer_mem / 1024); +#endif + + for (;;) + { + /* + * On the first iteration, or if we have read all the runs from the + * input tapes in a multi-pass merge, it's time to start a new pass. + * Rewind all the output tapes, and make them inputs for the next + * pass. + */ + if (state->nInputRuns == 0) + { + int64 input_buffer_size; + + /* Close the old, emptied, input tapes */ + if (state->nInputTapes > 0) + { + for (tapenum = 0; tapenum < state->nInputTapes; tapenum++) + LogicalTapeClose(state->inputTapes[tapenum]); + pfree(state->inputTapes); + } + + /* Previous pass's outputs become next pass's inputs. */ + state->inputTapes = state->outputTapes; + state->nInputTapes = state->nOutputTapes; + state->nInputRuns = state->nOutputRuns; + + /* + * Reset output tape variables. The actual LogicalTapes will be + * created as needed, here we only allocate the array to hold + * them. + */ + state->outputTapes = palloc0(state->nInputTapes * sizeof(LogicalTape *)); + state->nOutputTapes = 0; + state->nOutputRuns = 0; + + /* + * Redistribute the memory allocated for tape buffers, among the + * new input and output tapes. + */ + input_buffer_size = merge_read_buffer_size(state->tape_buffer_mem, + state->nInputTapes, + state->nInputRuns, + state->maxTapes); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "starting merge pass of %d input runs on %d tapes, " INT64_FORMAT " KB of memory for each input tape: %s", + state->nInputRuns, state->nInputTapes, input_buffer_size / 1024, + pg_rusage_show(&state->ru_start)); +#endif + + /* Prepare the new input tapes for merge pass. */ + for (tapenum = 0; tapenum < state->nInputTapes; tapenum++) + LogicalTapeRewindForRead(state->inputTapes[tapenum], input_buffer_size); + + /* + * If there's just one run left on each input tape, then only one + * merge pass remains. If we don't have to produce a materialized + * sorted tape, we can stop at this point and do the final merge + * on-the-fly. + */ + if ((state->sortopt & TUPLESORT_RANDOMACCESS) == 0 + && state->nInputRuns <= state->nInputTapes + && !WORKER(state)) + { + /* Tell logtape.c we won't be writing anymore */ + LogicalTapeSetForgetFreeSpace(state->tapeset); + /* Initialize for the final merge pass */ + beginmerge(state); + state->status = TSS_FINALMERGE; + return; + } + } + + /* Select an output tape */ + selectnewtape(state); + + /* Merge one run from each input tape. */ + mergeonerun(state); + + /* + * If the input tapes are empty, and we output only one output run, + * we're done. The current output tape contains the final result. + */ + if (state->nInputRuns == 0 && state->nOutputRuns <= 1) + break; + } + + /* + * Done. The result is on a single run on a single tape. + */ + state->result_tape = state->outputTapes[0]; + if (!WORKER(state)) + LogicalTapeFreeze(state->result_tape, NULL); + else + worker_freeze_result_tape(state); + state->status = TSS_SORTEDONTAPE; + + /* Close all the now-empty input tapes, to release their read buffers. */ + for (tapenum = 0; tapenum < state->nInputTapes; tapenum++) + LogicalTapeClose(state->inputTapes[tapenum]); +} + +/* + * Merge one run from each input tape. + */ +static void +mergeonerun(Tuplesortstate *state) +{ + int srcTapeIndex; + LogicalTape *srcTape; + + /* + * Start the merge by loading one tuple from each active source tape into + * the heap. + */ + beginmerge(state); + + /* + * Execute merge by repeatedly extracting lowest tuple in heap, writing it + * out, and replacing it with next tuple from same tape (if there is + * another one). + */ + while (state->memtupcount > 0) + { + SortTuple stup; + + /* write the tuple to destTape */ + srcTapeIndex = state->memtuples[0].srctape; + srcTape = state->inputTapes[srcTapeIndex]; + WRITETUP(state, state->destTape, &state->memtuples[0]); + + /* recycle the slot of the tuple we just wrote out, for the next read */ + if (state->memtuples[0].tuple) + RELEASE_SLAB_SLOT(state, state->memtuples[0].tuple); + + /* + * pull next tuple from the tape, and replace the written-out tuple in + * the heap with it. + */ + if (mergereadnext(state, srcTape, &stup)) + { + stup.srctape = srcTapeIndex; + tuplesort_heap_replace_top(state, &stup); + } + else + { + tuplesort_heap_delete_top(state); + state->nInputRuns--; + } + } + + /* + * When the heap empties, we're done. Write an end-of-run marker on the + * output tape. + */ + markrunend(state->destTape); +} + +/* + * beginmerge - initialize for a merge pass + * + * Fill the merge heap with the first tuple from each input tape. + */ +static void +beginmerge(Tuplesortstate *state) +{ + int activeTapes; + int srcTapeIndex; + + /* Heap should be empty here */ + Assert(state->memtupcount == 0); + + activeTapes = Min(state->nInputTapes, state->nInputRuns); + + for (srcTapeIndex = 0; srcTapeIndex < activeTapes; srcTapeIndex++) + { + SortTuple tup; + + if (mergereadnext(state, state->inputTapes[srcTapeIndex], &tup)) + { + tup.srctape = srcTapeIndex; + tuplesort_heap_insert(state, &tup); + } + } +} + +/* + * mergereadnext - read next tuple from one merge input tape + * + * Returns false on EOF. + */ +static bool +mergereadnext(Tuplesortstate *state, LogicalTape *srcTape, SortTuple *stup) +{ + unsigned int tuplen; + + /* read next tuple, if any */ + if ((tuplen = getlen(srcTape, true)) == 0) + return false; + READTUP(state, stup, srcTape, tuplen); + + return true; +} + +/* + * dumptuples - remove tuples from memtuples and write initial run to tape + * + * When alltuples = true, dump everything currently in memory. (This case is + * only used at end of input data.) + */ +static void +dumptuples(Tuplesortstate *state, bool alltuples) +{ + int memtupwrite; + int i; + + /* + * Nothing to do if we still fit in available memory and have array slots, + * unless this is the final call during initial run generation. + */ + if (state->memtupcount < state->memtupsize && !LACKMEM(state) && + !alltuples) + return; + + /* + * Final call might require no sorting, in rare cases where we just so + * happen to have previously LACKMEM()'d at the point where exactly all + * remaining tuples are loaded into memory, just before input was + * exhausted. In general, short final runs are quite possible, but avoid + * creating a completely empty run. In a worker, though, we must produce + * at least one tape, even if it's empty. + */ + if (state->memtupcount == 0 && state->currentRun > 0) + return; + + Assert(state->status == TSS_BUILDRUNS); + + /* + * It seems unlikely that this limit will ever be exceeded, but take no + * chances + */ + if (state->currentRun == INT_MAX) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("cannot have more than %d runs for an external sort", + INT_MAX))); + + if (state->currentRun > 0) + selectnewtape(state); + + state->currentRun++; + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d starting quicksort of run %d: %s", + state->worker, state->currentRun, + pg_rusage_show(&state->ru_start)); +#endif + + /* + * Sort all tuples accumulated within the allowed amount of memory for + * this run using quicksort + */ + tuplesort_sort_memtuples(state); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d finished quicksort of run %d: %s", + state->worker, state->currentRun, + pg_rusage_show(&state->ru_start)); +#endif + + memtupwrite = state->memtupcount; + for (i = 0; i < memtupwrite; i++) + { + WRITETUP(state, state->destTape, &state->memtuples[i]); + state->memtupcount--; + } + + /* + * Reset tuple memory. We've freed all of the tuples that we previously + * allocated. It's important to avoid fragmentation when there is a stark + * change in the sizes of incoming tuples. Fragmentation due to + * AllocSetFree's bucketing by size class might be particularly bad if + * this step wasn't taken. + */ + MemoryContextReset(state->tuplecontext); + + markrunend(state->destTape); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "worker %d finished writing run %d to tape %d: %s", + state->worker, state->currentRun, (state->currentRun - 1) % state->nOutputTapes + 1, + pg_rusage_show(&state->ru_start)); +#endif +} + +/* + * tuplesort_rescan - rewind and replay the scan + */ +void +tuplesort_rescan(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + + Assert(state->sortopt & TUPLESORT_RANDOMACCESS); + + switch (state->status) + { + case TSS_SORTEDINMEM: + state->current = 0; + state->eof_reached = false; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + case TSS_SORTEDONTAPE: + LogicalTapeRewindForRead(state->result_tape, 0); + state->eof_reached = false; + state->markpos_block = 0L; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + + MemoryContextSwitchTo(oldcontext); +} + +/* + * tuplesort_markpos - saves current position in the merged sort file + */ +void +tuplesort_markpos(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + + Assert(state->sortopt & TUPLESORT_RANDOMACCESS); + + switch (state->status) + { + case TSS_SORTEDINMEM: + state->markpos_offset = state->current; + state->markpos_eof = state->eof_reached; + break; + case TSS_SORTEDONTAPE: + LogicalTapeTell(state->result_tape, + &state->markpos_block, + &state->markpos_offset); + state->markpos_eof = state->eof_reached; + break; + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + + MemoryContextSwitchTo(oldcontext); +} + +/* + * tuplesort_restorepos - restores current position in merged sort file to + * last saved position + */ +void +tuplesort_restorepos(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + + Assert(state->sortopt & TUPLESORT_RANDOMACCESS); + + switch (state->status) + { + case TSS_SORTEDINMEM: + state->current = state->markpos_offset; + state->eof_reached = state->markpos_eof; + break; + case TSS_SORTEDONTAPE: + LogicalTapeSeek(state->result_tape, + state->markpos_block, + state->markpos_offset); + state->eof_reached = state->markpos_eof; + break; + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + + MemoryContextSwitchTo(oldcontext); +} + +/* + * tuplesort_get_stats - extract summary statistics + * + * This can be called after tuplesort_performsort() finishes to obtain + * printable summary information about how the sort was performed. + */ +void +tuplesort_get_stats(Tuplesortstate *state, + TuplesortInstrumentation *stats) +{ + /* + * Note: it might seem we should provide both memory and disk usage for a + * disk-based sort. However, the current code doesn't track memory space + * accurately once we have begun to return tuples to the caller (since we + * don't account for pfree's the caller is expected to do), so we cannot + * rely on availMem in a disk sort. This does not seem worth the overhead + * to fix. Is it worth creating an API for the memory context code to + * tell us how much is actually used in sortcontext? + */ + tuplesort_updatemax(state); + + if (state->isMaxSpaceDisk) + stats->spaceType = SORT_SPACE_TYPE_DISK; + else + stats->spaceType = SORT_SPACE_TYPE_MEMORY; + stats->spaceUsed = (state->maxSpace + 1023) / 1024; + + switch (state->maxSpaceStatus) + { + case TSS_SORTEDINMEM: + if (state->boundUsed) + stats->sortMethod = SORT_TYPE_TOP_N_HEAPSORT; + else + stats->sortMethod = SORT_TYPE_QUICKSORT; + break; + case TSS_SORTEDONTAPE: + stats->sortMethod = SORT_TYPE_EXTERNAL_SORT; + break; + case TSS_FINALMERGE: + stats->sortMethod = SORT_TYPE_EXTERNAL_MERGE; + break; + default: + stats->sortMethod = SORT_TYPE_STILL_IN_PROGRESS; + break; + } +} + +/* + * Convert TuplesortMethod to a string. + */ +const char * +tuplesort_method_name(TuplesortMethod m) +{ + switch (m) + { + case SORT_TYPE_STILL_IN_PROGRESS: + return "still in progress"; + case SORT_TYPE_TOP_N_HEAPSORT: + return "top-N heapsort"; + case SORT_TYPE_QUICKSORT: + return "quicksort"; + case SORT_TYPE_EXTERNAL_SORT: + return "external sort"; + case SORT_TYPE_EXTERNAL_MERGE: + return "external merge"; + } + + return "unknown"; +} + +/* + * Convert TuplesortSpaceType to a string. + */ +const char * +tuplesort_space_type_name(TuplesortSpaceType t) +{ + Assert(t == SORT_SPACE_TYPE_DISK || t == SORT_SPACE_TYPE_MEMORY); + return t == SORT_SPACE_TYPE_DISK ? "Disk" : "Memory"; +} + + +/* + * Heap manipulation routines, per Knuth's Algorithm 5.2.3H. + */ + +/* + * Convert the existing unordered array of SortTuples to a bounded heap, + * discarding all but the smallest "state->bound" tuples. + * + * When working with a bounded heap, we want to keep the largest entry + * at the root (array entry zero), instead of the smallest as in the normal + * sort case. This allows us to discard the largest entry cheaply. + * Therefore, we temporarily reverse the sort direction. + */ +static void +make_bounded_heap(Tuplesortstate *state) +{ + int tupcount = state->memtupcount; + int i; + + Assert(state->status == TSS_INITIAL); + Assert(state->bounded); + Assert(tupcount >= state->bound); + Assert(SERIAL(state)); + + /* Reverse sort direction so largest entry will be at root */ + reversedirection(state); + + state->memtupcount = 0; /* make the heap empty */ + for (i = 0; i < tupcount; i++) + { + if (state->memtupcount < state->bound) + { + /* Insert next tuple into heap */ + /* Must copy source tuple to avoid possible overwrite */ + SortTuple stup = state->memtuples[i]; + + tuplesort_heap_insert(state, &stup); + } + else + { + /* + * The heap is full. Replace the largest entry with the new + * tuple, or just discard it, if it's larger than anything already + * in the heap. + */ + if (COMPARETUP(state, &state->memtuples[i], &state->memtuples[0]) <= 0) + { + free_sort_tuple(state, &state->memtuples[i]); + CHECK_FOR_INTERRUPTS(); + } + else + tuplesort_heap_replace_top(state, &state->memtuples[i]); + } + } + + Assert(state->memtupcount == state->bound); + state->status = TSS_BOUNDED; +} + +/* + * Convert the bounded heap to a properly-sorted array + */ +static void +sort_bounded_heap(Tuplesortstate *state) +{ + int tupcount = state->memtupcount; + + Assert(state->status == TSS_BOUNDED); + Assert(state->bounded); + Assert(tupcount == state->bound); + Assert(SERIAL(state)); + + /* + * We can unheapify in place because each delete-top call will remove the + * largest entry, which we can promptly store in the newly freed slot at + * the end. Once we're down to a single-entry heap, we're done. + */ + while (state->memtupcount > 1) + { + SortTuple stup = state->memtuples[0]; + + /* this sifts-up the next-largest entry and decreases memtupcount */ + tuplesort_heap_delete_top(state); + state->memtuples[state->memtupcount] = stup; + } + state->memtupcount = tupcount; + + /* + * Reverse sort direction back to the original state. This is not + * actually necessary but seems like a good idea for tidiness. + */ + reversedirection(state); + + state->status = TSS_SORTEDINMEM; + state->boundUsed = true; +} + +/* + * Sort all memtuples using specialized qsort() routines. + * + * Quicksort is used for small in-memory sorts, and external sort runs. + */ +static void +tuplesort_sort_memtuples(Tuplesortstate *state) +{ + Assert(!LEADER(state)); + + if (state->memtupcount > 1) + { + /* + * Do we have the leading column's value or abbreviation in datum1, + * and is there a specialization for its comparator? + */ + if (state->haveDatum1 && state->sortKeys) + { + if (state->sortKeys[0].comparator == ssup_datum_unsigned_cmp) + { + qsort_tuple_unsigned(state->memtuples, + state->memtupcount, + state); + return; + } +#if SIZEOF_DATUM >= 8 + else if (state->sortKeys[0].comparator == ssup_datum_signed_cmp) + { + qsort_tuple_signed(state->memtuples, + state->memtupcount, + state); + return; + } +#endif + else if (state->sortKeys[0].comparator == ssup_datum_int32_cmp) + { + qsort_tuple_int32(state->memtuples, + state->memtupcount, + state); + return; + } + } + + /* Can we use the single-key sort function? */ + if (state->onlyKey != NULL) + { + qsort_ssup(state->memtuples, state->memtupcount, + state->onlyKey); + } + else + { + qsort_tuple(state->memtuples, + state->memtupcount, + state->comparetup, + state); + } + } +} + +/* + * Insert a new tuple into an empty or existing heap, maintaining the + * heap invariant. Caller is responsible for ensuring there's room. + * + * Note: For some callers, tuple points to a memtuples[] entry above the + * end of the heap. This is safe as long as it's not immediately adjacent + * to the end of the heap (ie, in the [memtupcount] array entry) --- if it + * is, it might get overwritten before being moved into the heap! + */ +static void +tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple) +{ + SortTuple *memtuples; + int j; + + memtuples = state->memtuples; + Assert(state->memtupcount < state->memtupsize); + + CHECK_FOR_INTERRUPTS(); + + /* + * Sift-up the new entry, per Knuth 5.2.3 exercise 16. Note that Knuth is + * using 1-based array indexes, not 0-based. + */ + j = state->memtupcount++; + while (j > 0) + { + int i = (j - 1) >> 1; + + if (COMPARETUP(state, tuple, &memtuples[i]) >= 0) + break; + memtuples[j] = memtuples[i]; + j = i; + } + memtuples[j] = *tuple; +} + +/* + * Remove the tuple at state->memtuples[0] from the heap. Decrement + * memtupcount, and sift up to maintain the heap invariant. + * + * The caller has already free'd the tuple the top node points to, + * if necessary. + */ +static void +tuplesort_heap_delete_top(Tuplesortstate *state) +{ + SortTuple *memtuples = state->memtuples; + SortTuple *tuple; + + if (--state->memtupcount <= 0) + return; + + /* + * Remove the last tuple in the heap, and re-insert it, by replacing the + * current top node with it. + */ + tuple = &memtuples[state->memtupcount]; + tuplesort_heap_replace_top(state, tuple); +} + +/* + * Replace the tuple at state->memtuples[0] with a new tuple. Sift up to + * maintain the heap invariant. + * + * This corresponds to Knuth's "sift-up" algorithm (Algorithm 5.2.3H, + * Heapsort, steps H3-H8). + */ +static void +tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple *tuple) +{ + SortTuple *memtuples = state->memtuples; + unsigned int i, + n; + + Assert(state->memtupcount >= 1); + + CHECK_FOR_INTERRUPTS(); + + /* + * state->memtupcount is "int", but we use "unsigned int" for i, j, n. + * This prevents overflow in the "2 * i + 1" calculation, since at the top + * of the loop we must have i < n <= INT_MAX <= UINT_MAX/2. + */ + n = state->memtupcount; + i = 0; /* i is where the "hole" is */ + for (;;) + { + unsigned int j = 2 * i + 1; + + if (j >= n) + break; + if (j + 1 < n && + COMPARETUP(state, &memtuples[j], &memtuples[j + 1]) > 0) + j++; + if (COMPARETUP(state, tuple, &memtuples[j]) <= 0) + break; + memtuples[i] = memtuples[j]; + i = j; + } + memtuples[i] = *tuple; +} + +/* + * Function to reverse the sort direction from its current state + * + * It is not safe to call this when performing hash tuplesorts + */ +static void +reversedirection(Tuplesortstate *state) +{ + SortSupport sortKey = state->sortKeys; + int nkey; + + for (nkey = 0; nkey < state->nKeys; nkey++, sortKey++) + { + sortKey->ssup_reverse = !sortKey->ssup_reverse; + sortKey->ssup_nulls_first = !sortKey->ssup_nulls_first; + } +} + + +/* + * Tape interface routines + */ + +static unsigned int +getlen(LogicalTape *tape, bool eofOK) +{ + unsigned int len; + + if (LogicalTapeRead(tape, + &len, sizeof(len)) != sizeof(len)) + elog(ERROR, "unexpected end of tape"); + if (len == 0 && !eofOK) + elog(ERROR, "unexpected end of data"); + return len; +} + +static void +markrunend(LogicalTape *tape) +{ + unsigned int len = 0; + + LogicalTapeWrite(tape, (void *) &len, sizeof(len)); +} + +/* + * Get memory for tuple from within READTUP() routine. + * + * We use next free slot from the slab allocator, or palloc() if the tuple + * is too large for that. + */ +static void * +readtup_alloc(Tuplesortstate *state, Size tuplen) +{ + SlabSlot *buf; + + /* + * We pre-allocate enough slots in the slab arena that we should never run + * out. + */ + Assert(state->slabFreeHead); + + if (tuplen > SLAB_SLOT_SIZE || !state->slabFreeHead) + return MemoryContextAlloc(state->sortcontext, tuplen); + else + { + buf = state->slabFreeHead; + /* Reuse this slot */ + state->slabFreeHead = buf->nextfree; + + return buf; + } +} + + +/* + * Routines specialized for HeapTuple (actually MinimalTuple) case + */ + +static int +comparetup_heap(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) +{ + SortSupport sortKey = state->sortKeys; + HeapTupleData ltup; + HeapTupleData rtup; + TupleDesc tupDesc; + int nkey; + int32 compare; + AttrNumber attno; + Datum datum1, + datum2; + bool isnull1, + isnull2; + + + /* Compare the leading sort key */ + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + sortKey); + if (compare != 0) + return compare; + + /* Compare additional sort keys */ + ltup.t_len = ((MinimalTuple) a->tuple)->t_len + MINIMAL_TUPLE_OFFSET; + ltup.t_data = (HeapTupleHeader) ((char *) a->tuple - MINIMAL_TUPLE_OFFSET); + rtup.t_len = ((MinimalTuple) b->tuple)->t_len + MINIMAL_TUPLE_OFFSET; + rtup.t_data = (HeapTupleHeader) ((char *) b->tuple - MINIMAL_TUPLE_OFFSET); + tupDesc = state->tupDesc; + + if (sortKey->abbrev_converter) + { + attno = sortKey->ssup_attno; + + datum1 = heap_getattr(<up, attno, tupDesc, &isnull1); + datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2); + + compare = ApplySortAbbrevFullComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + + sortKey++; + for (nkey = 1; nkey < state->nKeys; nkey++, sortKey++) + { + attno = sortKey->ssup_attno; + + datum1 = heap_getattr(<up, attno, tupDesc, &isnull1); + datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2); + + compare = ApplySortComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + + return 0; +} + +static void +copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + /* + * We expect the passed "tup" to be a TupleTableSlot, and form a + * MinimalTuple using the exported interface for that. + */ + TupleTableSlot *slot = (TupleTableSlot *) tup; + Datum original; + MinimalTuple tuple; + HeapTupleData htup; + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + + /* copy the tuple into sort storage */ + tuple = ExecCopySlotMinimalTuple(slot); + stup->tuple = (void *) tuple; + USEMEM(state, GetMemoryChunkSpace(tuple)); + /* set up first-column key value */ + htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET; + htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET); + original = heap_getattr(&htup, + state->sortKeys[0].ssup_attno, + state->tupDesc, + &stup->isnull1); + + MemoryContextSwitchTo(oldcontext); + + if (!state->sortKeys->abbrev_converter || stup->isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to zeroed representation (to be + * consistent, and to support cheap inequality tests for NULL + * abbreviated keys). + */ + stup->datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup->datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup->datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + htup.t_len = ((MinimalTuple) mtup->tuple)->t_len + + MINIMAL_TUPLE_OFFSET; + htup.t_data = (HeapTupleHeader) ((char *) mtup->tuple - + MINIMAL_TUPLE_OFFSET); + + mtup->datum1 = heap_getattr(&htup, + state->sortKeys[0].ssup_attno, + state->tupDesc, + &mtup->isnull1); + } + } +} + +static void +writetup_heap(Tuplesortstate *state, LogicalTape *tape, SortTuple *stup) +{ + MinimalTuple tuple = (MinimalTuple) stup->tuple; + + /* the part of the MinimalTuple we'll write: */ + char *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET; + unsigned int tupbodylen = tuple->t_len - MINIMAL_TUPLE_DATA_OFFSET; + + /* total on-disk footprint: */ + unsigned int tuplen = tupbodylen + sizeof(int); + + LogicalTapeWrite(tape, (void *) &tuplen, sizeof(tuplen)); + LogicalTapeWrite(tape, (void *) tupbody, tupbodylen); + if (state->sortopt & TUPLESORT_RANDOMACCESS) /* need trailing length + * word? */ + LogicalTapeWrite(tape, (void *) &tuplen, sizeof(tuplen)); + + if (!state->slabAllocatorUsed) + { + FREEMEM(state, GetMemoryChunkSpace(tuple)); + heap_free_minimal_tuple(tuple); + } +} + +static void +readtup_heap(Tuplesortstate *state, SortTuple *stup, + LogicalTape *tape, unsigned int len) +{ + unsigned int tupbodylen = len - sizeof(int); + unsigned int tuplen = tupbodylen + MINIMAL_TUPLE_DATA_OFFSET; + MinimalTuple tuple = (MinimalTuple) readtup_alloc(state, tuplen); + char *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET; + HeapTupleData htup; + + /* read in the tuple proper */ + tuple->t_len = tuplen; + LogicalTapeReadExact(tape, tupbody, tupbodylen); + if (state->sortopt & TUPLESORT_RANDOMACCESS) /* need trailing length + * word? */ + LogicalTapeReadExact(tape, &tuplen, sizeof(tuplen)); + stup->tuple = (void *) tuple; + /* set up first-column key value */ + htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET; + htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET); + stup->datum1 = heap_getattr(&htup, + state->sortKeys[0].ssup_attno, + state->tupDesc, + &stup->isnull1); +} + +/* + * Routines specialized for the CLUSTER case (HeapTuple data, with + * comparisons per a btree index definition) + */ + +static int +comparetup_cluster(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state) +{ + SortSupport sortKey = state->sortKeys; + HeapTuple ltup; + HeapTuple rtup; + TupleDesc tupDesc; + int nkey; + int32 compare; + Datum datum1, + datum2; + bool isnull1, + isnull2; + + /* Be prepared to compare additional sort keys */ + ltup = (HeapTuple) a->tuple; + rtup = (HeapTuple) b->tuple; + tupDesc = state->tupDesc; + + /* Compare the leading sort key, if it's simple */ + if (state->haveDatum1) + { + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + sortKey); + if (compare != 0) + return compare; + + if (sortKey->abbrev_converter) + { + AttrNumber leading = state->indexInfo->ii_IndexAttrNumbers[0]; + + datum1 = heap_getattr(ltup, leading, tupDesc, &isnull1); + datum2 = heap_getattr(rtup, leading, tupDesc, &isnull2); + + compare = ApplySortAbbrevFullComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + } + if (compare != 0 || state->nKeys == 1) + return compare; + /* Compare additional columns the hard way */ + sortKey++; + nkey = 1; + } + else + { + /* Must compare all keys the hard way */ + nkey = 0; + } + + if (state->indexInfo->ii_Expressions == NULL) + { + /* If not expression index, just compare the proper heap attrs */ + + for (; nkey < state->nKeys; nkey++, sortKey++) + { + AttrNumber attno = state->indexInfo->ii_IndexAttrNumbers[nkey]; + + datum1 = heap_getattr(ltup, attno, tupDesc, &isnull1); + datum2 = heap_getattr(rtup, attno, tupDesc, &isnull2); + + compare = ApplySortComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + } + else + { + /* + * In the expression index case, compute the whole index tuple and + * then compare values. It would perhaps be faster to compute only as + * many columns as we need to compare, but that would require + * duplicating all the logic in FormIndexDatum. + */ + Datum l_index_values[INDEX_MAX_KEYS]; + bool l_index_isnull[INDEX_MAX_KEYS]; + Datum r_index_values[INDEX_MAX_KEYS]; + bool r_index_isnull[INDEX_MAX_KEYS]; + TupleTableSlot *ecxt_scantuple; + + /* Reset context each time to prevent memory leakage */ + ResetPerTupleExprContext(state->estate); + + ecxt_scantuple = GetPerTupleExprContext(state->estate)->ecxt_scantuple; + + ExecStoreHeapTuple(ltup, ecxt_scantuple, false); + FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate, + l_index_values, l_index_isnull); + + ExecStoreHeapTuple(rtup, ecxt_scantuple, false); + FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate, + r_index_values, r_index_isnull); + + for (; nkey < state->nKeys; nkey++, sortKey++) + { + compare = ApplySortComparator(l_index_values[nkey], + l_index_isnull[nkey], + r_index_values[nkey], + r_index_isnull[nkey], + sortKey); + if (compare != 0) + return compare; + } + } + + return 0; +} + +static void +copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + HeapTuple tuple = (HeapTuple) tup; + Datum original; + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + + /* copy the tuple into sort storage */ + tuple = heap_copytuple(tuple); + stup->tuple = (void *) tuple; + USEMEM(state, GetMemoryChunkSpace(tuple)); + + MemoryContextSwitchTo(oldcontext); + + /* + * set up first-column key value, and potentially abbreviate, if it's a + * simple column + */ + if (!state->haveDatum1) + return; + + original = heap_getattr(tuple, + state->indexInfo->ii_IndexAttrNumbers[0], + state->tupDesc, + &stup->isnull1); + + if (!state->sortKeys->abbrev_converter || stup->isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to zeroed representation (to be + * consistent, and to support cheap inequality tests for NULL + * abbreviated keys). + */ + stup->datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup->datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup->datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + tuple = (HeapTuple) mtup->tuple; + mtup->datum1 = heap_getattr(tuple, + state->indexInfo->ii_IndexAttrNumbers[0], + state->tupDesc, + &mtup->isnull1); + } + } +} + +static void +writetup_cluster(Tuplesortstate *state, LogicalTape *tape, SortTuple *stup) +{ + HeapTuple tuple = (HeapTuple) stup->tuple; + unsigned int tuplen = tuple->t_len + sizeof(ItemPointerData) + sizeof(int); + + /* We need to store t_self, but not other fields of HeapTupleData */ + LogicalTapeWrite(tape, &tuplen, sizeof(tuplen)); + LogicalTapeWrite(tape, &tuple->t_self, sizeof(ItemPointerData)); + LogicalTapeWrite(tape, tuple->t_data, tuple->t_len); + if (state->sortopt & TUPLESORT_RANDOMACCESS) /* need trailing length + * word? */ + LogicalTapeWrite(tape, &tuplen, sizeof(tuplen)); + + if (!state->slabAllocatorUsed) + { + FREEMEM(state, GetMemoryChunkSpace(tuple)); + heap_freetuple(tuple); + } +} + +static void +readtup_cluster(Tuplesortstate *state, SortTuple *stup, + LogicalTape *tape, unsigned int tuplen) +{ + unsigned int t_len = tuplen - sizeof(ItemPointerData) - sizeof(int); + HeapTuple tuple = (HeapTuple) readtup_alloc(state, + t_len + HEAPTUPLESIZE); + + /* Reconstruct the HeapTupleData header */ + tuple->t_data = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE); + tuple->t_len = t_len; + LogicalTapeReadExact(tape, &tuple->t_self, sizeof(ItemPointerData)); + /* We don't currently bother to reconstruct t_tableOid */ + tuple->t_tableOid = InvalidOid; + /* Read in the tuple body */ + LogicalTapeReadExact(tape, tuple->t_data, tuple->t_len); + if (state->sortopt & TUPLESORT_RANDOMACCESS) /* need trailing length + * word? */ + LogicalTapeReadExact(tape, &tuplen, sizeof(tuplen)); + stup->tuple = (void *) tuple; + /* set up first-column key value, if it's a simple column */ + if (state->haveDatum1) + stup->datum1 = heap_getattr(tuple, + state->indexInfo->ii_IndexAttrNumbers[0], + state->tupDesc, + &stup->isnull1); +} + +/* + * Routines specialized for IndexTuple case + * + * The btree and hash cases require separate comparison functions, but the + * IndexTuple representation is the same so the copy/write/read support + * functions can be shared. + */ + +static int +comparetup_index_btree(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state) +{ + /* + * This is similar to comparetup_heap(), but expects index tuples. There + * is also special handling for enforcing uniqueness, and special + * treatment for equal keys at the end. + */ + SortSupport sortKey = state->sortKeys; + IndexTuple tuple1; + IndexTuple tuple2; + int keysz; + TupleDesc tupDes; + bool equal_hasnull = false; + int nkey; + int32 compare; + Datum datum1, + datum2; + bool isnull1, + isnull2; + + + /* Compare the leading sort key */ + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + sortKey); + if (compare != 0) + return compare; + + /* Compare additional sort keys */ + tuple1 = (IndexTuple) a->tuple; + tuple2 = (IndexTuple) b->tuple; + keysz = state->nKeys; + tupDes = RelationGetDescr(state->indexRel); + + if (sortKey->abbrev_converter) + { + datum1 = index_getattr(tuple1, 1, tupDes, &isnull1); + datum2 = index_getattr(tuple2, 1, tupDes, &isnull2); + + compare = ApplySortAbbrevFullComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + + /* they are equal, so we only need to examine one null flag */ + if (a->isnull1) + equal_hasnull = true; + + sortKey++; + for (nkey = 2; nkey <= keysz; nkey++, sortKey++) + { + datum1 = index_getattr(tuple1, nkey, tupDes, &isnull1); + datum2 = index_getattr(tuple2, nkey, tupDes, &isnull2); + + compare = ApplySortComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; /* done when we find unequal attributes */ + + /* they are equal, so we only need to examine one null flag */ + if (isnull1) + equal_hasnull = true; + } + + /* + * If btree has asked us to enforce uniqueness, complain if two equal + * tuples are detected (unless there was at least one NULL field and NULLS + * NOT DISTINCT was not set). + * + * It is sufficient to make the test here, because if two tuples are equal + * they *must* get compared at some stage of the sort --- otherwise the + * sort algorithm wouldn't have checked whether one must appear before the + * other. + */ + if (state->enforceUnique && !(!state->uniqueNullsNotDistinct && equal_hasnull)) + { + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + char *key_desc; + + /* + * Some rather brain-dead implementations of qsort (such as the one in + * QNX 4) will sometimes call the comparison routine to compare a + * value to itself, but we always use our own implementation, which + * does not. + */ + Assert(tuple1 != tuple2); + + index_deform_tuple(tuple1, tupDes, values, isnull); + + key_desc = BuildIndexValueDescription(state->indexRel, values, isnull); + + ereport(ERROR, + (errcode(ERRCODE_UNIQUE_VIOLATION), + errmsg("could not create unique index \"%s\"", + RelationGetRelationName(state->indexRel)), + key_desc ? errdetail("Key %s is duplicated.", key_desc) : + errdetail("Duplicate keys exist."), + errtableconstraint(state->heapRel, + RelationGetRelationName(state->indexRel)))); + } + + /* + * If key values are equal, we sort on ItemPointer. This is required for + * btree indexes, since heap TID is treated as an implicit last key + * attribute in order to ensure that all keys in the index are physically + * unique. + */ + { + BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid); + BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid); + + if (blk1 != blk2) + return (blk1 < blk2) ? -1 : 1; + } + { + OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid); + OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid); + + if (pos1 != pos2) + return (pos1 < pos2) ? -1 : 1; + } + + /* ItemPointer values should never be equal */ + Assert(false); + + return 0; +} + +static int +comparetup_index_hash(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state) +{ + Bucket bucket1; + Bucket bucket2; + IndexTuple tuple1; + IndexTuple tuple2; + + /* + * Fetch hash keys and mask off bits we don't want to sort by. We know + * that the first column of the index tuple is the hash key. + */ + Assert(!a->isnull1); + bucket1 = _hash_hashkey2bucket(DatumGetUInt32(a->datum1), + state->max_buckets, state->high_mask, + state->low_mask); + Assert(!b->isnull1); + bucket2 = _hash_hashkey2bucket(DatumGetUInt32(b->datum1), + state->max_buckets, state->high_mask, + state->low_mask); + if (bucket1 > bucket2) + return 1; + else if (bucket1 < bucket2) + return -1; + + /* + * If hash values are equal, we sort on ItemPointer. This does not affect + * validity of the finished index, but it may be useful to have index + * scans in physical order. + */ + tuple1 = (IndexTuple) a->tuple; + tuple2 = (IndexTuple) b->tuple; + + { + BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid); + BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid); + + if (blk1 != blk2) + return (blk1 < blk2) ? -1 : 1; + } + { + OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid); + OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid); + + if (pos1 != pos2) + return (pos1 < pos2) ? -1 : 1; + } + + /* ItemPointer values should never be equal */ + Assert(false); + + return 0; +} + +static void +copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + /* Not currently needed */ + elog(ERROR, "copytup_index() should not be called"); +} + +static void +writetup_index(Tuplesortstate *state, LogicalTape *tape, SortTuple *stup) +{ + IndexTuple tuple = (IndexTuple) stup->tuple; + unsigned int tuplen; + + tuplen = IndexTupleSize(tuple) + sizeof(tuplen); + LogicalTapeWrite(tape, (void *) &tuplen, sizeof(tuplen)); + LogicalTapeWrite(tape, (void *) tuple, IndexTupleSize(tuple)); + if (state->sortopt & TUPLESORT_RANDOMACCESS) /* need trailing length + * word? */ + LogicalTapeWrite(tape, (void *) &tuplen, sizeof(tuplen)); + + if (!state->slabAllocatorUsed) + { + FREEMEM(state, GetMemoryChunkSpace(tuple)); + pfree(tuple); + } +} + +static void +readtup_index(Tuplesortstate *state, SortTuple *stup, + LogicalTape *tape, unsigned int len) +{ + unsigned int tuplen = len - sizeof(unsigned int); + IndexTuple tuple = (IndexTuple) readtup_alloc(state, tuplen); + + LogicalTapeReadExact(tape, tuple, tuplen); + if (state->sortopt & TUPLESORT_RANDOMACCESS) /* need trailing length + * word? */ + LogicalTapeReadExact(tape, &tuplen, sizeof(tuplen)); + stup->tuple = (void *) tuple; + /* set up first-column key value */ + stup->datum1 = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &stup->isnull1); +} + +/* + * Routines specialized for DatumTuple case + */ + +static int +comparetup_datum(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) +{ + int compare; + + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + state->sortKeys); + if (compare != 0) + return compare; + + /* if we have abbreviations, then "tuple" has the original value */ + + if (state->sortKeys->abbrev_converter) + compare = ApplySortAbbrevFullComparator(PointerGetDatum(a->tuple), a->isnull1, + PointerGetDatum(b->tuple), b->isnull1, + state->sortKeys); + + return compare; +} + +static void +copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + /* Not currently needed */ + elog(ERROR, "copytup_datum() should not be called"); +} + +static void +writetup_datum(Tuplesortstate *state, LogicalTape *tape, SortTuple *stup) +{ + void *waddr; + unsigned int tuplen; + unsigned int writtenlen; + + if (stup->isnull1) + { + waddr = NULL; + tuplen = 0; + } + else if (!state->tuples) + { + waddr = &stup->datum1; + tuplen = sizeof(Datum); + } + else + { + waddr = stup->tuple; + tuplen = datumGetSize(PointerGetDatum(stup->tuple), false, state->datumTypeLen); + Assert(tuplen != 0); + } + + writtenlen = tuplen + sizeof(unsigned int); + + LogicalTapeWrite(tape, (void *) &writtenlen, sizeof(writtenlen)); + LogicalTapeWrite(tape, waddr, tuplen); + if (state->sortopt & TUPLESORT_RANDOMACCESS) /* need trailing length + * word? */ + LogicalTapeWrite(tape, (void *) &writtenlen, sizeof(writtenlen)); + + if (!state->slabAllocatorUsed && stup->tuple) + { + FREEMEM(state, GetMemoryChunkSpace(stup->tuple)); + pfree(stup->tuple); + } +} + +static void +readtup_datum(Tuplesortstate *state, SortTuple *stup, + LogicalTape *tape, unsigned int len) +{ + unsigned int tuplen = len - sizeof(unsigned int); + + if (tuplen == 0) + { + /* it's NULL */ + stup->datum1 = (Datum) 0; + stup->isnull1 = true; + stup->tuple = NULL; + } + else if (!state->tuples) + { + Assert(tuplen == sizeof(Datum)); + LogicalTapeReadExact(tape, &stup->datum1, tuplen); + stup->isnull1 = false; + stup->tuple = NULL; + } + else + { + void *raddr = readtup_alloc(state, tuplen); + + LogicalTapeReadExact(tape, raddr, tuplen); + stup->datum1 = PointerGetDatum(raddr); + stup->isnull1 = false; + stup->tuple = raddr; + } + + if (state->sortopt & TUPLESORT_RANDOMACCESS) /* need trailing length + * word? */ + LogicalTapeReadExact(tape, &tuplen, sizeof(tuplen)); +} + +/* + * Parallel sort routines + */ + +/* + * tuplesort_estimate_shared - estimate required shared memory allocation + * + * nWorkers is an estimate of the number of workers (it's the number that + * will be requested). + */ +Size +tuplesort_estimate_shared(int nWorkers) +{ + Size tapesSize; + + Assert(nWorkers > 0); + + /* Make sure that BufFile shared state is MAXALIGN'd */ + tapesSize = mul_size(sizeof(TapeShare), nWorkers); + tapesSize = MAXALIGN(add_size(tapesSize, offsetof(Sharedsort, tapes))); + + return tapesSize; +} + +/* + * tuplesort_initialize_shared - initialize shared tuplesort state + * + * Must be called from leader process before workers are launched, to + * establish state needed up-front for worker tuplesortstates. nWorkers + * should match the argument passed to tuplesort_estimate_shared(). + */ +void +tuplesort_initialize_shared(Sharedsort *shared, int nWorkers, dsm_segment *seg) +{ + int i; + + Assert(nWorkers > 0); + + SpinLockInit(&shared->mutex); + shared->currentWorker = 0; + shared->workersFinished = 0; + SharedFileSetInit(&shared->fileset, seg); + shared->nTapes = nWorkers; + for (i = 0; i < nWorkers; i++) + { + shared->tapes[i].firstblocknumber = 0L; + } +} + +/* + * tuplesort_attach_shared - attach to shared tuplesort state + * + * Must be called by all worker processes. + */ +void +tuplesort_attach_shared(Sharedsort *shared, dsm_segment *seg) +{ + /* Attach to SharedFileSet */ + SharedFileSetAttach(&shared->fileset, seg); +} + +/* + * worker_get_identifier - Assign and return ordinal identifier for worker + * + * The order in which these are assigned is not well defined, and should not + * matter; worker numbers across parallel sort participants need only be + * distinct and gapless. logtape.c requires this. + * + * Note that the identifiers assigned from here have no relation to + * ParallelWorkerNumber number, to avoid making any assumption about + * caller's requirements. However, we do follow the ParallelWorkerNumber + * convention of representing a non-worker with worker number -1. This + * includes the leader, as well as serial Tuplesort processes. + */ +static int +worker_get_identifier(Tuplesortstate *state) +{ + Sharedsort *shared = state->shared; + int worker; + + Assert(WORKER(state)); + + SpinLockAcquire(&shared->mutex); + worker = shared->currentWorker++; + SpinLockRelease(&shared->mutex); + + return worker; +} + +/* + * worker_freeze_result_tape - freeze worker's result tape for leader + * + * This is called by workers just after the result tape has been determined, + * instead of calling LogicalTapeFreeze() directly. They do so because + * workers require a few additional steps over similar serial + * TSS_SORTEDONTAPE external sort cases, which also happen here. The extra + * steps are around freeing now unneeded resources, and representing to + * leader that worker's input run is available for its merge. + * + * There should only be one final output run for each worker, which consists + * of all tuples that were originally input into worker. + */ +static void +worker_freeze_result_tape(Tuplesortstate *state) +{ + Sharedsort *shared = state->shared; + TapeShare output; + + Assert(WORKER(state)); + Assert(state->result_tape != NULL); + Assert(state->memtupcount == 0); + + /* + * Free most remaining memory, in case caller is sensitive to our holding + * on to it. memtuples may not be a tiny merge heap at this point. + */ + pfree(state->memtuples); + /* Be tidy */ + state->memtuples = NULL; + state->memtupsize = 0; + + /* + * Parallel worker requires result tape metadata, which is to be stored in + * shared memory for leader + */ + LogicalTapeFreeze(state->result_tape, &output); + + /* Store properties of output tape, and update finished worker count */ + SpinLockAcquire(&shared->mutex); + shared->tapes[state->worker] = output; + shared->workersFinished++; + SpinLockRelease(&shared->mutex); +} + +/* + * worker_nomergeruns - dump memtuples in worker, without merging + * + * This called as an alternative to mergeruns() with a worker when no + * merging is required. + */ +static void +worker_nomergeruns(Tuplesortstate *state) +{ + Assert(WORKER(state)); + Assert(state->result_tape == NULL); + Assert(state->nOutputRuns == 1); + + state->result_tape = state->destTape; + worker_freeze_result_tape(state); +} + +/* + * leader_takeover_tapes - create tapeset for leader from worker tapes + * + * So far, leader Tuplesortstate has performed no actual sorting. By now, all + * sorting has occurred in workers, all of which must have already returned + * from tuplesort_performsort(). + * + * When this returns, leader process is left in a state that is virtually + * indistinguishable from it having generated runs as a serial external sort + * might have. + */ +static void +leader_takeover_tapes(Tuplesortstate *state) +{ + Sharedsort *shared = state->shared; + int nParticipants = state->nParticipants; + int workersFinished; + int j; + + Assert(LEADER(state)); + Assert(nParticipants >= 1); + + SpinLockAcquire(&shared->mutex); + workersFinished = shared->workersFinished; + SpinLockRelease(&shared->mutex); + + if (nParticipants != workersFinished) + elog(ERROR, "cannot take over tapes before all workers finish"); + + /* + * Create the tapeset from worker tapes, including a leader-owned tape at + * the end. Parallel workers are far more expensive than logical tapes, + * so the number of tapes allocated here should never be excessive. + */ + inittapestate(state, nParticipants); + state->tapeset = LogicalTapeSetCreate(false, &shared->fileset, -1); + + /* + * Set currentRun to reflect the number of runs we will merge (it's not + * used for anything, this is just pro forma) + */ + state->currentRun = nParticipants; + + /* + * Initialize the state to look the same as after building the initial + * runs. + * + * There will always be exactly 1 run per worker, and exactly one input + * tape per run, because workers always output exactly 1 run, even when + * there were no input tuples for workers to sort. + */ + state->inputTapes = NULL; + state->nInputTapes = 0; + state->nInputRuns = 0; + + state->outputTapes = palloc0(nParticipants * sizeof(LogicalTape *)); + state->nOutputTapes = nParticipants; + state->nOutputRuns = nParticipants; + + for (j = 0; j < nParticipants; j++) + { + state->outputTapes[j] = LogicalTapeImport(state->tapeset, j, &shared->tapes[j]); + } + + state->status = TSS_BUILDRUNS; +} + +/* + * Convenience routine to free a tuple previously loaded into sort memory + */ +static void +free_sort_tuple(Tuplesortstate *state, SortTuple *stup) +{ + if (stup->tuple) + { + FREEMEM(state, GetMemoryChunkSpace(stup->tuple)); + pfree(stup->tuple); + stup->tuple = NULL; + } +} + +int +ssup_datum_unsigned_cmp(Datum x, Datum y, SortSupport ssup) +{ + if (x < y) + return -1; + else if (x > y) + return 1; + else + return 0; +} + +#if SIZEOF_DATUM >= 8 +int +ssup_datum_signed_cmp(Datum x, Datum y, SortSupport ssup) +{ + int64 xx = DatumGetInt64(x); + int64 yy = DatumGetInt64(y); + + if (xx < yy) + return -1; + else if (xx > yy) + return 1; + else + return 0; +} +#endif + +int +ssup_datum_int32_cmp(Datum x, Datum y, SortSupport ssup) +{ + int32 xx = DatumGetInt32(x); + int32 yy = DatumGetInt32(y); + + if (xx < yy) + return -1; + else if (xx > yy) + return 1; + else + return 0; +} diff --git a/src/tuplesort96.c b/src/tuplesort96.c new file mode 100644 index 0000000000..743e025b86 --- /dev/null +++ b/src/tuplesort96.c @@ -0,0 +1,4838 @@ +/*------------------------------------------------------------------------- + * + * tuplesort.c + * Generalized tuple sorting routines. + * + * This module handles sorting of heap tuples, index tuples, or single + * Datums (and could easily support other kinds of sortable objects, + * if necessary). It works efficiently for both small and large amounts + * of data. Small amounts are sorted in-memory using qsort(). Large + * amounts are sorted using temporary files and a standard external sort + * algorithm. + * + * See Knuth, volume 3, for more than you want to know about the external + * sorting algorithm. Historically, we divided the input into sorted runs + * using replacement selection, in the form of a priority tree implemented + * as a heap (essentially his Algorithm 5.2.3H), but now we only do that + * for the first run, and only if the run would otherwise end up being very + * short. We merge the runs using polyphase merge, Knuth's Algorithm + * 5.4.2D. The logical "tapes" used by Algorithm D are implemented by + * logtape.c, which avoids space wastage by recycling disk space as soon + * as each block is read from its "tape". + * + * We do not use Knuth's recommended data structure (Algorithm 5.4.1R) for + * the replacement selection, because it uses a fixed number of records + * in memory at all times. Since we are dealing with tuples that may vary + * considerably in size, we want to be able to vary the number of records + * kept in memory to ensure full utilization of the allowed sort memory + * space. So, we keep the tuples in a variable-size heap, with the next + * record to go out at the top of the heap. Like Algorithm 5.4.1R, each + * record is stored with the run number that it must go into, and we use + * (run number, key) as the ordering key for the heap. When the run number + * at the top of the heap changes, we know that no more records of the prior + * run are left in the heap. Note that there are in practice only ever two + * distinct run numbers, because since PostgreSQL 9.6, we only use + * replacement selection to form the first run. + * + * In PostgreSQL 9.6, a heap (based on Knuth's Algorithm H, with some small + * customizations) is only used with the aim of producing just one run, + * thereby avoiding all merging. Only the first run can use replacement + * selection, which is why there are now only two possible valid run + * numbers, and why heapification is customized to not distinguish between + * tuples in the second run (those will be quicksorted). We generally + * prefer a simple hybrid sort-merge strategy, where runs are sorted in much + * the same way as the entire input of an internal sort is sorted (using + * qsort()). The replacement_sort_tuples GUC controls the limited remaining + * use of replacement selection for the first run. + * + * There are several reasons to favor a hybrid sort-merge strategy. + * Maintaining a priority tree/heap has poor CPU cache characteristics. + * Furthermore, the growth in main memory sizes has greatly diminished the + * value of having runs that are larger than available memory, even in the + * case where there is partially sorted input and runs can be made far + * larger by using a heap. In most cases, a single-pass merge step is all + * that is required even when runs are no larger than available memory. + * Avoiding multiple merge passes was traditionally considered to be the + * major advantage of using replacement selection. + * + * The approximate amount of memory allowed for any one sort operation + * is specified in kilobytes by the caller (most pass work_mem). Initially, + * we absorb tuples and simply store them in an unsorted array as long as + * we haven't exceeded workMem. If we reach the end of the input without + * exceeding workMem, we sort the array using qsort() and subsequently return + * tuples just by scanning the tuple array sequentially. If we do exceed + * workMem, we begin to emit tuples into sorted runs in temporary tapes. + * When tuples are dumped in batch after quicksorting, we begin a new run + * with a new output tape (selected per Algorithm D). After the end of the + * input is reached, we dump out remaining tuples in memory into a final run + * (or two, when replacement selection is still used), then merge the runs + * using Algorithm D. + * + * When merging runs, we use a heap containing just the frontmost tuple from + * each source run; we repeatedly output the smallest tuple and insert the + * next tuple from its source tape (if any). When the heap empties, the merge + * is complete. The basic merge algorithm thus needs very little memory --- + * only M tuples for an M-way merge, and M is constrained to a small number. + * However, we can still make good use of our full workMem allocation by + * pre-reading additional tuples from each source tape. Without prereading, + * our access pattern to the temporary file would be very erratic; on average + * we'd read one block from each of M source tapes during the same time that + * we're writing M blocks to the output tape, so there is no sequentiality of + * access at all, defeating the read-ahead methods used by most Unix kernels. + * Worse, the output tape gets written into a very random sequence of blocks + * of the temp file, ensuring that things will be even worse when it comes + * time to read that tape. A straightforward merge pass thus ends up doing a + * lot of waiting for disk seeks. We can improve matters by prereading from + * each source tape sequentially, loading about workMem/M bytes from each tape + * in turn. Then we run the merge algorithm, writing but not reading until + * one of the preloaded tuple series runs out. Then we switch back to preread + * mode, fill memory again, and repeat. This approach helps to localize both + * read and write accesses. + * + * When the caller requests random access to the sort result, we form + * the final sorted run on a logical tape which is then "frozen", so + * that we can access it randomly. When the caller does not need random + * access, we return from tuplesort_performsort() as soon as we are down + * to one run per logical tape. The final merge is then performed + * on-the-fly as the caller repeatedly calls tuplesort_getXXX; this + * saves one cycle of writing all the data out to disk and reading it in. + * + * Before Postgres 8.2, we always used a seven-tape polyphase merge, on the + * grounds that 7 is the "sweet spot" on the tapes-to-passes curve according + * to Knuth's figure 70 (section 5.4.2). However, Knuth is assuming that + * tape drives are expensive beasts, and in particular that there will always + * be many more runs than tape drives. In our implementation a "tape drive" + * doesn't cost much more than a few Kb of memory buffers, so we can afford + * to have lots of them. In particular, if we can have as many tape drives + * as sorted runs, we can eliminate any repeated I/O at all. In the current + * code we determine the number of tapes M on the basis of workMem: we want + * workMem/M to be large enough that we read a fair amount of data each time + * we preread from a tape, so as to maintain the locality of access described + * above. Nonetheless, with large workMem we can have many tapes. + * + * + * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/utils/sort/tuplesort.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include + +#include "access/htup_details.h" +#include "access/nbtree.h" +#include "catalog/index.h" +#include "catalog/pg_am.h" +#include "commands/tablespace.h" +#include "executor/executor.h" +#include "miscadmin.h" +#include "pg_trace.h" +#include "utils/datum.h" +#include "utils/logtape.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/pg_rusage.h" +#include "utils/rel.h" +#include "utils/sortsupport.h" +#include "utils/tuplesort.h" + +/* Should be the last include */ +#include "disable_core_macro.h" + +/* sort-type codes for sort__start probes */ +#define HEAP_SORT 0 +#define INDEX_SORT 1 +#define DATUM_SORT 2 +#define CLUSTER_SORT 3 + +/* GUC variables */ +#ifdef TRACE_SORT +bool trace_sort = false; +#endif + +#ifdef DEBUG_BOUNDED_SORT +bool optimize_bounded_sort = true; +#endif + + +/* + * The objects we actually sort are SortTuple structs. These contain + * a pointer to the tuple proper (might be a MinimalTuple or IndexTuple), + * which is a separate palloc chunk --- we assume it is just one chunk and + * can be freed by a simple pfree() (except during final on-the-fly merge, + * when memory is used in batch). SortTuples also contain the tuple's + * first key column in Datum/nullflag format, and an index integer. + * + * Storing the first key column lets us save heap_getattr or index_getattr + * calls during tuple comparisons. We could extract and save all the key + * columns not just the first, but this would increase code complexity and + * overhead, and wouldn't actually save any comparison cycles in the common + * case where the first key determines the comparison result. Note that + * for a pass-by-reference datatype, datum1 points into the "tuple" storage. + * + * There is one special case: when the sort support infrastructure provides an + * "abbreviated key" representation, where the key is (typically) a pass by + * value proxy for a pass by reference type. In this case, the abbreviated key + * is stored in datum1 in place of the actual first key column. + * + * When sorting single Datums, the data value is represented directly by + * datum1/isnull1 for pass by value types (or null values). If the datatype is + * pass-by-reference and isnull1 is false, then "tuple" points to a separately + * palloc'd data value, otherwise "tuple" is NULL. The value of datum1 is then + * either the same pointer as "tuple", or is an abbreviated key value as + * described above. Accordingly, "tuple" is always used in preference to + * datum1 as the authoritative value for pass-by-reference cases. + * + * While building initial runs, tupindex holds the tuple's run number. + * Historically, the run number could meaningfully distinguish many runs, but + * it now only distinguishes RUN_FIRST and HEAP_RUN_NEXT, since replacement + * selection is always abandoned after the first run; no other run number + * should be represented here. During merge passes, we re-use it to hold the + * input tape number that each tuple in the heap was read from, or to hold the + * index of the next tuple pre-read from the same tape in the case of pre-read + * entries. tupindex goes unused if the sort occurs entirely in memory. + */ +typedef struct +{ + void *tuple; /* the tuple itself */ + Datum datum1; /* value of first key column */ + bool isnull1; /* is first key column NULL? */ + int tupindex; /* see notes above */ +} SortTuple; + + +/* + * Possible states of a Tuplesort object. These denote the states that + * persist between calls of Tuplesort routines. + */ +typedef enum +{ + TSS_INITIAL, /* Loading tuples; still within memory limit */ + TSS_BOUNDED, /* Loading tuples into bounded-size heap */ + TSS_BUILDRUNS, /* Loading tuples; writing to tape */ + TSS_SORTEDINMEM, /* Sort completed entirely in memory */ + TSS_SORTEDONTAPE, /* Sort completed, final run is on tape */ + TSS_FINALMERGE /* Performing final merge on-the-fly */ +} TupSortStatus; + +/* + * Parameters for calculation of number of tapes to use --- see inittapes() + * and tuplesort_merge_order(). + * + * In this calculation we assume that each tape will cost us about 3 blocks + * worth of buffer space (which is an underestimate for very large data + * volumes, but it's probably close enough --- see logtape.c). + * + * MERGE_BUFFER_SIZE is how much data we'd like to read from each input + * tape during a preread cycle (see discussion at top of file). + */ +#define MINORDER 6 /* minimum merge order */ +#define TAPE_BUFFER_OVERHEAD (BLCKSZ * 3) +#define MERGE_BUFFER_SIZE (BLCKSZ * 32) + + /* + * Run numbers, used during external sort operations. + * + * HEAP_RUN_NEXT is only used for SortTuple.tupindex, never state.currentRun. + */ +#define RUN_FIRST 0 +#define HEAP_RUN_NEXT INT_MAX +#define RUN_SECOND 1 + +typedef int (*SortTupleComparator) (const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); + +/* + * Private state of a Tuplesort operation. + */ +struct Tuplesortstate +{ + TupSortStatus status; /* enumerated value as shown above */ + int nKeys; /* number of columns in sort key */ + bool randomAccess; /* did caller request random access? */ + bool bounded; /* did caller specify a maximum number of + * tuples to return? */ + bool boundUsed; /* true if we made use of a bounded heap */ + int bound; /* if bounded, the maximum number of tuples */ + bool tuples; /* Can SortTuple.tuple ever be set? */ + int64 availMem; /* remaining memory available, in bytes */ + int64 allowedMem; /* total memory allowed, in bytes */ + int maxTapes; /* number of tapes (Knuth's T) */ + int tapeRange; /* maxTapes-1 (Knuth's P) */ + MemoryContext sortcontext; /* memory context holding most sort data */ + MemoryContext tuplecontext; /* sub-context of sortcontext for tuple data */ + LogicalTapeSet *tapeset; /* logtape.c object for tapes in a temp file */ + + /* + * These function pointers decouple the routines that must know what kind + * of tuple we are sorting from the routines that don't need to know it. + * They are set up by the tuplesort_begin_xxx routines. + * + * Function to compare two tuples; result is per qsort() convention, ie: + * <0, 0, >0 according as ab. The API must match + * qsort_arg_comparator. + */ + SortTupleComparator comparetup; + + /* + * Function to copy a supplied input tuple into palloc'd space and set up + * its SortTuple representation (ie, set tuple/datum1/isnull1). Also, + * state->availMem must be decreased by the amount of space used for the + * tuple copy (note the SortTuple struct itself is not counted). + */ + void (*copytup) (Tuplesortstate *state, SortTuple *stup, void *tup); + + /* + * Function to write a stored tuple onto tape. The representation of the + * tuple on tape need not be the same as it is in memory; requirements on + * the tape representation are given below. After writing the tuple, + * pfree() the out-of-line data (not the SortTuple struct!), and increase + * state->availMem by the amount of memory space thereby released. + */ + void (*writetup) (Tuplesortstate *state, int tapenum, + SortTuple *stup); + + /* + * Function to read a stored tuple from tape back into memory. 'len' is + * the already-read length of the stored tuple. Create a palloc'd copy, + * initialize tuple/datum1/isnull1 in the target SortTuple struct, and + * decrease state->availMem by the amount of memory space consumed. (See + * batchUsed notes for details on how memory is handled when incremental + * accounting is abandoned.) + */ + void (*readtup) (Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); + + /* + * Function to move a caller tuple. This is usually implemented as a + * memmove() shim, but function may also perform additional fix-up of + * caller tuple where needed. Batch memory support requires the movement + * of caller tuples from one location in memory to another. + */ + void (*movetup) (void *dest, void *src, unsigned int len); + + /* + * This array holds the tuples now in sort memory. If we are in state + * INITIAL, the tuples are in no particular order; if we are in state + * SORTEDINMEM, the tuples are in final sorted order; in states BUILDRUNS + * and FINALMERGE, the tuples are organized in "heap" order per Algorithm + * H. (Note that memtupcount only counts the tuples that are part of the + * heap --- during merge passes, memtuples[] entries beyond tapeRange are + * never in the heap and are used to hold pre-read tuples.) In state + * SORTEDONTAPE, the array is not used. + */ + SortTuple *memtuples; /* array of SortTuple structs */ + int memtupcount; /* number of tuples currently present */ + int memtupsize; /* allocated length of memtuples array */ + bool growmemtuples; /* memtuples' growth still underway? */ + + /* + * Memory for tuples is sometimes allocated in batch, rather than + * incrementally. This implies that incremental memory accounting has + * been abandoned. Currently, this only happens for the final on-the-fly + * merge step. Large batch allocations can store tuples (e.g. + * IndexTuples) without palloc() fragmentation and other overhead. + */ + bool batchUsed; + + /* + * While building initial runs, this indicates if the replacement + * selection strategy is in use. When it isn't, then a simple hybrid + * sort-merge strategy is in use instead (runs are quicksorted). + */ + bool replaceActive; + + /* + * While building initial runs, this is the current output run number + * (starting at RUN_FIRST). Afterwards, it is the number of initial runs + * we made. + */ + int currentRun; + + /* + * Unless otherwise noted, all pointer variables below are pointers to + * arrays of length maxTapes, holding per-tape data. + */ + + /* + * These variables are only used during merge passes. mergeactive[i] is + * true if we are reading an input run from (actual) tape number i and + * have not yet exhausted that run. mergenext[i] is the memtuples index + * of the next pre-read tuple (next to be loaded into the heap) for tape + * i, or 0 if we are out of pre-read tuples. mergelast[i] similarly + * points to the last pre-read tuple from each tape. mergeavailslots[i] + * is the number of unused memtuples[] slots reserved for tape i, and + * mergeavailmem[i] is the amount of unused space allocated for tape i. + * mergefreelist and mergefirstfree keep track of unused locations in the + * memtuples[] array. The memtuples[].tupindex fields link together + * pre-read tuples for each tape as well as recycled locations in + * mergefreelist. It is OK to use 0 as a null link in these lists, because + * memtuples[0] is part of the merge heap and is never a pre-read tuple. + */ + bool *mergeactive; /* active input run source? */ + int *mergenext; /* first preread tuple for each source */ + int *mergelast; /* last preread tuple for each source */ + int *mergeavailslots; /* slots left for prereading each tape */ + int64 *mergeavailmem; /* availMem for prereading each tape */ + int mergefreelist; /* head of freelist of recycled slots */ + int mergefirstfree; /* first slot never used in this merge */ + + /* + * Per-tape batch state, when final on-the-fly merge consumes memory from + * just a few large allocations. + * + * Aside from the general benefits of performing fewer individual retail + * palloc() calls, this also helps make merging more cache efficient, + * since each tape's tuples must naturally be accessed sequentially (in + * sorted order). + */ + int64 spacePerTape; /* Space (memory) for tuples (not slots) */ + char **mergetuples; /* Each tape's memory allocation */ + char **mergecurrent; /* Current offset into each tape's memory */ + char **mergetail; /* Last item's start point for each tape */ + char **mergeoverflow; /* Retail palloc() "overflow" for each tape */ + + /* + * Variables for Algorithm D. Note that destTape is a "logical" tape + * number, ie, an index into the tp_xxx[] arrays. Be careful to keep + * "logical" and "actual" tape numbers straight! + */ + int Level; /* Knuth's l */ + int destTape; /* current output tape (Knuth's j, less 1) */ + int *tp_fib; /* Target Fibonacci run counts (A[]) */ + int *tp_runs; /* # of real runs on each tape */ + int *tp_dummy; /* # of dummy runs for each tape (D[]) */ + int *tp_tapenum; /* Actual tape numbers (TAPE[]) */ + int activeTapes; /* # of active input tapes in merge pass */ + + /* + * These variables are used after completion of sorting to keep track of + * the next tuple to return. (In the tape case, the tape's current read + * position is also critical state.) + */ + int result_tape; /* actual tape number of finished output */ + int current; /* array index (only used if SORTEDINMEM) */ + bool eof_reached; /* reached EOF (needed for cursors) */ + + /* markpos_xxx holds marked position for mark and restore */ + long markpos_block; /* tape block# (only used if SORTEDONTAPE) */ + int markpos_offset; /* saved "current", or offset in tape block */ + bool markpos_eof; /* saved "eof_reached" */ + + /* + * The sortKeys variable is used by every case other than the hash index + * case; it is set by tuplesort_begin_xxx. tupDesc is only used by the + * MinimalTuple and CLUSTER routines, though. + */ + TupleDesc tupDesc; + SortSupport sortKeys; /* array of length nKeys */ + + /* + * This variable is shared by the single-key MinimalTuple case and the + * Datum case (which both use qsort_ssup()). Otherwise it's NULL. + */ + SortSupport onlyKey; + + /* + * Additional state for managing "abbreviated key" sortsupport routines + * (which currently may be used by all cases except the hash index case). + * Tracks the intervals at which the optimization's effectiveness is + * tested. + */ + int64 abbrevNext; /* Tuple # at which to next check + * applicability */ + + /* + * These variables are specific to the CLUSTER case; they are set by + * tuplesort_begin_cluster. + */ + IndexInfo *indexInfo; /* info about index being used for reference */ + EState *estate; /* for evaluating index expressions */ + + /* + * These variables are specific to the IndexTuple case; they are set by + * tuplesort_begin_index_xxx and used only by the IndexTuple routines. + */ + Relation heapRel; /* table the index is being built on */ + Relation indexRel; /* index being built */ + + /* These are specific to the index_btree subcase: */ + bool enforceUnique; /* complain if we find duplicate tuples */ + + /* These are specific to the index_hash subcase: */ + uint32 hash_mask; /* mask for sortable part of hash code */ + + /* + * These variables are specific to the Datum case; they are set by + * tuplesort_begin_datum and used only by the DatumTuple routines. + */ + Oid datumType; + /* we need typelen in order to know how to copy the Datums. */ + int datumTypeLen; + + /* + * Resource snapshot for time of sort start. + */ +#ifdef TRACE_SORT + PGRUsage ru_start; +#endif +}; + +#define COMPARETUP(state,a,b) ((*(state)->comparetup) (a, b, state)) +#define COPYTUP(state,stup,tup) ((*(state)->copytup) (state, stup, tup)) +#define WRITETUP(state,tape,stup) ((*(state)->writetup) (state, tape, stup)) +#define READTUP(state,stup,tape,len) ((*(state)->readtup) (state, stup, tape, len)) +#define MOVETUP(dest,src,len) ((*(state)->movetup) (dest, src, len)) +#define LACKMEM(state) ((state)->availMem < 0 && !(state)->batchUsed) +#define USEMEM(state,amt) ((state)->availMem -= (amt)) +#define FREEMEM(state,amt) ((state)->availMem += (amt)) + +/* + * NOTES about on-tape representation of tuples: + * + * We require the first "unsigned int" of a stored tuple to be the total size + * on-tape of the tuple, including itself (so it is never zero; an all-zero + * unsigned int is used to delimit runs). The remainder of the stored tuple + * may or may not match the in-memory representation of the tuple --- + * any conversion needed is the job of the writetup and readtup routines. + * + * If state->randomAccess is true, then the stored representation of the + * tuple must be followed by another "unsigned int" that is a copy of the + * length --- so the total tape space used is actually sizeof(unsigned int) + * more than the stored length value. This allows read-backwards. When + * randomAccess is not true, the write/read routines may omit the extra + * length word. + * + * writetup is expected to write both length words as well as the tuple + * data. When readtup is called, the tape is positioned just after the + * front length word; readtup must read the tuple data and advance past + * the back length word (if present). + * + * The write/read routines can make use of the tuple description data + * stored in the Tuplesortstate record, if needed. They are also expected + * to adjust state->availMem by the amount of memory space (not tape space!) + * released or consumed. There is no error return from either writetup + * or readtup; they should ereport() on failure. + * + * + * NOTES about memory consumption calculations: + * + * We count space allocated for tuples against the workMem limit, plus + * the space used by the variable-size memtuples array. Fixed-size space + * is not counted; it's small enough to not be interesting. + * + * Note that we count actual space used (as shown by GetMemoryChunkSpace) + * rather than the originally-requested size. This is important since + * palloc can add substantial overhead. It's not a complete answer since + * we won't count any wasted space in palloc allocation blocks, but it's + * a lot better than what we were doing before 7.3. As of 9.6, a + * separate memory context is used for caller passed tuples. Resetting + * it at certain key increments significantly ameliorates fragmentation. + * Note that this places a responsibility on readtup and copytup routines + * to use the right memory context for these tuples (and to not use the + * reset context for anything whose lifetime needs to span multiple + * external sort runs). + */ + +/* When using this macro, beware of double evaluation of len */ +#define LogicalTapeReadExact(tapeset, tapenum, ptr, len) \ + do { \ + if (LogicalTapeRead(tapeset, tapenum, ptr, len) != (size_t) (len)) \ + elog(ERROR, "unexpected end of data"); \ + } while(0) + + +static Tuplesortstate *tuplesort_begin_common(int workMem, bool randomAccess); +static void puttuple_common(Tuplesortstate *state, SortTuple *tuple); +static bool consider_abort_common(Tuplesortstate *state); +static bool useselection(Tuplesortstate *state); +static void inittapes(Tuplesortstate *state); +static void selectnewtape(Tuplesortstate *state); +static void mergeruns(Tuplesortstate *state); +static void mergeonerun(Tuplesortstate *state); +static void beginmerge(Tuplesortstate *state, bool finalMergeBatch); +static void batchmemtuples(Tuplesortstate *state); +static void mergebatch(Tuplesortstate *state, int64 spacePerTape); +static void mergebatchone(Tuplesortstate *state, int srcTape, + SortTuple *stup, bool *should_free); +static void mergebatchfreetape(Tuplesortstate *state, int srcTape, + SortTuple *rtup, bool *should_free); +static void *mergebatchalloc(Tuplesortstate *state, int tapenum, Size tuplen); +static void mergepreread(Tuplesortstate *state); +static void mergeprereadone(Tuplesortstate *state, int srcTape); +static void dumptuples(Tuplesortstate *state, bool alltuples); +static void dumpbatch(Tuplesortstate *state, bool alltuples); +static void make_bounded_heap(Tuplesortstate *state); +static void sort_bounded_heap(Tuplesortstate *state); +static void tuplesort_sort_memtuples(Tuplesortstate *state); +static void tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple, + int tupleindex, bool checkIndex); +static void tuplesort_heap_siftup(Tuplesortstate *state, bool checkIndex); +static void reversedirection(Tuplesortstate *state); +static unsigned int getlen(Tuplesortstate *state, int tapenum, bool eofOK); +static void markrunend(Tuplesortstate *state, int tapenum); +static void *readtup_alloc(Tuplesortstate *state, int tapenum, Size tuplen); +static int comparetup_heap(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_heap(Tuplesortstate *state, int tapenum, + SortTuple *stup); +static void readtup_heap(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); +static void movetup_heap(void *dest, void *src, unsigned int len); +static int comparetup_cluster(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_cluster(Tuplesortstate *state, int tapenum, + SortTuple *stup); +static void readtup_cluster(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); +static void movetup_cluster(void *dest, void *src, unsigned int len); +static int comparetup_index_btree(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static int comparetup_index_hash(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_index(Tuplesortstate *state, int tapenum, + SortTuple *stup); +static void readtup_index(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); +static void movetup_index(void *dest, void *src, unsigned int len); +static int comparetup_datum(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_datum(Tuplesortstate *state, int tapenum, + SortTuple *stup); +static void readtup_datum(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); +static void movetup_datum(void *dest, void *src, unsigned int len); +static void free_sort_tuple(Tuplesortstate *state, SortTuple *stup); + +/* + * Special versions of qsort just for SortTuple objects. qsort_tuple() sorts + * any variant of SortTuples, using the appropriate comparetup function. + * qsort_ssup() is specialized for the case where the comparetup function + * reduces to ApplySortComparator(), that is single-key MinimalTuple sorts + * and Datum sorts. + */ +#include "qsort_tuple.c" + + +/* + * tuplesort_begin_xxx + * + * Initialize for a tuple sort operation. + * + * After calling tuplesort_begin, the caller should call tuplesort_putXXX + * zero or more times, then call tuplesort_performsort when all the tuples + * have been supplied. After performsort, retrieve the tuples in sorted + * order by calling tuplesort_getXXX until it returns false/NULL. (If random + * access was requested, rescan, markpos, and restorepos can also be called.) + * Call tuplesort_end to terminate the operation and release memory/disk space. + * + * Each variant of tuplesort_begin has a workMem parameter specifying the + * maximum number of kilobytes of RAM to use before spilling data to disk. + * (The normal value of this parameter is work_mem, but some callers use + * other values.) Each variant also has a randomAccess parameter specifying + * whether the caller needs non-sequential access to the sort result. + */ + +static Tuplesortstate * +tuplesort_begin_common(int workMem, bool randomAccess) +{ + Tuplesortstate *state; + MemoryContext sortcontext; + MemoryContext tuplecontext; + MemoryContext oldcontext; + + /* + * Create a working memory context for this sort operation. All data + * needed by the sort will live inside this context. + */ + sortcontext = AllocSetContextCreate(CurrentMemoryContext, + "TupleSort main", + ALLOCSET_DEFAULT_SIZES); + + /* + * Caller tuple (e.g. IndexTuple) memory context. + * + * A dedicated child context used exclusively for caller passed tuples + * eases memory management. Resetting at key points reduces + * fragmentation. Note that the memtuples array of SortTuples is allocated + * in the parent context, not this context, because there is no need to + * free memtuples early. + */ + tuplecontext = AllocSetContextCreate(sortcontext, + "Caller tuples", + ALLOCSET_DEFAULT_SIZES); + + /* + * Make the Tuplesortstate within the per-sort context. This way, we + * don't need a separate pfree() operation for it at shutdown. + */ + oldcontext = MemoryContextSwitchTo(sortcontext); + + state = (Tuplesortstate *) palloc0(sizeof(Tuplesortstate)); + +#ifdef TRACE_SORT + if (trace_sort) + pg_rusage_init(&state->ru_start); +#endif + + state->status = TSS_INITIAL; + state->randomAccess = randomAccess; + state->bounded = false; + state->tuples = true; + state->boundUsed = false; + state->allowedMem = workMem * (int64) 1024; + state->availMem = state->allowedMem; + state->sortcontext = sortcontext; + state->tuplecontext = tuplecontext; + state->tapeset = NULL; + + state->memtupcount = 0; + + /* + * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD; + * see comments in grow_memtuples(). + */ + state->memtupsize = Max(1024, + ALLOCSET_SEPARATE_THRESHOLD / sizeof(SortTuple) + 1); + + state->growmemtuples = true; + state->batchUsed = false; + state->memtuples = (SortTuple *) palloc(state->memtupsize * sizeof(SortTuple)); + + USEMEM(state, GetMemoryChunkSpace(state->memtuples)); + + /* workMem must be large enough for the minimal memtuples array */ + if (LACKMEM(state)) + elog(ERROR, "insufficient memory allowed for sort"); + + state->currentRun = RUN_FIRST; + + /* + * maxTapes, tapeRange, and Algorithm D variables will be initialized by + * inittapes(), if needed + */ + + state->result_tape = -1; /* flag that result tape has not been formed */ + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_heap(TupleDesc tupDesc, + int nkeys, AttrNumber *attNums, + Oid *sortOperators, Oid *sortCollations, + bool *nullsFirstFlags, + int workMem, bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess); + MemoryContext oldcontext; + int i; + + oldcontext = MemoryContextSwitchTo(state->sortcontext); + + AssertArg(nkeys > 0); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c", + nkeys, workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = nkeys; + + TRACE_POSTGRESQL_SORT_START(HEAP_SORT, + false, /* no unique check */ + nkeys, + workMem, + randomAccess); + + state->comparetup = comparetup_heap; + state->copytup = copytup_heap; + state->writetup = writetup_heap; + state->readtup = readtup_heap; + state->movetup = movetup_heap; + + state->tupDesc = tupDesc; /* assume we need not copy tupDesc */ + state->abbrevNext = 10; + + /* Prepare SortSupport data for each column */ + state->sortKeys = (SortSupport) palloc0(nkeys * sizeof(SortSupportData)); + + for (i = 0; i < nkeys; i++) + { + SortSupport sortKey = state->sortKeys + i; + + AssertArg(attNums[i] != 0); + AssertArg(sortOperators[i] != 0); + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = sortCollations[i]; + sortKey->ssup_nulls_first = nullsFirstFlags[i]; + sortKey->ssup_attno = attNums[i]; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0); + + PrepareSortSupportFromOrderingOp(sortOperators[i], sortKey); + } + + /* + * The "onlyKey" optimization cannot be used with abbreviated keys, since + * tie-breaker comparisons may be required. Typically, the optimization + * is only of value to pass-by-value types anyway, whereas abbreviated + * keys are typically only of value to pass-by-reference types. + */ + if (nkeys == 1 && !state->sortKeys->abbrev_converter) + state->onlyKey = state->sortKeys; + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_cluster(TupleDesc tupDesc, + Relation indexRel, + int workMem, bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess); + ScanKey indexScanKey; + MemoryContext oldcontext; + int i; + + Assert(indexRel->rd_rel->relam == BTREE_AM_OID); + + oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c", + RelationGetNumberOfAttributes(indexRel), + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = RelationGetNumberOfAttributes(indexRel); + + TRACE_POSTGRESQL_SORT_START(CLUSTER_SORT, + false, /* no unique check */ + state->nKeys, + workMem, + randomAccess); + + state->comparetup = comparetup_cluster; + state->copytup = copytup_cluster; + state->writetup = writetup_cluster; + state->readtup = readtup_cluster; + state->movetup = movetup_cluster; + state->abbrevNext = 10; + + state->indexInfo = BuildIndexInfo(indexRel); + + state->tupDesc = tupDesc; /* assume we need not copy tupDesc */ + + indexScanKey = _bt_mkscankey_nodata(indexRel); + + if (state->indexInfo->ii_Expressions != NULL) + { + TupleTableSlot *slot; + ExprContext *econtext; + + /* + * We will need to use FormIndexDatum to evaluate the index + * expressions. To do that, we need an EState, as well as a + * TupleTableSlot to put the table tuples into. The econtext's + * scantuple has to point to that slot, too. + */ + state->estate = CreateExecutorState(); + slot = MakeSingleTupleTableSlot(tupDesc); + econtext = GetPerTupleExprContext(state->estate); + econtext->ecxt_scantuple = slot; + } + + /* Prepare SortSupport data for each column */ + state->sortKeys = (SortSupport) palloc0(state->nKeys * + sizeof(SortSupportData)); + + for (i = 0; i < state->nKeys; i++) + { + SortSupport sortKey = state->sortKeys + i; + ScanKey scanKey = indexScanKey + i; + int16 strategy; + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = scanKey->sk_collation; + sortKey->ssup_nulls_first = + (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0; + sortKey->ssup_attno = scanKey->sk_attno; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0); + + AssertState(sortKey->ssup_attno != 0); + + strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ? + BTGreaterStrategyNumber : BTLessStrategyNumber; + + PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey); + } + + _bt_freeskey(indexScanKey); + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_index_btree(Relation heapRel, + Relation indexRel, + bool enforceUnique, + int workMem, bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess); + ScanKey indexScanKey; + MemoryContext oldcontext; + int i; + + oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin index sort: unique = %c, workMem = %d, randomAccess = %c", + enforceUnique ? 't' : 'f', + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = RelationGetNumberOfAttributes(indexRel); + + TRACE_POSTGRESQL_SORT_START(INDEX_SORT, + enforceUnique, + state->nKeys, + workMem, + randomAccess); + + state->comparetup = comparetup_index_btree; + state->copytup = copytup_index; + state->writetup = writetup_index; + state->readtup = readtup_index; + state->movetup = movetup_index; + state->abbrevNext = 10; + + state->heapRel = heapRel; + state->indexRel = indexRel; + state->enforceUnique = enforceUnique; + + indexScanKey = _bt_mkscankey_nodata(indexRel); + state->nKeys = RelationGetNumberOfAttributes(indexRel); + + /* Prepare SortSupport data for each column */ + state->sortKeys = (SortSupport) palloc0(state->nKeys * + sizeof(SortSupportData)); + + for (i = 0; i < state->nKeys; i++) + { + SortSupport sortKey = state->sortKeys + i; + ScanKey scanKey = indexScanKey + i; + int16 strategy; + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = scanKey->sk_collation; + sortKey->ssup_nulls_first = + (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0; + sortKey->ssup_attno = scanKey->sk_attno; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0); + + AssertState(sortKey->ssup_attno != 0); + + strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ? + BTGreaterStrategyNumber : BTLessStrategyNumber; + + PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey); + } + + _bt_freeskey(indexScanKey); + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_index_hash(Relation heapRel, + Relation indexRel, + uint32 hash_mask, + int workMem, bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess); + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin index sort: hash_mask = 0x%x, workMem = %d, randomAccess = %c", + hash_mask, + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = 1; /* Only one sort column, the hash code */ + + state->comparetup = comparetup_index_hash; + state->copytup = copytup_index; + state->writetup = writetup_index; + state->readtup = readtup_index; + state->movetup = movetup_index; + + state->heapRel = heapRel; + state->indexRel = indexRel; + + state->hash_mask = hash_mask; + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +Tuplesortstate * +tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation, + bool nullsFirstFlag, + int workMem, bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess); + MemoryContext oldcontext; + int16 typlen; + bool typbyval; + + oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin datum sort: workMem = %d, randomAccess = %c", + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = 1; /* always a one-column sort */ + + TRACE_POSTGRESQL_SORT_START(DATUM_SORT, + false, /* no unique check */ + 1, + workMem, + randomAccess); + + state->comparetup = comparetup_datum; + state->copytup = copytup_datum; + state->writetup = writetup_datum; + state->readtup = readtup_datum; + state->movetup = movetup_datum; + state->abbrevNext = 10; + + state->datumType = datumType; + + /* lookup necessary attributes of the datum type */ + get_typlenbyval(datumType, &typlen, &typbyval); + state->datumTypeLen = typlen; + state->tuples = !typbyval; + + /* Prepare SortSupport data */ + state->sortKeys = (SortSupport) palloc0(sizeof(SortSupportData)); + + state->sortKeys->ssup_cxt = CurrentMemoryContext; + state->sortKeys->ssup_collation = sortCollation; + state->sortKeys->ssup_nulls_first = nullsFirstFlag; + + /* + * Abbreviation is possible here only for by-reference types. In theory, + * a pass-by-value datatype could have an abbreviated form that is cheaper + * to compare. In a tuple sort, we could support that, because we can + * always extract the original datum from the tuple is needed. Here, we + * can't, because a datum sort only stores a single copy of the datum; the + * "tuple" field of each sortTuple is NULL. + */ + state->sortKeys->abbreviate = !typbyval; + + PrepareSortSupportFromOrderingOp(sortOperator, state->sortKeys); + + /* + * The "onlyKey" optimization cannot be used with abbreviated keys, since + * tie-breaker comparisons may be required. Typically, the optimization + * is only of value to pass-by-value types anyway, whereas abbreviated + * keys are typically only of value to pass-by-reference types. + */ + if (!state->sortKeys->abbrev_converter) + state->onlyKey = state->sortKeys; + + MemoryContextSwitchTo(oldcontext); + + return state; +} + +/* + * tuplesort_set_bound + * + * Advise tuplesort that at most the first N result tuples are required. + * + * Must be called before inserting any tuples. (Actually, we could allow it + * as long as the sort hasn't spilled to disk, but there seems no need for + * delayed calls at the moment.) + * + * This is a hint only. The tuplesort may still return more tuples than + * requested. + */ +void +tuplesort_set_bound(Tuplesortstate *state, int64 bound) +{ + /* Assert we're called before loading any tuples */ + Assert(state->status == TSS_INITIAL); + Assert(state->memtupcount == 0); + Assert(!state->bounded); + +#ifdef DEBUG_BOUNDED_SORT + /* Honor GUC setting that disables the feature (for easy testing) */ + if (!optimize_bounded_sort) + return; +#endif + + /* We want to be able to compute bound * 2, so limit the setting */ + if (bound > (int64) (INT_MAX / 2)) + return; + + state->bounded = true; + state->bound = (int) bound; + + /* + * Bounded sorts are not an effective target for abbreviated key + * optimization. Disable by setting state to be consistent with no + * abbreviation support. + */ + state->sortKeys->abbrev_converter = NULL; + if (state->sortKeys->abbrev_full_comparator) + state->sortKeys->comparator = state->sortKeys->abbrev_full_comparator; + + /* Not strictly necessary, but be tidy */ + state->sortKeys->abbrev_abort = NULL; + state->sortKeys->abbrev_full_comparator = NULL; +} + +/* + * tuplesort_end + * + * Release resources and clean up. + * + * NOTE: after calling this, any pointers returned by tuplesort_getXXX are + * pointing to garbage. Be careful not to attempt to use or free such + * pointers afterwards! + */ +void +tuplesort_end(Tuplesortstate *state) +{ + /* context swap probably not needed, but let's be safe */ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + long spaceUsed; + + if (state->tapeset) + spaceUsed = LogicalTapeSetBlocks(state->tapeset); + else + spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024; +#endif + + /* + * Delete temporary "tape" files, if any. + * + * Note: want to include this in reported total cost of sort, hence need + * for two #ifdef TRACE_SORT sections. + */ + if (state->tapeset) + LogicalTapeSetClose(state->tapeset); + +#ifdef TRACE_SORT + if (trace_sort) + { + if (state->tapeset) + elog(LOG, "external sort ended, %ld disk blocks used: %s", + spaceUsed, pg_rusage_show(&state->ru_start)); + else + elog(LOG, "internal sort ended, %ld KB used: %s", + spaceUsed, pg_rusage_show(&state->ru_start)); + } + + TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, spaceUsed); +#else + + /* + * If you disabled TRACE_SORT, you can still probe sort__done, but you + * ain't getting space-used stats. + */ + TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, 0L); +#endif + + /* Free any execution state created for CLUSTER case */ + if (state->estate != NULL) + { + ExprContext *econtext = GetPerTupleExprContext(state->estate); + + ExecDropSingleTupleTableSlot(econtext->ecxt_scantuple); + FreeExecutorState(state->estate); + } + + MemoryContextSwitchTo(oldcontext); + + /* + * Free the per-sort memory context, thereby releasing all working memory, + * including the Tuplesortstate struct itself. + */ + MemoryContextDelete(state->sortcontext); +} + +/* + * Grow the memtuples[] array, if possible within our memory constraint. We + * must not exceed INT_MAX tuples in memory or the caller-provided memory + * limit. Return TRUE if we were able to enlarge the array, FALSE if not. + * + * Normally, at each increment we double the size of the array. When doing + * that would exceed a limit, we attempt one last, smaller increase (and then + * clear the growmemtuples flag so we don't try any more). That allows us to + * use memory as fully as permitted; sticking to the pure doubling rule could + * result in almost half going unused. Because availMem moves around with + * tuple addition/removal, we need some rule to prevent making repeated small + * increases in memtupsize, which would just be useless thrashing. The + * growmemtuples flag accomplishes that and also prevents useless + * recalculations in this function. + */ +static bool +grow_memtuples(Tuplesortstate *state) +{ + int newmemtupsize; + int memtupsize = state->memtupsize; + int64 memNowUsed = state->allowedMem - state->availMem; + + /* Forget it if we've already maxed out memtuples, per comment above */ + if (!state->growmemtuples) + return false; + + /* Select new value of memtupsize */ + if (memNowUsed <= state->availMem) + { + /* + * We've used no more than half of allowedMem; double our usage, + * clamping at INT_MAX tuples. + */ + if (memtupsize < INT_MAX / 2) + newmemtupsize = memtupsize * 2; + else + { + newmemtupsize = INT_MAX; + state->growmemtuples = false; + } + } + else + { + /* + * This will be the last increment of memtupsize. Abandon doubling + * strategy and instead increase as much as we safely can. + * + * To stay within allowedMem, we can't increase memtupsize by more + * than availMem / sizeof(SortTuple) elements. In practice, we want + * to increase it by considerably less, because we need to leave some + * space for the tuples to which the new array slots will refer. We + * assume the new tuples will be about the same size as the tuples + * we've already seen, and thus we can extrapolate from the space + * consumption so far to estimate an appropriate new size for the + * memtuples array. The optimal value might be higher or lower than + * this estimate, but it's hard to know that in advance. We again + * clamp at INT_MAX tuples. + * + * This calculation is safe against enlarging the array so much that + * LACKMEM becomes true, because the memory currently used includes + * the present array; thus, there would be enough allowedMem for the + * new array elements even if no other memory were currently used. + * + * We do the arithmetic in float8, because otherwise the product of + * memtupsize and allowedMem could overflow. Any inaccuracy in the + * result should be insignificant; but even if we computed a + * completely insane result, the checks below will prevent anything + * really bad from happening. + */ + double grow_ratio; + + grow_ratio = (double) state->allowedMem / (double) memNowUsed; + if (memtupsize * grow_ratio < INT_MAX) + newmemtupsize = (int) (memtupsize * grow_ratio); + else + newmemtupsize = INT_MAX; + + /* We won't make any further enlargement attempts */ + state->growmemtuples = false; + } + + /* Must enlarge array by at least one element, else report failure */ + if (newmemtupsize <= memtupsize) + goto noalloc; + + /* + * On a 32-bit machine, allowedMem could exceed MaxAllocHugeSize. Clamp + * to ensure our request won't be rejected. Note that we can easily + * exhaust address space before facing this outcome. (This is presently + * impossible due to guc.c's MAX_KILOBYTES limitation on work_mem, but + * don't rely on that at this distance.) + */ + if ((Size) newmemtupsize >= MaxAllocHugeSize / sizeof(SortTuple)) + { + newmemtupsize = (int) (MaxAllocHugeSize / sizeof(SortTuple)); + state->growmemtuples = false; /* can't grow any more */ + } + + /* + * We need to be sure that we do not cause LACKMEM to become true, else + * the space management algorithm will go nuts. The code above should + * never generate a dangerous request, but to be safe, check explicitly + * that the array growth fits within availMem. (We could still cause + * LACKMEM if the memory chunk overhead associated with the memtuples + * array were to increase. That shouldn't happen because we chose the + * initial array size large enough to ensure that palloc will be treating + * both old and new arrays as separate chunks. But we'll check LACKMEM + * explicitly below just in case.) + */ + if (state->availMem < (int64) ((newmemtupsize - memtupsize) * sizeof(SortTuple))) + goto noalloc; + + /* OK, do it */ + FREEMEM(state, GetMemoryChunkSpace(state->memtuples)); + state->memtupsize = newmemtupsize; + state->memtuples = (SortTuple *) + repalloc_huge(state->memtuples, + state->memtupsize * sizeof(SortTuple)); + USEMEM(state, GetMemoryChunkSpace(state->memtuples)); + if (LACKMEM(state)) + elog(ERROR, "unexpected out-of-memory situation in tuplesort"); + return true; + +noalloc: + /* If for any reason we didn't realloc, shut off future attempts */ + state->growmemtuples = false; + return false; +} + +/* + * Accept one tuple while collecting input data for sort. + * + * Note that the input data is always copied; the caller need not save it. + */ +void +tuplesort_puttupleslot(Tuplesortstate *state, TupleTableSlot *slot) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + /* + * Copy the given tuple into memory we control, and decrease availMem. + * Then call the common code. + */ + COPYTUP(state, &stup, (void *) slot); + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Accept one tuple while collecting input data for sort. + * + * Note that the input data is always copied; the caller need not save it. + */ +void +tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + /* + * Copy the given tuple into memory we control, and decrease availMem. + * Then call the common code. + */ + COPYTUP(state, &stup, (void *) tup); + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Collect one index tuple while collecting input data for sort, building + * it from caller-supplied values. + */ +void +tuplesort_putindextuplevalues(Tuplesortstate *state, Relation rel, + ItemPointer self, Datum *values, + bool *isnull) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + SortTuple stup; + Datum original; + IndexTuple tuple; + + stup.tuple = index_form_tuple(RelationGetDescr(rel), values, isnull); + tuple = ((IndexTuple) stup.tuple); + tuple->t_tid = *self; + USEMEM(state, GetMemoryChunkSpace(stup.tuple)); + /* set up first-column key value */ + original = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &stup.isnull1); + + MemoryContextSwitchTo(state->sortcontext); + + if (!state->sortKeys || !state->sortKeys->abbrev_converter || stup.isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to zeroed representation (to be + * consistent, and to support cheap inequality tests for NULL + * abbreviated keys). + */ + stup.datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup.datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup.datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + tuple = mtup->tuple; + mtup->datum1 = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &mtup->isnull1); + } + } + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Accept one Datum while collecting input data for sort. + * + * If the Datum is pass-by-ref type, the value will be copied. + */ +void +tuplesort_putdatum(Tuplesortstate *state, Datum val, bool isNull) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + SortTuple stup; + + /* + * Pass-by-value types or null values are just stored directly in + * stup.datum1 (and stup.tuple is not used and set to NULL). + * + * Non-null pass-by-reference values need to be copied into memory we + * control, and possibly abbreviated. The copied value is pointed to by + * stup.tuple and is treated as the canonical copy (e.g. to return via + * tuplesort_getdatum or when writing to tape); stup.datum1 gets the + * abbreviated value if abbreviation is happening, otherwise it's + * identical to stup.tuple. + */ + + if (isNull || !state->tuples) + { + /* + * Set datum1 to zeroed representation for NULLs (to be consistent, + * and to support cheap inequality tests for NULL abbreviated keys). + */ + stup.datum1 = !isNull ? val : (Datum) 0; + stup.isnull1 = isNull; + stup.tuple = NULL; /* no separate storage */ + MemoryContextSwitchTo(state->sortcontext); + } + else + { + Datum original = datumCopy(val, false, state->datumTypeLen); + + stup.isnull1 = false; + stup.tuple = DatumGetPointer(original); + USEMEM(state, GetMemoryChunkSpace(stup.tuple)); + MemoryContextSwitchTo(state->sortcontext); + + if (!state->sortKeys->abbrev_converter) + { + stup.datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup.datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup.datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any + * case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + mtup->datum1 = PointerGetDatum(mtup->tuple); + } + } + } + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Shared code for tuple and datum cases. + */ +static void +puttuple_common(Tuplesortstate *state, SortTuple *tuple) +{ + switch (state->status) + { + case TSS_INITIAL: + + /* + * Save the tuple into the unsorted array. First, grow the array + * as needed. Note that we try to grow the array when there is + * still one free slot remaining --- if we fail, there'll still be + * room to store the incoming tuple, and then we'll switch to + * tape-based operation. + */ + if (state->memtupcount >= state->memtupsize - 1) + { + (void) grow_memtuples(state); + Assert(state->memtupcount < state->memtupsize); + } + state->memtuples[state->memtupcount++] = *tuple; + + /* + * Check if it's time to switch over to a bounded heapsort. We do + * so if the input tuple count exceeds twice the desired tuple + * count (this is a heuristic for where heapsort becomes cheaper + * than a quicksort), or if we've just filled workMem and have + * enough tuples to meet the bound. + * + * Note that once we enter TSS_BOUNDED state we will always try to + * complete the sort that way. In the worst case, if later input + * tuples are larger than earlier ones, this might cause us to + * exceed workMem significantly. + */ + if (state->bounded && + (state->memtupcount > state->bound * 2 || + (state->memtupcount > state->bound && LACKMEM(state)))) + { +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "switching to bounded heapsort at %d tuples: %s", + state->memtupcount, + pg_rusage_show(&state->ru_start)); +#endif + make_bounded_heap(state); + return; + } + + /* + * Done if we still fit in available memory and have array slots. + */ + if (state->memtupcount < state->memtupsize && !LACKMEM(state)) + return; + + /* + * Nope; time to switch to tape-based operation. + */ + inittapes(state); + + /* + * Dump tuples until we are back under the limit. + */ + dumptuples(state, false); + break; + + case TSS_BOUNDED: + + /* + * We don't want to grow the array here, so check whether the new + * tuple can be discarded before putting it in. This should be a + * good speed optimization, too, since when there are many more + * input tuples than the bound, most input tuples can be discarded + * with just this one comparison. Note that because we currently + * have the sort direction reversed, we must check for <= not >=. + */ + if (COMPARETUP(state, tuple, &state->memtuples[0]) <= 0) + { + /* new tuple <= top of the heap, so we can discard it */ + free_sort_tuple(state, tuple); + CHECK_FOR_INTERRUPTS(); + } + else + { + /* discard top of heap, sift up, insert new tuple */ + free_sort_tuple(state, &state->memtuples[0]); + tuplesort_heap_siftup(state, false); + tuplesort_heap_insert(state, tuple, 0, false); + } + break; + + case TSS_BUILDRUNS: + + /* + * Insert the tuple into the heap, with run number currentRun if + * it can go into the current run, else HEAP_RUN_NEXT. The tuple + * can go into the current run if it is >= the first + * not-yet-output tuple. (Actually, it could go into the current + * run if it is >= the most recently output tuple ... but that + * would require keeping around the tuple we last output, and it's + * simplest to let writetup free each tuple as soon as it's + * written.) + * + * Note that this only applies when: + * + * - currentRun is RUN_FIRST + * + * - Replacement selection is in use (typically it is never used). + * + * When these two conditions are not both true, all tuples are + * appended indifferently, much like the TSS_INITIAL case. + * + * There should always be room to store the incoming tuple. + */ + Assert(!state->replaceActive || state->memtupcount > 0); + if (state->replaceActive && + COMPARETUP(state, tuple, &state->memtuples[0]) >= 0) + { + Assert(state->currentRun == RUN_FIRST); + + /* + * Insert tuple into first, fully heapified run. + * + * Unlike classic replacement selection, which this module was + * previously based on, only RUN_FIRST tuples are fully + * heapified. Any second/next run tuples are appended + * indifferently. While HEAP_RUN_NEXT tuples may be sifted + * out of the way of first run tuples, COMPARETUP() will never + * be called for the run's tuples during sifting (only our + * initial COMPARETUP() call is required for the tuple, to + * determine that the tuple does not belong in RUN_FIRST). + */ + tuplesort_heap_insert(state, tuple, state->currentRun, true); + } + else + { + /* + * Tuple was determined to not belong to heapified RUN_FIRST, + * or replacement selection not in play. Append the tuple to + * memtuples indifferently. + * + * dumptuples() does not trust that the next run's tuples are + * heapified. Anything past the first run will always be + * quicksorted even when replacement selection is initially + * used. (When it's never used, every tuple still takes this + * path.) + */ + tuple->tupindex = HEAP_RUN_NEXT; + state->memtuples[state->memtupcount++] = *tuple; + } + + /* + * If we are over the memory limit, dump tuples till we're under. + */ + dumptuples(state, false); + break; + + default: + elog(ERROR, "invalid tuplesort state"); + break; + } +} + +static bool +consider_abort_common(Tuplesortstate *state) +{ + Assert(state->sortKeys[0].abbrev_converter != NULL); + Assert(state->sortKeys[0].abbrev_abort != NULL); + Assert(state->sortKeys[0].abbrev_full_comparator != NULL); + + /* + * Check effectiveness of abbreviation optimization. Consider aborting + * when still within memory limit. + */ + if (state->status == TSS_INITIAL && + state->memtupcount >= state->abbrevNext) + { + state->abbrevNext *= 2; + + /* + * Check opclass-supplied abbreviation abort routine. It may indicate + * that abbreviation should not proceed. + */ + if (!state->sortKeys->abbrev_abort(state->memtupcount, + state->sortKeys)) + return false; + + /* + * Finally, restore authoritative comparator, and indicate that + * abbreviation is not in play by setting abbrev_converter to NULL + */ + state->sortKeys[0].comparator = state->sortKeys[0].abbrev_full_comparator; + state->sortKeys[0].abbrev_converter = NULL; + /* Not strictly necessary, but be tidy */ + state->sortKeys[0].abbrev_abort = NULL; + state->sortKeys[0].abbrev_full_comparator = NULL; + + /* Give up - expect original pass-by-value representation */ + return true; + } + + return false; +} + +/* + * All tuples have been provided; finish the sort. + */ +void +tuplesort_performsort(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "performsort starting: %s", + pg_rusage_show(&state->ru_start)); +#endif + + switch (state->status) + { + case TSS_INITIAL: + + /* + * We were able to accumulate all the tuples within the allowed + * amount of memory. Just qsort 'em and we're done. + */ + tuplesort_sort_memtuples(state); + state->current = 0; + state->eof_reached = false; + state->markpos_offset = 0; + state->markpos_eof = false; + state->status = TSS_SORTEDINMEM; + break; + + case TSS_BOUNDED: + + /* + * We were able to accumulate all the tuples required for output + * in memory, using a heap to eliminate excess tuples. Now we + * have to transform the heap to a properly-sorted array. + */ + sort_bounded_heap(state); + state->current = 0; + state->eof_reached = false; + state->markpos_offset = 0; + state->markpos_eof = false; + state->status = TSS_SORTEDINMEM; + break; + + case TSS_BUILDRUNS: + + /* + * Finish tape-based sort. First, flush all tuples remaining in + * memory out to tape; then merge until we have a single remaining + * run (or, if !randomAccess, one run per tape). Note that + * mergeruns sets the correct state->status. + */ + dumptuples(state, true); + mergeruns(state); + state->eof_reached = false; + state->markpos_block = 0L; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + +#ifdef TRACE_SORT + if (trace_sort) + { + if (state->status == TSS_FINALMERGE) + elog(LOG, "performsort done (except %d-way final merge): %s", + state->activeTapes, + pg_rusage_show(&state->ru_start)); + else + elog(LOG, "performsort done: %s", + pg_rusage_show(&state->ru_start)); + } +#endif + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Internal routine to fetch the next tuple in either forward or back + * direction into *stup. Returns FALSE if no more tuples. + * If *should_free is set, the caller must pfree stup.tuple when done with it. + * Otherwise, caller should not use tuple following next call here. + * + * Note: Public tuplesort fetch routine callers cannot rely on tuple being + * allocated in their own memory context when should_free is TRUE. It may be + * necessary to create a new copy of the tuple to meet the requirements of + * public fetch routine callers. + */ +static bool +tuplesort_gettuple_common(Tuplesortstate *state, bool forward, + SortTuple *stup, bool *should_free) +{ + unsigned int tuplen; + + switch (state->status) + { + case TSS_SORTEDINMEM: + Assert(forward || state->randomAccess); + Assert(!state->batchUsed); + *should_free = false; + if (forward) + { + if (state->current < state->memtupcount) + { + *stup = state->memtuples[state->current++]; + return true; + } + state->eof_reached = true; + + /* + * Complain if caller tries to retrieve more tuples than + * originally asked for in a bounded sort. This is because + * returning EOF here might be the wrong thing. + */ + if (state->bounded && state->current >= state->bound) + elog(ERROR, "retrieved too many tuples in a bounded sort"); + + return false; + } + else + { + if (state->current <= 0) + return false; + + /* + * if all tuples are fetched already then we return last + * tuple, else - tuple before last returned. + */ + if (state->eof_reached) + state->eof_reached = false; + else + { + state->current--; /* last returned tuple */ + if (state->current <= 0) + return false; + } + *stup = state->memtuples[state->current - 1]; + return true; + } + break; + + case TSS_SORTEDONTAPE: + Assert(forward || state->randomAccess); + Assert(!state->batchUsed); + *should_free = true; + if (forward) + { + if (state->eof_reached) + return false; + if ((tuplen = getlen(state, state->result_tape, true)) != 0) + { + READTUP(state, stup, state->result_tape, tuplen); + return true; + } + else + { + state->eof_reached = true; + return false; + } + } + + /* + * Backward. + * + * if all tuples are fetched already then we return last tuple, + * else - tuple before last returned. + */ + if (state->eof_reached) + { + /* + * Seek position is pointing just past the zero tuplen at the + * end of file; back up to fetch last tuple's ending length + * word. If seek fails we must have a completely empty file. + */ + if (!LogicalTapeBackspace(state->tapeset, + state->result_tape, + 2 * sizeof(unsigned int))) + return false; + state->eof_reached = false; + } + else + { + /* + * Back up and fetch previously-returned tuple's ending length + * word. If seek fails, assume we are at start of file. + */ + if (!LogicalTapeBackspace(state->tapeset, + state->result_tape, + sizeof(unsigned int))) + return false; + tuplen = getlen(state, state->result_tape, false); + + /* + * Back up to get ending length word of tuple before it. + */ + if (!LogicalTapeBackspace(state->tapeset, + state->result_tape, + tuplen + 2 * sizeof(unsigned int))) + { + /* + * If that fails, presumably the prev tuple is the first + * in the file. Back up so that it becomes next to read + * in forward direction (not obviously right, but that is + * what in-memory case does). + */ + if (!LogicalTapeBackspace(state->tapeset, + state->result_tape, + tuplen + sizeof(unsigned int))) + elog(ERROR, "bogus tuple length in backward scan"); + return false; + } + } + + tuplen = getlen(state, state->result_tape, false); + + /* + * Now we have the length of the prior tuple, back up and read it. + * Note: READTUP expects we are positioned after the initial + * length word of the tuple, so back up to that point. + */ + if (!LogicalTapeBackspace(state->tapeset, + state->result_tape, + tuplen)) + elog(ERROR, "bogus tuple length in backward scan"); + READTUP(state, stup, state->result_tape, tuplen); + return true; + + case TSS_FINALMERGE: + Assert(forward); + Assert(state->batchUsed || !state->tuples); + /* For now, assume tuple is stored in tape's batch memory */ + *should_free = false; + + /* + * This code should match the inner loop of mergeonerun(). + */ + if (state->memtupcount > 0) + { + int srcTape = state->memtuples[0].tupindex; + int tupIndex; + SortTuple *newtup; + + /* + * Returned tuple is still counted in our memory space most of + * the time. See mergebatchone() for discussion of why caller + * may occasionally be required to free returned tuple, and + * how preread memory is managed with regard to edge cases + * more generally. + */ + *stup = state->memtuples[0]; + tuplesort_heap_siftup(state, false); + if ((tupIndex = state->mergenext[srcTape]) == 0) + { + /* + * out of preloaded data on this tape, try to read more + * + * Unlike mergeonerun(), we only preload from the single + * tape that's run dry, though not before preparing its + * batch memory for a new round of sequential consumption. + * See mergepreread() comments. + */ + if (state->batchUsed) + mergebatchone(state, srcTape, stup, should_free); + + mergeprereadone(state, srcTape); + + /* + * if still no data, we've reached end of run on this tape + */ + if ((tupIndex = state->mergenext[srcTape]) == 0) + { + /* Free tape's buffer, avoiding dangling pointer */ + if (state->batchUsed) + mergebatchfreetape(state, srcTape, stup, should_free); + return true; + } + } + /* pull next preread tuple from list, insert in heap */ + newtup = &state->memtuples[tupIndex]; + state->mergenext[srcTape] = newtup->tupindex; + if (state->mergenext[srcTape] == 0) + state->mergelast[srcTape] = 0; + tuplesort_heap_insert(state, newtup, srcTape, false); + /* put the now-unused memtuples entry on the freelist */ + newtup->tupindex = state->mergefreelist; + state->mergefreelist = tupIndex; + state->mergeavailslots[srcTape]++; + return true; + } + return false; + + default: + elog(ERROR, "invalid tuplesort state"); + return false; /* keep compiler quiet */ + } +} + +/* + * Fetch the next tuple in either forward or back direction. + * If successful, put tuple in slot and return TRUE; else, clear the slot + * and return FALSE. + * + * Caller may optionally be passed back abbreviated value (on TRUE return + * value) when abbreviation was used, which can be used to cheaply avoid + * equality checks that might otherwise be required. Caller can safely make a + * determination of "non-equal tuple" based on simple binary inequality. A + * NULL value in leading attribute will set abbreviated value to zeroed + * representation, which caller may rely on in abbreviated inequality check. + * + * The slot receives a tuple that's been copied into the caller's memory + * context, so that it will stay valid regardless of future manipulations of + * the tuplesort's state (up to and including deleting the tuplesort). + * This differs from similar routines for other types of tuplesorts. + */ +bool +tuplesort_gettupleslot(Tuplesortstate *state, bool forward, + TupleTableSlot *slot, Datum *abbrev) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + bool should_free; + + if (!tuplesort_gettuple_common(state, forward, &stup, &should_free)) + stup.tuple = NULL; + + MemoryContextSwitchTo(oldcontext); + + if (stup.tuple) + { + /* Record abbreviated key for caller */ + if (state->sortKeys->abbrev_converter && abbrev) + *abbrev = stup.datum1; + + /* + * Callers rely on tuple being in their own memory context, which is + * not guaranteed by tuplesort_gettuple_common(), even when should_free + * is set to TRUE. We must always copy here, since our interface does + * not allow callers to opt into arrangement where tuple memory can go + * away on the next call here, or after tuplesort_end() is called. + */ + ExecStoreMinimalTuple(heap_copy_minimal_tuple((MinimalTuple) stup.tuple), + slot, true); + + /* + * Free local copy if needed. It would be very invasive to get + * tuplesort_gettuple_common() to allocate tuple in caller's context + * for us, so we just do this instead. + */ + if (should_free) + pfree(stup.tuple); + + return true; + } + else + { + ExecClearTuple(slot); + return false; + } +} + +/* + * Fetch the next tuple in either forward or back direction. + * Returns NULL if no more tuples. If *should_free is set, the + * caller must pfree the returned tuple when done with it. + * If it is not set, caller should not use tuple following next + * call here. It's never okay to use it after tuplesort_end(). + */ +HeapTuple +tuplesort_getheaptuple(Tuplesortstate *state, bool forward, bool *should_free) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup, should_free)) + stup.tuple = NULL; + + MemoryContextSwitchTo(oldcontext); + + return stup.tuple; +} + +/* + * Fetch the next index tuple in either forward or back direction. + * Returns NULL if no more tuples. If *should_free is set, the + * caller must pfree the returned tuple when done with it. + * If it is not set, caller should not use tuple following next + * call here. It's never okay to use it after tuplesort_end(). + */ +IndexTuple +tuplesort_getindextuple(Tuplesortstate *state, bool forward, + bool *should_free) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup, should_free)) + stup.tuple = NULL; + + MemoryContextSwitchTo(oldcontext); + + return (IndexTuple) stup.tuple; +} + +/* + * Fetch the next Datum in either forward or back direction. + * Returns FALSE if no more datums. + * + * If the Datum is pass-by-ref type, the returned value is freshly palloc'd + * in caller's context, and is now owned by the caller (this differs from + * similar routines for other types of tuplesorts). + * + * Caller may optionally be passed back abbreviated value (on TRUE return + * value) when abbreviation was used, which can be used to cheaply avoid + * equality checks that might otherwise be required. Caller can safely make a + * determination of "non-equal tuple" based on simple binary inequality. A + * NULL value will have a zeroed abbreviated value representation, which caller + * may rely on in abbreviated inequality check. + */ +bool +tuplesort_getdatum(Tuplesortstate *state, bool forward, + Datum *val, bool *isNull, Datum *abbrev) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + bool should_free; + + if (!tuplesort_gettuple_common(state, forward, &stup, &should_free)) + { + MemoryContextSwitchTo(oldcontext); + return false; + } + + /* Ensure we copy into caller's memory context */ + MemoryContextSwitchTo(oldcontext); + + /* Record abbreviated key for caller */ + if (state->sortKeys->abbrev_converter && abbrev) + *abbrev = stup.datum1; + + if (stup.isnull1 || !state->tuples) + { + *val = stup.datum1; + *isNull = stup.isnull1; + } + else + { + /* + * Callers rely on datum being in their own memory context, which is + * not guaranteed by tuplesort_gettuple_common(), even when should_free + * is set to TRUE. We must always copy here, since our interface does + * not allow callers to opt into arrangement where tuple memory can go + * away on the next call here, or after tuplesort_end() is called. + * + * Use stup.tuple because stup.datum1 may be an abbreviation. + */ + *val = datumCopy(PointerGetDatum(stup.tuple), false, state->datumTypeLen); + *isNull = false; + + /* + * Free local copy if needed. It would be very invasive to get + * tuplesort_gettuple_common() to allocate tuple in caller's context + * for us, so we just do this instead. + */ + if (should_free) + pfree(stup.tuple); + } + + return true; +} + +/* + * Advance over N tuples in either forward or back direction, + * without returning any data. N==0 is a no-op. + * Returns TRUE if successful, FALSE if ran out of tuples. + */ +bool +tuplesort_skiptuples(Tuplesortstate *state, int64 ntuples, bool forward) +{ + MemoryContext oldcontext; + + /* + * We don't actually support backwards skip yet, because no callers need + * it. The API is designed to allow for that later, though. + */ + Assert(forward); + Assert(ntuples >= 0); + + switch (state->status) + { + case TSS_SORTEDINMEM: + if (state->memtupcount - state->current >= ntuples) + { + state->current += ntuples; + return true; + } + state->current = state->memtupcount; + state->eof_reached = true; + + /* + * Complain if caller tries to retrieve more tuples than + * originally asked for in a bounded sort. This is because + * returning EOF here might be the wrong thing. + */ + if (state->bounded && state->current >= state->bound) + elog(ERROR, "retrieved too many tuples in a bounded sort"); + + return false; + + case TSS_SORTEDONTAPE: + case TSS_FINALMERGE: + + /* + * We could probably optimize these cases better, but for now it's + * not worth the trouble. + */ + oldcontext = MemoryContextSwitchTo(state->sortcontext); + while (ntuples-- > 0) + { + SortTuple stup; + bool should_free; + + if (!tuplesort_gettuple_common(state, forward, + &stup, &should_free)) + { + MemoryContextSwitchTo(oldcontext); + return false; + } + if (should_free && stup.tuple) + pfree(stup.tuple); + CHECK_FOR_INTERRUPTS(); + } + MemoryContextSwitchTo(oldcontext); + return true; + + default: + elog(ERROR, "invalid tuplesort state"); + return false; /* keep compiler quiet */ + } +} + +/* + * tuplesort_merge_order - report merge order we'll use for given memory + * (note: "merge order" just means the number of input tapes in the merge). + * + * This is exported for use by the planner. allowedMem is in bytes. + */ +int +tuplesort_merge_order(int64 allowedMem) +{ + int mOrder; + + /* + * We need one tape for each merge input, plus another one for the output, + * and each of these tapes needs buffer space. In addition we want + * MERGE_BUFFER_SIZE workspace per input tape (but the output tape doesn't + * count). + * + * Note: you might be thinking we need to account for the memtuples[] + * array in this calculation, but we effectively treat that as part of the + * MERGE_BUFFER_SIZE workspace. + */ + mOrder = (allowedMem - TAPE_BUFFER_OVERHEAD) / + (MERGE_BUFFER_SIZE + TAPE_BUFFER_OVERHEAD); + + /* Even in minimum memory, use at least a MINORDER merge */ + mOrder = Max(mOrder, MINORDER); + + return mOrder; +} + +/* + * useselection - determine algorithm to use to sort first run. + * + * It can sometimes be useful to use the replacement selection algorithm if it + * results in one large run, and there is little available workMem. See + * remarks on RUN_SECOND optimization within dumptuples(). + */ +static bool +useselection(Tuplesortstate *state) +{ + /* + * memtupsize might be noticeably higher than memtupcount here in atypical + * cases. It seems slightly preferable to not allow recent outliers to + * impact this determination. Note that caller's trace_sort output + * reports memtupcount instead. + */ + if (state->memtupsize <= replacement_sort_tuples) + return true; + + return false; +} + +/* + * inittapes - initialize for tape sorting. + * + * This is called only if we have found we don't have room to sort in memory. + */ +static void +inittapes(Tuplesortstate *state) +{ + int maxTapes, + j; + int64 tapeSpace; + + /* Compute number of tapes to use: merge order plus 1 */ + maxTapes = tuplesort_merge_order(state->allowedMem) + 1; + + /* + * We must have at least 2*maxTapes slots in the memtuples[] array, else + * we'd not have room for merge heap plus preread. It seems unlikely that + * this case would ever occur, but be safe. + */ + maxTapes = Min(maxTapes, state->memtupsize / 2); + + state->maxTapes = maxTapes; + state->tapeRange = maxTapes - 1; + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "switching to external sort with %d tapes: %s", + maxTapes, pg_rusage_show(&state->ru_start)); +#endif + + /* + * Decrease availMem to reflect the space needed for tape buffers; but + * don't decrease it to the point that we have no room for tuples. (That + * case is only likely to occur if sorting pass-by-value Datums; in all + * other scenarios the memtuples[] array is unlikely to occupy more than + * half of allowedMem. In the pass-by-value case it's not important to + * account for tuple space, so we don't care if LACKMEM becomes + * inaccurate.) + */ + tapeSpace = (int64) maxTapes *TAPE_BUFFER_OVERHEAD; + + if (tapeSpace + GetMemoryChunkSpace(state->memtuples) < state->allowedMem) + USEMEM(state, tapeSpace); + + /* + * Make sure that the temp file(s) underlying the tape set are created in + * suitable temp tablespaces. + */ + PrepareTempTablespaces(); + + /* + * Create the tape set and allocate the per-tape data arrays. + */ + state->tapeset = LogicalTapeSetCreate(maxTapes); + + state->mergeactive = (bool *) palloc0(maxTapes * sizeof(bool)); + state->mergenext = (int *) palloc0(maxTapes * sizeof(int)); + state->mergelast = (int *) palloc0(maxTapes * sizeof(int)); + state->mergeavailslots = (int *) palloc0(maxTapes * sizeof(int)); + state->mergeavailmem = (int64 *) palloc0(maxTapes * sizeof(int64)); + state->mergetuples = (char **) palloc0(maxTapes * sizeof(char *)); + state->mergecurrent = (char **) palloc0(maxTapes * sizeof(char *)); + state->mergetail = (char **) palloc0(maxTapes * sizeof(char *)); + state->mergeoverflow = (char **) palloc0(maxTapes * sizeof(char *)); + state->tp_fib = (int *) palloc0(maxTapes * sizeof(int)); + state->tp_runs = (int *) palloc0(maxTapes * sizeof(int)); + state->tp_dummy = (int *) palloc0(maxTapes * sizeof(int)); + state->tp_tapenum = (int *) palloc0(maxTapes * sizeof(int)); + + /* + * Give replacement selection a try based on user setting. There will be + * a switch to a simple hybrid sort-merge strategy after the first run + * (iff we could not output one long run). + */ + state->replaceActive = useselection(state); + + if (state->replaceActive) + { + /* + * Convert the unsorted contents of memtuples[] into a heap. Each + * tuple is marked as belonging to run number zero. + * + * NOTE: we pass false for checkIndex since there's no point in + * comparing indexes in this step, even though we do intend the + * indexes to be part of the sort key... + */ + int ntuples = state->memtupcount; + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "replacement selection will sort %d first run tuples", + state->memtupcount); +#endif + state->memtupcount = 0; /* make the heap empty */ + + for (j = 0; j < ntuples; j++) + { + /* Must copy source tuple to avoid possible overwrite */ + SortTuple stup = state->memtuples[j]; + + tuplesort_heap_insert(state, &stup, 0, false); + } + Assert(state->memtupcount == ntuples); + } + + state->currentRun = RUN_FIRST; + + /* + * Initialize variables of Algorithm D (step D1). + */ + for (j = 0; j < maxTapes; j++) + { + state->tp_fib[j] = 1; + state->tp_runs[j] = 0; + state->tp_dummy[j] = 1; + state->tp_tapenum[j] = j; + } + state->tp_fib[state->tapeRange] = 0; + state->tp_dummy[state->tapeRange] = 0; + + state->Level = 1; + state->destTape = 0; + + state->status = TSS_BUILDRUNS; +} + +/* + * selectnewtape -- select new tape for new initial run. + * + * This is called after finishing a run when we know another run + * must be started. This implements steps D3, D4 of Algorithm D. + */ +static void +selectnewtape(Tuplesortstate *state) +{ + int j; + int a; + + /* Step D3: advance j (destTape) */ + if (state->tp_dummy[state->destTape] < state->tp_dummy[state->destTape + 1]) + { + state->destTape++; + return; + } + if (state->tp_dummy[state->destTape] != 0) + { + state->destTape = 0; + return; + } + + /* Step D4: increase level */ + state->Level++; + a = state->tp_fib[0]; + for (j = 0; j < state->tapeRange; j++) + { + state->tp_dummy[j] = a + state->tp_fib[j + 1] - state->tp_fib[j]; + state->tp_fib[j] = a + state->tp_fib[j + 1]; + } + state->destTape = 0; +} + +/* + * mergeruns -- merge all the completed initial runs. + * + * This implements steps D5, D6 of Algorithm D. All input data has + * already been written to initial runs on tape (see dumptuples). + */ +static void +mergeruns(Tuplesortstate *state) +{ + int tapenum, + svTape, + svRuns, + svDummy; + + Assert(state->status == TSS_BUILDRUNS); + Assert(state->memtupcount == 0); + + if (state->sortKeys != NULL && state->sortKeys->abbrev_converter != NULL) + { + /* + * If there are multiple runs to be merged, when we go to read back + * tuples from disk, abbreviated keys will not have been stored, and + * we don't care to regenerate them. Disable abbreviation from this + * point on. + */ + state->sortKeys->abbrev_converter = NULL; + state->sortKeys->comparator = state->sortKeys->abbrev_full_comparator; + + /* Not strictly necessary, but be tidy */ + state->sortKeys->abbrev_abort = NULL; + state->sortKeys->abbrev_full_comparator = NULL; + } + + /* + * If we produced only one initial run (quite likely if the total data + * volume is between 1X and 2X workMem when replacement selection is used, + * but something we particular count on when input is presorted), we can + * just use that tape as the finished output, rather than doing a useless + * merge. (This obvious optimization is not in Knuth's algorithm.) + */ + if (state->currentRun == RUN_SECOND) + { + state->result_tape = state->tp_tapenum[state->destTape]; + /* must freeze and rewind the finished output tape */ + LogicalTapeFreeze(state->tapeset, state->result_tape); + state->status = TSS_SORTEDONTAPE; + return; + } + + /* End of step D2: rewind all output tapes to prepare for merging */ + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + LogicalTapeRewind(state->tapeset, tapenum, false); + + for (;;) + { + /* + * At this point we know that tape[T] is empty. If there's just one + * (real or dummy) run left on each input tape, then only one merge + * pass remains. If we don't have to produce a materialized sorted + * tape, we can stop at this point and do the final merge on-the-fly. + */ + if (!state->randomAccess) + { + bool allOneRun = true; + + Assert(state->tp_runs[state->tapeRange] == 0); + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + { + if (state->tp_runs[tapenum] + state->tp_dummy[tapenum] != 1) + { + allOneRun = false; + break; + } + } + if (allOneRun) + { + /* Tell logtape.c we won't be writing anymore */ + LogicalTapeSetForgetFreeSpace(state->tapeset); + /* Initialize for the final merge pass */ + beginmerge(state, state->tuples); + state->status = TSS_FINALMERGE; + return; + } + } + + /* Step D5: merge runs onto tape[T] until tape[P] is empty */ + while (state->tp_runs[state->tapeRange - 1] || + state->tp_dummy[state->tapeRange - 1]) + { + bool allDummy = true; + + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + { + if (state->tp_dummy[tapenum] == 0) + { + allDummy = false; + break; + } + } + + if (allDummy) + { + state->tp_dummy[state->tapeRange]++; + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + state->tp_dummy[tapenum]--; + } + else + mergeonerun(state); + } + + /* Step D6: decrease level */ + if (--state->Level == 0) + break; + /* rewind output tape T to use as new input */ + LogicalTapeRewind(state->tapeset, state->tp_tapenum[state->tapeRange], + false); + /* rewind used-up input tape P, and prepare it for write pass */ + LogicalTapeRewind(state->tapeset, state->tp_tapenum[state->tapeRange - 1], + true); + state->tp_runs[state->tapeRange - 1] = 0; + + /* + * reassign tape units per step D6; note we no longer care about A[] + */ + svTape = state->tp_tapenum[state->tapeRange]; + svDummy = state->tp_dummy[state->tapeRange]; + svRuns = state->tp_runs[state->tapeRange]; + for (tapenum = state->tapeRange; tapenum > 0; tapenum--) + { + state->tp_tapenum[tapenum] = state->tp_tapenum[tapenum - 1]; + state->tp_dummy[tapenum] = state->tp_dummy[tapenum - 1]; + state->tp_runs[tapenum] = state->tp_runs[tapenum - 1]; + } + state->tp_tapenum[0] = svTape; + state->tp_dummy[0] = svDummy; + state->tp_runs[0] = svRuns; + } + + /* + * Done. Knuth says that the result is on TAPE[1], but since we exited + * the loop without performing the last iteration of step D6, we have not + * rearranged the tape unit assignment, and therefore the result is on + * TAPE[T]. We need to do it this way so that we can freeze the final + * output tape while rewinding it. The last iteration of step D6 would be + * a waste of cycles anyway... + */ + state->result_tape = state->tp_tapenum[state->tapeRange]; + LogicalTapeFreeze(state->tapeset, state->result_tape); + state->status = TSS_SORTEDONTAPE; +} + +/* + * Merge one run from each input tape, except ones with dummy runs. + * + * This is the inner loop of Algorithm D step D5. We know that the + * output tape is TAPE[T]. + */ +static void +mergeonerun(Tuplesortstate *state) +{ + int destTape = state->tp_tapenum[state->tapeRange]; + int srcTape; + int tupIndex; + SortTuple *tup; + int64 priorAvail, + spaceFreed; + + /* + * Start the merge by loading one tuple from each active source tape into + * the heap. We can also decrease the input run/dummy run counts. + */ + beginmerge(state, false); + + /* + * Execute merge by repeatedly extracting lowest tuple in heap, writing it + * out, and replacing it with next tuple from same tape (if there is + * another one). + */ + while (state->memtupcount > 0) + { + /* write the tuple to destTape */ + priorAvail = state->availMem; + srcTape = state->memtuples[0].tupindex; + WRITETUP(state, destTape, &state->memtuples[0]); + /* writetup adjusted total free space, now fix per-tape space */ + spaceFreed = state->availMem - priorAvail; + state->mergeavailmem[srcTape] += spaceFreed; + /* compact the heap */ + tuplesort_heap_siftup(state, false); + if ((tupIndex = state->mergenext[srcTape]) == 0) + { + /* out of preloaded data on this tape, try to read more */ + mergepreread(state); + /* if still no data, we've reached end of run on this tape */ + if ((tupIndex = state->mergenext[srcTape]) == 0) + continue; + } + /* pull next preread tuple from list, insert in heap */ + tup = &state->memtuples[tupIndex]; + state->mergenext[srcTape] = tup->tupindex; + if (state->mergenext[srcTape] == 0) + state->mergelast[srcTape] = 0; + tuplesort_heap_insert(state, tup, srcTape, false); + /* put the now-unused memtuples entry on the freelist */ + tup->tupindex = state->mergefreelist; + state->mergefreelist = tupIndex; + state->mergeavailslots[srcTape]++; + } + + /* + * Reset tuple memory. We've freed all of the tuples that we previously + * allocated, but AllocSetFree will have put those chunks of memory on + * particular free lists, bucketed by size class. Thus, although all of + * that memory is free, it is effectively fragmented. Resetting the + * context gets us out from under that problem. + */ + MemoryContextReset(state->tuplecontext); + + /* + * When the heap empties, we're done. Write an end-of-run marker on the + * output tape, and increment its count of real runs. + */ + markrunend(state, destTape); + state->tp_runs[state->tapeRange]++; + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "finished %d-way merge step: %s", state->activeTapes, + pg_rusage_show(&state->ru_start)); +#endif +} + +/* + * beginmerge - initialize for a merge pass + * + * We decrease the counts of real and dummy runs for each tape, and mark + * which tapes contain active input runs in mergeactive[]. Then, load + * as many tuples as we can from each active input tape, and finally + * fill the merge heap with the first tuple from each active tape. + * + * finalMergeBatch indicates if this is the beginning of a final on-the-fly + * merge where a batched allocation of tuple memory is required. + */ +static void +beginmerge(Tuplesortstate *state, bool finalMergeBatch) +{ + int activeTapes; + int tapenum; + int srcTape; + int slotsPerTape; + int64 spacePerTape; + + /* Heap should be empty here */ + Assert(state->memtupcount == 0); + + /* Adjust run counts and mark the active tapes */ + memset(state->mergeactive, 0, + state->maxTapes * sizeof(*state->mergeactive)); + activeTapes = 0; + for (tapenum = 0; tapenum < state->tapeRange; tapenum++) + { + if (state->tp_dummy[tapenum] > 0) + state->tp_dummy[tapenum]--; + else + { + Assert(state->tp_runs[tapenum] > 0); + state->tp_runs[tapenum]--; + srcTape = state->tp_tapenum[tapenum]; + state->mergeactive[srcTape] = true; + activeTapes++; + } + } + state->activeTapes = activeTapes; + + /* Clear merge-pass state variables */ + memset(state->mergenext, 0, + state->maxTapes * sizeof(*state->mergenext)); + memset(state->mergelast, 0, + state->maxTapes * sizeof(*state->mergelast)); + state->mergefreelist = 0; /* nothing in the freelist */ + state->mergefirstfree = activeTapes; /* 1st slot avail for preread */ + + if (finalMergeBatch) + { + /* Free outright buffers for tape never actually allocated */ + FREEMEM(state, (state->maxTapes - activeTapes) * TAPE_BUFFER_OVERHEAD); + + /* + * Grow memtuples one last time, since the palloc() overhead no longer + * incurred can make a big difference + */ + batchmemtuples(state); + } + + /* + * Initialize space allocation to let each active input tape have an equal + * share of preread space. + */ + Assert(activeTapes > 0); + slotsPerTape = (state->memtupsize - state->mergefirstfree) / activeTapes; + Assert(slotsPerTape > 0); + spacePerTape = MAXALIGN_DOWN(state->availMem / activeTapes); + for (srcTape = 0; srcTape < state->maxTapes; srcTape++) + { + if (state->mergeactive[srcTape]) + { + state->mergeavailslots[srcTape] = slotsPerTape; + state->mergeavailmem[srcTape] = spacePerTape; + } + } + + /* + * Preallocate tuple batch memory for each tape. This is the memory used + * for tuples themselves (not SortTuples), so it's never used by + * pass-by-value datum sorts. Memory allocation is performed here at most + * once per sort, just in advance of the final on-the-fly merge step. + */ + if (finalMergeBatch) + mergebatch(state, spacePerTape); + + /* + * Preread as many tuples as possible (and at least one) from each active + * tape + */ + mergepreread(state); + + /* Load the merge heap with the first tuple from each input tape */ + for (srcTape = 0; srcTape < state->maxTapes; srcTape++) + { + int tupIndex = state->mergenext[srcTape]; + SortTuple *tup; + + if (tupIndex) + { + tup = &state->memtuples[tupIndex]; + state->mergenext[srcTape] = tup->tupindex; + if (state->mergenext[srcTape] == 0) + state->mergelast[srcTape] = 0; + tuplesort_heap_insert(state, tup, srcTape, false); + /* put the now-unused memtuples entry on the freelist */ + tup->tupindex = state->mergefreelist; + state->mergefreelist = tupIndex; + state->mergeavailslots[srcTape]++; + +#ifdef TRACE_SORT + if (trace_sort && finalMergeBatch) + { + int64 perTapeKB = (spacePerTape + 1023) / 1024; + int64 usedSpaceKB; + int usedSlots; + + /* + * Report how effective batchmemtuples() was in balancing the + * number of slots against the need for memory for the + * underlying tuples (e.g. IndexTuples). The big preread of + * all tapes when switching to FINALMERGE state should be + * fairly representative of memory utilization during the + * final merge step, and in any case is the only point at + * which all tapes are guaranteed to have depleted either + * their batch memory allowance or slot allowance. Ideally, + * both will be completely depleted for every tape by now. + */ + usedSpaceKB = (state->mergecurrent[srcTape] - + state->mergetuples[srcTape] + 1023) / 1024; + usedSlots = slotsPerTape - state->mergeavailslots[srcTape]; + + elog(LOG, "tape %d initially used " INT64_FORMAT " KB of " + INT64_FORMAT " KB batch (%2.3f) and %d out of %d slots " + "(%2.3f)", srcTape, + usedSpaceKB, perTapeKB, + (double) usedSpaceKB / (double) perTapeKB, + usedSlots, slotsPerTape, + (double) usedSlots / (double) slotsPerTape); + } +#endif + } + } +} + +/* + * batchmemtuples - grow memtuples without palloc overhead + * + * When called, availMem should be approximately the amount of memory we'd + * require to allocate memtupsize - memtupcount tuples (not SortTuples/slots) + * that were allocated with palloc() overhead, and in doing so use up all + * allocated slots. However, though slots and tuple memory is in balance + * following the last grow_memtuples() call, that's predicated on the observed + * average tuple size for the "final" grow_memtuples() call, which includes + * palloc overhead. During the final merge pass, where we will arrange to + * squeeze out the palloc overhead, we might need more slots in the memtuples + * array. + * + * To make that happen, arrange for the amount of remaining memory to be + * exactly equal to the palloc overhead multiplied by the current size of + * the memtuples array, force the grow_memtuples flag back to true (it's + * probably but not necessarily false on entry to this routine), and then + * call grow_memtuples. This simulates loading enough tuples to fill the + * whole memtuples array and then having some space left over because of the + * elided palloc overhead. We expect that grow_memtuples() will conclude that + * it can't double the size of the memtuples array but that it can increase + * it by some percentage; but if it does decide to double it, that just means + * that we've never managed to use many slots in the memtuples array, in which + * case doubling it shouldn't hurt anything anyway. + */ +static void +batchmemtuples(Tuplesortstate *state) +{ + int64 refund; + int64 availMemLessRefund; + int memtupsize = state->memtupsize; + + /* Caller error if we have no tapes */ + Assert(state->activeTapes > 0); + + /* For simplicity, assume no memtuples are actually currently counted */ + Assert(state->memtupcount == 0); + + /* + * Refund STANDARDCHUNKHEADERSIZE per tuple. + * + * This sometimes fails to make memory use perfectly balanced, but it + * should never make the situation worse. Note that Assert-enabled builds + * get a larger refund, due to a varying STANDARDCHUNKHEADERSIZE. + */ + refund = memtupsize * STANDARDCHUNKHEADERSIZE; + availMemLessRefund = state->availMem - refund; + + /* + * We need to be sure that we do not cause LACKMEM to become true, else + * the batch allocation size could be calculated as negative, causing + * havoc. Hence, if availMemLessRefund is negative at this point, we must + * do nothing. Moreover, if it's positive but rather small, there's + * little point in proceeding because we could only increase memtuples by + * a small amount, not worth the cost of the repalloc's. We somewhat + * arbitrarily set the threshold at ALLOCSET_DEFAULT_INITSIZE per tape. + * (Note that this does not represent any assumption about tuple sizes.) + */ + if (availMemLessRefund <= + (int64) state->activeTapes * ALLOCSET_DEFAULT_INITSIZE) + return; + + /* + * To establish balanced memory use after refunding palloc overhead, + * temporarily have our accounting indicate that we've allocated all + * memory we're allowed to less that refund, and call grow_memtuples() to + * have it increase the number of slots. + */ + state->growmemtuples = true; + USEMEM(state, availMemLessRefund); + (void) grow_memtuples(state); + state->growmemtuples = false; + /* availMem must stay accurate for spacePerTape calculation */ + FREEMEM(state, availMemLessRefund); + if (LACKMEM(state)) + elog(ERROR, "unexpected out-of-memory situation in tuplesort"); + +#ifdef TRACE_SORT + if (trace_sort) + { + Size OldKb = (memtupsize * sizeof(SortTuple) + 1023) / 1024; + Size NewKb = (state->memtupsize * sizeof(SortTuple) + 1023) / 1024; + + elog(LOG, "grew memtuples %1.2fx from %d (%zu KB) to %d (%zu KB) for final merge", + (double) NewKb / (double) OldKb, + memtupsize, OldKb, + state->memtupsize, NewKb); + } +#endif +} + +/* + * mergebatch - initialize tuple memory in batch + * + * This allows sequential access to sorted tuples buffered in memory from + * tapes/runs on disk during a final on-the-fly merge step. Note that the + * memory is not used for SortTuples, but for the underlying tuples (e.g. + * MinimalTuples). + * + * Note that when batch memory is used, there is a simple division of space + * into large buffers (one per active tape). The conventional incremental + * memory accounting (calling USEMEM() and FREEMEM()) is abandoned. Instead, + * when each tape's memory budget is exceeded, a retail palloc() "overflow" is + * performed, which is then immediately detected in a way that is analogous to + * LACKMEM(). This keeps each tape's use of memory fair, which is always a + * goal. + */ +static void +mergebatch(Tuplesortstate *state, int64 spacePerTape) +{ + int srcTape; + + Assert(state->activeTapes > 0); + Assert(state->tuples); + + /* + * For the purposes of tuplesort's memory accounting, the batch allocation + * is special, and regular memory accounting through USEMEM() calls is + * abandoned (see mergeprereadone()). + */ + for (srcTape = 0; srcTape < state->maxTapes; srcTape++) + { + char *mergetuples; + + if (!state->mergeactive[srcTape]) + continue; + + /* Allocate buffer for each active tape */ + mergetuples = MemoryContextAllocHuge(state->tuplecontext, + spacePerTape); + + /* Initialize state for tape */ + state->mergetuples[srcTape] = mergetuples; + state->mergecurrent[srcTape] = mergetuples; + state->mergetail[srcTape] = mergetuples; + state->mergeoverflow[srcTape] = NULL; + } + + state->batchUsed = true; + state->spacePerTape = spacePerTape; +} + +/* + * mergebatchone - prepare batch memory for one merge input tape + * + * This is called following the exhaustion of preread tuples for one input + * tape. All that actually occurs is that the state for the source tape is + * reset to indicate that all memory may be reused. + * + * This routine must deal with fixing up the tuple that is about to be returned + * to the client, due to "overflow" allocations. + */ +static void +mergebatchone(Tuplesortstate *state, int srcTape, SortTuple *rtup, + bool *should_free) +{ + Assert(state->batchUsed); + + /* + * Tuple about to be returned to caller ("stup") is final preread tuple + * from tape, just removed from the top of the heap. Special steps around + * memory management must be performed for that tuple, to make sure it + * isn't overwritten early. + */ + if (!state->mergeoverflow[srcTape]) + { + Size tupLen; + + /* + * Mark tuple buffer range for reuse, but be careful to move final, + * tail tuple to start of space for next run so that it's available to + * caller when stup is returned, and remains available at least until + * the next tuple is requested. + */ + tupLen = state->mergecurrent[srcTape] - state->mergetail[srcTape]; + state->mergecurrent[srcTape] = state->mergetuples[srcTape]; + MOVETUP(state->mergecurrent[srcTape], state->mergetail[srcTape], + tupLen); + + /* Make SortTuple at top of the merge heap point to new tuple */ + rtup->tuple = (void *) state->mergecurrent[srcTape]; + + state->mergetail[srcTape] = state->mergecurrent[srcTape]; + state->mergecurrent[srcTape] += tupLen; + } + else + { + /* + * Handle an "overflow" retail palloc. + * + * This is needed when we run out of tuple memory for the tape. + */ + state->mergecurrent[srcTape] = state->mergetuples[srcTape]; + state->mergetail[srcTape] = state->mergetuples[srcTape]; + + if (rtup->tuple) + { + Assert(rtup->tuple == (void *) state->mergeoverflow[srcTape]); + /* Caller should free palloc'd tuple */ + *should_free = true; + } + state->mergeoverflow[srcTape] = NULL; + } +} + +/* + * mergebatchfreetape - handle final clean-up for batch memory once tape is + * about to become exhausted + * + * All tuples are returned from tape, but a single final tuple, *rtup, is to be + * passed back to caller. Free tape's batch allocation buffer while ensuring + * that the final tuple is managed appropriately. + */ +static void +mergebatchfreetape(Tuplesortstate *state, int srcTape, SortTuple *rtup, + bool *should_free) +{ + Assert(state->batchUsed); + Assert(state->status == TSS_FINALMERGE); + + /* + * Tuple may or may not already be an overflow allocation from + * mergebatchone() + */ + if (!*should_free && rtup->tuple) + { + /* + * Final tuple still in tape's batch allocation. + * + * Return palloc()'d copy to caller, and have it freed in a similar + * manner to overflow allocation. Otherwise, we'd free batch memory + * and pass back a pointer to garbage. Note that we deliberately + * allocate this in the parent tuplesort context, to be on the safe + * side. + */ + Size tuplen; + void *oldTuple = rtup->tuple; + + tuplen = state->mergecurrent[srcTape] - state->mergetail[srcTape]; + rtup->tuple = MemoryContextAlloc(state->sortcontext, tuplen); + MOVETUP(rtup->tuple, oldTuple, tuplen); + *should_free = true; + } + + /* Free spacePerTape-sized buffer */ + pfree(state->mergetuples[srcTape]); +} + +/* + * mergebatchalloc - allocate memory for one tuple using a batch memory + * "logical allocation". + * + * This is used for the final on-the-fly merge phase only. READTUP() routines + * receive memory from here in place of palloc() and USEMEM() calls. + * + * Tuple tapenum is passed, ensuring each tape's tuples are stored in sorted, + * contiguous order (while allowing safe reuse of memory made available to + * each tape). This maximizes locality of access as tuples are returned by + * final merge. + * + * Caller must not subsequently attempt to free memory returned here. In + * general, only mergebatch* functions know about how memory returned from + * here should be freed, and this function's caller must ensure that batch + * memory management code will definitely have the opportunity to do the right + * thing during the final on-the-fly merge. + */ +static void * +mergebatchalloc(Tuplesortstate *state, int tapenum, Size tuplen) +{ + Size reserve_tuplen = MAXALIGN(tuplen); + char *ret; + + /* Should overflow at most once before mergebatchone() call: */ + Assert(state->mergeoverflow[tapenum] == NULL); + Assert(state->batchUsed); + + /* It should be possible to use precisely spacePerTape memory at once */ + if (state->mergecurrent[tapenum] + reserve_tuplen <= + state->mergetuples[tapenum] + state->spacePerTape) + { + /* + * Usual case -- caller is returned pointer into its tape's buffer, + * and an offset from that point is recorded as where tape has + * consumed up to for current round of preloading. + */ + ret = state->mergetail[tapenum] = state->mergecurrent[tapenum]; + state->mergecurrent[tapenum] += reserve_tuplen; + } + else + { + /* + * Allocate memory, and record as tape's overflow allocation. This + * will be detected quickly, in a similar fashion to a LACKMEM() + * condition, and should not happen again before a new round of + * preloading for caller's tape. Note that we deliberately allocate + * this in the parent tuplesort context, to be on the safe side. + * + * Sometimes, this does not happen because merging runs out of slots + * before running out of memory. + */ + ret = state->mergeoverflow[tapenum] = + MemoryContextAlloc(state->sortcontext, tuplen); + } + + return ret; +} + +/* + * mergepreread - load tuples from merge input tapes + * + * This routine exists to improve sequentiality of reads during a merge pass, + * as explained in the header comments of this file. Load tuples from each + * active source tape until the tape's run is exhausted or it has used up + * its fair share of available memory. In any case, we guarantee that there + * is at least one preread tuple available from each unexhausted input tape. + * + * We invoke this routine at the start of a merge pass for initial load, + * and then whenever any tape's preread data runs out. Note that we load + * as much data as possible from all tapes, not just the one that ran out. + * This is because logtape.c works best with a usage pattern that alternates + * between reading a lot of data and writing a lot of data, so whenever we + * are forced to read, we should fill working memory completely. + * + * In FINALMERGE state, we *don't* use this routine, but instead just preread + * from the single tape that ran dry. There's no read/write alternation in + * that state and so no point in scanning through all the tapes to fix one. + * (Moreover, there may be quite a lot of inactive tapes in that state, since + * we might have had many fewer runs than tapes. In a regular tape-to-tape + * merge we can expect most of the tapes to be active. Plus, only + * FINALMERGE state has to consider memory management for a batch + * allocation.) + */ +static void +mergepreread(Tuplesortstate *state) +{ + int srcTape; + + for (srcTape = 0; srcTape < state->maxTapes; srcTape++) + mergeprereadone(state, srcTape); +} + +/* + * mergeprereadone - load tuples from one merge input tape + * + * Read tuples from the specified tape until it has used up its free memory + * or array slots; but ensure that we have at least one tuple, if any are + * to be had. + */ +static void +mergeprereadone(Tuplesortstate *state, int srcTape) +{ + unsigned int tuplen; + SortTuple stup; + int tupIndex; + int64 priorAvail, + spaceUsed; + + if (!state->mergeactive[srcTape]) + return; /* tape's run is already exhausted */ + + /* + * Manage per-tape availMem. Only actually matters when batch memory not + * in use. + */ + priorAvail = state->availMem; + state->availMem = state->mergeavailmem[srcTape]; + + /* + * When batch memory is used if final on-the-fly merge, only mergeoverflow + * test is relevant; otherwise, only LACKMEM() test is relevant. + */ + while ((state->mergeavailslots[srcTape] > 0 && + state->mergeoverflow[srcTape] == NULL && !LACKMEM(state)) || + state->mergenext[srcTape] == 0) + { + /* read next tuple, if any */ + if ((tuplen = getlen(state, srcTape, true)) == 0) + { + state->mergeactive[srcTape] = false; + break; + } + READTUP(state, &stup, srcTape, tuplen); + /* find a free slot in memtuples[] for it */ + tupIndex = state->mergefreelist; + if (tupIndex) + state->mergefreelist = state->memtuples[tupIndex].tupindex; + else + { + tupIndex = state->mergefirstfree++; + Assert(tupIndex < state->memtupsize); + } + state->mergeavailslots[srcTape]--; + /* store tuple, append to list for its tape */ + stup.tupindex = 0; + state->memtuples[tupIndex] = stup; + if (state->mergelast[srcTape]) + state->memtuples[state->mergelast[srcTape]].tupindex = tupIndex; + else + state->mergenext[srcTape] = tupIndex; + state->mergelast[srcTape] = tupIndex; + } + /* update per-tape and global availmem counts */ + spaceUsed = state->mergeavailmem[srcTape] - state->availMem; + state->mergeavailmem[srcTape] = state->availMem; + state->availMem = priorAvail - spaceUsed; +} + +/* + * dumptuples - remove tuples from memtuples and write to tape + * + * This is used during initial-run building, but not during merging. + * + * When alltuples = false and replacement selection is still active, dump + * only enough tuples to get under the availMem limit (and leave at least + * one tuple in memtuples, since puttuple will then assume it is a heap that + * has a tuple to compare to). We always insist there be at least one free + * slot in the memtuples[] array. + * + * When alltuples = true, dump everything currently in memory. (This + * case is only used at end of input data, although in practice only the + * first run could fail to dump all tuples when we LACKMEM(), and only + * when replacement selection is active.) + * + * If, when replacement selection is active, we see that the tuple run + * number at the top of the heap has changed, start a new run. This must be + * the first run, because replacement selection is always abandoned for all + * further runs. + */ +static void +dumptuples(Tuplesortstate *state, bool alltuples) +{ + while (alltuples || + (LACKMEM(state) && state->memtupcount > 1) || + state->memtupcount >= state->memtupsize) + { + if (state->replaceActive) + { + /* + * Still holding out for a case favorable to replacement + * selection. Still incrementally spilling using heap. + * + * Dump the heap's frontmost entry, and sift up to remove it from + * the heap. + */ + Assert(state->memtupcount > 0); + WRITETUP(state, state->tp_tapenum[state->destTape], + &state->memtuples[0]); + tuplesort_heap_siftup(state, true); + } + else + { + /* + * Once committed to quicksorting runs, never incrementally spill + */ + dumpbatch(state, alltuples); + break; + } + + /* + * If top run number has changed, we've finished the current run (this + * can only be the first run), and will no longer spill incrementally. + */ + if (state->memtupcount == 0 || + state->memtuples[0].tupindex == HEAP_RUN_NEXT) + { + markrunend(state, state->tp_tapenum[state->destTape]); + Assert(state->currentRun == RUN_FIRST); + state->currentRun++; + state->tp_runs[state->destTape]++; + state->tp_dummy[state->destTape]--; /* per Alg D step D2 */ + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "finished incrementally writing %s run %d to tape %d: %s", + (state->memtupcount == 0) ? "only" : "first", + state->currentRun, state->destTape, + pg_rusage_show(&state->ru_start)); +#endif + + /* + * Done if heap is empty, which is possible when there is only one + * long run. + */ + Assert(state->currentRun == RUN_SECOND); + if (state->memtupcount == 0) + { + /* + * Replacement selection best case; no final merge required, + * because there was only one initial run (second run has no + * tuples). See RUN_SECOND case in mergeruns(). + */ + break; + } + + /* + * Abandon replacement selection for second run (as well as any + * subsequent runs). + */ + state->replaceActive = false; + + /* + * First tuple of next run should not be heapified, and so will + * bear placeholder run number. In practice this must actually be + * the second run, which just became the currentRun, so we're + * clear to quicksort and dump the tuples in batch next time + * memtuples becomes full. + */ + Assert(state->memtuples[0].tupindex == HEAP_RUN_NEXT); + selectnewtape(state); + } + } +} + +/* + * dumpbatch - sort and dump all memtuples, forming one run on tape + * + * Second or subsequent runs are never heapified by this module (although + * heapification still respects run number differences between the first and + * second runs), and a heap (replacement selection priority queue) is often + * avoided in the first place. + */ +static void +dumpbatch(Tuplesortstate *state, bool alltuples) +{ + int memtupwrite; + int i; + + /* + * Final call might require no sorting, in rare cases where we just so + * happen to have previously LACKMEM()'d at the point where exactly all + * remaining tuples are loaded into memory, just before input was + * exhausted. + * + * In general, short final runs are quite possible. Rather than allowing + * a special case where there was a superfluous selectnewtape() call (i.e. + * a call with no subsequent run actually written to destTape), we prefer + * to write out a 0 tuple run. + * + * mergepreread()/mergeprereadone() are prepared for 0 tuple runs, and + * will reliably mark the tape inactive for the merge when called from + * beginmerge(). This case is therefore similar to the case where + * mergeonerun() finds a dummy run for the tape, and so doesn't need to + * merge a run from the tape (or conceptually "merges" the dummy run, if + * you prefer). According to Knuth, Algorithm D "isn't strictly optimal" + * in its method of distribution and dummy run assignment; this edge case + * seems very unlikely to make that appreciably worse. + */ + Assert(state->status == TSS_BUILDRUNS); + + /* + * It seems unlikely that this limit will ever be exceeded, but take no + * chances + */ + if (state->currentRun == INT_MAX) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("cannot have more than %d runs for an external sort", + INT_MAX))); + + state->currentRun++; + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "starting quicksort of run %d: %s", + state->currentRun, pg_rusage_show(&state->ru_start)); +#endif + + /* + * Sort all tuples accumulated within the allowed amount of memory for + * this run using quicksort + */ + tuplesort_sort_memtuples(state); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "finished quicksort of run %d: %s", + state->currentRun, pg_rusage_show(&state->ru_start)); +#endif + + memtupwrite = state->memtupcount; + for (i = 0; i < memtupwrite; i++) + { + WRITETUP(state, state->tp_tapenum[state->destTape], + &state->memtuples[i]); + state->memtupcount--; + } + + /* + * Reset tuple memory. We've freed all of the tuples that we previously + * allocated. It's important to avoid fragmentation when there is a stark + * change in allocation patterns due to the use of batch memory. + * Fragmentation due to AllocSetFree's bucketing by size class might be + * particularly bad if this step wasn't taken. + */ + MemoryContextReset(state->tuplecontext); + + markrunend(state, state->tp_tapenum[state->destTape]); + state->tp_runs[state->destTape]++; + state->tp_dummy[state->destTape]--; /* per Alg D step D2 */ + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "finished writing run %d to tape %d: %s", + state->currentRun, state->destTape, + pg_rusage_show(&state->ru_start)); +#endif + + if (!alltuples) + selectnewtape(state); +} + +/* + * tuplesort_rescan - rewind and replay the scan + */ +void +tuplesort_rescan(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + + Assert(state->randomAccess); + + switch (state->status) + { + case TSS_SORTEDINMEM: + state->current = 0; + state->eof_reached = false; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + case TSS_SORTEDONTAPE: + LogicalTapeRewind(state->tapeset, + state->result_tape, + false); + state->eof_reached = false; + state->markpos_block = 0L; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + + MemoryContextSwitchTo(oldcontext); +} + +/* + * tuplesort_markpos - saves current position in the merged sort file + */ +void +tuplesort_markpos(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + + Assert(state->randomAccess); + + switch (state->status) + { + case TSS_SORTEDINMEM: + state->markpos_offset = state->current; + state->markpos_eof = state->eof_reached; + break; + case TSS_SORTEDONTAPE: + LogicalTapeTell(state->tapeset, + state->result_tape, + &state->markpos_block, + &state->markpos_offset); + state->markpos_eof = state->eof_reached; + break; + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + + MemoryContextSwitchTo(oldcontext); +} + +/* + * tuplesort_restorepos - restores current position in merged sort file to + * last saved position + */ +void +tuplesort_restorepos(Tuplesortstate *state) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + + Assert(state->randomAccess); + + switch (state->status) + { + case TSS_SORTEDINMEM: + state->current = state->markpos_offset; + state->eof_reached = state->markpos_eof; + break; + case TSS_SORTEDONTAPE: + if (!LogicalTapeSeek(state->tapeset, + state->result_tape, + state->markpos_block, + state->markpos_offset)) + elog(ERROR, "tuplesort_restorepos failed"); + state->eof_reached = state->markpos_eof; + break; + default: + elog(ERROR, "invalid tuplesort state"); + break; + } + + MemoryContextSwitchTo(oldcontext); +} + +/* + * tuplesort_get_stats - extract summary statistics + * + * This can be called after tuplesort_performsort() finishes to obtain + * printable summary information about how the sort was performed. + * spaceUsed is measured in kilobytes. + */ +void +tuplesort_get_stats(Tuplesortstate *state, + const char **sortMethod, + const char **spaceType, + long *spaceUsed) +{ + /* + * Note: it might seem we should provide both memory and disk usage for a + * disk-based sort. However, the current code doesn't track memory space + * accurately once we have begun to return tuples to the caller (since we + * don't account for pfree's the caller is expected to do), so we cannot + * rely on availMem in a disk sort. This does not seem worth the overhead + * to fix. Is it worth creating an API for the memory context code to + * tell us how much is actually used in sortcontext? + */ + if (state->tapeset) + { + *spaceType = "Disk"; + *spaceUsed = LogicalTapeSetBlocks(state->tapeset) * (BLCKSZ / 1024); + } + else + { + *spaceType = "Memory"; + *spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024; + } + + switch (state->status) + { + case TSS_SORTEDINMEM: + if (state->boundUsed) + *sortMethod = "top-N heapsort"; + else + *sortMethod = "quicksort"; + break; + case TSS_SORTEDONTAPE: + *sortMethod = "external sort"; + break; + case TSS_FINALMERGE: + *sortMethod = "external merge"; + break; + default: + *sortMethod = "still in progress"; + break; + } +} + + +/* + * Heap manipulation routines, per Knuth's Algorithm 5.2.3H. + * + * Compare two SortTuples. If checkIndex is true, use the tuple index + * as the front of the sort key; otherwise, no. + * + * Note that for checkIndex callers, the heap invariant is never + * maintained beyond the first run, and so there are no COMPARETUP() + * calls needed to distinguish tuples in HEAP_RUN_NEXT. + */ + +#define HEAPCOMPARE(tup1,tup2) \ + (checkIndex && ((tup1)->tupindex != (tup2)->tupindex || \ + (tup1)->tupindex == HEAP_RUN_NEXT) ? \ + ((tup1)->tupindex) - ((tup2)->tupindex) : \ + COMPARETUP(state, tup1, tup2)) + +/* + * Convert the existing unordered array of SortTuples to a bounded heap, + * discarding all but the smallest "state->bound" tuples. + * + * When working with a bounded heap, we want to keep the largest entry + * at the root (array entry zero), instead of the smallest as in the normal + * sort case. This allows us to discard the largest entry cheaply. + * Therefore, we temporarily reverse the sort direction. + * + * We assume that all entries in a bounded heap will always have tupindex + * zero; it therefore doesn't matter that HEAPCOMPARE() doesn't reverse + * the direction of comparison for tupindexes. + */ +static void +make_bounded_heap(Tuplesortstate *state) +{ + int tupcount = state->memtupcount; + int i; + + Assert(state->status == TSS_INITIAL); + Assert(state->bounded); + Assert(tupcount >= state->bound); + + /* Reverse sort direction so largest entry will be at root */ + reversedirection(state); + + state->memtupcount = 0; /* make the heap empty */ + for (i = 0; i < tupcount; i++) + { + if (state->memtupcount >= state->bound && + COMPARETUP(state, &state->memtuples[i], &state->memtuples[0]) <= 0) + { + /* New tuple would just get thrown out, so skip it */ + free_sort_tuple(state, &state->memtuples[i]); + CHECK_FOR_INTERRUPTS(); + } + else + { + /* Insert next tuple into heap */ + /* Must copy source tuple to avoid possible overwrite */ + SortTuple stup = state->memtuples[i]; + + tuplesort_heap_insert(state, &stup, 0, false); + + /* If heap too full, discard largest entry */ + if (state->memtupcount > state->bound) + { + free_sort_tuple(state, &state->memtuples[0]); + tuplesort_heap_siftup(state, false); + } + } + } + + Assert(state->memtupcount == state->bound); + state->status = TSS_BOUNDED; +} + +/* + * Convert the bounded heap to a properly-sorted array + */ +static void +sort_bounded_heap(Tuplesortstate *state) +{ + int tupcount = state->memtupcount; + + Assert(state->status == TSS_BOUNDED); + Assert(state->bounded); + Assert(tupcount == state->bound); + + /* + * We can unheapify in place because each sift-up will remove the largest + * entry, which we can promptly store in the newly freed slot at the end. + * Once we're down to a single-entry heap, we're done. + */ + while (state->memtupcount > 1) + { + SortTuple stup = state->memtuples[0]; + + /* this sifts-up the next-largest entry and decreases memtupcount */ + tuplesort_heap_siftup(state, false); + state->memtuples[state->memtupcount] = stup; + } + state->memtupcount = tupcount; + + /* + * Reverse sort direction back to the original state. This is not + * actually necessary but seems like a good idea for tidiness. + */ + reversedirection(state); + + state->status = TSS_SORTEDINMEM; + state->boundUsed = true; +} + +/* + * Sort all memtuples using specialized qsort() routines. + * + * Quicksort is used for small in-memory sorts. Quicksort is also generally + * preferred to replacement selection for generating runs during external sort + * operations, although replacement selection is sometimes used for the first + * run. + */ +static void +tuplesort_sort_memtuples(Tuplesortstate *state) +{ + if (state->memtupcount > 1) + { + /* Can we use the single-key sort function? */ + if (state->onlyKey != NULL) + qsort_ssup(state->memtuples, state->memtupcount, + state->onlyKey); + else + qsort_tuple(state->memtuples, + state->memtupcount, + state->comparetup, + state); + } +} + +/* + * Insert a new tuple into an empty or existing heap, maintaining the + * heap invariant. Caller is responsible for ensuring there's room. + * + * Note: we assume *tuple is a temporary variable that can be scribbled on. + * For some callers, tuple actually points to a memtuples[] entry above the + * end of the heap. This is safe as long as it's not immediately adjacent + * to the end of the heap (ie, in the [memtupcount] array entry) --- if it + * is, it might get overwritten before being moved into the heap! + */ +static void +tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple, + int tupleindex, bool checkIndex) +{ + SortTuple *memtuples; + int j; + + /* + * Save the tupleindex --- see notes above about writing on *tuple. It's a + * historical artifact that tupleindex is passed as a separate argument + * and not in *tuple, but it's notationally convenient so let's leave it + * that way. + */ + tuple->tupindex = tupleindex; + + memtuples = state->memtuples; + Assert(state->memtupcount < state->memtupsize); + Assert(!checkIndex || tupleindex == RUN_FIRST); + + CHECK_FOR_INTERRUPTS(); + + /* + * Sift-up the new entry, per Knuth 5.2.3 exercise 16. Note that Knuth is + * using 1-based array indexes, not 0-based. + */ + j = state->memtupcount++; + while (j > 0) + { + int i = (j - 1) >> 1; + + if (HEAPCOMPARE(tuple, &memtuples[i]) >= 0) + break; + memtuples[j] = memtuples[i]; + j = i; + } + memtuples[j] = *tuple; +} + +/* + * The tuple at state->memtuples[0] has been removed from the heap. + * Decrement memtupcount, and sift up to maintain the heap invariant. + */ +static void +tuplesort_heap_siftup(Tuplesortstate *state, bool checkIndex) +{ + SortTuple *memtuples = state->memtuples; + SortTuple *tuple; + unsigned int i, + n; + + Assert(!checkIndex || state->currentRun == RUN_FIRST); + if (--state->memtupcount <= 0) + return; + + CHECK_FOR_INTERRUPTS(); + + /* + * state->memtupcount is "int", but we use "unsigned int" for i, j, n. + * This prevents overflow in the "2 * i + 1" calculation, since at the top + * of the loop we must have i < n <= INT_MAX <= UINT_MAX/2. + */ + n = state->memtupcount; + tuple = &memtuples[n]; /* tuple that must be reinserted */ + i = 0; /* i is where the "hole" is */ + for (;;) + { + unsigned int j = 2 * i + 1; + + if (j >= n) + break; + if (j + 1 < n && + HEAPCOMPARE(&memtuples[j], &memtuples[j + 1]) > 0) + j++; + if (HEAPCOMPARE(tuple, &memtuples[j]) <= 0) + break; + memtuples[i] = memtuples[j]; + i = j; + } + memtuples[i] = *tuple; +} + +/* + * Function to reverse the sort direction from its current state + * + * It is not safe to call this when performing hash tuplesorts + */ +static void +reversedirection(Tuplesortstate *state) +{ + SortSupport sortKey = state->sortKeys; + int nkey; + + for (nkey = 0; nkey < state->nKeys; nkey++, sortKey++) + { + sortKey->ssup_reverse = !sortKey->ssup_reverse; + sortKey->ssup_nulls_first = !sortKey->ssup_nulls_first; + } +} + + +/* + * Tape interface routines + */ + +static unsigned int +getlen(Tuplesortstate *state, int tapenum, bool eofOK) +{ + unsigned int len; + + if (LogicalTapeRead(state->tapeset, tapenum, + &len, sizeof(len)) != sizeof(len)) + elog(ERROR, "unexpected end of tape"); + if (len == 0 && !eofOK) + elog(ERROR, "unexpected end of data"); + return len; +} + +static void +markrunend(Tuplesortstate *state, int tapenum) +{ + unsigned int len = 0; + + LogicalTapeWrite(state->tapeset, tapenum, (void *) &len, sizeof(len)); +} + +/* + * Get memory for tuple from within READTUP() routine. Allocate + * memory and account for that, or consume from tape's batch + * allocation. + * + * Memory returned here in the final on-the-fly merge case is recycled + * from tape's batch allocation. Otherwise, callers must pfree() or + * reset tuple child memory context, and account for that with a + * FREEMEM(). Currently, this only ever needs to happen in WRITETUP() + * routines. + */ +static void * +readtup_alloc(Tuplesortstate *state, int tapenum, Size tuplen) +{ + if (state->batchUsed) + { + /* + * No USEMEM() call, because during final on-the-fly merge accounting + * is based on tape-private state. ("Overflow" allocations are + * detected as an indication that a new round or preloading is + * required. Preloading marks existing contents of tape's batch buffer + * for reuse.) + */ + return mergebatchalloc(state, tapenum, tuplen); + } + else + { + char *ret; + + /* Batch allocation yet to be performed */ + ret = MemoryContextAlloc(state->tuplecontext, tuplen); + USEMEM(state, GetMemoryChunkSpace(ret)); + return ret; + } +} + + +/* + * Routines specialized for HeapTuple (actually MinimalTuple) case + */ + +static int +comparetup_heap(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) +{ + SortSupport sortKey = state->sortKeys; + HeapTupleData ltup; + HeapTupleData rtup; + TupleDesc tupDesc; + int nkey; + int32 compare; + AttrNumber attno; + Datum datum1, + datum2; + bool isnull1, + isnull2; + + + /* Compare the leading sort key */ + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + sortKey); + if (compare != 0) + return compare; + + /* Compare additional sort keys */ + ltup.t_len = ((MinimalTuple) a->tuple)->t_len + MINIMAL_TUPLE_OFFSET; + ltup.t_data = (HeapTupleHeader) ((char *) a->tuple - MINIMAL_TUPLE_OFFSET); + rtup.t_len = ((MinimalTuple) b->tuple)->t_len + MINIMAL_TUPLE_OFFSET; + rtup.t_data = (HeapTupleHeader) ((char *) b->tuple - MINIMAL_TUPLE_OFFSET); + tupDesc = state->tupDesc; + + if (sortKey->abbrev_converter) + { + attno = sortKey->ssup_attno; + + datum1 = heap_getattr(<up, attno, tupDesc, &isnull1); + datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2); + + compare = ApplySortAbbrevFullComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + + sortKey++; + for (nkey = 1; nkey < state->nKeys; nkey++, sortKey++) + { + attno = sortKey->ssup_attno; + + datum1 = heap_getattr(<up, attno, tupDesc, &isnull1); + datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2); + + compare = ApplySortComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + + return 0; +} + +static void +copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + /* + * We expect the passed "tup" to be a TupleTableSlot, and form a + * MinimalTuple using the exported interface for that. + */ + TupleTableSlot *slot = (TupleTableSlot *) tup; + Datum original; + MinimalTuple tuple; + HeapTupleData htup; + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + + /* copy the tuple into sort storage */ + tuple = ExecCopySlotMinimalTuple(slot); + stup->tuple = (void *) tuple; + USEMEM(state, GetMemoryChunkSpace(tuple)); + /* set up first-column key value */ + htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET; + htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET); + original = heap_getattr(&htup, + state->sortKeys[0].ssup_attno, + state->tupDesc, + &stup->isnull1); + + MemoryContextSwitchTo(oldcontext); + + if (!state->sortKeys->abbrev_converter || stup->isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to zeroed representation (to be + * consistent, and to support cheap inequality tests for NULL + * abbreviated keys). + */ + stup->datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup->datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup->datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + htup.t_len = ((MinimalTuple) mtup->tuple)->t_len + + MINIMAL_TUPLE_OFFSET; + htup.t_data = (HeapTupleHeader) ((char *) mtup->tuple - + MINIMAL_TUPLE_OFFSET); + + mtup->datum1 = heap_getattr(&htup, + state->sortKeys[0].ssup_attno, + state->tupDesc, + &mtup->isnull1); + } + } +} + +static void +writetup_heap(Tuplesortstate *state, int tapenum, SortTuple *stup) +{ + MinimalTuple tuple = (MinimalTuple) stup->tuple; + + /* the part of the MinimalTuple we'll write: */ + char *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET; + unsigned int tupbodylen = tuple->t_len - MINIMAL_TUPLE_DATA_OFFSET; + + /* total on-disk footprint: */ + unsigned int tuplen = tupbodylen + sizeof(int); + + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &tuplen, sizeof(tuplen)); + LogicalTapeWrite(state->tapeset, tapenum, + (void *) tupbody, tupbodylen); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &tuplen, sizeof(tuplen)); + + FREEMEM(state, GetMemoryChunkSpace(tuple)); + heap_free_minimal_tuple(tuple); +} + +static void +readtup_heap(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len) +{ + unsigned int tupbodylen = len - sizeof(int); + unsigned int tuplen = tupbodylen + MINIMAL_TUPLE_DATA_OFFSET; + MinimalTuple tuple = (MinimalTuple) readtup_alloc(state, tapenum, tuplen); + char *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET; + HeapTupleData htup; + + /* read in the tuple proper */ + tuple->t_len = tuplen; + LogicalTapeReadExact(state->tapeset, tapenum, + tupbody, tupbodylen); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeReadExact(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + stup->tuple = (void *) tuple; + /* set up first-column key value */ + htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET; + htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET); + stup->datum1 = heap_getattr(&htup, + state->sortKeys[0].ssup_attno, + state->tupDesc, + &stup->isnull1); +} + +static void +movetup_heap(void *dest, void *src, unsigned int len) +{ + memmove(dest, src, len); +} + +/* + * Routines specialized for the CLUSTER case (HeapTuple data, with + * comparisons per a btree index definition) + */ + +static int +comparetup_cluster(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state) +{ + SortSupport sortKey = state->sortKeys; + HeapTuple ltup; + HeapTuple rtup; + TupleDesc tupDesc; + int nkey; + int32 compare; + Datum datum1, + datum2; + bool isnull1, + isnull2; + AttrNumber leading = state->indexInfo->ii_KeyAttrNumbers[0]; + + /* Be prepared to compare additional sort keys */ + ltup = (HeapTuple) a->tuple; + rtup = (HeapTuple) b->tuple; + tupDesc = state->tupDesc; + + /* Compare the leading sort key, if it's simple */ + if (leading != 0) + { + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + sortKey); + if (compare != 0) + return compare; + + if (sortKey->abbrev_converter) + { + datum1 = heap_getattr(ltup, leading, tupDesc, &isnull1); + datum2 = heap_getattr(rtup, leading, tupDesc, &isnull2); + + compare = ApplySortAbbrevFullComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + } + if (compare != 0 || state->nKeys == 1) + return compare; + /* Compare additional columns the hard way */ + sortKey++; + nkey = 1; + } + else + { + /* Must compare all keys the hard way */ + nkey = 0; + } + + if (state->indexInfo->ii_Expressions == NULL) + { + /* If not expression index, just compare the proper heap attrs */ + + for (; nkey < state->nKeys; nkey++, sortKey++) + { + AttrNumber attno = state->indexInfo->ii_KeyAttrNumbers[nkey]; + + datum1 = heap_getattr(ltup, attno, tupDesc, &isnull1); + datum2 = heap_getattr(rtup, attno, tupDesc, &isnull2); + + compare = ApplySortComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + } + else + { + /* + * In the expression index case, compute the whole index tuple and + * then compare values. It would perhaps be faster to compute only as + * many columns as we need to compare, but that would require + * duplicating all the logic in FormIndexDatum. + */ + Datum l_index_values[INDEX_MAX_KEYS]; + bool l_index_isnull[INDEX_MAX_KEYS]; + Datum r_index_values[INDEX_MAX_KEYS]; + bool r_index_isnull[INDEX_MAX_KEYS]; + TupleTableSlot *ecxt_scantuple; + + /* Reset context each time to prevent memory leakage */ + ResetPerTupleExprContext(state->estate); + + ecxt_scantuple = GetPerTupleExprContext(state->estate)->ecxt_scantuple; + + ExecStoreTuple(ltup, ecxt_scantuple, InvalidBuffer, false); + FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate, + l_index_values, l_index_isnull); + + ExecStoreTuple(rtup, ecxt_scantuple, InvalidBuffer, false); + FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate, + r_index_values, r_index_isnull); + + for (; nkey < state->nKeys; nkey++, sortKey++) + { + compare = ApplySortComparator(l_index_values[nkey], + l_index_isnull[nkey], + r_index_values[nkey], + r_index_isnull[nkey], + sortKey); + if (compare != 0) + return compare; + } + } + + return 0; +} + +static void +copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + HeapTuple tuple = (HeapTuple) tup; + Datum original; + MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext); + + /* copy the tuple into sort storage */ + tuple = heap_copytuple(tuple); + stup->tuple = (void *) tuple; + USEMEM(state, GetMemoryChunkSpace(tuple)); + + MemoryContextSwitchTo(oldcontext); + + /* + * set up first-column key value, and potentially abbreviate, if it's a + * simple column + */ + if (state->indexInfo->ii_KeyAttrNumbers[0] == 0) + return; + + original = heap_getattr(tuple, + state->indexInfo->ii_KeyAttrNumbers[0], + state->tupDesc, + &stup->isnull1); + + if (!state->sortKeys->abbrev_converter || stup->isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to zeroed representation (to be + * consistent, and to support cheap inequality tests for NULL + * abbreviated keys). + */ + stup->datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup->datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup->datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + tuple = (HeapTuple) mtup->tuple; + mtup->datum1 = heap_getattr(tuple, + state->indexInfo->ii_KeyAttrNumbers[0], + state->tupDesc, + &mtup->isnull1); + } + } +} + +static void +writetup_cluster(Tuplesortstate *state, int tapenum, SortTuple *stup) +{ + HeapTuple tuple = (HeapTuple) stup->tuple; + unsigned int tuplen = tuple->t_len + sizeof(ItemPointerData) + sizeof(int); + + /* We need to store t_self, but not other fields of HeapTupleData */ + LogicalTapeWrite(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + LogicalTapeWrite(state->tapeset, tapenum, + &tuple->t_self, sizeof(ItemPointerData)); + LogicalTapeWrite(state->tapeset, tapenum, + tuple->t_data, tuple->t_len); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeWrite(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + + FREEMEM(state, GetMemoryChunkSpace(tuple)); + heap_freetuple(tuple); +} + +static void +readtup_cluster(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int tuplen) +{ + unsigned int t_len = tuplen - sizeof(ItemPointerData) - sizeof(int); + HeapTuple tuple = (HeapTuple) readtup_alloc(state, + tapenum, + t_len + HEAPTUPLESIZE); + + /* Reconstruct the HeapTupleData header */ + tuple->t_data = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE); + tuple->t_len = t_len; + LogicalTapeReadExact(state->tapeset, tapenum, + &tuple->t_self, sizeof(ItemPointerData)); + /* We don't currently bother to reconstruct t_tableOid */ + tuple->t_tableOid = InvalidOid; + /* Read in the tuple body */ + LogicalTapeReadExact(state->tapeset, tapenum, + tuple->t_data, tuple->t_len); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeReadExact(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + stup->tuple = (void *) tuple; + /* set up first-column key value, if it's a simple column */ + if (state->indexInfo->ii_KeyAttrNumbers[0] != 0) + stup->datum1 = heap_getattr(tuple, + state->indexInfo->ii_KeyAttrNumbers[0], + state->tupDesc, + &stup->isnull1); +} + +static void +movetup_cluster(void *dest, void *src, unsigned int len) +{ + HeapTuple tuple; + + memmove(dest, src, len); + + /* Repoint the HeapTupleData header */ + tuple = (HeapTuple) dest; + tuple->t_data = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE); +} + + +/* + * Routines specialized for IndexTuple case + * + * The btree and hash cases require separate comparison functions, but the + * IndexTuple representation is the same so the copy/write/read support + * functions can be shared. + */ + +static int +comparetup_index_btree(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state) +{ + /* + * This is similar to comparetup_heap(), but expects index tuples. There + * is also special handling for enforcing uniqueness, and special + * treatment for equal keys at the end. + */ + SortSupport sortKey = state->sortKeys; + IndexTuple tuple1; + IndexTuple tuple2; + int keysz; + TupleDesc tupDes; + bool equal_hasnull = false; + int nkey; + int32 compare; + Datum datum1, + datum2; + bool isnull1, + isnull2; + + + /* Compare the leading sort key */ + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + sortKey); + if (compare != 0) + return compare; + + /* Compare additional sort keys */ + tuple1 = (IndexTuple) a->tuple; + tuple2 = (IndexTuple) b->tuple; + keysz = state->nKeys; + tupDes = RelationGetDescr(state->indexRel); + + if (sortKey->abbrev_converter) + { + datum1 = index_getattr(tuple1, 1, tupDes, &isnull1); + datum2 = index_getattr(tuple2, 1, tupDes, &isnull2); + + compare = ApplySortAbbrevFullComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + + /* they are equal, so we only need to examine one null flag */ + if (a->isnull1) + equal_hasnull = true; + + sortKey++; + for (nkey = 2; nkey <= keysz; nkey++, sortKey++) + { + datum1 = index_getattr(tuple1, nkey, tupDes, &isnull1); + datum2 = index_getattr(tuple2, nkey, tupDes, &isnull2); + + compare = ApplySortComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; /* done when we find unequal attributes */ + + /* they are equal, so we only need to examine one null flag */ + if (isnull1) + equal_hasnull = true; + } + + /* + * If btree has asked us to enforce uniqueness, complain if two equal + * tuples are detected (unless there was at least one NULL field). + * + * It is sufficient to make the test here, because if two tuples are equal + * they *must* get compared at some stage of the sort --- otherwise the + * sort algorithm wouldn't have checked whether one must appear before the + * other. + */ + if (state->enforceUnique && !equal_hasnull) + { + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + char *key_desc; + + /* + * Some rather brain-dead implementations of qsort (such as the one in + * QNX 4) will sometimes call the comparison routine to compare a + * value to itself, but we always use our own implementation, which + * does not. + */ + Assert(tuple1 != tuple2); + + index_deform_tuple(tuple1, tupDes, values, isnull); + + key_desc = BuildIndexValueDescription(state->indexRel, values, isnull); + + ereport(ERROR, + (errcode(ERRCODE_UNIQUE_VIOLATION), + errmsg("could not create unique index \"%s\"", + RelationGetRelationName(state->indexRel)), + key_desc ? errdetail("Key %s is duplicated.", key_desc) : + errdetail("Duplicate keys exist."), + errtableconstraint(state->heapRel, + RelationGetRelationName(state->indexRel)))); + } + + /* + * If key values are equal, we sort on ItemPointer. This does not affect + * validity of the finished index, but it may be useful to have index + * scans in physical order. + */ + { + BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid); + BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid); + + if (blk1 != blk2) + return (blk1 < blk2) ? -1 : 1; + } + { + OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid); + OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid); + + if (pos1 != pos2) + return (pos1 < pos2) ? -1 : 1; + } + + /* ItemPointer values should never be equal */ + Assert(false); + + return 0; +} + +static int +comparetup_index_hash(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state) +{ + uint32 hash1; + uint32 hash2; + IndexTuple tuple1; + IndexTuple tuple2; + + /* + * Fetch hash keys and mask off bits we don't want to sort by. We know + * that the first column of the index tuple is the hash key. + */ + Assert(!a->isnull1); + hash1 = DatumGetUInt32(a->datum1) & state->hash_mask; + Assert(!b->isnull1); + hash2 = DatumGetUInt32(b->datum1) & state->hash_mask; + + if (hash1 > hash2) + return 1; + else if (hash1 < hash2) + return -1; + + /* + * If hash values are equal, we sort on ItemPointer. This does not affect + * validity of the finished index, but it may be useful to have index + * scans in physical order. + */ + tuple1 = (IndexTuple) a->tuple; + tuple2 = (IndexTuple) b->tuple; + + { + BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid); + BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid); + + if (blk1 != blk2) + return (blk1 < blk2) ? -1 : 1; + } + { + OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid); + OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid); + + if (pos1 != pos2) + return (pos1 < pos2) ? -1 : 1; + } + + /* ItemPointer values should never be equal */ + Assert(false); + + return 0; +} + +static void +copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + IndexTuple tuple = (IndexTuple) tup; + unsigned int tuplen = IndexTupleSize(tuple); + IndexTuple newtuple; + Datum original; + + /* copy the tuple into sort storage */ + newtuple = (IndexTuple) MemoryContextAlloc(state->tuplecontext, tuplen); + memcpy(newtuple, tuple, tuplen); + USEMEM(state, GetMemoryChunkSpace(newtuple)); + stup->tuple = (void *) newtuple; + /* set up first-column key value */ + original = index_getattr(newtuple, + 1, + RelationGetDescr(state->indexRel), + &stup->isnull1); + + if (!state->sortKeys->abbrev_converter || stup->isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to zeroed representation (to be + * consistent, and to support cheap inequality tests for NULL + * abbreviated keys). + */ + stup->datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup->datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup->datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just + * handled). It does not matter if some dumped tuples are already + * sorted on tape, since serialized tuples lack abbreviated keys + * (TSS_BUILDRUNS state prevents control reaching here in any case). + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + tuple = (IndexTuple) mtup->tuple; + mtup->datum1 = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &mtup->isnull1); + } + } +} + +static void +writetup_index(Tuplesortstate *state, int tapenum, SortTuple *stup) +{ + IndexTuple tuple = (IndexTuple) stup->tuple; + unsigned int tuplen; + + tuplen = IndexTupleSize(tuple) + sizeof(tuplen); + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &tuplen, sizeof(tuplen)); + LogicalTapeWrite(state->tapeset, tapenum, + (void *) tuple, IndexTupleSize(tuple)); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &tuplen, sizeof(tuplen)); + + FREEMEM(state, GetMemoryChunkSpace(tuple)); + pfree(tuple); +} + +static void +readtup_index(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len) +{ + unsigned int tuplen = len - sizeof(unsigned int); + IndexTuple tuple = (IndexTuple) readtup_alloc(state, tapenum, tuplen); + + LogicalTapeReadExact(state->tapeset, tapenum, + tuple, tuplen); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeReadExact(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + stup->tuple = (void *) tuple; + /* set up first-column key value */ + stup->datum1 = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &stup->isnull1); +} + +static void +movetup_index(void *dest, void *src, unsigned int len) +{ + memmove(dest, src, len); +} + +/* + * Routines specialized for DatumTuple case + */ + +static int +comparetup_datum(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) +{ + int compare; + + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + state->sortKeys); + if (compare != 0) + return compare; + + /* if we have abbreviations, then "tuple" has the original value */ + + if (state->sortKeys->abbrev_converter) + compare = ApplySortAbbrevFullComparator(PointerGetDatum(a->tuple), a->isnull1, + PointerGetDatum(b->tuple), b->isnull1, + state->sortKeys); + + return compare; +} + +static void +copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + /* Not currently needed */ + elog(ERROR, "copytup_datum() should not be called"); +} + +static void +writetup_datum(Tuplesortstate *state, int tapenum, SortTuple *stup) +{ + void *waddr; + unsigned int tuplen; + unsigned int writtenlen; + + if (stup->isnull1) + { + waddr = NULL; + tuplen = 0; + } + else if (!state->tuples) + { + waddr = &stup->datum1; + tuplen = sizeof(Datum); + } + else + { + waddr = stup->tuple; + tuplen = datumGetSize(PointerGetDatum(stup->tuple), false, state->datumTypeLen); + Assert(tuplen != 0); + } + + writtenlen = tuplen + sizeof(unsigned int); + + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &writtenlen, sizeof(writtenlen)); + LogicalTapeWrite(state->tapeset, tapenum, + waddr, tuplen); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeWrite(state->tapeset, tapenum, + (void *) &writtenlen, sizeof(writtenlen)); + + if (stup->tuple) + { + FREEMEM(state, GetMemoryChunkSpace(stup->tuple)); + pfree(stup->tuple); + stup->tuple = NULL; + } +} + +static void +readtup_datum(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len) +{ + unsigned int tuplen = len - sizeof(unsigned int); + + if (tuplen == 0) + { + /* it's NULL */ + stup->datum1 = (Datum) 0; + stup->isnull1 = true; + stup->tuple = NULL; + } + else if (!state->tuples) + { + Assert(tuplen == sizeof(Datum)); + LogicalTapeReadExact(state->tapeset, tapenum, + &stup->datum1, tuplen); + stup->isnull1 = false; + stup->tuple = NULL; + } + else + { + void *raddr = readtup_alloc(state, tapenum, tuplen); + + LogicalTapeReadExact(state->tapeset, tapenum, + raddr, tuplen); + stup->datum1 = PointerGetDatum(raddr); + stup->isnull1 = false; + stup->tuple = raddr; + } + + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeReadExact(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); +} + +static void +movetup_datum(void *dest, void *src, unsigned int len) +{ + memmove(dest, src, len); +} + +/* + * Convenience routine to free a tuple previously loaded into sort memory + */ +static void +free_sort_tuple(Tuplesortstate *state, SortTuple *stup) +{ + if (stup->tuple) + { + FREEMEM(state, GetMemoryChunkSpace(stup->tuple)); + pfree(stup->tuple); + } +} diff --git a/t/001_wal.pl b/t/001_wal.pl index 6cd507da86..a169683ee6 100644 --- a/t/001_wal.pl +++ b/t/001_wal.pl @@ -1,10 +1,31 @@ # Test generic xlog record work for rum index replication. use strict; use warnings; -use PostgresNode; -use TestLib; use Test::More tests => 31; +my $pg_15_modules; + +BEGIN +{ + $pg_15_modules = eval + { + require PostgreSQL::Test::Cluster; + require PostgreSQL::Test::Utils; + return 1; + }; + + unless (defined $pg_15_modules) + { + $pg_15_modules = 0; + + require PostgresNode; + require TestLib; + } +} + +note('PostgreSQL 15 modules are used: ' . ($pg_15_modules ? 'yes' : 'no')); + + my $node_master; my $node_standby; @@ -23,12 +44,12 @@ sub test_index_replay if ($server_version < 100000) { $caughtup_query = - "SELECT pg_current_xlog_location() <= write_location FROM pg_stat_replication WHERE application_name = '$applname';"; + "SELECT pg_current_xlog_location() <= replay_location FROM pg_stat_replication WHERE application_name = '$applname';"; } else { $caughtup_query = - "SELECT pg_current_wal_lsn() <= write_lsn FROM pg_stat_replication WHERE application_name = '$applname';"; + "SELECT pg_current_wal_lsn() <= replay_lsn FROM pg_stat_replication WHERE application_name = '$applname';"; } $node_master->poll_query_until('postgres', $caughtup_query) or die "Timed out while waiting for standby 1 to catch up"; @@ -50,7 +71,23 @@ sub test_index_replay } # Initialize master node -$node_master = get_new_node('master'); + +# Create node. +# Older versions of PostgreSQL modules use get_new_node function. +# Newer use standard perl object constructor syntax. +# Also applies for node_standby (below). +eval +{ + if ($pg_15_modules) + { + $node_master = PostgreSQL::Test::Cluster->new("master"); + } + else + { + $node_master = PostgresNode::get_new_node("master"); + } +}; + $node_master->init(allows_streaming => 1); $node_master->start; my $backup_name = 'my_backup'; @@ -59,7 +96,18 @@ sub test_index_replay $node_master->backup($backup_name); # Create streaming standby linking to master -$node_standby = get_new_node('standby'); +eval +{ + if ($pg_15_modules) + { + $node_standby = PostgreSQL::Test::Cluster->new("standby"); + } + else + { + $node_standby = PostgresNode::get_new_node("standby"); + } +}; + $node_standby->init_from_backup($node_master, $backup_name, has_streaming => 1); $node_standby->start; @@ -71,7 +119,7 @@ sub test_index_replay to_tsvector('simple', array_to_string(array( select substr('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ', trunc(random() * 52)::integer + 1, 1) FROM generate_series(i, i + 4)), '')) - FROM generate_series(1,100000) i;"); + FROM generate_series(1,16000) i;"); $node_master->psql("postgres", "CREATE INDEX rumidx ON tst USING rum (t rum_tsvector_ops);"); # Test that queries give same result diff --git a/t/002_pglist.pl b/t/002_pglist.pl new file mode 100644 index 0000000000..7b2d76c058 --- /dev/null +++ b/t/002_pglist.pl @@ -0,0 +1,207 @@ +# Test RUM index with big base 'pglist'. +use strict; +use warnings; +use Config; +use Test::More; + +plan skip_all => 'This test requires downloading a 1GB archive. ' . + 'The unpacked file weighs almost 3GB. ' . + 'Perform only if the big_values is enabled in PG_TEST_EXTRA' + unless $ENV{PG_TEST_EXTRA} && $ENV{PG_TEST_EXTRA} =~ /\bbig_values\b/; + +plan tests => 4; + +my $node; + +# Utility function + +sub file_exists +{ + my ($file) = @_; + return -e $file; +} + +# Check the existence of the test base, install if necessary + +sub install_pglist +{ + my $dir = Cwd->getcwd; #current directory + + my %config = ( + #directory with pglist dump must be inside the current directory + pglist_tmp_dir => $dir . '/pglist_tmp/', + dump_name => 'pglist-28-04-16.dump', + dump_url => 'http://www.sai.msu.su/~megera/postgres/files/pglist-28-04-16.dump.gz', + pglist_archive => $dir . '/pglist_tmp/' . 'pglist-28-04-16.dump.gz', + ); + + my $path_to_dump = $config{pglist_tmp_dir} . $config{dump_name}; + + if (file_exists($path_to_dump)) + { + note($config{dump_name} . ' already installed'); + } + else + { + # Create folder /contrib/rum/pglist_tmp if not already exists + mkdir($config{pglist_tmp_dir}, 0700) + unless file_exists($config{pglist_tmp_dir}); + + # Download archive pglist-28-04-16.dump.gz if not already exists + unless (file_exists($config{pglist_archive})) + { + note('Downloading pglist dump in ' . $config{pglist_archive}); + + # Flag "-nv" allows us to avoid frequent messages + # about the download status in the log. + # But it can be enabled for debugging purposes. + system("wget -P $config{pglist_tmp_dir} -nv $config{dump_url}") == 0 + or die "Couldn't get archive by link: $?"; + } + + # Unzip the dump. Delete archive to save memory + system("gzip -d $config{pglist_archive}") == 0 + or die "Couldn't extract archive: $?"; + + file_exists($path_to_dump) + or die "Failed to get " . $config{dump_name}; + + note($config{dump_name} . ' is ready to use'); + } + + $node->psql("postgres", "CREATE DATABASE pglist"); + $node->psql("postgres", "CREATE ROLE oleg"); + my $command = "'" . $path_to_dump . "'"; + my $result = $node->psql("pglist", '\i ' . $command); +} + +# Tests SELECT constructions to 'pglist' base + +sub test_select +{ + note("Creating index 'rumidx_orderby_sent'"); + + $node->safe_psql("pglist", "CREATE INDEX rumidx_orderby_sent ON pglist " . + "USING rum (fts rum_tsvector_timestamp_ops, sent) " . + "WITH (attach=sent, to=fts, order_by_attach=t)"); + + note("Test ORDER BY timestamp"); + + my $result1 = $node->safe_psql("pglist", + "SELECT sent, subject FROM pglist WHERE fts @@ " . + "to_tsquery('english', 'backend <-> crushed') " . + "ORDER BY sent <=| '2016-01-01 00:01' LIMIT 5"); + + is($result1, '1999-06-02 11:52:46|Re: [HACKERS] PID of backend'); + + note("Test tsvector filter"); + + my $result2 = $node->safe_psql("pglist", + "SELECT count(*) FROM pglist " . + "WHERE fts @@ to_tsquery('english', 'tom & lane')"); + + is($result2, '222813'); + + $node->safe_psql("pglist", "DROP INDEX rumidx_orderby_sent"); +} + +sub test_order_by +{ + note("Creating index 'pglist_rum_idx'"); + + $node->safe_psql("pglist", + "CREATE INDEX pglist_rum_idx ON pglist " . + "USING rum (fts rum_tsvector_ops)"); + + note("Test ORDER BY tsvector"); + + my $result3 = $node->safe_psql("pglist", + "SELECT id FROM pglist " . + "WHERE fts @@ to_tsquery('english', 'postgres:*') " . + "ORDER BY fts <=> " . + "to_tsquery('english', 'postgres:*') LIMIT 9"); + + is((split(" ", $result3))[0], '816114'); + + # Autovacuum after large update, with active RUM index crashes postgres + note("Test Issue #19"); + + my $stderr; + $node->safe_psql("pglist", "DELETE FROM pglist WHERE id < 100000"); + $node->safe_psql("pglist", "vacuum", stderr => \$stderr); + + is($stderr, undef); + + $node->safe_psql("pglist", "DROP INDEX pglist_rum_idx"); +} + +# Start backend + +my $pg_15_modules; + +BEGIN +{ + $pg_15_modules = eval + { + require PostgreSQL::Test::Cluster; + require PostgreSQL::Test::Utils; + return 1; + }; + + unless (defined $pg_15_modules) + { + $pg_15_modules = 0; + + require PostgresNode; + require TestLib; + } +} + +note('PostgreSQL 15 modules are used: ' . ($pg_15_modules ? 'yes' : 'no')); + +if ($pg_15_modules) +{ + $node = PostgreSQL::Test::Cluster->new("master"); +} +else +{ + $node = PostgresNode::get_new_node("master"); +} + +$node->init(allows_streaming => 1); +$node->append_conf("postgresql.conf", "shared_buffers='4GB'\n" . + "maintenance_work_mem='2GB'\n" . + "max_wal_size='2GB'\n" . + "work_mem='50MB'"); +$node->start; + +# Check the existence of the pglist base + +note('Check the existence of the pglist base...'); +my $check_pglist = $node->psql('postgres', "SELECT count(*) FROM pg_database " . + "WHERE datistemplate = false AND " . + "datname = 'pglist'"); +if ($check_pglist == 1) +{ + note("pglist already exists"); +} +else +{ + note("Create pglist database"); + install_pglist(); +} + +$node->psql("pglist", "CREATE EXTENSION rum"); +note('Setup is completed successfully'); + +eval +{ + test_select(); + test_order_by(); + $node->stop(); + done_testing(); + 1; +} or do { + note('Something went wrong: $@\n'); +}; + diff --git a/tests/README.md b/tests/README.md deleted file mode 100644 index de04c4d617..0000000000 --- a/tests/README.md +++ /dev/null @@ -1,14 +0,0 @@ -## Running tests - -Install testgres: - -``` -pip install testgres -``` - -Run command: - -``` -python -m unittest pglist_tests -``` - diff --git a/tests/__init__.py b/tests/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/tests/pglist_tests.py b/tests/pglist_tests.py deleted file mode 100644 index a693a1b606..0000000000 --- a/tests/pglist_tests.py +++ /dev/null @@ -1,157 +0,0 @@ -# coding: utf-8 -""" - Test RUM index with big base 'pglist' - - Copyright (c) 2015-2016, Postgres Professional -""" -import unittest -import os -import sys -import gzip -import testgres as tg - -if sys.version_info[0] < 3: - import urllib as request -else: - import urllib.request as request - -from os.path import expanduser - - -class PglistTests(unittest.TestCase): - - def setUp(self): - current_dir = os.path.dirname(os.path.abspath(__file__)) - - self.node = tg.get_new_node("pglist", - os.path.join(current_dir, "tmp_install")) - try: - self.node.init() - self.node.append_conf("postgresql.conf", - "shared_buffers='4GB'\n" - "maintenance_work_mem='2GB'\n" - "max_wal_size='2GB'\n" - "work_mem='50MB'") - self.node.start() - except Exception as e: - self.printlog(os.path.join(self.node.logs_dir, "postgresql.log")) - raise e - - def tearDown(self): - tg.stop_all() - - def init_pglist_data(self, node): - # Check if 'pglist' base exists - bases = node.execute("postgres", - "SELECT count(*) FROM pg_database " - "WHERE datistemplate = false AND " - " datname = 'pglist'") - if bases[0][0] != 0: - return - - # Check if 'pglist' dump exists - home = expanduser("~") - pglist_dump = os.path.join(home, "pglist-28-04-16.dump") - if not os.path.isfile(pglist_dump): - pglist_dumpgz = pglist_dump + ".gz" - if not os.path.isfile(pglist_dumpgz): - print("Downloading: {0}".format(pglist_dumpgz)) - request.urlretrieve( - "http://www.sai.msu.su/~megera/postgres/files/pglist-28-04-16.dump.gz", - pglist_dumpgz) - - print("Decompressing: {0}".format(pglist_dumpgz)) - gz = gzip.open(pglist_dumpgz, 'rb') - with open(pglist_dump, 'wb') as f: - f.write(gz.read()) - - os.remove(pglist_dumpgz) - - # Restore dump file - print("Restoring 'pglist'") - node.safe_psql("postgres", "CREATE DATABASE pglist") - node.psql("pglist", filename=pglist_dump) - - node.safe_psql("pglist", "CREATE EXTENSION rum") - - def printlog(self, logfile): - with open(logfile, 'r') as log: - for line in log.readlines(): - print(line) - - def test_order_by(self): - """Tests SELECT constructions to 'pglist' base""" - try: - self.init_pglist_data(self.node) - - print("Creating index 'rumidx_orderby_sent'") - - self.node.safe_psql( - "pglist", - "CREATE INDEX rumidx_orderby_sent ON pglist USING rum (" - " fts rum_tsvector_timestamp_ops, sent) " - " WITH (attach=sent, to=fts, order_by_attach=t)") - - print("Running tests") - - self.assertEqual( - self.node.safe_psql( - "pglist", - "SELECT sent, subject " - " FROM pglist " - " WHERE fts @@ " - " to_tsquery('english', 'backend <-> crushed') " - " ORDER BY sent <=| '2016-01-01 00:01' LIMIT 5" - ), - b'1999-06-02 11:52:46|Re: [HACKERS] PID of backend\n' - ) - - self.assertEqual( - self.node.safe_psql( - "pglist", - "SELECT count(*) FROM pglist " - "WHERE fts @@ to_tsquery('english', 'tom & lane')" - ), - b'222813\n' - ) - - self.node.safe_psql("pglist", "DROP INDEX rumidx_orderby_sent"); - - print("Creating index 'pglist_rum_idx'") - - self.node.safe_psql( - "pglist", - "CREATE INDEX pglist_rum_idx ON pglist USING rum (" - " fts rum_tsvector_ops)") - - print("Running tests") - - self.assertEqual( - self.node.execute( - "pglist", - "SELECT id FROM pglist " - "WHERE fts @@ to_tsquery('english', 'postgres:*') " - "ORDER BY fts <=> to_tsquery('english', 'postgres:*') " - "LIMIT 9" - )[0][0], - 816114 - ) - - # Autovacuum after large update, with active RUM index crashes postgres - print("Test Issue #19") - - self.node.safe_psql( - "pglist", - "DELETE FROM pglist WHERE id < 100000") - self.node.safe_psql( - "pglist", - "vacuum") - - self.node.safe_psql("pglist", "DROP INDEX pglist_rum_idx"); - - except Exception as e: - self.printlog(os.path.join(self.node.logs_dir, "postgresql.log")) - raise e - -if __name__ == "__main__": - unittest.main() diff --git a/travis/Dockerfile.in b/travis/Dockerfile.in new file mode 100644 index 0000000000..66625248cc --- /dev/null +++ b/travis/Dockerfile.in @@ -0,0 +1,33 @@ +FROM postgres:${PG_VERSION}-alpine + +# Install dependencies +RUN apk add --no-cache \ + linux-headers \ + openssl curl \ + perl perl-ipc-run perl-dev perl-app-cpanminus perl-dbi \ + make musl-dev gcc bison flex coreutils \ + zlib-dev libedit-dev \ + pkgconf icu-dev clang clang15 clang-analyzer; + +# Environment +ENV LANG=C.UTF-8 PGDATA=/pg/data + +# Make directories +RUN mkdir -p ${PGDATA} && \ + mkdir -p /pg/testdir + +COPY run_tests.sh /run.sh +RUN chmod 755 /run.sh + +COPY . /pg/testdir +WORKDIR /pg/testdir + +# Grant privileges +RUN chown postgres:postgres ${PGDATA} && \ + chown -R postgres:postgres /pg/testdir && \ + chown postgres:postgres /usr/local/include/postgresql/server/ && \ + chmod a+rwx /usr/local/lib/postgresql && \ + chmod a+rwx /usr/local/share/postgresql/extension + +USER postgres +ENTRYPOINT LEVEL=${LEVEL} /run.sh diff --git a/travis/docker-compose.yml b/travis/docker-compose.yml new file mode 100644 index 0000000000..0544d8597d --- /dev/null +++ b/travis/docker-compose.yml @@ -0,0 +1,3 @@ +services: + tests: + build: . diff --git a/travis/mk_dockerfile.sh b/travis/mk_dockerfile.sh new file mode 100755 index 0000000000..9108d2c68d --- /dev/null +++ b/travis/mk_dockerfile.sh @@ -0,0 +1,16 @@ +if [ -z ${PG_VERSION+x} ]; then + echo PG_VERSION is not set! + exit 1 +fi + +if [ -z ${LEVEL+x} ]; then + LEVEL=standard +fi + +echo PG_VERSION=${PG_VERSION} +echo LEVEL=${LEVEL} + +sed \ + -e 's/${PG_VERSION}/'${PG_VERSION}/g \ + -e 's/${LEVEL}/'${LEVEL}/g \ +Dockerfile.in > Dockerfile diff --git a/travis/run_tests.sh b/travis/run_tests.sh new file mode 100644 index 0000000000..37bba84d64 --- /dev/null +++ b/travis/run_tests.sh @@ -0,0 +1,114 @@ +#!/usr/bin/env bash + +# +# Copyright (c) 2019-2024, Postgres Professional +# +# supported levels: +# * standard +# * hardcore +# + +set -ux +status=0 + + +# rebuild PostgreSQL with cassert support +if [ "$LEVEL" = "hardcore" ]; then + + set -e + + CUSTOM_PG_BIN=$PWD/pg_bin + CUSTOM_PG_SRC=$PWD/postgresql + + # here PG_VERSION is provided by postgres:X-alpine docker image + curl "https://ftp.postgresql.org/pub/source/v$PG_VERSION/postgresql-$PG_VERSION.tar.bz2" -o postgresql.tar.bz2 + echo "$PG_SHA256 *postgresql.tar.bz2" | sha256sum -c - + + mkdir $CUSTOM_PG_SRC + + tar \ + --extract \ + --file postgresql.tar.bz2 \ + --directory $CUSTOM_PG_SRC \ + --strip-components 1 + + cd $CUSTOM_PG_SRC + + # enable additional options + ./configure \ + CFLAGS='-fno-omit-frame-pointer' \ + --enable-cassert \ + --enable-tap-tests \ + --prefix=$CUSTOM_PG_BIN \ + --quiet + + time make -s -j$(nproc) && make -s install + + # override default PostgreSQL instance + export PATH=$CUSTOM_PG_BIN/bin:$PATH + export LD_LIBRARY_PATH=$CUSTOM_PG_BIN/lib + + # show pg_config path (just in case) + which pg_config + + cd - + + set +e +fi + +# show pg_config just in case +pg_config + +# perform code checks if asked to +if [ "$LEVEL" = "hardcore" ]; then + + # perform static analyzis + scan-build --status-bugs \ + -disable-checker core.UndefinedBinaryOperatorResult \ + -disable-checker core.DivideZero \ + -disable-checker deadcode.DeadStores \ + make USE_PGXS=1 || status=$? + + # something's wrong, exit now! + if [ $status -ne 0 ]; then exit 1; fi + + # don't forget to "make clean" + make USE_PGXS=1 clean +fi + + +# build and install extension (using PG_CPPFLAGS and SHLIB_LINK for gcov) +make USE_PGXS=1 PG_CPPFLAGS="-coverage" SHLIB_LINK="-coverage" install + +# initialize database +initdb -D $PGDATA + +# set appropriate port +export PGPORT=55435 +echo "port = $PGPORT" >> $PGDATA/postgresql.conf + +# restart cluster 'test' +pg_ctl start -l /tmp/postgres.log -w || status=$? + +# something's wrong, exit now! +if [ $status -ne 0 ]; then cat /tmp/postgres.log; exit 1; fi + +# run regression tests +export PG_REGRESS_DIFF_OPTS="-w -U3" # for alpine's diff (BusyBox) +make USE_PGXS=1 installcheck || status=$? + +# show diff if it exists +if test -f regression.diffs; then cat regression.diffs; fi + +# something's wrong, exit now! +if [ $status -ne 0 ]; then exit 1; fi + +# generate *.gcov files +gcov src/*.c src/*.h + + +set +ux + + +# send coverage stats to Codecov +bash <(curl -s https://codecov.io/bash)