Comments and cleanup in the connection pool manager

author Tomas Vondra <[email protected]>

Sun, 22 Oct 2017 13:00:06 +0000 (15:00 +0200)

committer Tomas Vondra <[email protected]>

Sat, 4 Nov 2017 16:19:06 +0000 (17:19 +0100)
author Tomas Vondra <[email protected]>
Sun, 22 Oct 2017 13:00:06 +0000 (15:00 +0200)
committer Tomas Vondra <[email protected]>
Sat, 4 Nov 2017 16:19:06 +0000 (17:19 +0100)
diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c

index 66b993f53b1c43a9983291fd9734db44d5fe51cc..a664cc22dae9537c63f915d555fcff1820721d5d 100644 (file)
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -1,9 +1,103 @@
  /*-------------------------------------------------------------------------
   *
   * pgxcnode.c
+ *       Functions for communication with nodes through pooled connections.
   *
- *       Functions for the Coordinator communicating with the PGXC nodes:
- *       Datanodes and Coordinators
+ * This is mostly a backend-side counterpart to the pool manager. Each
+ * session acquires connections to remote nodes, and uses them to execute
+ * queries.
+ *
+ * Currently, we only allow a single connection to each remote node. If
+ * a query includes multiple nodes that communicate with a given remote
+ * node (e.g. Append with multiple RemoteSubquery children), then the
+ * connection may need to be buffered (see BufferConnection).
+ *
+ * Following is an overview of the basic methods for node management and
+ * communication over the handles.
+ *
+ *
+ * node handle management
+ * ----------------------
+ * get_any_handle      - acquire handle for replicated table
+ * get_handles         - acquire handles to all specified nodes
+ * get_current_handles - return already acquired handles
+ * release_handles     - release all connection (back to pool)
+ *
+ *
+ * connection functions (TODO move to poolmgr.c)
+ * --------------------
+ * PGXCNodeConnect    - open libpq connection using connection string
+ * PGXCNodePing       - ping node using connection string
+ * PGXCNodeClose      - close libpq connection
+ * PGXCNodeConnected  - verify connection status
+ * PGXCNodeConnStr    - build connection string
+ *
+ *
+ * node handle management
+ * ----------------------
+ * PGXCNodeGetNodeOid        - OID for node by index in handle array
+ * PGXCNodeGetNodeIdFromName - determine index in handle array by name
+ * PGXCNodeGetNodeId         - determine index in handle array from OID
+ *
+ *
+ * session/transaction parameters
+ * ------------------------------
+ * PGXCNodeSetParam               - add new parameter
+ * PGXCNodeResetParams            - reset (local or session) parameters
+ * PGXCNodeGetTransactionParamStr - generate SET with transaction params
+ * PGXCNodeGetSessionParamStr     - generate SET with session params
+ *
+ *
+ * low-level TCP buffer access
+ * ---------------------------
+ * pgxc_node_receive   - receive data into input buffers for connections
+ * pgxc_node_read_data - read data for one particular connection
+ * get_message         - read one complete message from a handle
+ * send_some           - send a chunk of data to remote node
+ *
+ *
+ * send higher-level messages to remote node
+ * -----------------------------------------
+ * pgxc_node_send_parse    - sends PARSE (part of extended protocol)
+ * pgxc_node_send_bind     - sends BIND (part of extended protocol)
+ * pgxc_node_send_describe - sends DESCRIBE (part of extended protocol)
+ * pgxc_node_send_execute  - sends EXECUTE (part of extended protocol)
+ * pgxc_node_send_flush    - sends FLUSH (part of extended protocol)
+ * pgxc_node_send_close    - sends close (C)
+ * pgxc_node_send_sync     - sends sync (S)
+ * pgxc_node_send_query    - simple query protocol (Q)
+ * pgxc_node_send_rollback - simple query on failed connection (Q)
+ * pgxc_node_send_query_extended - extended query protocol (PARSE, ...)
+ *
+ *
+ * XL-specific messages to remote nodes
+ * ------------------------------------
+ * pgxc_node_send_plan       - sends plan to remote node (p)
+ * pgxc_node_send_gxid       - sends GXID to remote node (g)
+ * pgxc_node_send_cmd_id     - sends CommandId to remote node (M)
+ * pgxc_node_send_snapshot   - sends snapshot to remote node (s)
+ * pgxc_node_send_timestamp  - sends timestamp to remote node (t)
+ *
+ *
+ * misc functions
+ * --------------
+ * pgxc_node_set_query  - send SET by simple protocol, wait for "ready"
+ * pgxc_node_flush      - flush all data from the output buffer
+ *
+ *
+ * XXX We should add the custom messages (gxid, snapshot, ...) to the SGML
+ * documentation describing message formats.
+ *
+ * XXX What about using simple list, instead of the arrays? Or define new
+ * structure grouping all the important parameters (buffer, size, maxsize).
+ *
+ * XXX The comments claim that dn_handles and co_handles are allocated in
+ * Transaction context, but in fact those are allocated in TopMemoryContext.
+ * Otherwise we wouldn't be able to use persistent connections, which keeps
+ * connections for the whole session.
+ *
+ * XXX The comment at pgxc_node_free mentions TopTransactionContext, so
+ * perhaps we should consider using that?
   *
   *
   * Portions Copyright (c) 2012-2014, TransLattice, Inc.
@@ -11,8 +105,7 @@
   * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
   *
   * IDENTIFICATION
- *       $$
- *
+ *       src/backend/pgxc/pool/pgxcnode.c
   *
   *-------------------------------------------------------------------------
   */
@@ -31,24 +124,28 @@
  #include <string.h>
  #include <unistd.h>
  #include <errno.h>
+
  #include "access/gtm.h"
  #include "access/transam.h"
  #include "access/xact.h"
  #include "access/htup_details.h"
  #include "catalog/pg_type.h"
+#include "catalog/pg_collation.h"
+#include "catalog/pgxc_node.h"
  #include "commands/prepare.h"
  #include "gtm/gtm_c.h"
+#include "miscadmin.h"
  #include "nodes/nodes.h"
-#include "pgxc/pgxcnode.h"
  #include "pgxc/execRemote.h"
-#include "catalog/pgxc_node.h"
-#include "catalog/pg_collation.h"
  #include "pgxc/locator.h"
  #include "pgxc/nodemgr.h"
+#include "pgxc/pause.h"
  #include "pgxc/pgxc.h"
+#include "pgxc/pgxcnode.h"
  #include "pgxc/poolmgr.h"
-#include "tcop/dest.h"
+#include "storage/ipc.h"
  #include "storage/lwlock.h"
+#include "tcop/dest.h"
  #include "utils/builtins.h"
  #include "utils/elog.h"
  #include "utils/memutils.h"
@@ -57,14 +154,9 @@
  #include "utils/syscache.h"
  #include "utils/lsyscache.h"
  #include "utils/formatting.h"
+#include "utils/snapmgr.h"
  #include "utils/tqual.h"
  #include "../interfaces/libpq/libpq-fe.h"
-#ifdef XCP
-#include "miscadmin.h"
-#include "storage/ipc.h"
-#include "pgxc/pause.h"
-#include "utils/snapmgr.h"
-#endif
  
  #define CMD_ID_MSG_LEN 8
  
@@ -73,35 +165,29 @@ static int datanode_count = 0;
  static int     coord_count = 0;
  
  /*
- * Datanode handles saved in Transaction memory context
- * when PostgresMain is launched.
- * Those handles are used inside a transaction by Coordinator to Datanodes.
- */
-static PGXCNodeHandle *dn_handles = NULL;
-
-/*
- * Coordinator handles saved in Transaction memory context
- * when PostgresMain is launched.
- * Those handles are used inside a transaction by Coordinator to Coordinators
+ * Datanode and coordinator handles (sockets obtained from the pooler),
+ * initialized in the TopMemoryContext memory context. Those connections
+ * are used during query execution to communicate wit the nodes.
+ *
+ * XXX At this point we have only a single connection to each node, and
+ * use multiplex it for multiple cursors (see BufferConnection).
   */
-static PGXCNodeHandle *co_handles = NULL;
+static PGXCNodeHandle *dn_handles = NULL;      /* datanodes */
+static PGXCNodeHandle *co_handles = NULL;      /* coordinators */
  
-/* Current size of dn_handles and co_handles */
+/* Current number of datanode and coordinator handles. */
  int                    NumDataNodes;
  int            NumCoords;
  
-
-#ifdef XCP
  volatile bool HandlesInvalidatePending = false;
  volatile bool HandlesRefreshPending = false;
  
  /*
- * Session and transaction parameters need to to be set on newly connected
- * remote nodes.
+ * Session/transaction parameters that need to to be set on new connections.
   */
  static List *session_param_list = NIL;
  static List    *local_param_list = NIL;
-static StringInfo      session_params;
+static StringInfo      session_params;
  static StringInfo      local_params;
  
  typedef struct
@@ -114,14 +200,9 @@ typedef struct
  
  static bool DoInvalidateRemoteHandles(void);
  static bool DoRefreshRemoteHandles(void);
-#endif
  
-#ifdef XCP
  static void pgxc_node_init(PGXCNodeHandle *handle, int sock,
                 bool global_session, int pid);
-#else
-static void pgxc_node_init(PGXCNodeHandle *handle, int sock);
-#endif
  static void pgxc_node_free(PGXCNodeHandle *handle);
  static void pgxc_node_all_free(void);
  
@@ -130,7 +211,7 @@ static int  get_char(PGXCNodeHandle * conn, char *out);
  
  
  /*
- * Initialize PGXCNodeHandle struct
+ * Initialize empty PGXCNodeHandle struct
   */
  static void
  init_pgxc_handle(PGXCNodeHandle *pgxc_handle)
@@ -165,17 +246,21 @@ init_pgxc_handle(PGXCNodeHandle *pgxc_handle)
  
  
  /*
- * Allocate and initialize memory to store Datanode and Coordinator handles.
+ * InitMultinodeExecutor
+ *       Initialize datanode and coordinator handles.
+ *
+ * Acquires list of nodes from the node manager, and initializes handle
+ * for each one.
+ *
+ * Also determines PGXCNodeId to index in the proper array of handles
+ * (co_handles or dn_handles), depending on the type of this node.
   */
  void
  InitMultinodeExecutor(bool is_force)
  {
         int                             count;
         Oid                             *coOids, *dnOids;
-#ifdef XCP
         MemoryContext   oldcontext;
-#endif
-
  
         /* Free all the existing information first */
         if (is_force)
@@ -192,13 +277,11 @@ InitMultinodeExecutor(bool is_force)
         /* Get classified list of node Oids */
         PgxcNodeGetOids(&coOids, &dnOids, &NumCoords, &NumDataNodes, true);
  
-#ifdef XCP
         /*
          * Coordinator and datanode handles should be available during all the
          * session lifetime
          */
         oldcontext = MemoryContextSwitchTo(TopMemoryContext);
-#endif
  
         /* Do proper initialization of handles */
         if (NumDataNodes > 0)
@@ -244,6 +327,15 @@ InitMultinodeExecutor(bool is_force)
  
         MemoryContextSwitchTo(oldcontext);
  
+       /*
+        * Determine index of a handle representing this node, either in the
+        * coordinator or datanode handles, depending on the type of this
+        * node. The index gets stored in PGXCNodeId.
+        *
+        * XXX It's a bit confusing that this may point either to co_handles
+        * or dn_handles, and may easily lead to bugs when used with the
+        * incorrect array.
+        */
         if (IS_PGXC_COORDINATOR)
         {
                 for (count = 0; count < NumCoords; count++)
@@ -265,7 +357,13 @@ InitMultinodeExecutor(bool is_force)
  }
  
  /*
- * Builds up a connection string
+ * PGXCNodeConnStr
+ *       Builds a connection string for the provided connection parameters.
+ *
+ * Aside from the usual connection parameters (host, port, ...) we also
+ * pass information about type of the parent node and remote node type.
+ *
+ * XXX Shouldn't this rather throw an ERROR instead of returning NULL?
   */
  char *
  PGXCNodeConnStr(char *host, int port, char *dbname,
@@ -278,6 +376,8 @@ PGXCNodeConnStr(char *host, int port, char *dbname,
         /*
          * Build up connection string
          * remote type can be Coordinator, Datanode or application.
+        *
+        * XXX What's application remote type?
          */
         num = snprintf(connstr, sizeof(connstr),
                                    "host=%s port=%d dbname=%s user=%s application_name='pgxc:%s' sslmode=disable options='-c remotetype=%s -c parentnode=%s %s'",
@@ -299,7 +399,8 @@ PGXCNodeConnStr(char *host, int port, char *dbname,
  
  
  /*
- * Connect to a Datanode using a connection string
+ * PGXCNodeConnect
+ *       Connect to a Datanode using a constructed connection string.
   */
  NODE_CONNECTION *
  PGXCNodeConnect(char *connstr)
@@ -311,7 +412,12 @@ PGXCNodeConnect(char *connstr)
         return (NODE_CONNECTION *) conn;
  }
  
-int PGXCNodePing(const char *connstr)
+/*
+ * PGXCNodePing
+ *       Check that a node (identified the connstring) responds correctly.
+ */
+int
+PGXCNodePing(const char *connstr)
  {
         if (connstr[0])
         {
@@ -326,22 +432,23 @@ int PGXCNodePing(const char *connstr)
  }
  
  /*
- * Close specified connection
+ * PGXCNodeClose
+ *       Close connection connection.
   */
  void
  PGXCNodeClose(NODE_CONNECTION *conn)
  {
-       /* Delegate call to the pglib */
+       /* Delegate call to the libpq */
         PQfinish((PGconn *) conn);
  }
  
  /*
- * Checks if connection active
+ * PGXCNodeConnected
+ *       Check if the provided connection is open and valid.
   */
  int
  PGXCNodeConnected(NODE_CONNECTION *conn)
  {
-       /* Delegate call to the pglib */
         PGconn     *pgconn = (PGconn *) conn;
  
         /*
@@ -352,12 +459,13 @@ PGXCNodeConnected(NODE_CONNECTION *conn)
  }
  
  
-
-/* Close the socket handle (this process' copy) and free occupied memory
+/*
+ * pgxc_node_free
+ *       Close the socket handle (local copy) and free occupied memory.
   *
- * Note that we do not free the handle and its members. This will be
- * taken care of when the transaction ends, when TopTransactionContext
- * is destroyed in xact.c.
+ * Note that this only closes the socket, but we do not free the handle
+ * and its members. This will be taken care of when the transaction ends,
+ * when TopTransactionContext is destroyed in xact.c.
   */
  static void
  pgxc_node_free(PGXCNodeHandle *handle)
@@ -368,7 +476,8 @@ pgxc_node_free(PGXCNodeHandle *handle)
  }
  
  /*
- * Free all the node handles cached
+ * pgxc_node_all_free
+ *       Free all the node handles cached in TopMemoryContext.
   */
  static void
  pgxc_node_all_free(void)
@@ -410,9 +519,11 @@ pgxc_node_all_free(void)
  }
  
  /*
- * Create and initialise internal structure to communicate to
- * Datanode via supplied socket descriptor.
- * Structure stores state info and I/O buffers
+ * pgxc_node_init
+ *       Initialize the handle to communicate to node throught the socket.
+ *
+ * Stored PID of the remote backend, and of requested, sends the global
+ * session string to the remote node.
   */
  static void
  pgxc_node_init(PGXCNodeHandle *handle, int sock, bool global_session, int pid)
@@ -435,9 +546,10 @@ pgxc_node_init(PGXCNodeHandle *handle, int sock, bool global_session, int pid)
         handle->inEnd = 0;
         handle->inCursor = 0;
         handle->needSync = false;
+
         /*
          * We got a new connection, set on the remote node the session parameters
-        * if defined. The transaction parameter should be sent after BEGIN
+        * if defined. The transaction parameter should be sent after BEGIN.
          */
         if (global_session)
         {
@@ -451,8 +563,9 @@ pgxc_node_init(PGXCNodeHandle *handle, int sock, bool global_session, int pid)
  
  
  /*
- * Wait while at least one of specified connections has data available and read
- * the data into the buffer
+ * pgxc_node_receive
+ *       Wait while at least one of the connections has data available, and
+ * read the data into the buffer.
   */
  bool
  pgxc_node_receive(const int conn_count,
@@ -605,28 +718,10 @@ retry:
         return NO_ERROR_OCCURED;
  }
  
-/*
- * Is there any data enqueued in the TCP input buffer waiting
- * to be read sent by the PGXC node connection
- */
-
-int
-pgxc_node_is_data_enqueued(PGXCNodeHandle *conn)
-{
-       int ret;
-       int enqueued;
-
-       if (conn->sock < 0)
-               return 0;
-       ret = ioctl(conn->sock, FIONREAD, &enqueued);
-       if (ret != 0)
-               return 0;
-
-       return enqueued;
-}
  
  /*
- * Read up incoming messages from the PGXC node connection
+ * pgxc_node_read_data
+ *       Read incoming data from the node TCP connection.
   */
  int
  pgxc_node_read_data(PGXCNodeHandle *conn, bool close_if_error)
@@ -769,7 +864,10 @@ retry:
  
  
  /*
- * Get one character from the connection buffer and advance cursor
+ * Get one character from the connection buffer and advance cursor.
+ *
+ * Returns 0 if enough data is available in the buffer (and the value is
+ * returned in the 'out' parameter). Otherwise the function returns EOF.
   */
  static int
  get_char(PGXCNodeHandle * conn, char *out)
@@ -783,7 +881,12 @@ get_char(PGXCNodeHandle * conn, char *out)
  }
  
  /*
- * Read an integer from the connection buffer and advance cursor
+ * Try reading an integer from the connection buffer and advance cursor.
+ *
+ * Returns 0 if enough data is available in the buffer (and the value is
+ * returned in the 'out' parameter). Otherwise the function returns EOF.
+ *
+ * XXX We only ever call this once with len=4, so simplify the function.
   */
  static int
  get_int(PGXCNodeHandle *conn, size_t len, int *out)
@@ -791,6 +894,10 @@ get_int(PGXCNodeHandle *conn, size_t len, int *out)
         unsigned short tmp2;
         unsigned int tmp4;
  
+       /*
+        * XXX This seems somewhat inconsistent with get_char(). Perhaps this
+        * should use >= to behave in the same way?
+        */
         if (conn->inCursor + len > conn->inEnd)
                 return EOF;
  
@@ -817,49 +924,70 @@ get_int(PGXCNodeHandle *conn, size_t len, int *out)
  
  /*
   * get_message
- * If connection has enough data read entire message from the connection buffer
- * and returns message type. Message data and data length are returned as
- * var parameters.
- * If buffer does not have enough data leaves cursor unchanged, changes
- * connection status to DN_CONNECTION_STATE_QUERY indicating it needs to
- * receive more and returns \0
+ *       Attempt to read the whole message from the input buffer, if possible.
+ *
+ * If the entire message is in the input buffer of the connection, reads it
+ * into a buffer (len and msg parameters) and returns the message type.
+ *
+ * If the input buffer does not contain the whole message, the cursor is
+ * left unchanged, the connection status is se to DN_CONNECTION_STATE_QUERY
+ * indicating it needs to receive more data, and \0 is returned (instead of
+ * an actual message type).
+ *
   * conn - connection to read from
   * len - returned length of the data where msg is pointing to
- * msg - returns pointer to memory in the incoming buffer. The buffer probably
- * will be overwritten upon next receive, so if caller wants to refer it later
- * it should make a copy.
+ * msg - returns pointer to position in the incoming buffer
+ *
+ * The buffer probably will be overwritten upon next receive, so if caller
+ * wants to refer it later it should make a copy.
   */
  char
  get_message(PGXCNodeHandle *conn, int *len, char **msg)
  {
         char            msgtype;
  
+       /*
+        * Try reading the first char (message type) and integer (message length).
+        *
+        * Both functions return 0 (false) in case of success, and EOF (true) in
+        * case of failure. So we call get_char() first, and only if it succeeds
+        * the get_int() gets called.
+        */
         if (get_char(conn, &msgtype) || get_int(conn, 4, len))
         {
-               /* Successful get_char would move cursor, restore position */
+               /* Successful get_char/get_int would move cursor, restore position. */
                 conn->inCursor = conn->inStart;
                 return '\0';
         }
  
+       /* The message length includes the length header too, so subtract it. */
         *len -= 4;
  
+       /*
+        * If the whole message is not in the buffer, we need to read more data.
+        *
+        * Reading function will discard already consumed data in the buffer till
+        * conn->inCursor. To avoid extra/handle cycles we need to fit the whole
+        * message (and not just a part of it) into the buffer. So let's ensure
+        * the buffer is large enough.
+        *
+        * We need 1 byte for for message type, 4 bytes for message length and
+        * the message itself (the length is currently in *len). The buffer may
+        * already be large enough, in which case ensure_in_buffer_capacity()
+        * will return immediately .
+        */
         if (conn->inCursor + *len > conn->inEnd)
         {
-               /*
-                * Not enough data in the buffer, we should read more.
-                * Reading function will discard already consumed data in the buffer
-                * till conn->inBegin. Then we want the message that is partly in the
-                * buffer now has been read completely, to avoid extra read/handle
-                * cycles. The space needed is 1 byte for message type, 4 bytes for
-                * message length and message itself which size is currently in *len.
-                * The buffer may already be large enough, in this case the function
-                * ensure_in_buffer_capacity() will immediately return
+               /* ensure space for the whole message (including 5B header)
+                *
+                * FIXME Add check of the return value. Non-zero value means failure.
                  */
                 ensure_in_buffer_capacity(5 + (size_t) *len, conn);
                 conn->inCursor = conn->inStart;
                 return '\0';
         }
  
+       /* Great, the whole message in the buffer. */
         *msg = conn->inBuffer + conn->inCursor;
         conn->inCursor += *len;
         conn->inStart = conn->inCursor;
@@ -868,8 +996,8 @@ get_message(PGXCNodeHandle *conn, int *len, char **msg)
  
  
  /*
- * Release all Datanode and Coordinator connections
- * back to pool and release occupied memory
+ * release_handles
+ *       Release all node connections back to pool and free the memory.
   */
  void
  release_handles(void)
@@ -887,6 +1015,7 @@ release_handles(void)
         if (cluster_ex_lock_held)
                 return;
  
+       /* quick exit if we have no connections to release */
         if (datanode_count == 0 && coord_count == 0)
                 return;
  
@@ -917,9 +1046,14 @@ release_handles(void)
                 }
         }
  
+       /*
+        * XXX Not sure why we coordinator connections are only released when on
+        * a coordinator. Perhaps we never acquire connections to coordinators on
+        * datanodes? Seems like a rather minor optimization anyway.
+        */
         if (IS_PGXC_COORDINATOR)
         {
-               /* Collect Coordinator handles */
+               /* Free Coordinator handles */
                 for (i = 0; i < NumCoords; i++)
                 {
                         PGXCNodeHandle *handle = &co_handles[i];
@@ -943,7 +1077,10 @@ release_handles(void)
                 }
         }
  
-       /* And finally release all the connections on pooler */
+       /*
+        * And finally release all the connections held by this backend back
+        * to the connection pool.
+        */
         PoolManagerReleaseConnections(destroy);
  
         datanode_count = 0;
@@ -951,15 +1088,20 @@ release_handles(void)
  }
  
  /*
- * Ensure that the supplied buffer has enough capacity and if not, it's
- * extended to an appropriate size.
+ * ensure_buffer_capacity
+ *       Ensure that the supplied buffer has at least the required capacity.
+ *
+ * currbuf  - the currently allocated buffer
+ * currsize - size of the current buffer (in bytes)
+ * bytes_needed - required capacity (in bytes)
+ *
+ * We shall return the new buffer, if allocated successfully and set newsize_p
+ * to contain the size of the repalloc-ed buffer.
   *
- * currbuf is the currently used buffer of currsize. bytes_needed is the
- * minimum size required. We shall return the new buffer, if allocated
- * successfully and set newsize_p to contain the size of the repalloced buffer.
   * If allocation fails, NULL is returned.
   *
- * The function checks for requests beyond MaxAllocSize and throw an error.
+ * The function checks for requests beyond MaxAllocSize and throws an error
+ * if the request exceeds the limit.
   */
  static char *
  ensure_buffer_capacity(char *currbuf, size_t currsize, size_t bytes_needed, size_t *newsize_p)
@@ -967,6 +1109,7 @@ ensure_buffer_capacity(char *currbuf, size_t currsize, size_t bytes_needed, size
         char       *newbuf;
         Size            newsize = (Size) currsize;
  
+       /* XXX Perhaps use AllocSizeIsValid instead? */
         if (((Size) bytes_needed) >= MaxAllocSize)
                 ereport(ERROR,
                                 (ENOSPC,
@@ -974,6 +1117,7 @@ ensure_buffer_capacity(char *currbuf, size_t currsize, size_t bytes_needed, size
                                  errdetail("Cannot enlarge buffer containing %ld bytes by %ld more bytes.",
                                                    currsize, bytes_needed)));
  
+       /* if the buffer is already large enough, we're done */
         if (bytes_needed <= newsize)
         {
                 *newsize_p = currsize;
@@ -1028,8 +1172,10 @@ ensure_buffer_capacity(char *currbuf, size_t currsize, size_t bytes_needed, size
  }
  
  /*
- * Ensure specified amount of data can fit to the incoming buffer and
- * increase it if necessary
+ * ensure_in_buffer_capacity
+ *       Ensure specified amount of data can fit to the input buffer of a handle.
+ *
+ * Returns 0 in case of success, EOF otherwise.
   */
  int
  ensure_in_buffer_capacity(size_t bytes_needed, PGXCNodeHandle *handle)
@@ -1047,8 +1193,10 @@ ensure_in_buffer_capacity(size_t bytes_needed, PGXCNodeHandle *handle)
  }
  
  /*
- * Ensure specified amount of data can fit to the outgoing buffer and
- * increase it if necessary
+ * ensure_out_buffer_capacity
+ *       Ensure specified amount of data can fit to the output buffer of a handle.
+ *
+ * Returns 0 in case of success, EOF otherwise.
   */
  int
  ensure_out_buffer_capacity(size_t bytes_needed, PGXCNodeHandle *handle)
@@ -1067,7 +1215,8 @@ ensure_out_buffer_capacity(size_t bytes_needed, PGXCNodeHandle *handle)
  
  
  /*
- * Send specified amount of data from the outgoing buffer over the connection
+ * send_some
+ *       Send specified amount of data from the output buffer over the handle.
   */
  int
  send_some(PGXCNodeHandle *handle, int len)
@@ -1195,11 +1344,12 @@ send_some(PGXCNodeHandle *handle, int len)
  }
  
  /*
- * Send PARSE message with specified statement down to the Datanode
+ * pgxc_node_send_parse
+ *       Send PARSE message with specified statement down to the datanode.
   */
  int
  pgxc_node_send_parse(PGXCNodeHandle * handle, const char* statement,
-                                               const char *query, short num_params, Oid *param_types)
+                                        const char *query, short num_params, Oid *param_types)
  {
         /* statement name size (allow NULL) */
         int                     stmtLen = statement ? strlen(statement) + 1 : 1;
@@ -1283,7 +1433,8 @@ pgxc_node_send_parse(PGXCNodeHandle * handle, const char* statement,
  }
  
  /*
- * Send PLAN message down to the Data node
+ * pgxc_node_send_plan
+ *       Send PLAN message down to the datanode.
   */
  int
  pgxc_node_send_plan(PGXCNodeHandle * handle, const char *statement,
@@ -1364,7 +1515,8 @@ pgxc_node_send_plan(PGXCNodeHandle * handle, const char *statement,
  }
  
  /*
- * Send BIND message down to the Datanode
+ * pgxc_node_send_bind
+ *       Send BIND message down to the datanode.
   */
  int
  pgxc_node_send_bind(PGXCNodeHandle * handle, const char *portal,
@@ -1446,7 +1598,8 @@ pgxc_node_send_bind(PGXCNodeHandle * handle, const char *portal,
  
  
  /*
- * Send DESCRIBE message (portal or statement) down to the Datanode
+ * pgxc_node_send_describe
+ *       Send DESCRIBE message (portal or statement) down to the datanode.
   */
  int
  pgxc_node_send_describe(PGXCNodeHandle * handle, bool is_statement,
@@ -1494,7 +1647,8 @@ pgxc_node_send_describe(PGXCNodeHandle * handle, bool is_statement,
  
  
  /*
- * Send CLOSE message (portal or statement) down to the Datanode
+ * pgxc_node_send_close
+ *       Send CLOSE message (portal or statement) down to the datanode.
   */
  int
  pgxc_node_send_close(PGXCNodeHandle * handle, bool is_statement,
@@ -1534,7 +1688,8 @@ pgxc_node_send_close(PGXCNodeHandle * handle, bool is_statement,
  }
  
  /*
- * Send EXECUTE message down to the Datanode
+ * pgxc_node_send_execute
+ *       Send EXECUTE message down to the datanode.
   */
  int
  pgxc_node_send_execute(PGXCNodeHandle * handle, const char *portal, int fetch)
@@ -1579,7 +1734,8 @@ pgxc_node_send_execute(PGXCNodeHandle * handle, const char *portal, int fetch)
  
  
  /*
- * Send FLUSH message down to the Datanode
+ * pgxc_node_send_flush
+ *       Send FLUSH message down to the datanode.
   */
  int
  pgxc_node_send_flush(PGXCNodeHandle * handle)
@@ -1606,7 +1762,8 @@ pgxc_node_send_flush(PGXCNodeHandle * handle)
  
  
  /*
- * Send SYNC message down to the Datanode
+ * pgxc_node_send_sync
+ *       Send SYNC message down to the datanode.
   */
  int
  pgxc_node_send_sync(PGXCNodeHandle * handle)
@@ -1635,7 +1792,8 @@ pgxc_node_send_sync(PGXCNodeHandle * handle)
  
  
  /*
- * Send series of Extended Query protocol messages to the data node
+ * pgxc_node_send_query_extended
+ *       Send series of Extended Query protocol messages to the datanode.
   */
  int
  pgxc_node_send_query_extended(PGXCNodeHandle *handle, const char *query,
@@ -1664,8 +1822,11 @@ pgxc_node_send_query_extended(PGXCNodeHandle *handle, const char *query,
  
  
  /*
- * This method won't return until connection buffer is empty or error occurs
- * To ensure all data are on the wire before waiting for response
+ * pgxc_node_flush
+ *       Flush all data from the output buffer of a node handle.
+ *
+ * This method won't return until connection buffer is empty or error occurs.
+ * To ensure all data are on the wire before waiting for a response.
   */
  int
  pgxc_node_flush(PGXCNodeHandle *handle)
@@ -1693,39 +1854,10 @@ pgxc_node_flush(PGXCNodeHandle *handle)
         return 0;
  }
  
-/*
- * This method won't return until network buffer is empty or error occurs
- * To ensure all data in network buffers is read and wasted
- */
-void
-pgxc_node_flush_read(PGXCNodeHandle *handle)
-{
-       bool    is_ready;
-       int     read_result;
-
-       if (handle == NULL)
-               return;
-
-       /*
-        * Before reading input send Sync to make sure
-        * we will eventually receive ReadyForQuery
-        */
-       pgxc_node_send_sync(handle);
-       while(true)
-       {
-               read_result = pgxc_node_read_data(handle, false);
-               if (read_result < 0)
-                       break;
-
-               is_ready = is_data_node_ready(handle);
-               if (is_ready == true)
-                       break;
-
-       }
-}
  
  /*
- * Send specified statement down to the PGXC node
+ * pgxc_node_send_query_internal
+ *       Send the statement down to the PGXC node.
   */
  static int
  pgxc_node_send_query_internal(PGXCNodeHandle * handle, const char *query,
@@ -1768,21 +1900,32 @@ pgxc_node_send_query_internal(PGXCNodeHandle * handle, const char *query,
         return pgxc_node_flush(handle);
  }
  
+/*
+ * pgxc_node_send_rollback
+ *       Send the rollback command to the remote node.
+ *
+ * XXX The only effect of the "rollback" is that we try sending the query
+ * even on invalid/failed connections (when everything else is prohibited).
+ */
  int
  pgxc_node_send_rollback(PGXCNodeHandle *handle, const char *query)
  {
         return pgxc_node_send_query_internal(handle, query, true);
  }
  
+/*
+ * pgxc_node_send_query
+ *       Send the query to the remote node.
+ */
  int
  pgxc_node_send_query(PGXCNodeHandle *handle, const char *query)
  {
         return pgxc_node_send_query_internal(handle, query, false);
  }
  
-
  /*
- * Send the GXID down to the PGXC node
+ * pgxc_node_send_gxid
+ *       Send the GXID (global transaction ID) down to the remote node.
   */
  int
  pgxc_node_send_gxid(PGXCNodeHandle *handle, GlobalTransactionId gxid)
@@ -1812,7 +1955,8 @@ pgxc_node_send_gxid(PGXCNodeHandle *handle, GlobalTransactionId gxid)
  }
  
  /*
- * Send the Command ID down to the PGXC node
+ * pgxc_node_send_cmd_id
+ *       Send the Command ID down to the remote node
   */
  int
  pgxc_node_send_cmd_id(PGXCNodeHandle *handle, CommandId cid)
@@ -1847,7 +1991,8 @@ pgxc_node_send_cmd_id(PGXCNodeHandle *handle, CommandId cid)
  }
  
  /*
- * Send the snapshot down to the PGXC node
+ * pgxc_node_send_snapshot
+ *       Send the snapshot down to the remote node.
   */
  int
  pgxc_node_send_snapshot(PGXCNodeHandle *handle, Snapshot snapshot)
@@ -1901,7 +2046,8 @@ pgxc_node_send_snapshot(PGXCNodeHandle *handle, Snapshot snapshot)
  }
  
  /*
- * Send the timestamp down to the PGXC node
+ * pgxc_node_send_timestamp
+ *       Send the timestamp down to the remote node
   */
  int
  pgxc_node_send_timestamp(PGXCNodeHandle *handle, TimestampTz timestamp)
@@ -1947,8 +2093,9 @@ pgxc_node_send_timestamp(PGXCNodeHandle *handle, TimestampTz timestamp)
  
  
  /*
- * Add another message to the list of errors to be returned back to the client
- * at the convenient time
+ * add_error_message
+ *       Add a message to the list of errors to be returned back to the client
+ * at a convenient time.
   */
  void
  add_error_message(PGXCNodeHandle *handle, const char *message)
@@ -1964,11 +2111,17 @@ add_error_message(PGXCNodeHandle *handle, const char *message)
                 handle->error = pstrdup(message);
  }
  
+/* index of the last node returned by get_any_handled (round-robin) */
  static int load_balancer = 0;
+
  /*
- * Get one of the specified nodes to query replicated data source.
- * If session already owns one or more  of the requested connection,
- * the function returns existing one to avoid contacting pooler.
+ * get_any_handle
+ *       Get one of the specified nodes to query replicated data source.
+ *
+ * If session already owns one or more of requested datanode connections,
+ * the function returns one of those existing ones to avoid unnecessary
+ * pooler requests.
+ *
   * Performs basic load balancing.
   */
  PGXCNodeHandle *
@@ -1998,6 +2151,7 @@ get_any_handle(List *datanodelist)
                 /* At the moment node is an index in the array, and we may need to wrap it */
                 if (node >= NumDataNodes)
                         node -= NumDataNodes;
+
                 /* See if handle is already used */
                 if (dn_handles[node].sock != NO_SOCKET)
                 {
@@ -2079,13 +2233,16 @@ get_any_handle(List *datanodelist)
  }
  
  /*
- * for specified list return array of PGXCNodeHandles
- * acquire from pool if needed.
- * the lenth of returned array is the same as of nodelist
- * For Datanodes, Special case is empty or NIL nodeList, in this case return all the nodes.
- * The returned list should be pfree'd when no longer needed.
- * For Coordinator, do not get a connection if Coordinator list is NIL,
- * Coordinator fds is returned only if transaction uses a DDL
+ * get_handles
+ *       Return array of node handles (PGXCNodeHandles) for requested nodes.
+ *
+ * If we don't have the handles in the pool, acquire from pool if needed.
+ *
+ * For datanodes, the specified list may be set to NIL, in which case we
+ * return handles for all datanodes.
+ *
+ * For coordinators, we do not acquire any handles when NIL list is used.
+ * Coordinator handles are needed only for transaction performing DDL.
   */
  PGXCNodeAllHandles *
  get_handles(List *datanodelist, List *coordlist, bool is_coord_only_query, bool is_global_session)
@@ -2360,6 +2517,10 @@ get_handles(List *datanodelist, List *coordlist, bool is_coord_only_query, bool
         return result;
  }
  
+/*
+ * get_current_handles
+ *       Return currently acquired handles.
+ */
  PGXCNodeAllHandles *
  get_current_handles(void)
  {
@@ -2414,7 +2575,10 @@ get_current_handles(void)
         return result;
  }
  
-/* Free PGXCNodeAllHandles structure */
+/*
+ * pfree_pgxc_all_handles
+ *       Free memory allocated for the PGXCNodeAllHandles structure.
+ */
  void
  pfree_pgxc_all_handles(PGXCNodeAllHandles *pgxc_handles)
  {
@@ -2433,11 +2597,14 @@ pfree_pgxc_all_handles(PGXCNodeAllHandles *pgxc_handles)
  
  /*
   * PGXCNodeGetNodeId
- *             Look at the data cached for handles and return node position
- *             If node type is PGXC_NODE_COORDINATOR look only in coordinator list,
- *             if node type is PGXC_NODE_DATANODE look only in datanode list,
- *             if other (assume PGXC_NODE_NODE) search both, in last case return actual
- *             node type.
+ *       Lookup index of the requested node (by OID) in the cached handles.
+ *
+ * Optionally, the node type may be restricted using the second parameter.
+ * If the type is PGXC_NODE_COORDINATOR, we only look in coordinator list.
+ * If the node is PGXC_NODE_DATANODE, we only look in datanode list.
+ *
+ * For other values (assume PGXC_NODE_NONE) we search for both node types,
+ * and then also return the actual node type in the second parameter.
   */
  int
  PGXCNodeGetNodeId(Oid nodeoid, char *node_type)
@@ -2478,7 +2645,9 @@ PGXCNodeGetNodeId(Oid nodeoid, char *node_type)
  
  /*
   * PGXCNodeGetNodeOid
- *             Look at the data cached for handles and return node Oid
+ *       Look at the data cached for handles and return node Oid.
+ *
+ * XXX Unlike PGXCNodeGetNodeId, this requires node type parameter.
   */
  Oid
  PGXCNodeGetNodeOid(int nodeid, char node_type)
@@ -2504,8 +2673,7 @@ PGXCNodeGetNodeOid(int nodeid, char node_type)
  
  /*
   * pgxc_node_str
- *
- * get the name of the node
+ *       get the name of the current node
   */
  Datum
  pgxc_node_str(PG_FUNCTION_ARGS)
@@ -2515,7 +2683,7 @@ pgxc_node_str(PG_FUNCTION_ARGS)
  
  /*
   * PGXCNodeGetNodeIdFromName
- *             Return node position in handles array
+ *       Return position of the node (specified by name) in handles array.
   */
  int
  PGXCNodeGetNodeIdFromName(char *node_name, char *node_type)
@@ -2544,42 +2712,48 @@ PGXCNodeGetNodeIdFromName(char *node_name, char *node_type)
         return PGXCNodeGetNodeId(nodeoid, node_type);
  }
  
+/*
+ * paramlist_delete_param
+ *       Delete parameter with the specified name from the parameter list.
+ */
  static List *
  paramlist_delete_param(List *param_list, const char *name)
  {
-          ListCell   *cur_item;
-          ListCell   *prev_item;
-
-          prev_item = NULL;
-          cur_item = list_head(param_list);
-
-          while (cur_item != NULL)
-          {
-                          ParamEntry *entry = (ParamEntry *) lfirst(cur_item);
-
-                          if (strcmp(NameStr(entry->name), name) == 0)
-                          {
-                                          /* cur_item must be removed */
-                                          param_list = list_delete_cell(param_list, cur_item, prev_item);
-                                          pfree(entry);
-                                          if (prev_item)
-                                                          cur_item = lnext(prev_item);
-                                          else
-                                                          cur_item = list_head(param_list);
-                          }
-                          else
-                          {
-                                          prev_item = cur_item;
-                                          cur_item = lnext(prev_item);
-                          }
-          }
-
-          return param_list;
+       ListCell   *cur_item;
+       ListCell   *prev_item;
+
+       prev_item = NULL;
+       cur_item = list_head(param_list);
+
+       while (cur_item != NULL)
+       {
+               ParamEntry *entry = (ParamEntry *) lfirst(cur_item);
+
+               if (strcmp(NameStr(entry->name), name) == 0)
+               {
+                       /* cur_item must be removed */
+                       param_list = list_delete_cell(param_list, cur_item, prev_item);
+                       pfree(entry);
+                       if (prev_item)
+                               cur_item = lnext(prev_item);
+                       else
+                               cur_item = list_head(param_list);
+               }
+               else
+               {
+                       prev_item = cur_item;
+                       cur_item = lnext(prev_item);
+               }
+       }
+
+       return param_list;
  }
  
  /*
- * Remember new value of a session or transaction parameter, and set same
- * values on newly connected remote nodes.
+ * PGXCNodeSetParam
+ *       Remember new value of a session/transaction parameter.
+ *
+ * We'll set this parameter value for new connections to remote nodes.
   */
  void
  PGXCNodeSetParam(bool local, const char *name, const char *value, int flags)
@@ -2617,8 +2791,9 @@ PGXCNodeSetParam(bool local, const char *name, const char *value, int flags)
  
         /*
          * Special case for
-        *      RESET SESSION AUTHORIZATION
-        *      SET SESSION AUTHORIZATION TO DEFAULT
+        *
+        *      RESET SESSION AUTHORIZATION
+        *      SET SESSION AUTHORIZATION TO DEFAULT
          *
          * We must also forget any SET ROLE commands since RESET SESSION
          * AUTHORIZATION also resets current role to session default
@@ -2636,8 +2811,8 @@ PGXCNodeSetParam(bool local, const char *name, const char *value, int flags)
  
  
  /*
- * Forget all parameter values set either for transaction or both transaction
- * and session.
+ * PGXCNodeResetParams
+ *       Forget all transaction (or session too) parameters.
   */
  void
  PGXCNodeResetParams(bool only_local)
@@ -2662,6 +2837,10 @@ PGXCNodeResetParams(bool only_local)
         local_params = NULL;
  }
  
+/*
+ * get_set_command
+ *       Construct a command setting all parameters from a given list.
+ */
  static void
  get_set_command(List *param_list, StringInfo command, bool local)
  {
@@ -2687,22 +2866,29 @@ get_set_command(List *param_list, StringInfo command, bool local)
  
  
  /*
- * Returns SET commands needed to initialize remote session.
- * The command may already be biult and valid, return it right away if the case.
- * Otherwise build it up.
- * To support Distributed Session machinery coordinator should generate and
- * send a distributed session identifier to remote nodes. Generate it here.
+ * PGXCNodeGetSessionParamStr
+ *       Returns SET commands needed to initialize remote session.
+ *
+ * The SET command may already be built and valid (in the session_params),
+ * in which case we simply return it. Otherwise we build if from session
+ * parameter list.
+ *
+ * To support "Distributed Session" machinery, the coordinator should
+ * generate and send a distributed session identifier to remote nodes.
+ * Generate it here (simply as nodename_PID).
+ *
+ * We always define a parameter with PID of the parent process (which is
+ * this backend).
   */
  char *
  PGXCNodeGetSessionParamStr(void)
  {
         /*
-        * If no session parameters are set and that is a coordinator we need to set
-        * global_session anyway, even if there were no other parameters.
-        * We do not want this string to disappear, so create it in the
-        * TopMemoryContext. However if we add first session parameter we will need
-        * to free the buffer and recreate it in the same context as the hash table
-        * to avoid memory leakage.
+        * If no session parameters are set and this is a coordinator node, we
+        * need to set global_session anyway, even if there are no other params.
+        *
+        * We do not want this string to simply disappear, so create it in the
+        * TopMemoryContext.
          */
         if (session_params == NULL)
         {
@@ -2711,7 +2897,7 @@ PGXCNodeGetSessionParamStr(void)
                 MemoryContextSwitchTo(oldcontext);
         }
  
-       /* If the paramstr invalid build it up */
+       /* If the parameter string is empty, build it up. */
         if (session_params->len == 0)
         {
                 if (IS_PGXC_COORDINATOR)
@@ -2726,9 +2912,11 @@ PGXCNodeGetSessionParamStr(void)
  
  
  /*
- * Returns SET commands needed to initialize transaction on a remote session.
- * The command may already be biult and valid, return it right away if the case.
- * Otherwise build it up.
+ * PGXCNodeGetTransactionParamStr
+ *       Returns SET commands needed to initialize transaction on a remote node.
+ *
+ * The command may already be built and valid (in local_params StringInfo), in
+ * which case we return it right away. Otherwise build it up.
   */
  char *
  PGXCNodeGetTransactionParamStr(void)
@@ -2738,7 +2926,7 @@ PGXCNodeGetTransactionParamStr(void)
                 return NULL;
  
         /*
-        * If the paramstr invalid build it up.
+        * If the StringInfo is not allocated yed, do it in TopTransactionContext.
          */
         if (local_params == NULL)
         {
@@ -2746,25 +2934,30 @@ PGXCNodeGetTransactionParamStr(void)
                 local_params = makeStringInfo();
                 MemoryContextSwitchTo(oldcontext);
         }
+
         /*
-        * If parameter string exists it is valid, it is truncated when parameters
-        * are modified.
+        * If the parameter string is empty, it was reset in PGXCNodeSetParam. So
+        * recompute it, using the current local_param_list (we know it's not
+        * empty, otherwise we wound't get here through the first condition).
          */
         if (local_params->len == 0)
         {
                 get_set_command(local_param_list, local_params, true);
         }
+
         return local_params->len == 0 ? NULL : local_params->data;
  }
  
  
  /*
- * Send down specified query, read and discard all responses until ReadyForQuery
+ * pgxc_node_set_query
+ *       Send down specified query, discard all responses until ReadyForQuery.
   */
  void
  pgxc_node_set_query(PGXCNodeHandle *handle, const char *set_query)
  {
         pgxc_node_send_query(handle, set_query);
+
         /*
          * Now read responses until ReadyForQuery.
          * XXX We may need to handle possible errors here.
diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c

index 140907d8728407d66b649459264b99483f4acbc6..3722e9e04de77be665b919d0cf9cde4dd2474aa4 100644 (file)
--- a/src/backend/pgxc/pool/poolmgr.c
+++ b/src/backend/pgxc/pool/poolmgr.c
@@ -2,34 +2,217 @@
   *
   * poolmgr.c
   *
- *       Connection pool manager handles connections to Datanodes
+ *       Connection pool manager handles connections to other nodes.
   *
- * The pooler runs as a separate process and is forked off from a
- * Coordinator postmaster. If the Coordinator needs a connection from a
- * Datanode, it asks for one from the pooler, which maintains separate
- * pools for each Datanode. A group of connections can be requested in
- * a single request, and the pooler returns a list of file descriptors
- * to use for the connections.
   *
- * Note the current implementation does not yet shrink the pool over time
- * as connections are idle.  Also, it does not queue requests; if a
- * connection is unavailable, it will simply fail. This should be implemented
- * one day, although there is a chance for deadlocks. For now, limiting
- * connections should be done between the application and Coordinator.
- * Still, this is useful to avoid having to re-establish connections to the
- * Datanodes all the time for multiple Coordinator backend sessions.
+ * During query execution, nodes in the cluster often need communicate
+ * with other nodes. This applies both to coordinators (which generally
+ * delegate the query execution to the datanodes) and datanodes (that
+ * may need to exchange data with other datanodes, e.g. to redistribute
+ * one side of a join).
   *
- * The term "agent" here refers to a session manager, one for each backend
- * Coordinator connection to the pooler. It will contain a list of connections
- * allocated to a session, at most one per Datanode.
+ * Opening a new connection every time would be very inefficient (and
+ * would quickly become a major bottleneck in OLTP workloads with short
+ * queries/transactions), so XL pools and reuses the connections.
   *
+ * The pool manager runs as a separate auxiliary process and is forked
+ * from the postmaster in AuxiliaryProcessMain(), similarly to other
+ * auxiliary processes (checkpointer, bgwriter, ...).
+ *
+ * When a backend needs a connection to another node, it does not open
+ * it on it's own, but instead asks the pool manager. The pool manager
+ * maintains lists of connections for other nodes, so in most cases it
+ * can quickly provide an existing connection.
+ *
+ * Backends often need multiple connections at the same time (unless the
+ * query gets pushed to just a single node), so to reduce the overhead
+ * it's also possible to request multiple connections at once. In that
+ * case the pool manager handles all of them at once, and returns file
+ * descriptors for all the nodes at once.
+ *
+ *
+ * Note: The connection requests are not queued; if a connection is not
+ * unavailable (and can't be opened right away), the request will simply
+ * fail. This should be implemented one day, although there is a chance
+ * for deadlocks. For now, limiting connections should be done between
+ * the application and the coordinator. Still, this is useful to avoid
+ * having to re-establish connections to the datanodes all the time for
+ * multiple coordinator backend sessions.
+ *
+ * XXX Well, we try to do pools_maintenance(), which closes all old idle
+ * connections. But we try to do that only once, to prevent infinite
+ * loops.
+ *
+ * The term "pool agent" here refers to a session manager, one for each
+ * backend accessing the pooler. It manages a list of connections
+ * allocated to a session, at most one per datanode.
+ *
+ *
+ * entities of the pooler
+ * ======================
+ *
+ * This section is an overview of basic entities in the connection pool
+ * implementation. With the exception of PoolManager, all the entities
+ * are represented by a struct.
+ *
+ *
+ * PoolManager
+ * -----------
+ *
+ * - The auxiliary process started by postmaster, managing all requests
+ *   from sessions (from backend processes).
+ *
+ * - Requests arrive through PoolHandle (from sessions) and responses
+ *   (back to sessions) are sent through PoolAgent.
+ *
+ * PoolHandle
+ * ----------
+ *
+ * - Connection to PoolManager from sessions, i.e. when the sessions
+ *   needs something from the pool manager (e.g. new connection), it
+ *   sends a request a request through the handle (which pretty much
+ *   represents a unix socket).
+ *
+ * - Created and initialized in the backend process.
+ *
+ * PoolAgent
+ * ---------
+ *
+ * - Represents a session in the connection pool manager process, and
+ *   associates it with a database pool.
+ *
+ * - Tracks open connections to other nodes in the cluster, so that
+ *   we can release or close them automatically if needed.
+ *
+ * DatabasePool
+ * ------------
+ *
+ * - A connection pool for a particular database/user combination, or
+ *   rather a collection of per-node connection pools, one for each
+ *   node in the cluster.
+ *
+ * PGXCNodePool
+ * ------------
+ *
+ * - A pool of connections for a particular node in the cluster, part
+ *   of a DatabasePool (i.e. for a database/user combination).
+ *
+ * PGXCNodePoolSlot
+ * ----------------
+ *
+ * - A pooled connection, tracked in PGXCNodePool.
+ *
+ *
+ * interaction with the pooler
+ * ===========================
+ *
+ * When a session needs to open connections to other nodes, this is very
+ * roughly what happens:
+ *
+ * 1) PoolManagerConnect (backend session)
+ *
+ *    Initializes connection to the pool manager process (through the
+ *    unix socket), so that the session can send messages to the pool.
+ *    The connection is represented by "pool handle".
+ *
+ *    Note: This is not called directly, but automatically from the
+ *    functions that require connection to connection pool.
+ *
+ * 2) agent_create/agent_init (pool manager)
+ * 
+ *    Accepts the connection from the session, and opens a socket used
+ *    to respond to the session (e.g. with pooled connections).
+ *
+ *    Initializes the PoolAgent responsible for managing the pooled
+ *    connections assigned to this session, and associates it with
+ *    a database pool (dbname/user combination).
+ *
+ * 3) PoolManagerGetConnections (backend session)
+ *
+ *    Sends a request to the pool manager (through the pool handle).
+ *    The pool manager handles this in handle_get_connections(), and
+ *    sends back a list of file descriptors (pooled connections).
+ *
+ * 4) PoolManagerReleaseConnections (backend session)
+ *
+ *    Sends a request to the pool manager, notifying it that the
+ *    connections can be returned to the shared connection pool (or
+ *    have to be closed, in case of error).
+ *
+ *    The pool manager handles this in agent_release_connections().
+ *
+ * 5) PoolManagerDisconnect (backend session)
+ *
+ *    Sends a 'disconnect' message to the pool manager, and resets
+ *    the pool handle to NULL (if the session needs more connections,
+ *    it'll reconnect and start from scratch).
+ *
+ *    The pool manager handles the message by calling agent_destroy(),
+ *    which releases all remaining connections associated with the
+ *    agent, and then releases all the memory.
+ *
+ *
+ * public connection pool API
+ * ==========================
+ *
+ * The previous section briefly discussed the simplest interaction with
+ * the pool manager. This section provides a more complete overview of
+ * the pooler API, with some additional functions.
+ *
+ * These functions are meant to be used from the backends, and mostly
+ * "only" send requests to the pool manager (through the socket). The
+ * pool manager then processes those requests and does all the work.
+ *
+ * The primary use case (pooling) is handled by two functions:
+ *
+ * - PoolManagerGetConnections         acquire connection from the pool
+ * - PoolManagerReleaseConnections     release pooled connections back
+ *
+ * To cancel a query or abort a transaction in a distributed database,
+ * we need to forward the cancel/abort requests to all participating
+ * connection (tracked by PoolAgent). This is done by:
+ *
+ * - PoolManagerCancelQuery            forward "query cancel"
+ * - PoolManagerAbortTransactions      forward "abort transaction"
+ *
+ * The API also includes a number of 'maintenance' functions, which are
+ * useful e.g. when changing configuration of the cluster.
+ *
+ * - PoolManagerCleanConnection        close all unused connections
+ * - PoolManagerCheckConnectionInfo    check connection consistency
+ * - PoolManagerRefreshConnectionInfo  close mismatching connections
+ * - PoolManagerReloadConnectionInfo   close all connections
+ *
+ * There's a number of additional helper functions, but those are mostly
+ * internal and marked as static.
+ *
+ *
+ * XXX Why do we even need a separate connection pool manager? Can't we
+ * simply track the connections in a shared memory, somehow? That should
+ * be fairly simple, and it would remove the need for a separate process
+ * managing requests from all backends, no?
+ *
+ * XXX Apparently there's no "max_db_connections" option, that would
+ * limit the number of connections per node (similarly to what pgbouncer
+ * does for each DB pool, by grouping all per-user connections).
+ *
+ * XXX Make POOL_CHECK_SUCCESS and POOL_CHECK_FAILED an enum.
+ *
+ * XXX Some of the functions expect two separate lists of nodes, one for
+ * datanodes and one for coordinators. Not sure why that is necessary,
+ * and it makes the code more complicated.
+ *
+ * XXX The message types are hard-coded in the various methods as magic
+ * constants (e.g. PoolManagerAbortTransactions uses 'a'). Perhaps
+ * define this somewhere in a clear manner, e.g. like a #define.
   *
   * Portions Copyright (c) 2012-2014, TransLattice, Inc.
   * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
   * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
   *
+ *
   * IDENTIFICATION
- *       $$
+ *       src/backend/pgxc/pool/poolmgr.c
   *
   *-------------------------------------------------------------------------
   */
@@ -75,8 +258,7 @@ int                  PoolConnKeepAlive = 600;
  int                    PoolMaintenanceTimeout = 30;
  int                    MaxPoolSize = 100;
  int                    PoolerPort = 6667;
-
-bool                   PersistentConnections = false;
+bool           PersistentConnections = false;
  
  /* Flag to tell if we are Postgres-XC pooler process */
  static bool am_pgxc_pooler = false;
@@ -89,35 +271,66 @@ typedef struct
         int     port;
  } PGXCNodeConnectionInfo;
  
-/* Handle to the pool manager (Session's side) */
+/* Handle to the pool manager (from each session) */
  typedef struct
  {
         /* communication channel */
         PoolPort        port;
  } PoolHandle;
  
-/* The root memory context */
+/* The pooler root memory context */
  static MemoryContext PoolerMemoryContext = NULL;
-/*
- * Allocations of core objects: Datanode connections, upper level structures,
- * connection strings, etc.
- */
+
+/* Core objects: connections, connection strings, etc. */
  static MemoryContext PoolerCoreContext = NULL;
-/*
- * Memory to store Agents
- */
+
+/* Pool Agents */
  static MemoryContext PoolerAgentContext = NULL;
  
-/* Pool to all the databases (linked list) */
+/*
+ * A list of connection pools per (one for each db/user combination).
+ *
+ * XXX The DatabasePool are organized in a simple linked list. That may
+ * be an issue with many databases/users, so perhaps we should consider
+ * organizing this in a hash table  or something. But for now linked
+ * list is good enough.
+ */
  static DatabasePool *databasePools = NULL;
  
-/* PoolAgents and the poll array*/
+/*
+ * An array of allocated PoolAgents (one for each session).
+ *
+ * There's a 1:1 mapping between sessions and agents, so the number of
+ * agents is limited by MaxConnections. Also, we can access the agents
+ * directly using MyBackendId, so there's not much point in building a
+ * more complicated structure here (like a hash table for example).
+ *
+ * XXX That however does not happen, because agent_create() simply adds
+ * the agents at the end of the poolAgents array. So PoolerLoop and
+ * agent_destroy have to loop through the agents, etc. Seems expensive.
+ *
+ * XXX We do know that there will never be more than MaxConnections
+ * agents, so we can simply pre-allocate all of them in PoolManagerInit,
+ * and then only flag them as 'used/unused' intead of palloc/pfree.
+ */
  static int     agentCount = 0;
  static PoolAgent **poolAgents;
  
+/*
+ * A connection to the pool manager (essentially a PQ connection).
+ */
  static PoolHandle *poolHandle = NULL;
  
+/*
+ * PoolManager "lock" flag. The manager runs as a separate process, so
+ * we can use this very simple approach to locking.
+ */
  static int     is_pool_locked = false;
+
+/*
+ * File descriptor representing the pool manager UNIX socket. Sessions
+ * are communicating with the pool manager though this file descriptor.
+ */
  static int     server_fd = -1;
  
  static int     node_info_check(PoolAgent *agent);
@@ -141,10 +354,18 @@ static PGXCNodePoolSlot *acquire_connection(DatabasePool *dbPool, Oid node);
  static void agent_release_connections(PoolAgent *agent, bool force_destroy);
  static void release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot,
                                                            Oid node, bool force_destroy);
+
  static void destroy_slot(PGXCNodePoolSlot *slot);
-static PGXCNodePool *grow_pool(DatabasePool *dbPool, Oid node);
  static void destroy_node_pool(PGXCNodePool *node_pool);
+
+static PGXCNodePool *grow_pool(DatabasePool *dbPool, Oid node);
+static bool shrink_pool(DatabasePool *pool);
+static void pools_maintenance(void);
+
  static void PoolerLoop(void);
+static void PoolManagerConnect(const char *database, const char *user_name,
+               const char *pgoptions);
+
  static int clean_connection(List *node_discard,
                                                         const char *database,
                                                         const char *user_name);
@@ -153,14 +374,12 @@ static int *abort_pids(int *count,
                                            const char *database,
                                            const char *user_name);
  static char *build_node_conn_str(Oid node, DatabasePool *dbPool);
+
  /* Signal handlers */
  static void pooler_die(SIGNAL_ARGS);
  static void pooler_quickdie(SIGNAL_ARGS);
-static void PoolManagerConnect(const char *database, const char *user_name,
-               const char *pgoptions);
  static void pooler_sighup(SIGNAL_ARGS);
-static bool shrink_pool(DatabasePool *pool);
-static void pools_maintenance(void);
+
  static void TryPingUnhealthyNode(Oid nodeoid);
  
  /*
@@ -182,7 +401,7 @@ IsPGXCPoolerProcess(void)
  }
  
  /*
- * Initialize internal structures
+ * Initialize internal PoolManager structures.
   */
  int
  PoolManagerInit()
@@ -208,7 +427,8 @@ PoolManagerInit()
                                                                                            ALLOCSET_DEFAULT_INITSIZE,
                                                                                            ALLOCSET_DEFAULT_MAXSIZE);
  
-       ForgetLockFiles();      
+       /* XXX Not sure what this is ... */
+       ForgetLockFiles();
  
         /*
          * Properly accept or ignore signals the postmaster might send us
@@ -230,6 +450,7 @@ PoolManagerInit()
         /* Allocate pooler structures in the Pooler context */
         MemoryContextSwitchTo(PoolerMemoryContext);
  
+       /* Allocate pool agents, one for each connection (session). */
         poolAgents = (PoolAgent **) palloc(MaxConnections * sizeof(PoolAgent *));
         if (poolAgents == NULL)
         {
@@ -244,7 +465,11 @@ PoolManagerInit()
  
  
  /*
- * Check connection info consistency with system catalogs
+ * node_info_check
+ *       Check that connection info is consistent with system catalogs.
+ *
+ * Returns POOL_CHECK_SUCCESS when all the information (number of nodes,
+ * node OIDs and connection strings) match. POOL_CHECK_FAILED otherwise.
   */
  static int
  node_info_check(PoolAgent *agent)
@@ -258,8 +483,9 @@ node_info_check(PoolAgent *agent)
         int                             numDn;
  
         /*
-        * First check if agent's node information matches to current content of the
-        * shared memory table.
+        * First check if agent's node information (number of node OIDs and
+        * the OID values) matches the current contents of the shared memory
+        * table (with authoritative node information).
          */
         PgxcNodeGetOids(&coOids, &dnOids, &numCo, &numDn, false);
  
@@ -274,8 +500,14 @@ node_info_check(PoolAgent *agent)
         pfree(dnOids);
  
         /*
-        * Iterate over all dbnode pools and check if connection strings
-        * are matching node definitions.
+        * Iterate over all database pools and check if connection strings
+        * (in all node pools) match node definitions from node catalog.
+        *
+        * XXX Does this behave correctly with multiple database pools? We
+        * remember which nodes were already checked in a 'checked' list,
+        * so that we check each node just once. But doesn't that mean we
+        * only really check the first DatabasePool and fail to check the
+        * following ones?
          */
         while (res == POOL_CHECK_SUCCESS && dbPool)
         {
@@ -287,22 +519,30 @@ node_info_check(PoolAgent *agent)
                 {
                         char               *connstr_chk;
  
-                       /* No need to check same Datanode twice */
+                       /* No need to check same node twice */
                         if (list_member_oid(checked, nodePool->nodeoid))
                                 continue;
+
                         checked = lappend_oid(checked, nodePool->nodeoid);
  
                         connstr_chk = build_node_conn_str(nodePool->nodeoid, dbPool);
                         if (connstr_chk == NULL)
                         {
                                 /* Problem of constructing connection string */
+                               ereport(INFO,
+                                               (errmsg("failed to construct connection string for node %d",
+                                                               nodePool->nodeoid)));
                                 hash_seq_term(&hseq_status);
                                 res = POOL_CHECK_FAILED;
                                 break;
                         }
+
                         /* return error if there is difference */
                         if (strcmp(connstr_chk, nodePool->connstr))
                         {
+                               ereport(INFO,
+                                               (errmsg("mismatching connection string for node %d ('%s' != '%s')",
+                                                               nodePool->nodeoid, nodePool->connstr, connstr_chk)));
                                 pfree(connstr_chk);
                                 hash_seq_term(&hseq_status);
                                 res = POOL_CHECK_FAILED;
@@ -313,29 +553,21 @@ node_info_check(PoolAgent *agent)
                 }
                 dbPool = dbPool->next;
         }
+
         list_free(checked);
         return res;
  }
  
  /*
- * Destroy internal structures
- */
-int
-PoolManagerDestroy(void)
-{
-       int                     status = 0;
-
-       if (PoolerMemoryContext)
-       {
-               MemoryContextDelete(PoolerMemoryContext);
-               PoolerMemoryContext = NULL;
-       }
-
-       return status;
-}
-
-/*
- * Connect to the pooler process
+ * GetPoolManagerHandle
+ *       Connect to pool manager (through a UNIX socket).
+ *
+ * We know the pooler always runs on the same system (as it's just an
+ * auxiliary process forked from postmaster), so we only support UNIX
+ * sockets.
+ *
+ * XXX Perhaps this should fail at compile time when HAVE_UNIX_SOCKETS
+ * is not defined?
   */
  static void
  GetPoolManagerHandle(void)
@@ -343,8 +575,8 @@ GetPoolManagerHandle(void)
         PoolHandle *handle;
         int                     fdsock = -1;
  
+       /* do nothing if a session is already connected to pool manager */
         if (poolHandle)
-               /* already connected */
                 return;
  
  #ifdef HAVE_UNIX_SOCKETS
@@ -409,10 +641,11 @@ GetPoolManagerHandle(void)
                                 (errmsg("failed to connect to pool manager: %m")));
  
         /*
-        * Allocate handle
+        * Allocate the handle
+        *
+        * XXX We may change malloc to palloc here, but first ensure that
+        * the CurrentMemoryContext is set properly.
          *
-        * XXX we may change malloc here to palloc but first ensure
-        * the CurrentMemoryContext is properly set.
          * The handle allocated just before new session is forked off and
          * inherited by the session process. It should remain valid for all
          * the session lifetime.
@@ -432,7 +665,12 @@ GetPoolManagerHandle(void)
  }
  
  /*
- * Create agent
+ * agent_create
+ *       Create a PoolAgent for a new session.
+ *
+ * PoolAgent represents the session within pool manager process. So when
+ * the session wants to communicate with the pool manager, it sends the
+ * data through PoolHandle, and pool manager responds through PoolAgent.
   */
  static void
  agent_create(void)
@@ -493,21 +731,22 @@ agent_create(void)
  
  /*
   * session_options
- * Returns the pgoptions string generated using a particular
- * list of parameters that are required to be propagated to Datanodes.
- * These parameters then become default values for the pooler sessions.
+ *       Generates a pgoptions string to propagete to the other nodes.
+ *
+ * These parameters then become default values for the pooled sessions.
   * For e.g., a psql user sets PGDATESTYLE. This value should be set
   * as the default connection parameter in the pooler session that is
- * connected to the Datanodes. There are various parameters which need to
- * be analysed individually to determine whether these should be set on
- * Datanodes.
+ * connected to the other nodes.
   *
- * Note: These parameters values are the default values of the particular
- * Coordinator backend session, and not the new values set by SET command.
+ * There are various parameters which need to be analysed individually
+ * to determine whether these should be tracked and propagated.
   *
+ * Note: These parameters values are the default values of each backend
+ * session, and not the new values set by SET command. We simply get
+ * the default value using GetConfigOptionResetString().
   */
-
-char *session_options(void)
+char *
+session_options(void)
  {
         int                              i;
         char                    *pgoptions[] = {"DateStyle", "timezone", "geqo", "intervalstyle", "lc_monetary"};
@@ -547,8 +786,20 @@ char *session_options(void)
  
  
  /*
- * Associate session with specified database and respective connection pool
- * Invoked from Session process
+ * PoolManagerConnect
+ *       Connect session to a pool manager.
+ *
+ * Used from a backend to open a connection to the pool manager. The
+ * backends do not call this directly, though - it's called automatically
+ * from functions that need to communicate with the pool manager.
+ *
+ * Opens a communication channel by acquiring a "pool manger handle"
+ * (which opens a two-way connection through a UNIX socket), and then
+ * sends enough information (particularly dbname and username) to lookup
+ * the right connection pool.
+ *
+ * This only sends the message to the pool manager, but does not wait
+ * for response.
   */
  static void
  PoolManagerConnect(const char *database, const char *user_name,
@@ -561,7 +812,7 @@ PoolManagerConnect(const char *database, const char *user_name,
         int             pgoptionslen = strlen(pgoptions);
         char    atchar = ' ';
  
-       /* Connect to the pooler process if not yet connected */
+       /* Make sure we're connected to the pool manager process.*/
         GetPoolManagerHandle();
         if (poolHandle == NULL)
                 ereport(ERROR,
@@ -573,9 +824,10 @@ PoolManagerConnect(const char *database, const char *user_name,
  
         /*
          * Special handling for db_user_namespace=on
+        *
          * We need to handle per-db users and global users. The per-db users will
          * arrive with @dbname and global users just as username. Handle both of
-        * them appropriately
+        * them appropriately.
          */
         if (strcmp(GetConfigOption("db_user_namespace", false, false), "on") == 0)
         {
@@ -623,6 +875,7 @@ PoolManagerConnect(const char *database, const char *user_name,
         }
         else
                 pool_putbytes(&poolHandle->port, user_name, unamelen);
+
         pool_putbytes(&poolHandle->port, "\0", 1);
  
         /* Length of pgoptions string */
@@ -636,54 +889,11 @@ PoolManagerConnect(const char *database, const char *user_name,
  }
  
  /*
- * Reconnect to pool manager
- * It simply does a disconnection and a reconnection.
- */
-void
-PoolManagerReconnect(void)
-{
-       elog(DEBUG1, "Reconnecting to PoolManager");
-
-       /* Connected, disconnect */
-       if (poolHandle)
-               PoolManagerDisconnect();
-
-       PoolManagerConnect(get_database_name(MyDatabaseId), GetClusterUserName(),
-                       session_options());
-}
-
-/*
- * Lock/unlock pool manager
- * During locking, the only operations not permitted are abort, connection and
- * connection obtention.
- */
-void
-PoolManagerLock(bool is_lock)
-{
-       char msgtype = 'o';
-       int n32;
-       int msglen = 8;
-       if (poolHandle == NULL)
-               PoolManagerConnect(get_database_name(MyDatabaseId),
-                                                  GetClusterUserName(), "");
-
-       elog(DEBUG1, "Locking PoolManager");
-
-       /* Message type */
-       pool_putbytes(&poolHandle->port, &msgtype, 1);
-
-       /* Message length */
-       n32 = htonl(msglen);
-       pool_putbytes(&poolHandle->port, (char *) &n32, 4);
-
-       /* Lock information */
-       n32 = htonl((int) is_lock);
-       pool_putbytes(&poolHandle->port, (char *) &n32, 4);
-       pool_flush(&poolHandle->port);
-}
-
-/*
- * Init PoolAgent
+ * agent_init
+ *       Initialize a PoolAgent instance (allocate memory, etc.).
+ *
+ * Allocates memory for coordinator and datanode connections (in the
+ * per-agent memory context), and links it to the correct database pool.
   */
  static void
  agent_init(PoolAgent *agent, const char *database, const char *user_name,
@@ -695,6 +905,9 @@ agent_init(PoolAgent *agent, const char *database, const char *user_name,
         Assert(database);
         Assert(user_name);
  
+       elog(DEBUG1, "Initializing PoolAgent (user_name %s, database %s, "
+                       "pgoptions %s", user_name, database, pgoptions);
+
         /* disconnect if we are still connected */
         if (agent->pool)
                 agent_release_connections(agent, false);
@@ -709,20 +922,34 @@ agent_init(PoolAgent *agent, const char *database, const char *user_name,
                         palloc0(agent->num_coord_connections * sizeof(PGXCNodePoolSlot *));
         agent->dn_connections = (PGXCNodePoolSlot **)
                         palloc0(agent->num_dn_connections * sizeof(PGXCNodePoolSlot *));
-       /* find database */
+
+       /* find the right database pool */
         agent->pool = find_database_pool(database, user_name, pgoptions);
  
         /* create if not found */
         if (agent->pool == NULL)
                 agent->pool = create_database_pool(database, user_name, pgoptions);
  
+       Assert(agent->pool);
+
         MemoryContextSwitchTo(oldcontext);
  
         return;
  }
  
  /*
- * Destroy PoolAgent
+ * agent_destroy
+ *             Close remaining connections, release agent's memory.
+ *
+ * Under normal conditions, all connections managed by the agent should
+ * have been closed by this point. If there are some connections still
+ * associated with the agent, something must have gone wrong (error),
+ * in which case we have no idea in what state the connections are and
+ * we have no reliable / cheap way to find out. So just close them.
+ *
+ * XXX This is one of the places where we have to loop through the array
+ * of agents to find the "current" one. Seems expensive, especially when
+ * there are many short-lived sessions (as typical in OLTP).
   */
  static void
  agent_destroy(PoolAgent *agent)
@@ -733,17 +960,17 @@ agent_destroy(PoolAgent *agent)
  
         close(Socket(agent->port));
  
-       /* Discard connections if any remaining */
+       /*
+        * Release all connections the session might be still holding.
+        * 
+        * If the session is disconnecting while still holding some open
+        * connections, we have no idea if those connections are clean
+        * or not. So force destroying them.
+        */
         if (agent->pool)
-       {
-               /*
-                * If session is disconnecting while there are active connections
-                * we can not know if they clean or not, so force destroy them
-                */
                 agent_release_connections(agent, true);
-       }
  
-       /* find agent in the list */
+       /* Remove the agent from the poolAgents array. */
         for (i = 0; i < agentCount; i++)
         {
                 if (poolAgents[i] == agent)
@@ -762,8 +989,13 @@ agent_destroy(PoolAgent *agent)
  }
  
  /*
- * Ping an UNHEALTHY node and if it succeeds, update SHARED node
- * information
+ * TryPingUnhealthyNode
+ *       Try pinging a node marked as unhealthy, and update shared info.
+ *
+ * Try pinging a node previously marked as UNHEALTHY, and if it succeeds
+ * then update the SHARED node information (marking it as healthy).
+ *
+ * XXX Perhaps this should track timestamp of the last attempted ping?
   */
  static void
  TryPingUnhealthyNode(Oid nodeoid)
@@ -773,6 +1005,7 @@ TryPingUnhealthyNode(Oid nodeoid)
         char connstr[MAXPGPATH * 2 + 256];
  
         nodeDef = PgxcNodeGetDefinition(nodeoid);
+
         if (nodeDef == NULL)
         {
                 /* No such definition, node dropped? */
@@ -780,6 +1013,8 @@ TryPingUnhealthyNode(Oid nodeoid)
                          " skipping health check", nodeoid);
                 return;
         }
+
+       /* XXX This fails to release the nodeDef, which is a memory leak. */
         if (nodeDef->nodeishealthy)
         {
                 /* hmm, can this happen? */
@@ -790,9 +1025,11 @@ TryPingUnhealthyNode(Oid nodeoid)
  
         elog(LOG, "node (%s:%u) down! Trying ping",
                  NameStr(nodeDef->nodename), nodeoid);
+
         sprintf(connstr,
                         "host=%s port=%d", NameStr(nodeDef->nodehost),
                         nodeDef->nodeport);
+
         status = PGXCNodePing(connstr);
         if (status != 0)
         {
@@ -813,8 +1050,10 @@ TryPingUnhealthyNode(Oid nodeoid)
  }
  
  /*
- * Check if a node is indeed down and if it is update its UNHEALTHY
- * status
+ * PoolPingNodeRecheck
+ *       Check if a node is down, and if it is then mark it as UNHEALTHY.
+ *
+ * XXX Move to pgxcnode.c (as static), it's not used anywhere else.
   */
  void
  PoolPingNodeRecheck(Oid nodeoid)
@@ -858,7 +1097,11 @@ PoolPingNodeRecheck(Oid nodeoid)
  }
  
  /*
- * Ping UNHEALTHY nodes as part of the maintenance window
+ * PoolPingNodes
+ *       Ping nodes currently marked as UNHEALTHY.
+ *
+ * XXX Perhaps we should fetch only the unhealthy nodes, instead of
+ * fetching everything and then looping over them.
   */
  void
  PoolPingNodes()
@@ -875,7 +1118,7 @@ PoolPingNodes()
                                                  coHealthMap, dnHealthMap);
  
         /*
-        * Find unhealthy datanodes and try to re-ping them
+        * Find unhealthy datanodes and try to re-ping them.
          */
         for (i = 0; i < numDn; i++)
         {
@@ -885,8 +1128,9 @@ PoolPingNodes()
                         TryPingUnhealthyNode(nodeoid);
                 }
         }
+
         /*
-        * Find unhealthy coordinators and try to re-ping them
+        * Find unhealthy coordinators and try to re-ping them.
          */
         for (i = 0; i < numCo; i++)
         {
@@ -898,8 +1142,18 @@ PoolPingNodes()
         }
  }
  
+/***********************************************************************
+ * Communication with a pool manager (sending messages through socket).
+ **********************************************************************/
+
+
  /*
- * Release handle to pool manager
+ * PoolManagerDisconnect
+ *       Close connection to the pool manager and reset it to NULL.
+ *
+ * When everything goes well, the session notifies the pool manager by
+ * sending an exit message ('d'), closes the port and releases all
+ * memory associated with it.
   */
  void
  PoolManagerDisconnect(void)
@@ -917,7 +1171,12 @@ PoolManagerDisconnect(void)
  
  
  /*
- * Get pooled connections
+ * PoolManagerGetConnections
+ *       Acquire connections for requested nodes, along with their PIDs.
+ *
+ * Acquires pooled connections for the specified nodes, and returns an
+ * array of file descriptors, representing connections to the nodes.
+ * It also provides array of PIDs of the backends (on remote nodes).
   */
  int *
  PoolManagerGetConnections(List *datanodelist, List *coordlist, int **pids)
@@ -926,20 +1185,27 @@ PoolManagerGetConnections(List *datanodelist, List *coordlist, int **pids)
         ListCell   *nodelist_item;
         int                *fds;
         int                     totlen = list_length(datanodelist) + list_length(coordlist);
-       int                     nodes[totlen + 2];
+       int                     nodes[totlen + 2]; /* node OIDs + two node counts */
  
+       /* Make sure we're connected to the pool manager. */
         if (poolHandle == NULL)
                 PoolManagerConnect(get_database_name(MyDatabaseId),
                                                    GetClusterUserName(), session_options());
  
         /*
-        * Prepare end send message to pool manager.
-        * First with Datanode list.
-        * This list can be NULL for a query that does not need
-        * Datanode Connections (Sequence DDLs)
+        * Prepare a message we send to the pool manager. We build it in the
+        * nodes array, as all the fields are int-sized.
+        *
+        * - number of datanodes
+        * - datanode OIDs
+        * - number of coordinators
+        * - coordinator OIDs
+        * 
+        * The datanode list may be empty when the query does not need talk
+        * to datanodes (e.g. sequence DDL).
          */
-       nodes[0] = htonl(list_length(datanodelist));
-       i = 1;
+       i = 0;
+       nodes[i++] = htonl(list_length(datanodelist));
         if (list_length(datanodelist) != 0)
         {
                 foreach(nodelist_item, datanodelist)
@@ -947,7 +1213,11 @@ PoolManagerGetConnections(List *datanodelist, List *coordlist, int **pids)
                         nodes[i++] = htonl(lfirst_int(nodelist_item));
                 }
         }
-       /* Then with Coordinator list (can be nul) */
+
+       /*
+        * Similarly for coordinators, some queries don't need them and in
+        * that case the list may be NULL.
+        */
         nodes[i++] = htonl(list_length(coordlist));
         if (list_length(coordlist) != 0)
         {
@@ -957,10 +1227,14 @@ PoolManagerGetConnections(List *datanodelist, List *coordlist, int **pids)
                 }
         }
  
+       /*
+        * Send the encoded datanode/coordinator OIDs to the pool manager,
+        * flush the message nd wait for the response.
+        */
         pool_putmessage(&poolHandle->port, 'g', (char *) nodes, sizeof(int) * (totlen + 2));
         pool_flush(&poolHandle->port);
  
-       /* Receive response */
+       /* Allocate memory for file descriptors (node connections). */
         fds = (int *) palloc(sizeof(int) * totlen);
         if (fds == NULL)
         {
@@ -968,14 +1242,19 @@ PoolManagerGetConnections(List *datanodelist, List *coordlist, int **pids)
                                 (errcode(ERRCODE_OUT_OF_MEMORY),
                                  errmsg("out of memory")));
         }
+
+       /* receive file descriptors */
         if (pool_recvfds(&poolHandle->port, fds, totlen))
         {
+               elog(WARNING, "failed to receive file descriptors for connections");
                 pfree(fds);
                 fds = NULL;
         }
  
+       /* receive PIDs for remote backends */
         if (pool_recvpids(&poolHandle->port, pids) != totlen)
         {
+               elog(WARNING, "failed to receive PIDs of remote backends");
                 pfree(*pids);
                 *pids = NULL;
                 return NULL;
@@ -984,16 +1263,26 @@ PoolManagerGetConnections(List *datanodelist, List *coordlist, int **pids)
         return fds;
  }
  
+
  /*
- * Abort active transactions using pooler.
- * Take a lock forbidding access to Pooler for new transactions.
+ * PoolManagerAbortTransactions
+ *       Abort active transactions on connections in a particular pool.
+ *
+ * Simply send an 'abort' message to the pool manager, which then aborts
+ * in-progress transaction on all connections in a matching DatabasePool
+ * (identified by dbname/username).
+ *
+ * Currently this point this only happens during CLEAN CONNECTION.
+ *
+ * An array of PIDs on which transactions were aborted is returned
+ * through the proc_pids argument, the number of PIDs as a return value.
   */
  int
  PoolManagerAbortTransactions(char *dbname, char *username, int **proc_pids)
  {
         int             num_proc_ids = 0;
         int             n32, msglen;
-       char            msgtype = 'a';
+       char    msgtype = 'a';
         int             dblen = dbname ? strlen(dbname) + 1 : 0;
         int             userlen = username ? strlen(username) + 1 : 0;
  
@@ -1039,10 +1328,12 @@ PoolManagerAbortTransactions(char *dbname, char *username, int **proc_pids)
  
  
  /*
- * Clean up Pooled connections
+ * PoolManagerCleanConnection
+ *       Performs a cleanup of pooled connections.
   */
  void
-PoolManagerCleanConnection(List *datanodelist, List *coordlist, char *dbname, char *username)
+PoolManagerCleanConnection(List *datanodelist, List *coordlist,
+                                                  char *dbname, char *username)
  {
         int                     totlen = list_length(datanodelist) + list_length(coordlist);
         int                     nodes[totlen + 2];
@@ -1052,16 +1343,25 @@ PoolManagerCleanConnection(List *datanodelist, List *coordlist, char *dbname, ch
         int                     userlen = username ? strlen(username) + 1 : 0;
         int                     dblen = dbname ? strlen(dbname) + 1 : 0;
  
-       /*
-        * New connection may be established to clean connections to
-        * specified nodes and databases.
-        */
+       /* Make sure we're connected to the pool manager. */
         if (poolHandle == NULL)
                 PoolManagerConnect(get_database_name(MyDatabaseId),
                                                    GetClusterUserName(), session_options());
  
-       nodes[0] = htonl(list_length(datanodelist));
-       i = 1;
+       /*
+        * Prepare a message we send to the pool manager. We build it in the
+        * nodes array, as all the fields are int-sized.
+        *
+        * - number of datanodes
+        * - datanode OIDs
+        * - number of coordinators
+        * - coordinator OIDs
+        * 
+        * The datanode list may be empty when the query does not need talk
+        * to datanodes (e.g. sequence DDL).
+        */
+       i = 0;
+       nodes[i++] = htonl(list_length(datanodelist));
         if (list_length(datanodelist) != 0)
         {
                 foreach(nodelist_item, datanodelist)
@@ -1069,7 +1369,11 @@ PoolManagerCleanConnection(List *datanodelist, List *coordlist, char *dbname, ch
                         nodes[i++] = htonl(lfirst_int(nodelist_item));
                 }
         }
-       /* Then with Coordinator list (can be nul) */
+
+       /*
+        * Similarly for coordinators, some queries don't need them and in
+        * that case the list may be NULL.
+        */
         nodes[i++] = htonl(list_length(coordlist));
         if (list_length(coordlist) != 0)
         {
@@ -1117,21 +1421,32 @@ PoolManagerCleanConnection(List *datanodelist, List *coordlist, char *dbname, ch
  
  
  /*
- * Check connection information consistency cached in pooler with catalog information
+ * PoolManagerCheckConnectionInfo
+ *       Check that pool manager info is consistent with the node catalog.
+ *
+ * Check that information used by the pool manager (for open connections)
+ * is consistent with the system catalog.
+ *
+ * Returns 'true' when everything seems consistent, and 'false' in case
+ * of some inconsistency.
   */
  bool
  PoolManagerCheckConnectionInfo(void)
  {
         int res;
  
-       /*
-        * New connection may be established to clean connections to
-        * specified nodes and databases.
-        */
+       /* Make sure we're connected to the pool manager. */
         if (poolHandle == NULL)
                 PoolManagerConnect(get_database_name(MyDatabaseId),
                                                    GetClusterUserName(), session_options());
+
+       /*
+        * The name is a bit misleading, but PgxcNodeListAndCount updates
+        * information about nodes in shared memory from system catalog.
+        */
         PgxcNodeListAndCount();
+
+       /* Send message to the pool manager and wait for a response. */
         pool_putmessage(&poolHandle->port, 'q', NULL, 0);
         pool_flush(&poolHandle->port);
  
@@ -1145,7 +1460,8 @@ PoolManagerCheckConnectionInfo(void)
  
  
  /*
- * Reload connection data in pooler and drop all the existing connections of pooler
+ * PoolManagerReloadConnectionInfo
+ *       Reload connection metadata and close all open connections.
   */
  void
  PoolManagerReloadConnectionInfo(void)
@@ -1156,11 +1472,14 @@ PoolManagerReloadConnectionInfo(void)
         pool_flush(&poolHandle->port);
  }
  
+
  /*
- * Refresh connection data in pooler and drop connections for those nodes
- * that have changed. Thus, this operation is less destructive as compared
- * to PoolManagerReloadConnectionInfo and should typically be called when
- * NODE ALTER has been performed
+ * PoolManagerRefreshConnectionInfo
+ *       Refresh connection metadata and close stale connections.
+ *
+ * Unlike PoolManagerReloadConnectionInfo, this only closes connections
+ * to nodes where the metadata changed. Thus, this operation is less
+ * destructive, and should typically be called after NODE ALTER.
   */
  int
  PoolManagerRefreshConnectionInfo(void)
@@ -1180,6 +1499,17 @@ PoolManagerRefreshConnectionInfo(void)
         return false;
  }
  
+
+/***********************************************************************
+ * Handling of messages sent to the pool manager (through the socket).
+ **********************************************************************/
+
+/*
+ * handle_abort
+ *       Handles 'abort transaction' action.
+ *
+ * The message is built and sent by PoolManagerAbortTransactions.
+ */
  static void
  handle_abort(PoolAgent * agent, StringInfo s)
  {
@@ -1206,6 +1536,15 @@ handle_abort(PoolAgent * agent, StringInfo s)
                 pfree(pids);
  }
  
+/*
+ * handle_connect
+ *       Initializes a PoolAgent object and associates is with a pool.
+ *
+ * Once the connect is complete, the agent is associated with a database
+ * pool and can provide pooled connections.
+ *
+ * The message is built and sent by PoolManagerConnect.
+ */
  static void
  handle_connect(PoolAgent * agent, StringInfo s)
  {
@@ -1226,14 +1565,19 @@ handle_connect(PoolAgent * agent, StringInfo s)
         len = pq_getmsgint(s, 4);
         pgoptions = pq_getmsgbytes(s, len);
  
-       /*
-        * Coordinator pool is not initialized.
-        * With that it would be impossible to create a Database by default.
-        */
+       /* Initialize the agent - find the proper DatabasePool, etc. */
         agent_init(agent, database, user_name, pgoptions);
+
+       /* XXX Shouldn't this be before the agent_init? */
         pq_getmsgend(s);
  }
  
+/*
+ * handle_clean_connection
+ *       Handles CLEAN CONNECTION command.
+ *
+ * The message is built and sent by PoolManagerCleanConnection.
+ */
  static void
  handle_clean_connection(PoolAgent * agent, StringInfo s)
  {
@@ -1275,15 +1619,21 @@ handle_clean_connection(PoolAgent * agent, StringInfo s)
  
         pq_getmsgend(s);
  
-       /* Clean up connections here */
+       /* perform the actual connection cleanup */
         res = clean_connection(nodelist, database, user_name);
  
         list_free(nodelist);
  
-       /* Send success result */
+       /* send result (success/failure) back */
         pool_sendres(&agent->port, res);
  }
  
+/*
+ * handle_get_connections
+ *       Acquire pooled connections to the specified nodes.
+ *
+ * The message is built and sent by PoolManagerGetConnections.
+ */
  static void
  handle_get_connections(PoolAgent * agent, StringInfo s)
  {
@@ -1294,22 +1644,26 @@ handle_get_connections(PoolAgent * agent, StringInfo s)
         List   *coordlist = NIL;
  
         /*
-        * Length of message is caused by:
-        * - Message header = 4bytes
-        * - List of Datanodes = NumPoolDataNodes * 4bytes (max)
-        * - List of Coordinators = NumPoolCoords * 4bytes (max)
-        * - Number of Datanodes sent = 4bytes
-        * - Number of Coordinators sent = 4bytes
-        * It is better to send in a same message the list of Co and Dn at the same
-        * time, this permits to reduce interactions between postmaster and pooler
+        * The message consists of:
+        *
+        * - Message header = 4B
+        * - Number of Datanodes sent = 4B
+        * - List of Datanodes = NumPoolDataNodes * 4B (max)
+        * - Number of Coordinators sent = 4B
+        * - List of Coordinators = NumPoolCoords * 4B (max)
          */
+
         pool_getmessage(&agent->port, s, 4 * agent->num_dn_connections + 4 * agent->num_coord_connections + 12);
  
+       /* decode the datanode OIDs */
         datanodecount = pq_getmsgint(s, 4);
         for (i = 0; i < datanodecount; i++)
                 datanodelist = lappend_int(datanodelist, pq_getmsgint(s, 4));
  
-       /* It is possible that no Coordinators are involved in the transaction */
+       /*
+        * decode the coordinator OIDs (there may be none, if no coordinators
+        * are involved in the transaction)
+        */
         coordcount = pq_getmsgint(s, 4);
         for (i = 0; i < coordcount; i++)
                 coordlist = lappend_int(coordlist, pq_getmsgint(s, 4));
@@ -1327,19 +1681,23 @@ handle_get_connections(PoolAgent * agent, StringInfo s)
         list_free(datanodelist);
         list_free(coordlist);
  
+       /* Send the file descriptors back, along with the correct count. */
         pool_sendfds(&agent->port, fds, fds ? datanodecount + coordcount : 0);
         if (fds)
                 pfree(fds);
  
-       /*
-        * Also send the PIDs of the remote backend processes serving
-        * these connections
-        */
+       /* Also send PIDs of the remote backends serving the connections. */
         pool_sendpids(&agent->port, pids, pids ? datanodecount + coordcount : 0);
         if (pids)
                 pfree(pids);
  }
  
+/*
+ * handle_query_cancel
+ *       Cancel query executed on connections associated with the agent.
+ *
+ * PoolManagerCancelQuery
+ */
  static void
  handle_query_cancel(PoolAgent * agent, StringInfo s)
  {
@@ -1378,7 +1736,8 @@ handle_query_cancel(PoolAgent * agent, StringInfo s)
  }
  
  /*
- * Handle messages to agent
+ * agent_handle_input
+ *       Handle messages passed to the pool agent from PoolerLoop().
   */
  static void
  agent_handle_input(PoolAgent * agent, StringInfo s)
@@ -1500,7 +1859,12 @@ agent_handle_input(PoolAgent * agent, StringInfo s)
  }
  
  /*
- * acquire connection
+ * agent_acquire_connections
+ *             Acquire connections to specified nodes, associate them with agent.
+ *
+ * Returns an array of file descriptors representing the connections, with
+ * order matching the datanode/coordinator list. Also returns an array of
+ * backend PIDs, handling those connections (on the remote nodes).
   */
  static int *
  agent_acquire_connections(PoolAgent *agent, List *datanodelist,
@@ -1526,12 +1890,17 @@ agent_acquire_connections(PoolAgent *agent, List *datanodelist,
         }
  
         /*
-        * Allocate memory
-        * File descriptors of Datanodes and Coordinators are saved in the same array,
-        * This array will be sent back to the postmaster.
-        * It has a length equal to the length of the Datanode list
-        * plus the length of the Coordinator list.
-        * Datanode fds are saved first, then Coordinator fds are saved.
+        * Allocate memory for the file descriptors and backend PIDs.
+        *
+        * File descriptors of datanodes and coordinators are both saved in
+        * a single array, which is then sent back to the backend. Datanodes
+        * are stored first, coordinators second, and the order matches the
+        * order of input lists.
+        *
+        * And similarly for the PIDs - single array, datanodes first.
+        *
+        * XXX How expensive is it to do the list_length over and over? Maybe
+        * do the count once and then use the value elsewhere?
          */
         result = (int *) palloc((list_length(datanodelist) + list_length(coordlist)) * sizeof(int));
         if (result == NULL)
@@ -1550,15 +1919,13 @@ agent_acquire_connections(PoolAgent *agent, List *datanodelist,
         }
  
         /*
-        * There are possible memory allocations in the core pooler, we want
-        * these allocations in the contect of the database pool
+        * Make sure the results (connections) are allocated in the memory
+        * context for the DatabasePool.
          */
         oldcontext = MemoryContextSwitchTo(agent->pool->mcxt);
  
-
-       /* Initialize result */
+       /* first open connections to the datanodes */
         i = 0;
-       /* Save in array fds of Datanodes first */
         foreach(nodelist_item, datanodelist)
         {
                 int                     node = lfirst_int(nodelist_item);
@@ -1586,6 +1953,8 @@ agent_acquire_connections(PoolAgent *agent, List *datanodelist,
                          * Update newly-acquired slot with session parameters.
                          * Local parameters are fired only once BEGIN has been launched on
                          * remote nodes.
+                        *
+                        * FIXME Perhaps we should be doing something here?
                          */
                 }
  
@@ -1593,7 +1962,10 @@ agent_acquire_connections(PoolAgent *agent, List *datanodelist,
                 (*pids)[i++] = ((PGconn *) agent->dn_connections[node]->conn)->be_pid;
         }
  
-       /* Save then in the array fds for Coordinators */
+       /* make sure we got the expected number of datanode connections */
+       Assert(i == list_length(datanodelist));
+
+       /* and then the coordinators */
         foreach(nodelist_item, coordlist)
         {
                 int                     node = lfirst_int(nodelist_item);
@@ -1620,6 +1992,8 @@ agent_acquire_connections(PoolAgent *agent, List *datanodelist,
                          * Update newly-acquired slot with session parameters.
                          * Local parameters are fired only once BEGIN has been launched on
                          * remote nodes.
+                        *
+                        * FIXME Perhaps we should be doing something here?
                          */
                 }
  
@@ -1629,11 +2003,15 @@ agent_acquire_connections(PoolAgent *agent, List *datanodelist,
  
         MemoryContextSwitchTo(oldcontext);
  
+       /* make sure we got the expected total number of connections */
+       Assert(i == list_length(datanodelist) + list_length(coordlist));
+
         return result;
  }
  
  /*
- * Cancel query
+ * cancel_query_on_connections
+ *       Cancel query running on connections managed by a PoolAgent.
   */
  static int
  cancel_query_on_connections(PoolAgent *agent, List *datanodelist, List *coordlist)
@@ -1706,7 +2084,8 @@ cancel_query_on_connections(PoolAgent *agent, List *datanodelist, List *coordlis
  }
  
  /*
- * Return connections back to the pool
+ * PoolManagerReleaseConnections
+ *       Return all connections back to the pool.
   */
  void
  PoolManagerReleaseConnections(bool force)
@@ -1715,7 +2094,10 @@ PoolManagerReleaseConnections(bool force)
         int n32;
         int msglen = 8;
  
-       /* If disconnected from pooler all the connections already released */
+       /*
+        * If disconnected from the pool manager, all the connections were
+        * already released.
+        */
         if (!poolHandle)
                 return;
  
@@ -1735,7 +2117,8 @@ PoolManagerReleaseConnections(bool force)
  }
  
  /*
- * Cancel Query
+ * PoolManagerCancelQuery
+ *       Cancel query on all nodes where it's running.
   */
  void
  PoolManagerCancelQuery(int dn_count, int* dn_list, int co_count, int* co_list)
@@ -1794,7 +2177,10 @@ PoolManagerCancelQuery(int dn_count, int* dn_list, int co_count, int* co_list)
  }
  
  /*
- * Release connections for Datanodes and Coordinators
+ * agent_release_connections
+ *       Release connections associated with a PoolAgent instance.
+ *
+ * 
   */
  static void
  agent_release_connections(PoolAgent *agent, bool force_destroy)
@@ -1802,8 +2188,15 @@ agent_release_connections(PoolAgent *agent, bool force_destroy)
         MemoryContext oldcontext;
         int                     i;
  
+       /* If there are no open connections in the agent, we're done. */
         if (!agent->dn_connections && !agent->coord_connections)
                 return;
+
+       /*
+        * In PAUSED cluster (see src/backend/pgxc/cluster/pause.c) we can't
+        * return any connections to the connection pools, we can only close
+        * them, so we require 'force'.
+        */
         if (!force_destroy && cluster_ex_lock_held)
         {
                 elog(LOG, "Not releasing connection with cluster lock");
@@ -1811,29 +2204,33 @@ agent_release_connections(PoolAgent *agent, bool force_destroy)
         }
  
         /*
-        * There are possible memory allocations in the core pooler, we want
-        * these allocations in the contect of the database pool
+        * Make sure all allocations happen in the DatabasePool memory context
+        * (and not for example in the main pooler context, which would cause
+        * memory leaks, or in caller's context, likely causing crashes).
          */
         oldcontext = MemoryContextSwitchTo(agent->pool->mcxt);
  
         /*
-        * Remaining connections are assumed to be clean.
-        * First clean up for Datanodes
+        * All currently open connections are assumed to be 'clean' so just
+        * return them back to the pool (or close them, with force_destroy).
+        * First the datanodes, then coordinators.
          */
         for (i = 0; i < agent->num_dn_connections; i++)
         {
                 PGXCNodePoolSlot *slot = agent->dn_connections[i];
  
                 /*
-                * Release connection.
+                * Release the connection.
+                *
                  * If connection has temporary objects on it, destroy connection slot.
                  */
                 if (slot)
                         release_connection(agent->pool, slot, agent->dn_conn_oids[i], force_destroy);
+
                 agent->dn_connections[i] = NULL;
                 elog(DEBUG1, "Released connection to node %d", agent->dn_conn_oids[i]);
         }
-       /* Then clean up for Coordinator connections */
+
         for (i = 0; i < agent->num_coord_connections; i++)
         {
                 PGXCNodePoolSlot *slot = agent->coord_connections[i];
@@ -1844,6 +2241,7 @@ agent_release_connections(PoolAgent *agent, bool force_destroy)
                  */
                 if (slot)
                         release_connection(agent->pool, slot, agent->coord_conn_oids[i], force_destroy);
+
                 agent->coord_connections[i] = NULL;
                 elog(DEBUG1, "Released connection to node %d", agent->coord_conn_oids[i]);
         }
@@ -1851,7 +2249,7 @@ agent_release_connections(PoolAgent *agent, bool force_destroy)
         /*
          * Released connections are now in the pool and we may want to close
          * them eventually. Update the oldest_idle value to reflect the latest
-        * last access time if not already updated..
+        * last access time if not already updated.
          */
         if (!force_destroy && agent->pool->oldest_idle == (time_t) 0)
                 agent->pool->oldest_idle = time(NULL);
@@ -1859,13 +2257,24 @@ agent_release_connections(PoolAgent *agent, bool force_destroy)
         MemoryContextSwitchTo(oldcontext);
  }
  
+
+/***********************************************************************
+ * Pool Management
+ **********************************************************************/
+
  /*
- * Create new empty pool for a database.
- * By default Database Pools have a size null so as to avoid interactions
- * between PGXC nodes in the cluster (Co/Co, Dn/Dn and Co/Dn).
- * Pool is increased at the first GET_CONNECTION message received.
- * Returns POOL_OK if operation succeed POOL_FAIL in case of OutOfMemory
- * error and POOL_WEXIST if poll for this database already exist.
+ * create_database_pool
+ *       Create new empty pool for a database/user combination.
+ *
+ * We only initialize the database pool and add it to the global list,
+ * but do not try to preallocate any connections. That only happens when
+ * the first request for connection arrives.
+ *
+ * Returns a pointer to the new DatabasePool in case of success, NULL
+ * when something fails (out of memory, etc.)
+ *
+ * XXX Should we add some protection against duplicate pools? Probably
+ * not really necessary.
   */
  static DatabasePool *
  create_database_pool(const char *database, const char *user_name, const char *pgoptions)
@@ -1878,14 +2287,18 @@ create_database_pool(const char *database, const char *user_name, const char *pg
         elog(DEBUG1, "Creating a connection pool for database %s, user %s,"
                         " with pgoptions %s", database, user_name, pgoptions);
  
+       /* create a memory context for the database pool */
         dbcontext = AllocSetContextCreate(PoolerCoreContext,
-                                                                         "DB Context",
+                                                                         "Database Pool Context",
                                                                           ALLOCSET_DEFAULT_MINSIZE,
                                                                           ALLOCSET_DEFAULT_INITSIZE,
                                                                           ALLOCSET_DEFAULT_MAXSIZE);
+
         oldcontext = MemoryContextSwitchTo(dbcontext);
-       /* Allocate memory */
+
+       /* Allocate memory (already in the dbpool memory context) */
         databasePool = (DatabasePool *) palloc(sizeof(DatabasePool));
+
         if (!databasePool)
         {
                 /* out of memory */
@@ -1896,15 +2309,16 @@ create_database_pool(const char *database, const char *user_name, const char *pg
         }
  
         databasePool->mcxt = dbcontext;
-        /* Copy the database name */
+
+       /* copy the basic details about the pool */
         databasePool->database = pstrdup(database);
-        /* Copy the user name */
         databasePool->user_name = pstrdup(user_name);
-       /* Reset the oldest_idle value */
-       databasePool->oldest_idle = (time_t) 0;
-        /* Copy the pgoptions */
         databasePool->pgoptions = pstrdup(pgoptions);
  
+       /* reset the oldest_idle value */
+       databasePool->oldest_idle = (time_t) 0;
+
+       /* FIXME We should check all the parameters we just copied. */
         if (!databasePool->database)
         {
                 /* out of memory */
@@ -1931,7 +2345,7 @@ create_database_pool(const char *database, const char *user_name, const char *pg
  
         MemoryContextSwitchTo(oldcontext);
  
-       /* Insert into the list */
+       /* insert the new database pool into the global list */
         insert_database_pool(databasePool);
  
         return databasePool;
@@ -1939,7 +2353,16 @@ create_database_pool(const char *database, const char *user_name, const char *pg
  
  
  /*
- * Destroy the pool and free memory
+ * destroy_database_pool
+ *       Destroy a database pool for a user/dbname combination.
+ *
+ * When a matching database pool exists, we destroy all the node pools
+ * (which closes all the connection), and release the memory context.
+ *
+ * Returns 1 in case of success (when pool exists), 0 when a matching
+ * pool was not found.
+ *
+ * XXX Maybe return true/false instead?
   */
  static int
  destroy_database_pool(const char *database, const char *user_name)
@@ -1965,19 +2388,28 @@ destroy_database_pool(const char *database, const char *user_name)
                 MemoryContextDelete(databasePool->mcxt);
                 return 1;
         }
+
+       elog(DEBUG1, "Connection pool for database %s, user %s not found",
+                       database, user_name);
+
         return 0;
  }
  
  
  /*
- * Insert new database pool to the list
+ * insert_database_pool
+ *       Insert the newly created pool to the head of the global pool list.
   */
  static void
  insert_database_pool(DatabasePool *databasePool)
  {
         Assert(databasePool);
  
-       /* Reference existing list or null the tail */
+       /*
+        * Reference existing list or null the tail
+        *
+        * XXX The 'if' seems somewhat unnecessary I guess ...
+        */
         if (databasePools)
                 databasePool->next = databasePools;
         else
@@ -1989,7 +2421,10 @@ insert_database_pool(DatabasePool *databasePool)
  
  /*
   * reload_database_pools
- *     rebuild connection information for all database pools
+ *       Rebuild connection information for all database pools.
+ *
+ * Connection information reload applies to all database pools (not
+ * just the one associated with a the current pool agent).
   *
   * A database pool is reloaded as follows for each remote node:
   *
@@ -1999,21 +2434,22 @@ insert_database_pool(DatabasePool *databasePool)
   * - node pool is deleted if its port or host information is changed.
   *   Subsequently all its connections are dropped.
   *
- * - node pool is kept unchanged with existing connection information
- *   is not changed. However its index position in node pool is changed
- *   according to the alphabetical order of the node name in new
- *   cluster configuration.
+ * - node pool is kept unchanged if the connection information has not
+ *   changed. However its index position in node pool changes according
+ *   to the alphabetical order of the node name in new configuration.
   *
   * Backend sessions are responsible to reconnect to the pooler to update
   * their agent with newest connection information.
   *
- * The session invocating connection information reload is reconnected
- * and uploaded automatically after database pool reload. Other server
- * sessions are signaled to reconnect to pooler and update their
- * connection information separately.
+ * The session that triggered the connection metadata reload reconnects
+ * automatically after the reload. Other server sessions are signaled
+ * to reconnect to pooler and update their connection info separately.
   *
   * During reload process done internally on pooler, pooler is locked
   * to forbid new connection requests.
+ *
+ * XXX Where does the locking happen?
+ * XXX Where do we signal the other sessions?
   */
  static void
  reload_database_pools(PoolAgent *agent)
@@ -2023,26 +2459,29 @@ reload_database_pools(PoolAgent *agent)
         elog(DEBUG1, "Reloading database pools");
  
         /*
-        * Release node connections if any held. It is not guaranteed client session
-        * does the same so don't ever try to return them to pool and reuse
+        * Release node connections if any held. It is not guaranteed client
+        * session does the same so we don't ever try to return them to pool
+        * for reuse, and instead just close them.
          */
         agent_release_connections(agent, true);
  
         /* Forget previously allocated node info */
         MemoryContextReset(agent->mcxt);
  
-       /* and allocate new */
+       /* And allocate a blank copy. */
         PgxcNodeGetOids(&agent->coord_conn_oids, &agent->dn_conn_oids,
-                                       &agent->num_coord_connections, &agent->num_dn_connections, false);
+                                       &agent->num_coord_connections, &agent->num_dn_connections,
+                                       false);
  
         agent->coord_connections = (PGXCNodePoolSlot **)
                         palloc0(agent->num_coord_connections * sizeof(PGXCNodePoolSlot *));
+
         agent->dn_connections = (PGXCNodePoolSlot **)
                         palloc0(agent->num_dn_connections * sizeof(PGXCNodePoolSlot *));
  
         /*
-        * Scan the list and destroy any altered pool. They will be recreated
-        * upon subsequent connection acquisition.
+        * Scan the list of database pools and destroy any altered pool. The
+        * pools will be recreated upon subsequent connection acquisition.
          */
         databasePool = databasePools;
         while (databasePool)
@@ -2074,18 +2513,19 @@ reload_database_pools(PoolAgent *agent)
  
  /*
   * refresh_database_pools
- *             refresh information for all database pools
+ *             Refresh information for all database pools.
+ *
+ * Connection information refresh applies to all database pools (not
+ * just the one associated with a the current pool agent).
   *
- * Connection information refresh concerns all the database pools.
   * A database pool is refreshed as follows for each remote node:
   *
   * - node pool is deleted if its port or host information is changed.
   *   Subsequently all its connections are dropped.
   *
- * If any other type of activity is found, we error out.
- *
- * XXX I don't see any cases that would error out. Isn't the comment
- * simply obsolete?
+ * If any other type of activity is found (e.g. removed or deleted node)
+ * we error out (and return POOL_REFRESH_FAILED). In case of success we
+ * return POOL_REFRESH_SUCCESS.
   */
  static int
  refresh_database_pools(PoolAgent *agent)
@@ -2117,7 +2557,7 @@ refresh_database_pools(PoolAgent *agent)
  
         /*
          * Scan the list and destroy any altered pool. They will be recreated
-        * upon subsequent connection acquisition.
+        * automatically upon subsequent connection acquisition.
          */
         databasePool = databasePools;
         while (res == POOL_REFRESH_SUCCESS && databasePool)
@@ -2132,7 +2572,10 @@ refresh_database_pools(PoolAgent *agent)
  
                         /*
                          * Since we re-checked the numbers above, we should not get
-                        * the case of an ADDED or a DELETED node here..
+                        * the case of an ADDED or a DELETED node here.
+                        *
+                        * Newly added nodes are detected indirectly (same node count
+                        * and no deleted nodes means no added nodes either).
                          */
                         if (connstr_chk == NULL)
                         {
@@ -2145,10 +2588,10 @@ refresh_database_pools(PoolAgent *agent)
                         if (strcmp(connstr_chk, nodePool->connstr))
                         {
                                 elog(LOG, "Found an altered node (%u)", nodePool->nodeoid);
+
                                 /*
-                                * Node has been altered. First remove
-                                * all references to this node from ALL the
-                                * agents before destroying it..
+                                * Node has been altered. First remove all references to
+                                * this node from ALL the agents before destroying it.
                                  */
                                 if (!remove_all_agent_references(nodePool->nodeoid))
                                 {
@@ -2156,6 +2599,7 @@ refresh_database_pools(PoolAgent *agent)
                                         break;
                                 }
  
+                               /* And now destroy the node pool. */
                                 destroy_node_pool(nodePool);
                                 hash_search(databasePool->nodePools, &nodePool->nodeoid,
                                                         HASH_REMOVE, NULL);
@@ -2167,9 +2611,17 @@ refresh_database_pools(PoolAgent *agent)
  
                 databasePool = databasePool->next;
         }
+
         return res;
  }
  
+/*
+ * remove_all_agent_references
+ *       Remove all references to a specified node from all PoolAgents.
+ *
+ * XXX This is yet another place unnecesserily complicated by keeping
+ * datanodes and coordinators separate.
+ */
  static bool
  remove_all_agent_references(Oid nodeoid)
  {
@@ -2177,8 +2629,7 @@ remove_all_agent_references(Oid nodeoid)
         bool res = true;
  
         /*
-        * Identify if it's a coordinator or datanode first
-        * and get its index
+        * Identify if it's a coordinator or datanode first and get its index.
          */
         for (i = 1; i <= agentCount; i++)
         {
@@ -2228,14 +2679,20 @@ remove_all_agent_references(Oid nodeoid)
  }
  
  /*
- * Find pool for specified database and username in the list
+ * find_database_pool
+ *       Find a DatabasePool for specified database/username combination.
+ *
+ * Returns a pointer to the database pool if it exists, NULL otherwise.
   */
  static DatabasePool *
-find_database_pool(const char *database, const char *user_name, const char *pgoptions)
+find_database_pool(const char *database, const char *user_name,
+                                  const char *pgoptions)
  {
         DatabasePool *databasePool;
  
-       /* Scan the list */
+       Assert(database && user_name && pgoptions);
+
+       /* scan the list */
         databasePool = databasePools;
         while (databasePool)
         {
@@ -2243,14 +2700,21 @@ find_database_pool(const char *database, const char *user_name, const char *pgop
                         strcmp(user_name, databasePool->user_name) == 0 &&
                         strcmp(pgoptions, databasePool->pgoptions) == 0)
                         break;
+
                 databasePool = databasePool->next;
         }
+
         return databasePool;
  }
  
  
  /*
- * Remove pool for specified database from the list
+ * remove_database_pool
+ *       Remove database pool for database/username combination from the list.
+ *
+ * Only removes the pool from the global list, but does not destroy it.
+ * This allows doing additional maintenance on the database pool (e.g.
+ * destroy all the node pools, etc.)
   */
  static DatabasePool *
  remove_database_pool(const char *database, const char *user_name)
@@ -2258,21 +2722,24 @@ remove_database_pool(const char *database, const char *user_name)
         DatabasePool *databasePool,
                            *prev;
  
+       Assert(database && user_name);
+
         /* Scan the list */
         databasePool = databasePools;
         prev = NULL;
         while (databasePool)
         {
  
-               /* if match break the loop and return */
+               /* if the pool matches, break the loop */
                 if (strcmp(database, databasePool->database) == 0 &&
                         strcmp(user_name, databasePool->user_name) == 0)
                         break;
+
                 prev = databasePool;
                 databasePool = databasePool->next;
         }
  
-       /* if found */
+       /* if found a matching pool, remove it from the list */
         if (databasePool)
         {
  
@@ -2285,11 +2752,29 @@ remove_database_pool(const char *database, const char *user_name)
  
                 databasePool->next = NULL;
         }
+       else
+               elog(LOG, "database pool for %s/%s not found",
+                        database, user_name);
+
+
         return databasePool;
  }
  
  /*
- * Acquire connection
+ * acquire_connection
+ *             Acquire connection to a given node from a specified pool.
+ *
+ * The node connection is acquired in one of two ways:
+ *
+ * (a) By reusing a connection already available in the connection pool.
+ *
+ * (b) By opening a fresh connection (when freeSize==0).
+ *
+ * Returns a PGXCNodePoolSlot pointer in case of success, NULL when the
+ * connection can't be obtained.
+ *
+ * Also updates node health information in the shared memory, both in
+ * case of success (healthy) or failure (unhealthy).
   */
  static PGXCNodePoolSlot *
  acquire_connection(DatabasePool *dbPool, Oid node)
@@ -2298,21 +2783,25 @@ acquire_connection(DatabasePool *dbPool, Oid node)
         PGXCNodePoolSlot   *slot;
  
         Assert(dbPool);
+       Assert(OidIsValid(node));
  
-       nodePool = (PGXCNodePool *) hash_search(dbPool->nodePools, &node, HASH_FIND,
-                                                                                       NULL);
+       /* see if we have pool for the node */
+       nodePool = (PGXCNodePool *) hash_search(dbPool->nodePools, &node,
+                                                                                       HASH_FIND, NULL);
  
         /*
-        * When a Coordinator pool is initialized by a Coordinator Postmaster,
-        * it has a NULL size and is below minimum size that is 1
-        * This is to avoid problems of connections between Coordinators
-        * when creating or dropping Databases.
+        * If there are no free connections in the node pool, grow it.
+        *
+        * Coordinator pools initialized by a coordinator postmaster are
+        * initially empty. This is to avoid problems of connections between
+        * coordinators when creating or dropping databases.
          */
         if (nodePool == NULL || nodePool->freeSize == 0)
                 nodePool = grow_pool(dbPool, node);
  
         slot = NULL;
-       /* Check available connections */
+
+       /* check available connections */
         while (nodePool && nodePool->freeSize > 0)
         {
                 int                     poll_result;
@@ -2323,14 +2812,26 @@ acquire_connection(DatabasePool *dbPool, Oid node)
                 if (PQsocket((PGconn *) slot->conn) > 0)
                 {
                         /*
-                        * Make sure connection is ok, destroy connection slot if there is a
-                        * problem.
+                        * Check if the connection is ok, destroy the connection
+                        * slot if there is a problem.
+                        *
+                        * XXX Not sure how expensive this is, but perhaps we should
+                        * check the connections differently (not in the hot path
+                        * when requesting the connection, when every instruction
+                        * makes a difference). This seems particularly pointless
+                        * when the connection was just opened by grow_pool().
+                        *
+                        * XXX Perhaps we can do this only when the connection is
+                        * old enough (e.g. using slot->released)?
                          */
                         poll_result = pqReadReady((PGconn *) slot->conn);
  
+                       /* ok, no data - we have a working connection */
                         if (poll_result == 0)
-                               break;          /* ok, no data */
-                       else if (poll_result < 0)
+                               break;
+
+                       /* something went wrong - retry, if possible */
+                       if (poll_result < 0)
                         {
                                 if (errno == EAGAIN || errno == EINTR)
                                         goto retry;
@@ -2346,6 +2847,7 @@ acquire_connection(DatabasePool *dbPool, Oid node)
  
                 /* Decrement current max pool size */
                 (nodePool->size)--;
+
                 /* Ensure we are not below minimum size */
                 nodePool = grow_pool(dbPool, node);
         }
@@ -2355,8 +2857,8 @@ acquire_connection(DatabasePool *dbPool, Oid node)
                 elog(WARNING, "can not connect to node %u", node);
  
                 /*
-                * before returning, also update the shared health
-                * status field to indicate that this node is down
+                * Before returning, update the node health status in shared
+                * memory to indicate this node is down.
                  */
                 if (!PgxcNodeUpdateHealth(node, false))
                         elog(WARNING, "Could not update health status of node %u", node);
@@ -2364,6 +2866,10 @@ acquire_connection(DatabasePool *dbPool, Oid node)
                         elog(WARNING, "Health map updated to reflect DOWN node (%u)", node);
         }
         else
+               /*
+                * XXX Is this necessary? Isn't this just another source of latency
+                * in the connection-acquisition path?
+                */
                 PgxcNodeUpdateHealth(node, true);
  
         return slot;
@@ -2371,7 +2877,13 @@ acquire_connection(DatabasePool *dbPool, Oid node)
  
  
  /*
- * release connection from specified pool and slot
+ * release_connection
+ *       Return a connection to a pool, or close it entirely.
+ *
+ * Release a connection - either return it back to the database pool
+ * (or more precisely to the node pool in that database pool), or force
+ * closing it (necessary for example when the session fails and we are
+ * not sure whether the connection is in consistent state).
   */
  static void
  release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot,
@@ -2381,40 +2893,61 @@ release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot,
  
         Assert(dbPool);
         Assert(slot);
+       Assert(OidIsValid(node));
+
+       nodePool = (PGXCNodePool *) hash_search(dbPool->nodePools, &node,
+                                                                                       HASH_FIND, NULL);
  
-       nodePool = (PGXCNodePool *) hash_search(dbPool->nodePools, &node, HASH_FIND,
-                                                                                       NULL);
+       /*
+        * When the node pool does not exist, the node was probably either
+        * dropped or altered. In both cases the connection is no longer
+        * valid, so just close it.
+        */
         if (nodePool == NULL)
         {
-               /*
-                * The node may be altered or dropped.
-                * In any case the slot is no longer valid.
-                */
+               elog(WARNING, "Node pool (%d) does not exist anymore, closing connection",
+                       node);
+
                 destroy_slot(slot);
                 return;
         }
  
-       /* return or discard */
+       /*
+        * The node pool exists, but we've been asked to forcefully close
+        * the connection, so do as asked.
+        */
         if (!force_destroy)
         {
-               /* Insert the slot into the array and increase pool size */
-               nodePool->slot[(nodePool->freeSize)++] = slot;
-               slot->released = time(NULL);
-       }
-       else
-       {
-               elog(DEBUG1, "Cleaning up connection from pool %s, closing", nodePool->connstr);
+               elog(DEBUG1, "Cleaning up connection from pool %s (node %d), closing",
+                       nodePool->connstr, node);
+
                 destroy_slot(slot);
+
                 /* Decrement pool size */
                 (nodePool->size)--;
+
                 /* Ensure we are not below minimum size */
                 grow_pool(dbPool, node);
+
+               return;
         }
+
+       /*
+        * Everything peachy, so just insert the connection (slot) into the
+        * array and increase the number of free connections in the pool.
+        * Also note the timestamp when the connection was released.
+        */
+       nodePool->slot[(nodePool->freeSize)++] = slot;
+       slot->released = time(NULL);
  }
  
  
  /*
- * Increase database pool size, create new if does not exist
+ * grow_pool
+ *       Increase size of a pool for a particular node if needed.
+ *
+ * If the node pool (for the specified node) does not exist, it will be
+ * created automatically.
   */
  static PGXCNodePool *
  grow_pool(DatabasePool *dbPool, Oid node)
@@ -2425,10 +2958,18 @@ grow_pool(DatabasePool *dbPool, Oid node)
         bool                    found;
  
         Assert(dbPool);
+       Assert(OidIsValid(node));
  
+       /* lookup node pool, create it if needed */
         nodePool = (PGXCNodePool *) hash_search(dbPool->nodePools, &node,
                                                                                         HASH_ENTER, &found);
+
+       /*
+        * XXX Aren't we calling this even when the connstr already exists?
+        * Seems a bit wasteful, I guess.
+        */
         nodePool->connstr = build_node_conn_str(node, dbPool);
+
         if (!nodePool->connstr)
         {
                 ereport(ERROR,
@@ -2436,6 +2977,10 @@ grow_pool(DatabasePool *dbPool, Oid node)
                                  errmsg("could not build connection string for node %u", node)));
         }
  
+       /*
+        * XXX Shouldn't this really be called right after the hash_search
+        * (and before we do the build_node_conn_str)?
+        */
         if (!found)
         {
                 nodePool->slot = (PGXCNodePoolSlot **) palloc0(MaxPoolSize * sizeof(PGXCNodePoolSlot *));
@@ -2449,6 +2994,11 @@ grow_pool(DatabasePool *dbPool, Oid node)
                 nodePool->size = 0;
         }
  
+       /*
+        * If there are no free connections, try to create one. But do not
+        * exceed MaxPoolSize, i.e. the maximum number of connections in
+        * a node pool.
+        */
         while (nodePool->freeSize == 0 && nodePool->size < MaxPoolSize)
         {
                 PGXCNodePoolSlot *slot;
@@ -2475,16 +3025,22 @@ grow_pool(DatabasePool *dbPool, Oid node)
                                                   " connection error (%s)",
                                                   nodePool->connstr,
                                                   PQerrorMessage((PGconn*) slot->conn))));
+
                         destroy_slot(slot);
+
                         /*
-                        * If we failed to connect probably number of connections on the
-                        * target node reached max_connections. Try and release idle
-                        * connections and try again.
-                        * We do not want to enter endless loop here and run maintenance
-                        * procedure only once.
-                        * It is not safe to run the maintenance procedure if no connections
-                        * from that pool currently in use - the node pool may be destroyed
-                        * in that case.
+                        * If we failed to connect, probably number of connections on
+                        * the target node reached max_connections. Release idle from
+                        * this node, and retry.
+                        *
+                        * We do not want to enter endless loop here, so we only try
+                        * releasing idle connections once.
+                        *
+                        * It is not safe to run the maintenance from a pool with no
+                        * active connections, as the maintenance might kill the pool.
+                        *
+                        * XXX Maybe temporarily marking the pool, so that it does not
+                        * get removed (pinned=true) would do the trick?
                          */
                         if (tryagain && nodePool->size > nodePool->freeSize)
                         {
@@ -2497,24 +3053,34 @@ grow_pool(DatabasePool *dbPool, Oid node)
  
                 slot->xc_cancelConn = (NODE_CANCEL *) PQgetCancel((PGconn *)slot->conn);
                 slot->released = time(NULL);
+
+               /*
+                * No need to compare the oldest_idle here, as every existing
+                * idle connection is automatically older than the new one. Only
+                * if there are no other idle connections this one is the oldest.
+                */
                 if (dbPool->oldest_idle == (time_t) 0)
                         dbPool->oldest_idle = slot->released;
  
-               /* Insert at the end of the pool */
+               /* Insert the new slot to the last place in the node pool. */
                 nodePool->slot[(nodePool->freeSize)++] = slot;
  
-               /* Increase count of pool size */
+               /* Increase the size of the node pool. */
                 (nodePool->size)++;
-               elog(DEBUG1, "Pooler: increased pool size to %d for pool %s",
+
+               elog(DEBUG1, "Pooler: increased pool size to %d for pool %s (%u)",
                          nodePool->size,
-                        nodePool->connstr);
+                        nodePool->connstr,
+                        node);
         }
+
         return nodePool;
  }
  
  
  /*
- * Destroy pool slot
+ * destroy_slot
+ *       Destroy a connection slot (free cancel info and the slot itself).
   */
  static void
  destroy_slot(PGXCNodePoolSlot *slot)
@@ -2529,7 +3095,10 @@ destroy_slot(PGXCNodePoolSlot *slot)
  
  
  /*
- * Destroy node pool
+ * destroy_node_pool
+ *       Close any remaining connections to the node and destroy the slots.
+ *
+ * XXX This does not release the node_pool itself. Not sure if correct.
   */
  static void
  destroy_node_pool(PGXCNodePool *node_pool)
@@ -2546,6 +3115,7 @@ destroy_node_pool(PGXCNodePool *node_pool)
          */
         elog(DEBUG1, "About to destroy node pool %s, current size is %d, %d connections are in use",
                  node_pool->connstr, node_pool->freeSize, node_pool->size - node_pool->freeSize);
+
         if (node_pool->connstr)
                 pfree(node_pool->connstr);
  
@@ -2553,13 +3123,21 @@ destroy_node_pool(PGXCNodePool *node_pool)
         {
                 for (i = 0; i < node_pool->freeSize; i++)
                         destroy_slot(node_pool->slot[i]);
+
                 pfree(node_pool->slot);
         }
  }
  
  
  /*
- * Main handling loop
+ * PoolerLoop
+ *       Main handling loop of the pool manager.
+ *
+ * Has three main responsibilities:
+ * 
+ * - triggering regular pool maintenance
+ * - responding to postmaster events (e.g. shutdown)
+ * - forwarding messages to pool agents (which do handle them)
   */
  static void
  PoolerLoop(void)
@@ -2725,7 +3303,7 @@ PoolerLoop(void)
                         /*
                          * Agent may be removed from the array while processing
                          * and trailing items are shifted, so scroll downward
-                        * to avoid problem
+                        * to avoid problems.
                          */
                         for (i = agentCount - 1; agentCount > 0 && i >= 0; i--)
                         {
@@ -2737,6 +3315,7 @@ PoolerLoop(void)
                                         agent_handle_input(agent, &input_message);
                         }
  
+                       /* New session without an existing agent. */
                         if (pool_fd[0].revents & POLLIN)
                                 agent_create();
                 }
@@ -2751,9 +3330,18 @@ PoolerLoop(void)
  }
  
  /*
- * Clean Connection in all Database Pools for given Datanode and Coordinator list
+ * clean_connection
+ *       Clean connections for specified nodes in matching database pool.
+ *
+ * The function closes all unused connections to nodes specified in the
+ * node_discard list, in all database pools for the dbname/username
+ * combination. There may be multiple matching pools, with different
+ * pgoptions values.
+ *
+ * XXX The code handles NULL values in database/username, but not sure
+ * if that's really needed?
   */
-int
+static int
  clean_connection(List *node_discard, const char *database, const char *user_name)
  {
         DatabasePool *databasePool;
@@ -2766,7 +3354,7 @@ clean_connection(List *node_discard, const char *database, const char *user_name
                 ListCell *lc;
  
                 if ((database && strcmp(database, databasePool->database)) ||
-                               (user_name && strcmp(user_name, databasePool->user_name)))
+                       (user_name && strcmp(user_name, databasePool->user_name)))
                 {
                         /* The pool does not match to request, skip */
                         databasePool = databasePool->next;
@@ -2789,12 +3377,12 @@ clean_connection(List *node_discard, const char *database, const char *user_name
                                 /* Check if connections are in use */
                                 if (nodePool->freeSize < nodePool->size)
                                 {
-                                       elog(WARNING, "Pool of Database %s is using Datanode %u connections",
+                                       elog(WARNING, "Pool of database %s is using node %u connections",
                                                                 databasePool->database, node);
                                         res = CLEAN_CONNECTION_NOT_COMPLETED;
                                 }
  
-                               /* Destroy connections currently in Node Pool */
+                               /* Destroy unused connections in this Node Pool */
                                 if (nodePool->slot)
                                 {
                                         int i;
@@ -2806,6 +3394,7 @@ clean_connection(List *node_discard, const char *database, const char *user_name
                         }
                 }
  
+               /* XXX Can there be multiple database pools? */
                 databasePool = databasePool->next;
         }
  
@@ -2815,11 +3404,14 @@ clean_connection(List *node_discard, const char *database, const char *user_name
  }
  
  /*
- * Take a Lock on Pooler.
- * Abort PIDs registered with the agents for the given database.
- * Send back to client list of PIDs signaled to watch them.
+ * abort_pids
+ *       Aborts backends associated with agents for a database/user.
+ *
+ * Ignores the current backend (otherwise it might cancel itself), and
+ * returns an array of PIDs that were actually signalled, so that the
+ * client can watch them. Number of the PIDs is passed in 'len'.
   */
-int *
+static int *
  abort_pids(int *len, int pid, const char *database, const char *user_name)
  {
         int *pids = NULL;
@@ -2858,7 +3450,7 @@ abort_pids(int *len, int pid, const char *database, const char *user_name)
  }
  
  /*
- *
+ * Request shutdown of the pooler.
   */
  static void
  pooler_die(SIGNAL_ARGS)
@@ -2868,7 +3460,7 @@ pooler_die(SIGNAL_ARGS)
  
  
  /*
- *
+ * Request quick shutdown of the pooler.
   */
  static void
  pooler_quickdie(SIGNAL_ARGS)
@@ -2877,7 +3469,9 @@ pooler_quickdie(SIGNAL_ARGS)
         exit(2);
  }
  
-
+/*
+ * Note that the pooler received SIGHUP signal.
+ */
  static void
  pooler_sighup(SIGNAL_ARGS)
  {
@@ -2885,8 +3479,13 @@ pooler_sighup(SIGNAL_ARGS)
  }
  
  /*
- * Given node identifier, dbname and user name build connection string.
- * Get node connection details from the shared memory node table
+ * build_node_conn_str
+ *       Construct a connection string for the specified node.
+ *
+ * Given node OID and pool (which includes dbname and username strings),
+ * build the node connection string.
+ *
+ * May return NULL if the node got deleted, for example.
   */
  static char *
  build_node_conn_str(Oid node, DatabasePool *dbPool)
@@ -2914,10 +3513,13 @@ build_node_conn_str(Oid node, DatabasePool *dbPool)
  }
  
  /*
- * Check all pooled connections, and close which have been released more then
- * PooledConnKeepAlive seconds ago.
- * Return true if shrink operation closed all the connections and pool can be
- * ddestroyed, false if there are still connections or pool is in use.
+ * shrink_pool
+ *       Close connections unused for more than PooledConnKeepAlive seconds.
+ *
+ * Returns true if shrink operation closed all the connections and the
+ * whole database pool can be destroyed, false if there are still open
+ * connections (in at least one node pool) or if the pool is in use
+ * (that is, if there are pool agents still referencing this pool).
   */
  static bool
  shrink_pool(DatabasePool *pool)
@@ -2991,8 +3593,13 @@ shrink_pool(DatabasePool *pool)
  
  
  /*
- * Scan connection pools and release connections which are idle for long.
- * If pool gets empty after releasing connections it is destroyed.
+ * pools_maintenance
+ *       Perform regular maintenance of the connection pools.
+ *
+ * Scan connection pools and release connections which are idle for too
+ * long (longer than PoolConnKeepAlive). If the node pool gets empty
+ * after releasing idle connections it is destroyed (but only if not
+ * used by any pool agent).
   */
  static void
  pools_maintenance(void)
diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h

index 7ad15c7c6a367856363e3c36e69bb67b212d743e..13b52e802c772589049c3951fcc0958eab259c90 100644 (file)
--- a/src/include/pgxc/pgxcnode.h
+++ b/src/include/pgxc/pgxcnode.h
@@ -175,11 +175,9 @@ extern int pgxc_node_send_timestamp(PGXCNodeHandle * handle, TimestampTz timesta
  extern bool    pgxc_node_receive(const int conn_count,
                                   PGXCNodeHandle ** connections, struct timeval * timeout);
  extern int     pgxc_node_read_data(PGXCNodeHandle * conn, bool close_if_error);
-extern int     pgxc_node_is_data_enqueued(PGXCNodeHandle *conn);
  
  extern int     send_some(PGXCNodeHandle * handle, int len);
  extern int     pgxc_node_flush(PGXCNodeHandle *handle);
-extern void    pgxc_node_flush_read(PGXCNodeHandle *handle);
  
  extern char get_message(PGXCNodeHandle *conn, int *len, char **msg);
  
@@ -202,4 +200,8 @@ extern bool PgxcNodeDiffBackendHandles(List **nodes_alter,
                            List **nodes_delete, List **nodes_add);
  extern void PgxcNodeRefreshBackendHandlesShmem(List *nodes_alter);
  extern void HandlePoolerMessages(void);
+
+/* Check health of nodes in the connection pool. */
+extern void PoolPingNodeRecheck(Oid nodeoid);
+
  #endif /* PGXCNODE_H */
diff --git a/src/include/pgxc/poolmgr.h b/src/include/pgxc/poolmgr.h

index 47a54c67b2ac2b92c42792ab55f133f2200d0d30..3c2d1f4eb2db7c5591858b29984f9a3e718a6b0e 100644 (file)
--- a/src/include/pgxc/poolmgr.h
+++ b/src/include/pgxc/poolmgr.h
@@ -1,15 +1,24 @@
  /*-------------------------------------------------------------------------
   *
   * poolmgr.h
- *
- *       Definitions for the Datanode connection pool.
+ *       Definitions for the built-in Postgres-XL connection pool.
   *
   *
   * Portions Copyright (c) 2012-2014, TransLattice, Inc.
   * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
   * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
   *
- * src/include/pgxc/poolmgr.h
+ *
+ * XXX Some function take list of nodes, others accept array + nitems.
+ * We should make this more consistent.
+ *
+ * XXX PoolPingNodes is defined on a number of places, including some .c
+ * files. We should define it on one place (pgxcnode.h?) and then include
+ * the header wherever needed.
+ *
+ *
+ * IDENTIFICATION
+ *       src/include/pgxc/poolmgr.h
   *
   *-------------------------------------------------------------------------
   */
@@ -26,7 +35,14 @@
  
  #define MAX_IDLE_TIME 60
  
-/* Connection pool entry */
+/*
+ * One connection in the pool (to datanode or coordinator).
+ *
+ * Essentially a PGconn+PGcancel, so that we can talk to the remote node
+ * and also forward a cancel request if needed.
+ *
+ * XXX rename to PooledConnection.
+ */
  typedef struct
  {
         time_t          released;
@@ -34,33 +50,55 @@ typedef struct
         NODE_CANCEL     *xc_cancelConn;
  } PGXCNodePoolSlot;
  
-/* Pool of connections to specified pgxc node */
+/*
+ * Pool of open connections to single node (datanode or coordinator).
+ *
+ * All the connections share the same connection string, and are tracked
+ * in a simple array of connections.
+ *
+ * XXX rename to NodePool.
+ * XXX not sure if "size" means "valid entries" or "maximum entries".
+ * XXX use FLEXIBLE_ARRAY_MEMBER
+ * XXX or maybe use simple lists of available/free connections instead?
+ */
  typedef struct
  {
-       Oid                     nodeoid;        /* Node Oid related to this pool */
-       char       *connstr;
+       Oid                     nodeoid;        /* node Oid related to this pool */
+       char       *connstr;    /* connection string for all the connections */
         int                     freeSize;       /* available connections */
-       int                     size;           /* total pool size */
+       int                     size;           /* total pool size (available slots) */
+
+       /* array of open connections (with freeSize available connections) */
         PGXCNodePoolSlot **slot;
  } PGXCNodePool;
  
-/* All pools for specified database */
+/*
+ * A group of per-node connection pools (PGXCNodePool), for a particular
+ * database/user combination. We have one PGXCNodePool for each remote
+ * node (datanode or coordinator).
+ *
+ * If there are multiple such combinations (e.g. when there are multiple
+ * users accessing the same database), there will be multiple DatabasePool
+ * entries, organized in a linked list.
+ */
  typedef struct databasepool
  {
         char       *database;
         char       *user_name;
         char       *pgoptions;          /* Connection options */
-       HTAB       *nodePools;          /* Hashtable of PGXCNodePool, one entry for each
-                                                                * Coordinator or DataNode */
+       HTAB       *nodePools;          /* hashtable, one entry per remote node */
         MemoryContext mcxt;
         struct databasepool *next;      /* Reference to next to organize linked list */
         time_t          oldest_idle;
  } DatabasePool;
  
  /*
- * Agent of client session (Pool Manager side)
- * Acts as a session manager, grouping connections together
- * and managing session parameters
+ * Agent, managing a single client session on PoolManager side.
+ *
+ * Is responsible for:
+ *
+ * - tracking which connections are assigned to the session
+ * - managing parameters (GUCs) set in the session
   */
  typedef struct
  {
@@ -74,20 +112,17 @@ typedef struct
         int                             num_coord_connections;
         Oid                        *dn_conn_oids;               /* one for each Datanode */
         Oid                        *coord_conn_oids;    /* one for each Coordinator */
-       PGXCNodePoolSlot **dn_connections; /* one for each Datanode */
+       PGXCNodePoolSlot **dn_connections;      /* one for each Datanode */
         PGXCNodePoolSlot **coord_connections; /* one for each Coordinator */
  } PoolAgent;
+
  /*
- * Helper to poll for all pooler sockets
+ * Configuration parameters (GUCs).
   */
-typedef struct pollfd Pollfd;
-
-
  extern int     PoolConnKeepAlive;
  extern int     PoolMaintenanceTimeout;
  extern int     MaxPoolSize;
  extern int     PoolerPort;
-
  extern bool PersistentConnections;
  
  /* Status inquiry functions */
@@ -97,53 +132,48 @@ extern bool IsPGXCPoolerProcess(void);
  /* Initialize internal structures */
  extern int     PoolManagerInit(void);
  
-/* Destroy internal structures */
-extern int     PoolManagerDestroy(void);
-
  /*
- * Gracefully close connection to the PoolManager
+ * Gracefully close the PoolManager connection.
   */
  extern void PoolManagerDisconnect(void);
-extern char *session_options(void);
  
  /*
- * Reconnect to pool manager
- * This simply does a disconnection followed by a reconnection.
+ * Returns list of options to be propagated to the remote node(s).
   */
-extern void PoolManagerReconnect(void);
+extern char *session_options(void);
  
-/* Get pooled connections */
+/* Get pooled connections to specified nodes */
  extern int *PoolManagerGetConnections(List *datanodelist, List *coordlist,
                 int **pids);
  
-/* Clean pool connections */
-extern void PoolManagerCleanConnection(List *datanodelist, List *coordlist, char *dbname, char *username);
+/* Clean connections for the specified nodes (for dbname/user). */
+extern void PoolManagerCleanConnection(List *datanodelist, List *coordlist,
+               char *dbname, char *username);
  
-/* Check consistency of connection information cached in pooler with catalogs */
+/* Check that connections cached in the connection poole match catalogs. */
  extern bool PoolManagerCheckConnectionInfo(void);
  
-/* Reload connection data in pooler and drop all the existing connections of pooler */
+/* Reload connection data in pooler (and close all existing connections). */
  extern void PoolManagerReloadConnectionInfo(void);
  
-/* Refresh connection data in pooler and drop connections of altered nodes in pooler */
+/* Reload connection data in pooler and close connections to modified nodes). */
  extern int PoolManagerRefreshConnectionInfo(void);
  
-/* Send Abort signal to transactions being run */
-extern int     PoolManagerAbortTransactions(char *dbname, char *username, int **proc_pids);
-
-/* Return connections back to the pool, for both Coordinator and Datanode connections */
+/* Return all connections (for the session) back to the pool. */
  extern void PoolManagerReleaseConnections(bool destroy);
  
-/* Cancel a running query on Datanodes as well as on other Coordinators */
-extern void PoolManagerCancelQuery(int dn_count, int* dn_list, int co_count, int* co_list);
+/* Send "abort transaction" signal to transactions being run */
+extern int     PoolManagerAbortTransactions(char *dbname, char *username,
+               int **proc_pids);
  
-/* Lock/unlock pool manager */
-extern void PoolManagerLock(bool is_lock);
+/* Cancel a running query on all participating nodes (pg_cancel_backend). */
+extern void PoolManagerCancelQuery(int dn_count, int* dn_list,
+                                                                  int co_count, int* co_list);
  
-/* Do pool health check activity */
+/* Check health of nodes in the connection pool. */
  extern void PoolPingNodes(void);
-extern void PoolPingNodeRecheck(Oid nodeoid);
  
  extern bool check_persistent_connections(bool *newval, void **extra,
                 GucSource source);
+
  #endif
author	Tomas Vondra <[email protected]>
	Sun, 22 Oct 2017 13:00:06 +0000 (15:00 +0200)
committer	Tomas Vondra <[email protected]>
	Sat, 4 Nov 2017 16:19:06 +0000 (17:19 +0100)
src/backend/pgxc/pool/pgxcnode.c		patch \| blob \| blame \| history
src/backend/pgxc/pool/poolmgr.c		patch \| blob \| blame \| history
src/include/pgxc/pgxcnode.h		patch \| blob \| blame \| history
src/include/pgxc/poolmgr.h		patch \| blob \| blame \| history