From 29374ecac2cee5b09d404dc72a6a19a4384213dd Mon Sep 17 00:00:00 2001 From: leo Date: Wed, 5 Jul 2023 14:19:25 +0800 Subject: [PATCH 1/9] This commit adds UnionStore table for CBDB, we need to create extention unionstore to enable this feature. We can create UnionStore table using syntax: create table xxx using union_store. We seperate WAL records related with UnionStore table from other WAL records, send those records to UnionStore through UnionStore's XLog system. UnionStore constructs pages to serve CBDB's page read requests. The core changes of CBDB: Add hook XLogInsert_hook for plugins to support more WAL managers. Add UnionStore XLog system. Backends write WAL records into ring buffer, and WAL Proposer send the WAL records to WAL Service of UnionStore. Add storage manager hook smgr_hook for plugins to add more storage managers. Add t_cid to some XLog header, this is to ensure consistency of transaction. Add hook ReadBuffer_hook for plugins to support more buffer read procedures Add hook index_create_hook for plugins to support more index create procedures. Add hook index_build_hook for plugins to support more index build procedures. Add hook Startup_hook for plugins to do additional startup works. Add hook relation_size_hook for plugins to calculate relation size. Add hook NewSegRelfilenode_assign_hook for plugins to assign relfilenode. Add hook start_bgworkers_hook for plugins to start background workers. Other parts of code comes from neon postgres. Design Doc: https://hashdata.feishu.cn/docx/QFSidS6jpoeGIjxgD5Pc2QFdnZe --- configure | 84 +++++++++++++++ gpcontrib/gp_replica_check/gp_replica_check.c | 10 +- src/backend/access/aocs/aocsam_handler.c | 6 +- .../access/appendonly/appendonlyam_handler.c | 6 +- src/backend/access/gin/ginfast.c | 10 +- src/backend/access/gin/ginxlog.c | 25 +++-- src/backend/access/heap/heapam.c | 29 +++-- src/backend/access/heap/heapam_handler.c | 7 +- src/backend/access/transam/clog.c | 9 ++ src/backend/access/transam/distributedlog.c | 9 ++ src/backend/access/transam/varsup.c | 6 ++ src/backend/access/transam/xlog.c | 28 ++++- src/backend/access/transam/xloginsert.c | 62 +++++++++++ src/backend/access/transam/xlogutils.c | 26 ++++- src/backend/catalog/heap.c | 3 +- src/backend/catalog/index.c | 75 ++++++++++++- src/backend/catalog/storage.c | 12 +-- src/backend/commands/tablecmds.c | 4 +- src/backend/main/main.c | 48 +++++++++ src/backend/postmaster/postmaster.c | 14 +++ src/backend/storage/buffer/bufmgr.c | 22 ++-- src/backend/storage/buffer/localbuf.c | 10 +- src/backend/storage/smgr/md.c | 8 +- src/backend/storage/smgr/smgr.c | 100 +++++++++--------- src/backend/tcop/postgres.c | 26 +++-- src/backend/utils/adt/dbsize.c | 9 ++ src/backend/utils/cache/relcache.c | 4 +- src/backend/utils/mmgr/slab.c | 6 +- src/bin/psql/describe.c | 1 - src/include/access/clog.h | 5 +- src/include/access/distributedlog.h | 5 +- src/include/access/heapam_xlog.h | 5 + src/include/access/slru.h | 1 - src/include/access/transam.h | 4 + src/include/access/xlog.h | 5 + src/include/access/xloginsert.h | 11 ++ src/include/access/xlogutils.h | 6 ++ src/include/catalog/index.h | 43 ++++++++ src/include/catalog/storage.h | 3 +- src/include/miscadmin.h | 1 + src/include/postmaster/bgworker_internals.h | 5 + src/include/replication/syncrep.h | 1 - src/include/storage/buf_internals.h | 2 +- src/include/storage/bufmgr.h | 6 ++ src/include/storage/smgr.h | 50 ++++++++- src/include/utils/rel.h | 6 +- .../expected/heap_checksum_corruption.out | 2 +- src/test/heap_checksum/input/setup.source | 4 +- src/test/heap_checksum/output/setup.source | 2 +- .../sql/heap_checksum_corruption.sql | 2 +- 50 files changed, 684 insertions(+), 144 deletions(-) diff --git a/configure b/configure index f9129d18b63..6ca6679155d 100755 --- a/configure +++ b/configure @@ -738,6 +738,7 @@ with_libxml with_uuid with_readline with_systemd +with_libseccomp with_selinux with_ldap with_krb_srvnam @@ -910,6 +911,7 @@ with_bsd_auth with_ldap with_bonjour with_selinux +with_libseccomp with_systemd with_readline with_libedit_preferred @@ -1642,6 +1644,7 @@ Optional Packages: --with-ldap build with LDAP support --with-bonjour build with Bonjour support --with-selinux build with SELinux support + --with-libseccomp build with libseccomp support --with-systemd build with systemd support --without-readline do not use GNU Readline nor BSD Libedit for editing --with-libedit-preferred @@ -9558,6 +9561,37 @@ fi { $as_echo "$as_me:${as_lineno-$LINENO}: result: $with_selinux" >&5 $as_echo "$with_selinux" >&6; } +# +# libseccomp +# +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to build with libseccomp support" >&5 +$as_echo_n "checking whether to build with libseccomp support... " >&6; } + + + +# Check whether --with-libseccomp was given. +if test "${with_libseccomp+set}" = set; then : + withval=$with_libseccomp; + case $withval in + yes) + : + ;; + no) + : + ;; + *) + as_fn_error $? "no argument expected for --with-libseccomp option" "$LINENO" 5 + ;; + esac + +else + with_libseccomp=no + +fi + +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $with_libseccomp" >&5 +$as_echo "$with_libseccomp" >&6; } + # # Systemd # @@ -15533,6 +15567,56 @@ fi fi +if test "$with_libseccomp" = yes ; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for seccomp_init in -lseccomp" >&5 +$as_echo_n "checking for seccomp_init in -lseccomp... " >&6; } +if ${ac_cv_lib_seccomp_seccomp_init+:} false; then : + $as_echo_n "(cached) " >&6 +else + ac_check_lib_save_LIBS=$LIBS +LIBS="-lseccomp $LIBS" +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +/* Override any GCC internal prototype to avoid an error. + Use char because int might match the return type of a GCC + builtin and then its argument prototype would still apply. */ +#ifdef __cplusplus +extern "C" +#endif +char seccomp_init (); +int +main () +{ +return seccomp_init (); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + ac_cv_lib_seccomp_seccomp_init=yes +else + ac_cv_lib_seccomp_seccomp_init=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +LIBS=$ac_check_lib_save_LIBS +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_seccomp_seccomp_init" >&5 +$as_echo "$ac_cv_lib_seccomp_seccomp_init" >&6; } +if test "x$ac_cv_lib_seccomp_seccomp_init" = xyes; then : + cat >>confdefs.h <<_ACEOF +#define HAVE_LIBSECCOMP 1 +_ACEOF + + LIBS="-lseccomp $LIBS" + +else + as_fn_error $? "library 'libseccomp' is required for Seccomp BPF support" "$LINENO" 5 +fi + +fi + # for contrib/uuid-ossp if test "$with_uuid" = bsd ; then # On BSD, the UUID functions are in libc diff --git a/gpcontrib/gp_replica_check/gp_replica_check.c b/gpcontrib/gp_replica_check/gp_replica_check.c index e1c1cadc15b..cee2b0b7001 100644 --- a/gpcontrib/gp_replica_check/gp_replica_check.c +++ b/gpcontrib/gp_replica_check/gp_replica_check.c @@ -57,7 +57,7 @@ extern Datum gp_replica_check(PG_FUNCTION_ARGS); typedef struct RelfilenodeEntry { - Oid relfilenode; + RelFileNodeId relfilenode; Oid relam; int relkind; char relname[NAMEDATALEN]; @@ -514,9 +514,9 @@ get_relfilenode_map() HASHCTL relfilenodectl; int hash_flags; MemSet(&relfilenodectl, 0, sizeof(relfilenodectl)); - relfilenodectl.keysize = sizeof(Oid); + relfilenodectl.keysize = sizeof(RelFileNodeId); relfilenodectl.entrysize = sizeof(RelfilenodeEntry); - relfilenodectl.hash = oid_hash; + relfilenodectl.hash = tag_hash; hash_flags = (HASH_ELEM | HASH_FUNCTION); relfilenodemap = hash_create("relfilenode map", 50000, &relfilenodectl, hash_flags); @@ -539,7 +539,7 @@ get_relfilenode_map() continue; RelfilenodeEntry *rentry; - Oid rnode; + RelFileNodeId rnode; /* Its relmapped relation, need to fetch the mapping from relmap file */ if (classtuple->relfilenode == InvalidOid) rnode = RelationMapOidToFilenode(classtuple->oid, @@ -564,7 +564,7 @@ get_relfilenode_entry(char *relfilenode, HTAB *relfilenode_map) { bool found; - Oid rnode = DatumGetObjectId(DirectFunctionCall1(oidin, CStringGetDatum(relfilenode))); + RelFileNodeId rnode = DatumGetObjectId(DirectFunctionCall1(int8in, CStringGetDatum(relfilenode))); RelfilenodeEntry *rentry = hash_search(relfilenode_map, (void *)&rnode, HASH_FIND, &found); if (found) diff --git a/src/backend/access/aocs/aocsam_handler.c b/src/backend/access/aocs/aocsam_handler.c index 2d389f8c9d3..20119ef0841 100644 --- a/src/backend/access/aocs/aocsam_handler.c +++ b/src/backend/access/aocs/aocsam_handler.c @@ -995,7 +995,7 @@ aoco_relation_set_new_filenode(Relation rel, * * Segment files will be created when / if needed. */ - srel = RelationCreateStorage(*newrnode, persistence, SMGR_AO); + srel = RelationCreateStorage(*newrnode, persistence, SMGR_AO, rel); /* * If required, set up an init fork for an unlogged table so that it can @@ -1061,7 +1061,7 @@ aoco_relation_copy_data(Relation rel, const RelFileNode *newrnode) * Use the "AO-specific" (non-shared buffers backed storage) SMGR * implementation */ - dstrel = smgropen(*newrnode, rel->rd_backend, SMGR_AO); + dstrel = smgropen(*newrnode, rel->rd_backend, SMGR_AO, rel); RelationOpenSmgr(rel); /* @@ -1071,7 +1071,7 @@ aoco_relation_copy_data(Relation rel, const RelFileNode *newrnode) * NOTE: any conflict in relfilenode value will be caught in * RelationCreateStorage(). */ - RelationCreateStorage(*newrnode, rel->rd_rel->relpersistence, SMGR_AO); + RelationCreateStorage(*newrnode, rel->rd_rel->relpersistence, SMGR_AO, rel); copy_append_only_data(rel->rd_node, *newrnode, rel->rd_backend, rel->rd_rel->relpersistence); diff --git a/src/backend/access/appendonly/appendonlyam_handler.c b/src/backend/access/appendonly/appendonlyam_handler.c index bce0a00db2e..12dce83d488 100644 --- a/src/backend/access/appendonly/appendonlyam_handler.c +++ b/src/backend/access/appendonly/appendonlyam_handler.c @@ -896,7 +896,7 @@ appendonly_relation_set_new_filenode(Relation rel, * * Segment files will be created when / if needed. */ - srel = RelationCreateStorage(*newrnode, persistence, SMGR_AO); + srel = RelationCreateStorage(*newrnode, persistence, SMGR_AO, rel); /* * If required, set up an init fork for an unlogged table so that it can @@ -951,7 +951,7 @@ appendonly_relation_copy_data(Relation rel, const RelFileNode *newrnode) * Use the "AO-specific" (non-shared buffers backed storage) SMGR * implementation */ - dstrel = smgropen(*newrnode, rel->rd_backend, SMGR_AO); + dstrel = smgropen(*newrnode, rel->rd_backend, SMGR_AO, rel); RelationOpenSmgr(rel); /* @@ -961,7 +961,7 @@ appendonly_relation_copy_data(Relation rel, const RelFileNode *newrnode) * NOTE: any conflict in relfilenode value will be caught in * RelationCreateStorage(). */ - RelationCreateStorage(*newrnode, rel->rd_rel->relpersistence, SMGR_AO); + RelationCreateStorage(*newrnode, rel->rd_rel->relpersistence, SMGR_AO, rel); copy_append_only_data(rel->rd_node, *newrnode, rel->rd_backend, rel->rd_rel->relpersistence); diff --git a/src/backend/access/gin/ginfast.c b/src/backend/access/gin/ginfast.c index e0d99409461..a6c0a739b75 100644 --- a/src/backend/access/gin/ginfast.c +++ b/src/backend/access/gin/ginfast.c @@ -285,8 +285,6 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector) memset(&sublist, 0, sizeof(GinMetaPageData)); makeSublist(index, collector->tuples, collector->ntuples, &sublist); - if (needWal) - XLogBeginInsert(); /* * metapage was unlocked, see above @@ -307,6 +305,9 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector) metadata->nPendingPages = sublist.nPendingPages; metadata->nPendingHeapTuples = sublist.nPendingHeapTuples; + + if (needWal) + XLogBeginInsert(); } else { @@ -335,7 +336,10 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector) metadata->nPendingHeapTuples += sublist.nPendingHeapTuples; if (needWal) - XLogRegisterBuffer(1, buffer, REGBUF_STANDARD); + { + XLogBeginInsert(); + XLogRegisterBuffer(1, buffer, REGBUF_STANDARD); + } } } else diff --git a/src/backend/access/gin/ginxlog.c b/src/backend/access/gin/ginxlog.c index 4073fd5132c..722ff991613 100644 --- a/src/backend/access/gin/ginxlog.c +++ b/src/backend/access/gin/ginxlog.c @@ -407,6 +407,7 @@ ginRedoSplit(XLogReaderState *record) rootbuf; bool isLeaf = (data->flags & GIN_INSERT_ISLEAF) != 0; bool isRoot = (data->flags & GIN_SPLIT_ROOT) != 0; + XLogRedoAction action; /* * First clear incomplete-split flag on child page if this finishes a @@ -415,21 +416,27 @@ ginRedoSplit(XLogReaderState *record) if (!isLeaf) ginRedoClearIncompleteSplit(record, 3); - if (XLogReadBufferForRedo(record, 0, &lbuffer) != BLK_RESTORED) - elog(ERROR, "GIN split record did not contain a full-page image of left page"); + action = XLogReadBufferForRedo(record, 0, &lbuffer); + if (action != BLK_RESTORED && action != BLK_DONE) + elog(ERROR, "GIN split record did not contain a full-page image of left page"); - if (XLogReadBufferForRedo(record, 1, &rbuffer) != BLK_RESTORED) - elog(ERROR, "GIN split record did not contain a full-page image of right page"); + action = XLogReadBufferForRedo(record, 1, &rbuffer); + if (action != BLK_RESTORED && action != BLK_DONE) + elog(ERROR, "GIN split record did not contain a full-page image of right page"); if (isRoot) { - if (XLogReadBufferForRedo(record, 2, &rootbuf) != BLK_RESTORED) - elog(ERROR, "GIN split record did not contain a full-page image of root page"); - UnlockReleaseBuffer(rootbuf); + action = XLogReadBufferForRedo(record, 2, &rootbuf); + if (action != BLK_RESTORED && action != BLK_DONE) + elog(ERROR, "GIN split record did not contain a full-page image of root page"); + if (rootbuf != InvalidBuffer) + UnlockReleaseBuffer(rootbuf); } - UnlockReleaseBuffer(rbuffer); - UnlockReleaseBuffer(lbuffer); + if (rbuffer != InvalidBuffer) + UnlockReleaseBuffer(rbuffer); + if (lbuffer != InvalidBuffer) + UnlockReleaseBuffer(lbuffer); } /* diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 006652ade75..9d5fa4d2237 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -2279,6 +2279,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, xlhdr.t_infomask2 = heaptup->t_data->t_infomask2; xlhdr.t_infomask = heaptup->t_data->t_infomask; xlhdr.t_hoff = heaptup->t_data->t_hoff; + xlhdr.t_cid = HeapTupleHeaderGetRawCommandId(heaptup->t_data); /* * note we mark xlhdr as belonging to buffer; if XLogInsert decides to @@ -2608,6 +2609,7 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, tuphdr->t_infomask2 = heaptup->t_data->t_infomask2; tuphdr->t_infomask = heaptup->t_data->t_infomask; tuphdr->t_hoff = heaptup->t_data->t_hoff; + tuphdr->t_cid = HeapTupleHeaderGetRawCommandId(heaptup->t_data); /* write bitmap [+ padding] [+ oid] + data */ datalen = heaptup->t_len - SizeofHeapTupleHeader; @@ -3133,6 +3135,7 @@ heap_delete(Relation relation, ItemPointer tid, tp.t_data->t_infomask2); xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self); xlrec.xmax = new_xmax; + xlrec.t_cid = HeapTupleHeaderGetRawCommandId(tp.t_data); if (old_key_tuple != NULL) { @@ -3154,6 +3157,7 @@ heap_delete(Relation relation, ItemPointer tid, { xlhdr.t_infomask2 = old_key_tuple->t_data->t_infomask2; xlhdr.t_infomask = old_key_tuple->t_data->t_infomask; + xlhdr.t_cid = HeapTupleHeaderGetRawCommandId(old_key_tuple->t_data); xlhdr.t_hoff = old_key_tuple->t_data->t_hoff; XLogRegisterData((char *) &xlhdr, SizeOfHeapHeader); @@ -3881,6 +3885,8 @@ heap_update_internal(Relation relation, ItemPointer otid, HeapTuple newtup, oldtup.t_data->t_infomask2); xlrec.flags = cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0; + xlrec.t_cid = HeapTupleHeaderGetRawCommandId(oldtup.t_data); + XLogRegisterData((char *) &xlrec, SizeOfHeapLock); recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK); PageSetLSN(page, recptr); @@ -5081,10 +5087,11 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, xlrec.infobits_set = compute_infobits(new_infomask, tuple->t_data->t_infomask2); xlrec.flags = cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0; + xlrec.t_cid = HeapTupleHeaderGetRawCommandId(tuple->t_data); + XLogRegisterData((char *) &xlrec, SizeOfHeapLock); /* we don't decode row locks atm, so no need to log the origin */ - recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK); PageSetLSN(page, recptr); @@ -6136,7 +6143,6 @@ heap_abort_speculative(Relation relation, ItemPointer tid) XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); /* No replica identity & replication origin logged */ - recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE); PageSetLSN(page, recptr); @@ -8318,6 +8324,7 @@ log_heap_update(Relation reln, Buffer oldbuf, /* Prepare WAL data for the new page */ xlrec.new_offnum = ItemPointerGetOffsetNumber(&newtup->t_self); xlrec.new_xmax = HeapTupleHeaderGetRawXmax(newtup->t_data); + xlrec.t_cid = HeapTupleHeaderGetRawCommandId(newtup->t_data); bufflags = REGBUF_STANDARD; if (init) @@ -8355,6 +8362,8 @@ log_heap_update(Relation reln, Buffer oldbuf, xlhdr.t_infomask2 = newtup->t_data->t_infomask2; xlhdr.t_infomask = newtup->t_data->t_infomask; xlhdr.t_hoff = newtup->t_data->t_hoff; + xlhdr.t_cid = HeapTupleHeaderGetRawCommandId(newtup->t_data); + Assert(SizeofHeapTupleHeader + prefixlen + suffixlen <= newtup->t_len); /* @@ -8396,6 +8405,7 @@ log_heap_update(Relation reln, Buffer oldbuf, xlhdr_idx.t_infomask2 = old_key_tuple->t_data->t_infomask2; xlhdr_idx.t_infomask = old_key_tuple->t_data->t_infomask; xlhdr_idx.t_hoff = old_key_tuple->t_data->t_hoff; + xlhdr_idx.t_cid = HeapTupleHeaderGetRawCommandId(old_key_tuple->t_data); XLogRegisterData((char *) &xlhdr_idx, SizeOfHeapHeader); @@ -9036,7 +9046,8 @@ heap_xlog_delete(XLogReaderState *record) HeapTupleHeaderSetXmax(htup, xlrec->xmax); else HeapTupleHeaderSetXmin(htup, InvalidTransactionId); - HeapTupleHeaderSetCmax(htup, FirstCommandId, false); + + HeapTupleHeaderSetCmax(htup, xlrec->t_cid, false); /* Mark the page as a candidate for pruning */ PageSetPrunable(page, XLogRecGetXid(record)); @@ -9137,7 +9148,7 @@ heap_xlog_insert(XLogReaderState *record) htup->t_infomask = xlhdr.t_infomask; htup->t_hoff = xlhdr.t_hoff; HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); - HeapTupleHeaderSetCmin(htup, FirstCommandId); + HeapTupleHeaderSetCmin(htup, xlhdr.t_cid); htup->t_ctid = target_tid; if (PageAddItem(page, (Item) htup, newlen, xlrec->offnum, @@ -9280,7 +9291,7 @@ heap_xlog_multi_insert(XLogReaderState *record) htup->t_infomask = xlhdr->t_infomask; htup->t_hoff = xlhdr->t_hoff; HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); - HeapTupleHeaderSetCmin(htup, FirstCommandId); + HeapTupleHeaderSetCmin(htup, xlhdr->t_cid); ItemPointerSetBlockNumber(&htup->t_ctid, blkno); ItemPointerSetOffsetNumber(&htup->t_ctid, offnum); @@ -9420,7 +9431,8 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) fix_infomask_from_infobits(xlrec->old_infobits_set, &htup->t_infomask, &htup->t_infomask2); HeapTupleHeaderSetXmax(htup, xlrec->old_xmax); - HeapTupleHeaderSetCmax(htup, FirstCommandId, false); + HeapTupleHeaderSetCmax(htup, xlrec->t_cid, false); + /* Set forward chain link in t_ctid */ htup->t_ctid = newtid; @@ -9553,7 +9565,7 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) htup->t_hoff = xlhdr.t_hoff; HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); - HeapTupleHeaderSetCmin(htup, FirstCommandId); + HeapTupleHeaderSetCmin(htup, xlhdr.t_cid); HeapTupleHeaderSetXmax(htup, xlrec->new_xmax); /* Make sure there is no forward chain link in t_ctid */ htup->t_ctid = newtid; @@ -9694,7 +9706,8 @@ heap_xlog_lock(XLogReaderState *record) offnum); } HeapTupleHeaderSetXmax(htup, xlrec->locking_xid); - HeapTupleHeaderSetCmax(htup, FirstCommandId, false); + HeapTupleHeaderSetCmax(htup, xlrec->t_cid, false); + PageSetLSN(page, lsn); MarkBufferDirty(buffer); } diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 0e8fc153e99..48c19366a5f 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -605,7 +605,7 @@ heapam_relation_set_new_filenode(Relation rel, */ *minmulti = GetOldestMultiXactId(); - srel = RelationCreateStorage(*newrnode, persistence, SMGR_MD); + srel = RelationCreateStorage(*newrnode, persistence, SMGR_MD, rel); /* * If required, set up an init fork for an unlogged table so that it can @@ -643,7 +643,8 @@ heapam_relation_copy_data(Relation rel, const RelFileNode *newrnode) { SMgrRelation dstrel; - dstrel = smgropen(*newrnode, rel->rd_backend, SMGR_MD); + dstrel = smgropen(*newrnode, rel->rd_backend, SMGR_MD, rel); + RelationOpenSmgr(rel); /* @@ -661,7 +662,7 @@ heapam_relation_copy_data(Relation rel, const RelFileNode *newrnode) * NOTE: any conflict in relfilenode value will be caught in * RelationCreateStorage(). */ - RelationCreateStorage(*newrnode, rel->rd_rel->relpersistence, SMGR_MD); + RelationCreateStorage(*newrnode, rel->rd_rel->relpersistence, SMGR_MD, rel); /* copy main fork */ RelationCopyStorage(rel->rd_smgr, dstrel, MAIN_FORKNUM, diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c index ad955c2824d..86e61aee5a2 100644 --- a/src/backend/access/transam/clog.c +++ b/src/backend/access/transam/clog.c @@ -1138,3 +1138,12 @@ clogsyncfiletag(const FileTag *ftag, char *path) { return SlruSyncFileTag(XactCtl, ftag, path); } + +/* + * Get clog control data + */ +SlruCtl +CLOG_Ctl(void) +{ + return XactCtl; +} \ No newline at end of file diff --git a/src/backend/access/transam/distributedlog.c b/src/backend/access/transam/distributedlog.c index 46e86ecb75a..24e1dd5d026 100644 --- a/src/backend/access/transam/distributedlog.c +++ b/src/backend/access/transam/distributedlog.c @@ -1083,3 +1083,12 @@ DistributedLog_syncfiletag(const FileTag *ftag, char *path) { return SlruSyncFileTag(DistributedLogCtl, ftag, path); } + +/* + * Get distributed log control data + */ +SlruCtl +DistributedLog_Ctl(void) +{ + return DistributedLogCtl; +} \ No newline at end of file diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c index 9b75b3e4eea..90788509f8a 100644 --- a/src/backend/access/transam/varsup.c +++ b/src/backend/access/transam/varsup.c @@ -24,6 +24,7 @@ #include "postmaster/autovacuum.h" #include "storage/pmsignal.h" #include "storage/proc.h" +#include "cdb/cdbutil.h" #include "utils/guc.h" #include "utils/syscache.h" @@ -40,6 +41,8 @@ VariableCache ShmemVariableCache = NULL; int xid_stop_limit; int xid_warn_limit; +NewSegRelfilenode_assign_hook_type NewSegRelfilenode_assign_hook = NULL; + /* * Allocate the next FullTransactionId for a new transaction or * subtransaction. @@ -715,6 +718,9 @@ GetNewSegRelfilenodeUnderLock(void) ShmemVariableCache->relfilenodeCount = VAR_OID_PREFETCH; } + if (NewSegRelfilenode_assign_hook) + return (*NewSegRelfilenode_assign_hook) (); + result = ShmemVariableCache->nextRelfilenode; (ShmemVariableCache->nextRelfilenode)++; diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 7b3622bf509..84d45963106 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -343,6 +343,8 @@ bool wal_receiver_create_temp_slot = false; /* are we currently in standby mode? */ bool StandbyMode = false; +Startup_hook_type Startup_hook = NULL; + /* * if recoveryStopsBefore/After returns true, it saves information of the stop * point here @@ -8413,6 +8415,18 @@ StartupXLOG(void) */ InRecovery = false; + /* + * Hook for plugins to do additional startup works. + * + * Allow to write any WALs in hook. + */ + if (Startup_hook) + { + LocalSetXLogInsertAllowed(); + (*Startup_hook) (); + LocalXLogInsertAllowed = -1; + } + /* * If we are a standby with contentid -1 and undergoing promotion, * update ourselves as the new master in catalog. This does not @@ -11106,9 +11120,19 @@ xlog_redo(XLogReaderState *record) for (uint8 block_id = 0; block_id <= record->max_block_id; block_id++) { Buffer buffer; + XLogRedoAction result; + + result = XLogReadBufferForRedo(record, block_id, &buffer); + if (result == BLK_DONE && !IsUnderPostmaster) + { + /* + * In the special WAL process, blocks that are being ignored + * return BLK_DONE. Accept that. + */ + } + else if (result != BLK_RESTORED) + elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block"); - if (XLogReadBufferForRedo(record, block_id, &buffer) != BLK_RESTORED) - elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block"); UnlockReleaseBuffer(buffer); } } diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c index 9cd6bf438d0..3f486ae750f 100644 --- a/src/backend/access/transam/xloginsert.c +++ b/src/backend/access/transam/xloginsert.c @@ -65,6 +65,9 @@ typedef struct char compressed_page[BLCKSZ]; } registered_buffer; +/* hook for new XLogInsert method */ +XLogInsert_hook_type XLogInsert_hook = NULL; + static registered_buffer *registered_buffers; static int max_registered_buffers; /* allocated size */ static int max_registered_block_id = 0; /* highest block_id + 1 currently @@ -469,6 +472,9 @@ XLogInsert_Internal(RmgrId rmid, uint8 info, TransactionId headerXid) return EndPos; } + if (XLogInsert_hook) + return (*XLogInsert_hook) (rmid, info, headerXid, curinsert_flags, (void *)XLogRecordAssemble); + do { XLogRecPtr RedoRecPtr; @@ -1287,3 +1293,59 @@ InitXLogInsert(void) hdr_scratch = MemoryContextAllocZero(xloginsert_cxt, HEADER_SCRATCH_SIZE); } + +/* + * Get RelFileNode/ForkNumber/BlockNumber of XLog register block if any. + */ +bool +GetXLogRegisterBufferTagIfAny(RelFileNode *rnode, ForkNumber *forknum, BlockNumber *blkno) +{ + for (int i = 0; i < max_registered_block_id; i++) + { + if (GetXLogRegisterBuffer(i, rnode, forknum, blkno, NULL)) + return true; + } + + return false; +} + +int +GetNumXLogRegisterBuffers(void) +{ + return max_registered_block_id; +} + +/* + * Caller should make sure the block_id is valid(block_id < max_registered_block_id). + */ +bool +GetXLogRegisterBuffer(int block_id, RelFileNode *rnode, ForkNumber *forknum, BlockNumber *blkno, Page *page) +{ + if (registered_buffers[block_id].in_use) + { + if (rnode) + *rnode = registered_buffers[block_id].rnode; + if (forknum) + *forknum = registered_buffers[block_id].forkno; + if (blkno) + *blkno = registered_buffers[block_id].block; + if (page) + *page = registered_buffers[block_id].page; + return true; + } + + return false; +} + +/* + * Get register rdata. + */ +char * +GetXLogRegisterRdata(int rdata_index) +{ + if (rdata_index >= num_rdatas) + elog(ERROR, "invalid rdata index"); + + return rdatas[rdata_index].data; +} + diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c index 7f58c54a92b..8d43566c4c1 100644 --- a/src/backend/access/transam/xlogutils.c +++ b/src/backend/access/transam/xlogutils.c @@ -32,10 +32,11 @@ #include "utils/hsearch.h" #include "utils/rel.h" - /* GUC variable */ bool ignore_invalid_pages = false; +bool (*redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id); + /* * During XLOG replay, we may see XLOG records for incremental updates of * pages that no longer exist, because their relation was later dropped or @@ -347,6 +348,27 @@ XLogReadBufferForRedoExtended(XLogReaderState *record, elog(PANIC, "failed to locate backup block with ID %d", block_id); } + /* + * If a WAL redo function calls XLogReadBufferForRedoExtended() for a page that has a full-page + * image, it always succeeds. However, if redo process is only concerned about replaying changes + * to a singe page, so replaying any changes for other pages is a waste of cycles. We have modified + * XLogReadBufferForRedoExtended() to return BLK_DONE for all other pages, to avoid the overhead. + */ + if (redo_read_buffer_filter && redo_read_buffer_filter(record, block_id)) + { + if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) + { + *buf = ReadBufferWithoutRelcache(rnode, forknum, + blkno, mode, NULL); + return BLK_DONE; + } + else + { + *buf = InvalidBuffer; + return BLK_DONE; + } + } + /* * Make sure that if the block is marked with WILL_INIT, the caller is * going to initialize it. And vice versa. @@ -451,7 +473,7 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, * Open the relation at smgr level. Relations using shared buffers need * the default SMGR implementation. */ - smgr = smgropen(rnode, InvalidBackendId, SMGR_MD); + smgr = smgropen(rnode, InvalidBackendId, SMGR_MD, NULL); /* * Create the target file if it doesn't already exist. This lets us cope diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c index 4b6f11597d1..85a44e27b21 100644 --- a/src/backend/catalog/heap.c +++ b/src/backend/catalog/heap.c @@ -478,7 +478,7 @@ heap_create(const char *relname, case RELKIND_INDEX: case RELKIND_SEQUENCE: - RelationCreateStorage(rel->rd_node, relpersistence, SMGR_MD); + RelationCreateStorage(rel->rd_node, relpersistence, SMGR_MD, rel); break; case RELKIND_RELATION: @@ -1612,6 +1612,7 @@ heap_create_with_catalog(const char *relname, else relacl = NULL; + /* * Create the relcache entry (mostly dummy at this point) and the physical * disk file. (If we fail further down, it's the smgr's responsibility to diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 715a3ddf4b7..493a07a617c 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -101,6 +101,10 @@ /* Potentially set by pg_upgrade_support functions */ Oid binary_upgrade_next_index_pg_class_oid = InvalidOid; +index_create_hook_type index_create_hook = NULL; + +ambuild_function index_build_hook = NULL; + /* * Pointer-free representation of variables used when reindexing system * catalogs; we use this to propagate those values to parallel workers. @@ -764,6 +768,69 @@ index_create(Relation heapRelation, bool allow_system_table_mods, bool is_internal, Oid *constraintId) +{ + if (index_create_hook) + return (*index_create_hook) (heapRelation, + indexRelationName, + indexRelationId, + parentIndexRelid, + parentConstraintId, + relFileNode, + indexInfo, + indexColNames, + accessMethodObjectId, + tableSpaceId, + collationObjectId, + classObjectId, + coloptions, + reloptions, + flags, + constr_flags, + allow_system_table_mods, + is_internal, + constraintId); + else + return index_create_internal(heapRelation, + indexRelationName, + indexRelationId, + parentIndexRelid, + parentConstraintId, + relFileNode, + indexInfo, + indexColNames, + accessMethodObjectId, + tableSpaceId, + collationObjectId, + classObjectId, + coloptions, + reloptions, + flags, + constr_flags, + allow_system_table_mods, + is_internal, + constraintId); +} + +Oid +index_create_internal(Relation heapRelation, + const char *indexRelationName, + Oid indexRelationId, + Oid parentIndexRelid, + Oid parentConstraintId, + Oid relFileNode, + IndexInfo *indexInfo, + List *indexColNames, + Oid accessMethodObjectId, + Oid tableSpaceId, + Oid *collationObjectId, + Oid *classObjectId, + int16 *coloptions, + Datum reloptions, + bits16 flags, + bits16 constr_flags, + bool allow_system_table_mods, + bool is_internal, + Oid *constraintId) { Oid heapRelationId = RelationGetRelid(heapRelation); Relation pg_class; @@ -986,6 +1053,7 @@ index_create(Relation heapRelation, Assert(relminmxid == InvalidMultiXactId); Assert(indexRelationId == RelationGetRelid(indexRelation)); + /* * Obtain exclusive lock on it. Although no other transactions can see it * until we commit, this prevents deadlock-risk complaints from lock @@ -3108,8 +3176,11 @@ index_build(Relation heapRelation, /* * Call the access method's build procedure */ - stats = indexRelation->rd_indam->ambuild(heapRelation, indexRelation, - indexInfo); + if (index_build_hook) + stats = index_build_hook(heapRelation, indexRelation, indexInfo); + else + stats = indexRelation->rd_indam->ambuild(heapRelation, indexRelation, + indexInfo); Assert(PointerIsValid(stats)); /* diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c index 9e24ac4599e..49f477549cd 100644 --- a/src/backend/catalog/storage.c +++ b/src/backend/catalog/storage.c @@ -116,7 +116,7 @@ AddPendingSync(const RelFileNode *rnode) * transaction aborts later on, the storage will be destroyed. */ SMgrRelation -RelationCreateStorage(RelFileNode rnode, char relpersistence, SMgrImpl smgr_which) +RelationCreateStorage(RelFileNode rnode, char relpersistence, SMgrImpl smgr_which, Relation rel) { PendingRelDelete *pending; SMgrRelation srel; @@ -144,7 +144,7 @@ RelationCreateStorage(RelFileNode rnode, char relpersistence, SMgrImpl smgr_whic return NULL; /* placate compiler */ } - srel = smgropen(rnode, backend, smgr_which); + srel = smgropen(rnode, backend, smgr_which, rel); smgrcreate(srel, MAIN_FORKNUM, false); if (needs_wal) @@ -662,7 +662,7 @@ smgrDoPendingDeletes(bool isCommit) srel = smgropen(pending->relnode.node, pending->relnode.isTempRelation ? TempRelBackendId : InvalidBackendId, - pending->relnode.smgr_which); + pending->relnode.smgr_which, NULL); /* allocate the initial array, or extend it, if needed */ if (maxrels == 0) @@ -743,7 +743,7 @@ smgrDoPendingSyncs(bool isCommit, bool isParallelWorker) BlockNumber total_blocks = 0; SMgrRelation srel; - srel = smgropen(pendingsync->rnode, InvalidBackendId, SMGR_MD); + srel = smgropen(pendingsync->rnode, InvalidBackendId, SMGR_MD, NULL); /* * We emit newpage WAL records for smaller relations. @@ -966,7 +966,7 @@ smgr_redo(XLogReaderState *record) xl_smgr_create *xlrec = (xl_smgr_create *) XLogRecGetData(record); SMgrRelation reln; - reln = smgropen(xlrec->rnode, InvalidBackendId, xlrec->impl); + reln = smgropen(xlrec->rnode, InvalidBackendId, xlrec->impl, NULL); smgrcreate(reln, xlrec->forkNum, true); } else if (info == XLOG_SMGR_TRUNCATE) @@ -984,7 +984,7 @@ smgr_redo(XLogReaderState *record) * for AO takes a different code path, it does not involve emitting * SMGR_TRUNCATE WAL record. */ - reln = smgropen(xlrec->rnode, InvalidBackendId, SMGR_MD); + reln = smgropen(xlrec->rnode, InvalidBackendId, SMGR_MD, NULL); /* * Forcibly create relation if it doesn't exist (which suggests that diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 76faa1f343d..5dbda592e53 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -16169,7 +16169,7 @@ index_copy_data(Relation rel, RelFileNode newrnode) SMgrImpl smgr_which = RelationIsAppendOptimized(rel) ? SMGR_AO : SMGR_MD; - dstrel = smgropen(newrnode, rel->rd_backend, smgr_which); + dstrel = smgropen(newrnode, rel->rd_backend, smgr_which, rel); RelationOpenSmgr(rel); @@ -16188,7 +16188,7 @@ index_copy_data(Relation rel, RelFileNode newrnode) * NOTE: any conflict in relfilenode value will be caught in * RelationCreateStorage(). */ - RelationCreateStorage(newrnode, rel->rd_rel->relpersistence, smgr_which); + RelationCreateStorage(newrnode, rel->rd_rel->relpersistence, smgr_which, rel); /* copy main fork */ RelationCopyStorage(rel->rd_smgr, dstrel, MAIN_FORKNUM, diff --git a/src/backend/main/main.c b/src/backend/main/main.c index c018a695eb8..5c8f609d61c 100644 --- a/src/backend/main/main.c +++ b/src/backend/main/main.c @@ -52,6 +52,52 @@ static void init_locale(const char *categoryname, int category, const char *loca static void help(const char *progname); static void check_root(const char *progname); +typedef int (*MainFunc) (int argc, char *argv[]); + +static int +CallExtMain(int argc, char *argv[], bool load_config) +{ + MainFunc main_func; + char *library_name; + char *main_func_name; + + if (argc <= 3) + { + elog(LOG, "library_name and main_func_name are both needed."); + exit(1); + } + + library_name = argv[2]; + main_func_name = argv[3]; + + /* + * Perform just enough initialization that we can load external libraries + */ + InitStandaloneProcess(argv[0]); + + SetProcessingMode(InitProcessing); + + /* + * Set default values for command-line options. + */ + InitializeGUCOptions(); + + /* Acquire configuration parameters */ + if (load_config && !SelectConfigFiles(NULL, progname)) + exit(1); + + /* + * Imitate we are early in bootstrap loading shared_preload_libraries; + * neon extension sets PGC_POSTMASTER gucs requiring this. + */ + process_shared_preload_libraries_in_progress = true; + + main_func = load_external_function(library_name, main_func_name, true, NULL); + + process_shared_preload_libraries_in_progress = false; + + return main_func(argc, argv); +} /* * Any Postgres server process begins execution here. @@ -217,6 +263,8 @@ main(int argc, char *argv[]) PostgresMain(argc, argv, NULL, /* no dbname */ strdup(get_user_name_or_exit(progname))); /* does not return */ + else if (argc > 1 && strcmp(argv[1], "--ext-main") == 0) + CallExtMain(argc, argv, false); else PostmasterMain(argc, argv); /* does not return */ abort(); /* should not get here */ diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index fff6213b465..07e759f9678 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -278,6 +278,9 @@ char *bonjour_name; bool restart_after_crash = true; bool remove_temp_files_after_crash = true; +/* Hook for plugins to start background workers */ +start_bgworkers_hook_type start_bgworkers_hook = NULL; + /* * PIDs of special child processes; 0 when not running. When adding a new PID * to the list, remember to add the process title to GetServerProcessTitle() @@ -534,6 +537,7 @@ static int CountChildren(int target); static bool assign_backendlist_entry(RegisteredBgWorker *rw); static void maybe_start_bgworkers(void); static bool CreateOptsFile(int argc, char *argv[], char *fullprogname); +static bool do_start_bgworker(RegisteredBgWorker *rw); static pid_t StartChildProcess(AuxProcType type); static void StartAutovacuumWorker(void); static void MaybeStartWalReceiver(void); @@ -4534,6 +4538,11 @@ PostmasterStateMachine(void) pmState = PM_STARTUP; /* crash recovery started, reset SIGKILL flag */ AbortStartTime = 0; + + if (start_bgworkers_hook) + { + (*start_bgworkers_hook) (FatalError, pmState, do_start_bgworker); + } } } @@ -6576,6 +6585,11 @@ maybe_start_bgworkers(void) TimestampTz now = 0; slist_mutable_iter iter; + if (start_bgworkers_hook) + { + (*start_bgworkers_hook) (FatalError, pmState, do_start_bgworker); + } + /* * During crash recovery, we have no need to be called until the state * transition out of recovery. diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index e4deb326e7d..6321999e97d 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -129,6 +129,8 @@ typedef struct CkptTsStatus int index; } CkptTsStatus; +ReadBuffer_hook_type ReadBuffer_hook = NULL; + /* * Type for array used to sort SMgrRelations * @@ -875,7 +877,7 @@ ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum, * Use default SMGR implementation when opening a relation backed by * shared buffers */ - SMgrRelation smgr = smgropen(rnode, InvalidBackendId, 0); + SMgrRelation smgr = smgropen(rnode, InvalidBackendId, 0, NULL); Assert(InRecovery); @@ -898,6 +900,12 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, Block bufBlock; bool found; bool isExtend; + + if (ReadBuffer_hook) + { + return ReadBuffer_hook(smgr, relpersistence, forkNum, blockNum, mode, strategy, hit); + } + /* * Temp tables in Cloudberry use shared buffers so that backends executing * multiple slices of the same query can share them. @@ -935,7 +943,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, if (isLocalBuf) { - bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found); + bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found, InvalidBuffer); if (found) pgBufferUsage.local_blks_hit++; else if (isExtend) @@ -2996,7 +3004,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln) bool istemp = (buf_state_unlocked & BM_TEMP) != 0; reln = smgropen(buf->tag.rnode, - istemp ? TempRelBackendId : InvalidBackendId, 0); + istemp ? TempRelBackendId : InvalidBackendId, 0, NULL); } TRACE_POSTGRESQL_BUFFER_FLUSH_START(buf->tag.forkNum, @@ -3034,8 +3042,10 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln) * disastrous system-wide consequences. To make sure that can't happen, * skip the flush if the buffer isn't permanent. */ - if (buf_state & BM_PERMANENT) - XLogFlush(recptr); + if ((buf_state & BM_PERMANENT) && smgr_is_heap_relation(reln)) + { + XLogFlush(recptr); + } /* * Now it's safe to write buffer to disk. Note that no one else should @@ -5122,7 +5132,7 @@ IssuePendingWritebacks(WritebackContext *context) i += ahead; /* and finally tell the kernel to write the data to storage */ - reln = smgropen(tag.rnode, InvalidBackendId, 0); + reln = smgropen(tag.rnode, InvalidBackendId, 0, NULL); smgrwriteback(reln, tag.forkNum, tag.blockNum, nblocks); } diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c index 56afad01085..13f3afe292d 100644 --- a/src/backend/storage/buffer/localbuf.c +++ b/src/backend/storage/buffer/localbuf.c @@ -107,7 +107,7 @@ PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum, */ BufferDesc * LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, - bool *foundPtr) + bool *foundPtr, Buffer non_evited_buffer) { BufferTag newTag; /* identity of requested block */ LocalBufferLookupEnt *hresult; @@ -189,6 +189,12 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, if (LocalRefCount[b] == 0) { + if (-b - 1 == non_evited_buffer) + { + /* Prevent eviction of the buffer with needed page */ + continue; + } + buf_state = pg_atomic_read_u32(&bufHdr->state); if (BUF_STATE_GET_USAGECOUNT(buf_state) > 0) @@ -222,7 +228,7 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, Page localpage = (char *) LocalBufHdrGetBlock(bufHdr); /* Find smgr relation for buffer */ - oreln = smgropen(bufHdr->tag.rnode, MyBackendId, 0); + oreln = smgropen(bufHdr->tag.rnode, MyBackendId, 0, NULL); // GPDB_93_MERGE_FIXME: is this TODO comment still relevant? // UNDONE: Unfortunately, I think we write temp relations to the mirror... diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 94561251785..0cfb9bae55f 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -560,7 +560,7 @@ mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior) mdfd->mdfd_vfd = fd; mdfd->mdfd_segno = 0; - Assert(_mdnblocks(reln, forknum, mdfd) <= ((BlockNumber) RELSEG_SIZE)); + Assert(_mdnblocks(reln, forknum, mdfd) <= ((BlockNumber) RELSEG_SIZE) || reln->smgr_which == SMGR_AO); return mdfd; } @@ -1151,7 +1151,7 @@ DropRelationFiles(RelFileNodePendingDelete *delrels, int ndelrels, bool isRedo) SMgrRelation srel = smgropen(delrels[i].node, delrels[i].isTempRelation ? TempRelBackendId : InvalidBackendId, - delrels[i].smgr_which); + delrels[i].smgr_which, NULL); if (isRedo) { @@ -1434,7 +1434,7 @@ _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) int mdsyncfiletag(const FileTag *ftag, char *path) { - SMgrRelation reln = smgropen(ftag->rnode, InvalidBackendId, 0); + SMgrRelation reln = smgropen(ftag->rnode, InvalidBackendId, 0, NULL); File file; bool need_to_close; int result, @@ -1476,7 +1476,7 @@ mdsyncfiletag(const FileTag *ftag, char *path) int aosyncfiletag(const FileTag *ftag, char *path) { - SMgrRelation reln = smgropen(ftag->rnode, InvalidBackendId, 1); + SMgrRelation reln = smgropen(ftag->rnode, InvalidBackendId, 1, NULL); char *p; /* Provide the path for informational messages. */ diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index 67aa6a7d582..d9d05944ed2 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -20,12 +20,14 @@ #include "postgres.h" #include "access/aomd.h" +#include "access/relation.h" #include "access/xact.h" #include "access/xlogutils.h" #include "catalog/catalog.h" #include "catalog/indexing.h" +#include "commands/defrem.h" +#include "postmaster/autovacuum.h" #include "postmaster/postmaster.h" - #include "access/xlog.h" #include "lib/ilist.h" #include "storage/bufmgr.h" @@ -35,6 +37,7 @@ #include "utils/faultinjector.h" #include "utils/hsearch.h" #include "utils/inval.h" +#include "utils/relfilenodemap.h" /* * Hook for plugins to collect statistics from storage functions @@ -46,32 +49,10 @@ file_extend_hook_type file_extend_hook = NULL; file_truncate_hook_type file_truncate_hook = NULL; file_unlink_hook_type file_unlink_hook = NULL; -typedef struct f_smgr -{ - void (*smgr_init) (void); /* may be NULL */ - void (*smgr_shutdown) (void); /* may be NULL */ - void (*smgr_open) (SMgrRelation reln); - void (*smgr_close) (SMgrRelation reln, ForkNumber forknum); - void (*smgr_create) (SMgrRelation reln, ForkNumber forknum, - bool isRedo); - bool (*smgr_exists) (SMgrRelation reln, ForkNumber forknum); - void (*smgr_unlink) (RelFileNodeBackend rnode, ForkNumber forknum, - bool isRedo); - void (*smgr_extend) (SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, char *buffer, bool skipFsync); - bool (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum); - void (*smgr_read) (SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, char *buffer); - void (*smgr_write) (SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, char *buffer, bool skipFsync); - void (*smgr_writeback) (SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, BlockNumber nblocks); - BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum); - void (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum, - BlockNumber nblocks); - void (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum); -} f_smgr; +/* Hook for plugins to get control in smgr */ +smgr_init_hook_type smgr_init_hook = NULL; +smgr_hook_type smgr_hook = NULL; +smgr_shutdown_hook_type smgr_shutdown_hook = NULL; static const f_smgr smgrsw[] = { /* magnetic disk */ @@ -151,6 +132,9 @@ smgrinit(void) smgrsw[i].smgr_init(); } + if (smgr_init_hook) + (*smgr_init_hook)(); + /* register the shutdown proc */ on_proc_exit(smgrshutdown, 0); } @@ -176,7 +160,7 @@ smgrshutdown(int code, Datum arg) * This does not attempt to actually open the underlying file. */ SMgrRelation -smgropen(RelFileNode rnode, BackendId backend, SMgrImpl which) +smgropen(RelFileNode rnode, BackendId backend, SMgrImpl which, Relation rel) { RelFileNodeBackend brnode; SMgrRelation reln; @@ -215,11 +199,19 @@ smgropen(RelFileNode rnode, BackendId backend, SMgrImpl which) reln->smgr_cached_nblocks[i] = InvalidBlockNumber; reln->smgr_which = which; /* GPDB add SMGR_AO*/ - /* implementation-specific initialization */ - smgrsw[reln->smgr_which].smgr_open(reln); - /* it has no owner yet */ dlist_push_tail(&unowned_relns, &reln->node); + reln->smgr = &smgrsw[reln->smgr_which]; + + /* + * hook for other storage managers. + */ + if (smgr_hook) + (*smgr_hook) (reln, backend, which, rel); + + Assert(reln->smgr); + + (*reln->smgr).smgr_open(reln); } return reln; @@ -283,7 +275,7 @@ smgrclearowner(SMgrRelation *owner, SMgrRelation reln) bool smgrexists(SMgrRelation reln, ForkNumber forknum) { - return smgrsw[reln->smgr_which].smgr_exists(reln, forknum); + return (*reln->smgr).smgr_exists(reln, forknum); } /* @@ -296,7 +288,7 @@ smgrclose(SMgrRelation reln) ForkNumber forknum; for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) - smgrsw[reln->smgr_which].smgr_close(reln, forknum); + (*reln->smgr).smgr_close(reln, forknum); owner = reln->smgr_owner; @@ -369,7 +361,8 @@ smgrclosenode(RelFileNodeBackend rnode) void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo) { - smgrsw[reln->smgr_which].smgr_create(reln, forknum, isRedo); + (*reln->smgr).smgr_create(reln, forknum, isRedo); + if (file_create_hook) (*file_create_hook)(reln->smgr_rnode); } @@ -415,12 +408,10 @@ smgrdosyncall(SMgrRelation *rels, int nrels) */ for (i = 0; i < nrels; i++) { - int which = rels[i]->smgr_which; - for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) { - if (smgrsw[which].smgr_exists(rels[i], forknum)) - smgrsw[which].smgr_immedsync(rels[i], forknum); + if ((*rels[i]->smgr).smgr_exists(rels[i], forknum)) + (*rels[i]->smgr).smgr_immedsync(rels[i], forknum); } } } @@ -459,13 +450,11 @@ smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo) for (i = 0; i < nrels; i++) { RelFileNodeBackend rnode = rels[i]->smgr_rnode; - int which = rels[i]->smgr_which; rnodes[i] = rnode; - /* Close the forks at smgr level */ for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) - smgrsw[which].smgr_close(rels[i], forknum); + (*rels[i]->smgr).smgr_close(rels[i], forknum); } /* @@ -494,10 +483,8 @@ smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo) for (i = 0; i < nrels; i++) { - int which = rels[i]->smgr_which; - for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) - smgrsw[which].smgr_unlink(rnodes[i], forknum, isRedo); + (*rels[i]->smgr).smgr_unlink(rnodes[i], forknum, isRedo); } if (file_unlink_hook) @@ -522,8 +509,9 @@ void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync) { - smgrsw[reln->smgr_which].smgr_extend(reln, forknum, blocknum, + (*reln->smgr).smgr_extend(reln, forknum, blocknum, buffer, skipFsync); + /* * Normally we expect this to increase nblocks by one, but if the cached * value isn't as expected, just invalidate it so the next call asks the @@ -548,7 +536,7 @@ smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, bool smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) { - return smgrsw[reln->smgr_which].smgr_prefetch(reln, forknum, blocknum); + return (*reln->smgr).smgr_prefetch(reln, forknum, blocknum); } /* @@ -563,7 +551,7 @@ void smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer) { - smgrsw[reln->smgr_which].smgr_read(reln, forknum, blocknum, buffer); + (*reln->smgr).smgr_read(reln, forknum, blocknum, buffer); } /* @@ -585,7 +573,7 @@ void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync) { - smgrsw[reln->smgr_which].smgr_write(reln, forknum, blocknum, + (*reln->smgr).smgr_write(reln, forknum, blocknum, buffer, skipFsync); } @@ -598,7 +586,7 @@ void smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks) { - smgrsw[reln->smgr_which].smgr_writeback(reln, forknum, blocknum, + (*reln->smgr).smgr_writeback(reln, forknum, blocknum, nblocks); } @@ -616,7 +604,7 @@ smgrnblocks(SMgrRelation reln, ForkNumber forknum) if (result != InvalidBlockNumber) return result; - result = smgrsw[reln->smgr_which].smgr_nblocks(reln, forknum); + result = (*reln->smgr).smgr_nblocks(reln, forknum); reln->smgr_cached_nblocks[forknum] = result; @@ -682,7 +670,7 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nb /* Make the cached size is invalid if we encounter an error. */ reln->smgr_cached_nblocks[forknum[i]] = InvalidBlockNumber; - smgrsw[reln->smgr_which].smgr_truncate(reln, forknum[i], nblocks[i]); + (*reln->smgr).smgr_truncate(reln, forknum[i], nblocks[i]); /* * We might as well update the local smgr_cached_nblocks values. The @@ -723,7 +711,15 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nb void smgrimmedsync(SMgrRelation reln, ForkNumber forknum) { - smgrsw[reln->smgr_which].smgr_immedsync(reln, forknum); + (*reln->smgr).smgr_immedsync(reln, forknum); +} +/* + * is the relation heap relation? + */ +bool +smgr_is_heap_relation(SMgrRelation reln) +{ + return (reln->smgr == &smgrsw[SMGR_MD]); } /* diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index c8b8e77c814..5e8235f1247 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -1250,22 +1250,20 @@ exec_mpp_query(const char *query_string, } } - if (log_statement != LOGSTMT_NONE) { - /* - * TODO need to log SELECT INTO as DDL - */ - if (log_statement == LOGSTMT_ALL || - (plan->utilityStmt && log_statement == LOGSTMT_DDL) || - (plan && log_statement >= LOGSTMT_MOD)) - - { - ereport(LOG, (errmsg("statement: %s", query_string) - )); - was_logged = true; - } - + /* + * TODO need to log SELECT INTO as DDL + */ + if (log_statement == LOGSTMT_ALL || + (plan->utilityStmt && log_statement == LOGSTMT_DDL) || + (plan && log_statement >= LOGSTMT_MOD)) + + { + ereport(LOG, (errmsg("statement: %s", query_string) + )); + was_logged = true; + } } /* diff --git a/src/backend/utils/adt/dbsize.c b/src/backend/utils/adt/dbsize.c index 36f40637b95..7c0605b0c64 100644 --- a/src/backend/utils/adt/dbsize.c +++ b/src/backend/utils/adt/dbsize.c @@ -54,6 +54,9 @@ static int64 calculate_total_relation_size(Relation rel); +/* Hook for plugins to calculate relation size */ +relation_size_hook_type relation_size_hook = NULL; + /** * Some functions are peculiar in that they do their own dispatching. * They do not work on entry db since we do not support dispatching @@ -444,6 +447,12 @@ calculate_relation_size(Relation rel, ForkNumber forknum) char pathname[MAXPGPATH]; unsigned int segcount = 0; + /* + * TODO: For non-heap relations, use table_relation_size instead. + */ + if (relation_size_hook) + return (*relation_size_hook) (rel, forknum); + /* Call into the tableam api for AO/AOCO relations */ if (RelationIsAppendOptimized(rel)) return table_relation_size(rel, forknum); diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index f8dd5c30368..29c484835d7 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -3861,6 +3861,7 @@ RelationSetNewRelfilenode(Relation relation, char persistence) */ RelationDropStorage(relation); + /* * Create storage for the main fork of the new relfilenode. If it's a * table-like object, call into the table AM to do so, which'll also @@ -3881,7 +3882,8 @@ RelationSetNewRelfilenode(Relation relation, char persistence) SMgrRelation srel; srel = RelationCreateStorage(newrnode, persistence, - 0 /* default storage implementation */); + 0 /* default storage implementation */, + relation); smgrclose(srel); } break; diff --git a/src/backend/utils/mmgr/slab.c b/src/backend/utils/mmgr/slab.c index c58bf923323..834cab81fa8 100644 --- a/src/backend/utils/mmgr/slab.c +++ b/src/backend/utils/mmgr/slab.c @@ -152,7 +152,11 @@ static const MemoryContextMethods SlabMethods = { SlabDelete, SlabGetChunkSpace, SlabIsEmpty, - SlabStats + SlabStats, + NULL, + NULL, + NULL, + NULL #ifdef MEMORY_CONTEXT_CHECKING ,SlabCheck #endif diff --git a/src/bin/psql/describe.c b/src/bin/psql/describe.c index f66e4535b8e..db54657efcd 100644 --- a/src/bin/psql/describe.c +++ b/src/bin/psql/describe.c @@ -4997,7 +4997,6 @@ listTables(const char *tabtypes, const char *pattern, bool verbose, bool showSys appendPQExpBuffer(&buf, " WHEN %d THEN '%s'", AO_ROW_TABLE_AM_OID, gettext_noop("append only")); appendPQExpBuffer(&buf, " WHEN %d THEN '%s'", AO_COLUMN_TABLE_AM_OID, gettext_noop("append only columnar")); appendPQExpBuffer(&buf, " WHEN %d THEN '%s'", BTREE_AM_OID, gettext_noop("btree")); - appendPQExpBuffer(&buf, " END as \"%s\"\n", gettext_noop("Storage")); } else diff --git a/src/include/access/clog.h b/src/include/access/clog.h index a6342606bad..d3b3c0e8209 100644 --- a/src/include/access/clog.h +++ b/src/include/access/clog.h @@ -36,6 +36,9 @@ typedef struct xl_clog_truncate Oid oldestXactDb; } xl_clog_truncate; +struct SlruCtlData; +typedef struct SlruCtlData *SlruCtl; + extern void TransactionIdSetTreeStatus(TransactionId xid, int nsubxids, TransactionId *subxids, XidStatus status, XLogRecPtr lsn); extern XidStatus TransactionIdGetStatus(TransactionId xid, XLogRecPtr *lsn); @@ -63,5 +66,5 @@ extern int clogsyncfiletag(const FileTag *ftag, char *path); extern void clog_redo(XLogReaderState *record); extern void clog_desc(StringInfo buf, XLogReaderState *record); extern const char *clog_identify(uint8 info); - +extern SlruCtl CLOG_Ctl(void); #endif /* CLOG_H */ diff --git a/src/include/access/distributedlog.h b/src/include/access/distributedlog.h index de643afdae0..7d7f630d941 100644 --- a/src/include/access/distributedlog.h +++ b/src/include/access/distributedlog.h @@ -42,6 +42,9 @@ typedef struct DistributedLogEntry } DistributedLogEntry; +struct SlruCtlData; +typedef struct SlruCtlData *SlruCtl; + extern void DistributedLog_SetCommittedTree(TransactionId xid, int nxids, TransactionId *xids, DistributedTransactionId distribXid, bool isRedo); @@ -80,5 +83,5 @@ extern void DistributedLog_GetDistributedXid( DistributedTransactionId *distribXid); extern int DistributedLog_syncfiletag(const FileTag *ftag, char *path); - +extern SlruCtl DistributedLog_Ctl(void); #endif /* DISTRIBUTEDLOG_H */ diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h index e6b11a8cb9e..b1d72c00e15 100644 --- a/src/include/access/heapam_xlog.h +++ b/src/include/access/heapam_xlog.h @@ -108,6 +108,7 @@ typedef struct xl_heap_delete { TransactionId xmax; /* xmax of the deleted tuple */ OffsetNumber offnum; /* deleted tuple's offset */ + uint32 t_cid; /* current command id */ uint8 infobits_set; /* infomask bits */ uint8 flags; } xl_heap_delete; @@ -145,6 +146,7 @@ typedef struct xl_heap_header { uint16 t_infomask2; uint16 t_infomask; + uint32 t_cid; /* current command id */ uint8 t_hoff; } xl_heap_header; @@ -186,6 +188,7 @@ typedef struct xl_multi_insert_tuple uint16 datalen; /* size of tuple data that follows */ uint16 t_infomask2; uint16 t_infomask; + uint32 t_cid; /* current command id */ uint8 t_hoff; /* TUPLE DATA FOLLOWS AT END OF STRUCT */ } xl_multi_insert_tuple; @@ -215,6 +218,7 @@ typedef struct xl_heap_update OffsetNumber old_offnum; /* old tuple's offset */ uint8 old_infobits_set; /* infomask bits to set on old tuple */ uint8 flags; + uint32 t_cid; /* current command id */ TransactionId new_xmax; /* xmax of the new tuple */ OffsetNumber new_offnum; /* new tuple's offset */ @@ -279,6 +283,7 @@ typedef struct xl_heap_lock { TransactionId locking_xid; /* might be a MultiXactId not xid */ OffsetNumber offnum; /* locked tuple's offset on page */ + uint32 t_cid; /* current command id */ int8 infobits_set; /* infomask and infomask2 bits to set */ uint8 flags; /* XLH_LOCK_* flag bits */ } xl_heap_lock; diff --git a/src/include/access/slru.h b/src/include/access/slru.h index 0edc8c3ee8c..88653ae7e66 100644 --- a/src/include/access/slru.h +++ b/src/include/access/slru.h @@ -171,5 +171,4 @@ extern bool SlruScanDirCbReportPresence(SlruCtl ctl, char *filename, int segpage, void *data); extern bool SlruScanDirCbDeleteAll(SlruCtl ctl, char *filename, int segpage, void *data); - #endif /* SLRU_H */ diff --git a/src/include/access/transam.h b/src/include/access/transam.h index 44162f99161..bdf9bc856b5 100644 --- a/src/include/access/transam.h +++ b/src/include/access/transam.h @@ -303,6 +303,10 @@ extern int xid_warn_limit; /* GPDB-specific */ extern bool gp_pause_on_restore_point_replay; +/* hook for plugins to assign new relfilenode */ +typedef RelFileNodeId (*NewSegRelfilenode_assign_hook_type)(void); +extern PGDLLIMPORT NewSegRelfilenode_assign_hook_type NewSegRelfilenode_assign_hook; + /* * prototypes for functions in transam/transam.c */ diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 4702785d046..d52bde5e235 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -160,6 +160,11 @@ extern bool StandbyMode; /* tde feature enable or not */ extern int FileEncryptionEnabled; + +/* Hook for plugins to do some startup job */ +typedef void (*Startup_hook_type) (void); +extern PGDLLIMPORT Startup_hook_type Startup_hook; + /* Archive modes */ typedef enum ArchiveMode { diff --git a/src/include/access/xloginsert.h b/src/include/access/xloginsert.h index 2b951a409b5..a78796c93e1 100644 --- a/src/include/access/xloginsert.h +++ b/src/include/access/xloginsert.h @@ -38,6 +38,17 @@ #define REGBUF_KEEP_DATA 0x10 /* include data even if a full-page image * is taken */ +typedef void * (*RecordAssembleFunc)(RmgrId rmid, uint8 info, + XLogRecPtr RedoRecPtr, bool doPageWrites, + XLogRecPtr *fpw_lsn, TransactionId headerXid, int *num_fpi); +typedef XLogRecPtr (*XLogInsert_hook_type)(RmgrId rmid, uint8 info, TransactionId headerXid, uint8 curinsert_flags, RecordAssembleFunc recordAssembleFunc); +extern PGDLLIMPORT XLogInsert_hook_type XLogInsert_hook; + +extern bool GetXLogRegisterBufferTagIfAny(RelFileNode *rnode, ForkNumber *forknum, BlockNumber *blkno); +extern int GetNumXLogRegisterBuffers(void); +extern bool GetXLogRegisterBuffer(int block_id, RelFileNode *rnode, ForkNumber *forknum, BlockNumber *blkno, char **page); +extern char *GetXLogRegisterRdata(int rdata_index); + /* prototypes for public functions in xloginsert.c: */ extern void XLogBeginInsert(void); extern void XLogSetRecordFlags(uint8 flags); diff --git a/src/include/access/xlogutils.h b/src/include/access/xlogutils.h index d749456554b..170d7e90786 100644 --- a/src/include/access/xlogutils.h +++ b/src/include/access/xlogutils.h @@ -33,6 +33,12 @@ typedef enum * need to be replayed) */ } XLogRedoAction; +/* + * Returns true if we shouldn't do REDO on that block in record indicated by + * block_id; false otherwise. + */ +extern bool (*redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id); + extern XLogRedoAction XLogReadBufferForRedo(XLogReaderState *record, uint8 buffer_id, Buffer *buf); extern Buffer XLogInitBufferForRedo(XLogReaderState *record, uint8 block_id); diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h index 66b832ddaaa..e13c88e14df 100644 --- a/src/include/catalog/index.h +++ b/src/include/catalog/index.h @@ -14,6 +14,7 @@ #ifndef INDEX_H #define INDEX_H +#include "access/amapi.h" #include "access/relscan.h" /* Relation, Snapshot */ #include "catalog/objectaddress.h" #include "executor/tuptable.h" /* TupTableSlot */ @@ -72,6 +73,27 @@ extern void index_check_primary_key(Relation heapRel, #define INDEX_CREATE_PARTITIONED (1 << 5) #define INDEX_CREATE_INVALID (1 << 6) +typedef Oid (*index_create_hook_type)(Relation heapRelation, + const char *indexRelationName, + Oid indexRelationId, + Oid parentIndexRelid, + Oid parentConstraintId, + Oid relFileNode, + IndexInfo *indexInfo, + List *indexColNames, + Oid accessMethodObjectId, + Oid tableSpaceId, + Oid *collationObjectId, + Oid *classObjectId, + int16 *coloptions, + Datum reloptions, + bits16 flags, + bits16 constr_flags, + bool allow_system_table_mods, + bool is_internal, + Oid *constraintId); +extern PGDLLIMPORT index_create_hook_type index_create_hook; + extern Oid index_create(Relation heapRelation, const char *indexRelationName, Oid indexRelationId, @@ -91,6 +113,27 @@ extern Oid index_create(Relation heapRelation, bool allow_system_table_mods, bool is_internal, Oid *constraintId); +extern Oid index_create_internal(Relation heapRelation, + const char *indexRelationName, + Oid indexRelationId, + Oid parentIndexRelid, + Oid parentConstraintId, + Oid relFileNode, + IndexInfo *indexInfo, + List *indexColNames, + Oid accessMethodObjectId, + Oid tableSpaceId, + Oid *collationObjectId, + Oid *classObjectId, + int16 *coloptions, + Datum reloptions, + bits16 flags, + bits16 constr_flags, + bool allow_system_table_mods, + bool is_internal, + Oid *constraintId); + +extern PGDLLIMPORT ambuild_function index_build_hook; #define INDEX_CONSTR_CREATE_MARK_AS_PRIMARY (1 << 0) #define INDEX_CONSTR_CREATE_DEFERRABLE (1 << 1) diff --git a/src/include/catalog/storage.h b/src/include/catalog/storage.h index e0128900d6c..ca71c8ded7d 100644 --- a/src/include/catalog/storage.h +++ b/src/include/catalog/storage.h @@ -24,7 +24,8 @@ extern int wal_skip_threshold; extern SMgrRelation RelationCreateStorage(RelFileNode rnode, char relpersistence, - SMgrImpl smgr_which); + SMgrImpl smgr_which, + Relation rel); extern void RelationDropStorage(Relation rel); extern void RelationPreserveStorage(RelFileNode rnode, bool atCommit); extern void RelationPreTruncate(Relation rel); diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 2fe29dd3067..62215c3a44d 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -613,4 +613,5 @@ extern void GpRecoveryFromError(void); dispatch_nest_level = 0; \ } while(0) + #endif /* MISCADMIN_H */ diff --git a/src/include/postmaster/bgworker_internals.h b/src/include/postmaster/bgworker_internals.h index fc7706314be..6f3edb3b160 100644 --- a/src/include/postmaster/bgworker_internals.h +++ b/src/include/postmaster/bgworker_internals.h @@ -42,6 +42,11 @@ typedef struct RegisteredBgWorker slist_node rw_lnode; /* list link */ } RegisteredBgWorker; +/* Hook for plugins to start background workers */ +typedef bool (*start_bgworker_func) (RegisteredBgWorker *rw); +typedef void (*start_bgworkers_hook_type) (bool FatalError, int pmState, start_bgworker_func startBgworkerFunc); +extern PGDLLIMPORT start_bgworkers_hook_type start_bgworkers_hook; + extern slist_head BackgroundWorkerList; extern Size BackgroundWorkerShmemSize(void); diff --git a/src/include/replication/syncrep.h b/src/include/replication/syncrep.h index 0e36a86b16c..f88796389ab 100644 --- a/src/include/replication/syncrep.h +++ b/src/include/replication/syncrep.h @@ -119,5 +119,4 @@ extern int syncrep_yylex(void); extern void syncrep_yyerror(const char *str); extern void syncrep_scanner_init(const char *query_string); extern void syncrep_scanner_finish(void); - #endif /* _SYNCREP_H */ diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index 51a7e503b2c..cfedfa85080 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -343,7 +343,7 @@ extern PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum); extern BufferDesc *LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, - BlockNumber blockNum, bool *foundPtr); + BlockNumber blockNum, bool *foundPtr, Buffer non_evited_buffer); extern void MarkLocalBufferDirty(Buffer buffer); extern void DropRelFileNodeLocalBuffers(RelFileNode rnode, ForkNumber forkNum, BlockNumber firstDelBlock); diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index a3790f76ee6..7b1f98b933d 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -18,6 +18,7 @@ #include "storage/buf.h" #include "storage/bufpage.h" #include "storage/relfilenode.h" +#include "storage/smgr.h" #include "utils/relcache.h" #include "utils/snapmgr.h" @@ -168,6 +169,11 @@ extern PGDLLIMPORT int32 *LocalRefCount; */ #define BufferGetPage(buffer) ((Page)BufferGetBlock(buffer)) +typedef Buffer (*ReadBuffer_hook_type)(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, + BlockNumber blockNum, ReadBufferMode mode, + BufferAccessStrategy strategy, bool *hit); +extern PGDLLIMPORT ReadBuffer_hook_type ReadBuffer_hook; + /* * prototypes for functions in bufmgr.c */ diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index c0b8f4236c5..f49b3b0e49f 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -20,6 +20,7 @@ #include "storage/block.h" #include "storage/relfilenode.h" #include "storage/dbdirnode.h" +#include "utils/relcache.h" typedef enum SMgrImplementation { @@ -27,6 +28,8 @@ typedef enum SMgrImplementation SMGR_AO = 1 } SMgrImpl; +struct f_smgr; + /* * smgr.c maintains a table of SMgrRelation objects, which are essentially * cached file handles. An SMgrRelation is created (if not already present) @@ -64,6 +67,11 @@ typedef struct SMgrRelationData /* additional public fields may someday exist here */ + /* copy of pg_class.relpersistence, or 0 if not known */ + char smgr_relpersistence; + /* pointer to storage manager */ + const struct f_smgr *smgr; + /* * Fields below here are intended to be private to smgr.c and its * submodules. Do not touch them from elsewhere. @@ -86,9 +94,49 @@ typedef SMgrRelationData *SMgrRelation; #define SmgrIsTemp(smgr) \ RelFileNodeBackendIsTemp((smgr)->smgr_rnode) +/* + * Redefinition of storage manager here to make it accessible by other plugins(Union Store), + * and we can introduce more storage managers by smgr_hook. + */ +typedef struct f_smgr +{ + void (*smgr_init) (void); /* may be NULL */ + void (*smgr_shutdown) (void); /* may be NULL */ + void (*smgr_open) (SMgrRelation reln); + void (*smgr_close) (SMgrRelation reln, ForkNumber forknum); + void (*smgr_create) (SMgrRelation reln, ForkNumber forknum, + bool isRedo); + bool (*smgr_exists) (SMgrRelation reln, ForkNumber forknum); + void (*smgr_unlink) (RelFileNodeBackend rnode, ForkNumber forknum, + bool isRedo); + void (*smgr_extend) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, bool skipFsync); + bool (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum); + void (*smgr_read) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer); + void (*smgr_write) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, bool skipFsync); + void (*smgr_writeback) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks); + BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum); + void (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum, + BlockNumber nblocks); + void (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum); +} f_smgr; + +typedef void (*smgr_init_hook_type) (void); +typedef void (*smgr_hook_type) (SMgrRelation reln, BackendId backend, SMgrImpl which, Relation rel); +typedef void (*smgr_shutdown_hook_type) (void); +extern PGDLLIMPORT smgr_init_hook_type smgr_init_hook; +extern PGDLLIMPORT smgr_hook_type smgr_hook; +extern PGDLLIMPORT smgr_shutdown_hook_type smgr_shutdown_hook; + +extern bool smgr_is_heap_relation(SMgrRelation reln); + extern void smgrinit(void); extern SMgrRelation smgropen(RelFileNode rnode, BackendId backend, - SMgrImpl smgr_which); + SMgrImpl smgr_which, Relation rel); extern bool smgrexists(SMgrRelation reln, ForkNumber forknum); extern void smgrsetowner(SMgrRelation *owner, SMgrRelation reln); extern void smgrclearowner(SMgrRelation *owner, SMgrRelation reln); diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h index 6c3757dab00..6ae800016a0 100644 --- a/src/include/utils/rel.h +++ b/src/include/utils/rel.h @@ -34,7 +34,6 @@ #include "catalog/pg_proc.h" - /* * LockRelId and LockInfo really belong to lmgr.h, but it's more convenient * to declare them here so we can have a LockInfoData field in a Relation. @@ -258,6 +257,9 @@ typedef struct RelationData struct PgStat_TableStatus *pgstat_info; /* statistics collection area */ } RelationData; +/* Hook for plugins to calculate relation size */ +typedef int64 (*relation_size_hook_type) (Relation rel, ForkNumber forknum); +extern PGDLLIMPORT relation_size_hook_type relation_size_hook; /* * ForeignKeyCacheInfo @@ -606,7 +608,7 @@ typedef struct ViewOptions smgrsetowner(&((relation)->rd_smgr), \ smgropen((relation)->rd_node, \ (relation)->rd_backend, \ - RelationIsAppendOptimized(relation)?SMGR_AO:SMGR_MD)); \ + RelationIsAppendOptimized(relation)?SMGR_AO:SMGR_MD, relation)); \ } while (0) /* diff --git a/src/test/heap_checksum/expected/heap_checksum_corruption.out b/src/test/heap_checksum/expected/heap_checksum_corruption.out index 176440af4ff..a3dc7bae6a7 100644 --- a/src/test/heap_checksum/expected/heap_checksum_corruption.out +++ b/src/test/heap_checksum/expected/heap_checksum_corruption.out @@ -87,7 +87,7 @@ $$ DECLARE tablespace Oid; database Oid; -relfile Oid; +relfile int8; result bool; BEGIN SELECT dattablespace, oid INTO tablespace, database FROM pg_database WHERE datname = current_database(); diff --git a/src/test/heap_checksum/input/setup.source b/src/test/heap_checksum/input/setup.source index c7bc929b9b8..b71ff1997b4 100644 --- a/src/test/heap_checksum/input/setup.source +++ b/src/test/heap_checksum/input/setup.source @@ -1,2 +1,2 @@ -CREATE OR REPLACE FUNCTION invalidate_buffers(Oid, Oid, Oid) RETURNS BOOL AS '@abs_builddir@/heap_checksum_helper@DLSUFFIX@', 'invalidate_buffers' -LANGUAGE C VOLATILE STRICT NO SQL; +CREATE OR REPLACE FUNCTION invalidate_buffers(Oid, Oid, int8) RETURNS BOOL AS '@abs_builddir@/heap_checksum_helper@DLSUFFIX@', 'invalidate_buffers' +LANGUAGE C VOLATILE STRICT NO SQL; \ No newline at end of file diff --git a/src/test/heap_checksum/output/setup.source b/src/test/heap_checksum/output/setup.source index c7bc929b9b8..011e6d42553 100644 --- a/src/test/heap_checksum/output/setup.source +++ b/src/test/heap_checksum/output/setup.source @@ -1,2 +1,2 @@ -CREATE OR REPLACE FUNCTION invalidate_buffers(Oid, Oid, Oid) RETURNS BOOL AS '@abs_builddir@/heap_checksum_helper@DLSUFFIX@', 'invalidate_buffers' +CREATE OR REPLACE FUNCTION invalidate_buffers(Oid, Oid, int8) RETURNS BOOL AS '@abs_builddir@/heap_checksum_helper@DLSUFFIX@', 'invalidate_buffers' LANGUAGE C VOLATILE STRICT NO SQL; diff --git a/src/test/heap_checksum/sql/heap_checksum_corruption.sql b/src/test/heap_checksum/sql/heap_checksum_corruption.sql index 7ad1aca25d0..7defc9a6150 100644 --- a/src/test/heap_checksum/sql/heap_checksum_corruption.sql +++ b/src/test/heap_checksum/sql/heap_checksum_corruption.sql @@ -98,7 +98,7 @@ $$ DECLARE tablespace Oid; database Oid; -relfile Oid; +relfile int8; result bool; BEGIN SELECT dattablespace, oid INTO tablespace, database FROM pg_database WHERE datname = current_database(); From 057f65e6f340fd71c4b30b5263c4395741da9fca Mon Sep 17 00:00:00 2001 From: leo Date: Fri, 7 Jul 2023 11:21:41 +0800 Subject: [PATCH 2/9] Fix compile error of assertion macro nested to if else condition This was introduced by 0798f4a687a, use brace for body of if else condition nested to assertion macro to avoid compile error. --- src/backend/catalog/index.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 493a07a617c..ba6411db0fe 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -3177,10 +3177,14 @@ index_build(Relation heapRelation, * Call the access method's build procedure */ if (index_build_hook) - stats = index_build_hook(heapRelation, indexRelation, indexInfo); + { + stats = index_build_hook(heapRelation, indexRelation, indexInfo); + } else - stats = indexRelation->rd_indam->ambuild(heapRelation, indexRelation, - indexInfo); + { + stats = indexRelation->rd_indam->ambuild(heapRelation, indexRelation, + indexInfo); + } Assert(PointerIsValid(stats)); /* From aaddb3b620737bb3f567fbb7595bd82d46e1a01f Mon Sep 17 00:00:00 2001 From: Zhang Mingli Date: Mon, 10 Jul 2023 20:09:35 +0800 Subject: [PATCH 3/9] Push locus down of same slice after eliding motion node. CBDB will try to elide motion node when create plan if we are from a SingleQE to Entry. But that only adjust the top node's locus. However, the locus doesn't affect the plan for execution. But when we explain show locus, it will confuse us because the locus of a plan node shows where it executes. And for Motion node, its locus is the target locus we want to be. explain(costs off, locus) select distinct min(c1), max(c1) from t1; QUERY PLAN ------------------------------------------------------------ Unique Locus: Entry Group Key: (min(c1)), (max(c1)) -> Sort Locus: SingleQE Sort Key: (min(c1)), (max(c1)) -> Aggregate Locus: SingleQE -> Gather Motion 3:1 (slice1; segments: 3) Locus: SingleQE -> Seq Scan on t1 Locus: Hashed The real plan will execute Unique, Sort and Aggregate on QD after a Gather Motion. So all the locus of nodes until Gather Motion are Entry locus. This commit pushes the MotionPath's locus down onto descendant subpaths of the same slice. explain(costs off, locus) select distinct min(f1), max(f1) from t1; QUERY PLAN ------------------------------------------------------------ Unique Locus: Entry Group Key: (min(f1)), (max(f1)) -> Sort Locus: Entry Sort Key: (min(f1)), (max(f1)) -> Aggregate Locus: Entry -> Gather Motion 3:1 (slice1; segments: 3) Locus: Entry -> Seq Scan on t1 Locus: Hashed Authored-by: Zhang Mingli avamingli@gmail.com --- src/backend/optimizer/plan/createplan.c | 48 +++++- src/test/regress/expected/gp_parallel.out | 177 ++++++++++++++++++++-- src/test/regress/sql/gp_parallel.sql | 49 +++++- 3 files changed, 255 insertions(+), 19 deletions(-) diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index b26ab32a54e..974577c0d5b 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -356,6 +356,7 @@ static Motion *cdbpathtoplan_create_motion_plan(PlannerInfo *root, static void append_initplan_for_function_scan(PlannerInfo *root, Path *best_path, Plan *plan); static bool contain_motion(PlannerInfo *root, Node *node); static bool contain_motion_walk(Node *node, contain_motion_walk_context *ctx); +static void push_locus_down_after_elide_motion(Plan* pplan); /* * create_plan @@ -3284,11 +3285,10 @@ create_motion_plan(PlannerInfo *root, CdbMotionPath *path) if (CdbPathLocus_IsEntry(path->path.locus) && CdbPathLocus_IsSingleQE(subpath->locus)) { - /* Push the MotionPath's locus down onto subpath. */ - subpath->locus = path->path.locus; - subplan = create_plan_recurse(root, subpath, CP_EXACT_TLIST); + push_locus_down_after_elide_motion(subplan); + return subplan; } @@ -9009,3 +9009,45 @@ contain_motion_walk(Node *node, contain_motion_walk_context *ctx) return plan_tree_walker((Node *) node, contain_motion_walk, ctx, true); } + +/* + * Push locus down onto descendant subpaths of the same slice + * after eliding Motion. + * The plan's parent must be Entry locus at the first call. + */ +static void +push_locus_down_after_elide_motion(Plan* plan) +{ + while(plan && (CdbLocusType_SingleQE == plan->locustype)) + { + plan->locustype = CdbLocusType_Entry; + switch (nodeTag(plan)) + { + case T_Motion: + /* Push down within the same slice. */ + return; + case T_Append: + { + List* subplans = NIL; + ListCell* cell; + subplans = ((Append*)(plan))->appendplans; + foreach(cell, subplans) + { + push_locus_down_after_elide_motion(lfirst(cell)); + } + break; + } + case T_SubqueryScan: + /* We haven't elided Subquery yet. */ + plan = ((SubqueryScan *)(plan))->subplan; + break; + case T_NestLoop: + case T_MergeJoin: + case T_HashJoin: + push_locus_down_after_elide_motion(plan->righttree); + /* FALLTHROUGH */ + default: + plan = plan->lefttree; + } + } +} \ No newline at end of file diff --git a/src/test/regress/expected/gp_parallel.out b/src/test/regress/expected/gp_parallel.out index f23b6f03eb2..306c1119bb3 100644 --- a/src/test/regress/expected/gp_parallel.out +++ b/src/test/regress/expected/gp_parallel.out @@ -110,7 +110,7 @@ explain(locus, costs off) select count(*) from ao1, ao2 where ao1.x = ao2.x; Finalize Aggregate Locus: Entry -> Gather Motion 12:1 (slice1; segments: 12) - Locus: SingleQE + Locus: Entry -> Partial Aggregate Locus: HashedWorkers Parallel Workers: 4 @@ -1328,15 +1328,15 @@ explain (locus, costs off) select a from rt1 union all select count(*) as a from Append Locus: Entry -> Gather Motion 1:1 (slice1; segments: 1) - Locus: SingleQE + Locus: Entry -> Subquery Scan on "*SELECT* 1" Locus: SegmentGeneral -> Seq Scan on rt1 Locus: SegmentGeneral -> Finalize Aggregate - Locus: SingleQE + Locus: Entry -> Gather Motion 3:1 (slice2; segments: 3) - Locus: SingleQE + Locus: Entry -> Partial Aggregate Locus: Hashed -> Seq Scan on sq1 @@ -1391,18 +1391,26 @@ insert into t1 select i, i+1 from generate_series(1, 100000) i; analyze t1; set local optimizer = off; set local enable_parallel = on; -explain(costs off) select * from t1 order by c2 asc limit 3 offset 5; +explain(costs off, locus) select * from t1 order by c2 asc limit 3 offset 5; QUERY PLAN ------------------------------------------------- Limit + Locus: Entry -> Gather Motion 6:1 (slice1; segments: 6) + Locus: Entry Merge Key: c2 -> Limit + Locus: HashedWorkers + Parallel Workers: 2 -> Sort + Locus: HashedWorkers + Parallel Workers: 2 Sort Key: c2 -> Parallel Seq Scan on t1 + Locus: HashedWorkers + Parallel Workers: 2 Optimizer: Postgres query optimizer -(8 rows) +(16 rows) select * from t1 order by c2 asc limit 3 offset 5; c1 | c2 @@ -1414,18 +1422,23 @@ select * from t1 order by c2 asc limit 3 offset 5; -- non-parallel results set local enable_parallel = off; -explain(costs off) select * from t1 order by c2 asc limit 3 offset 5; +explain(costs off, locus) select * from t1 order by c2 asc limit 3 offset 5; QUERY PLAN ------------------------------------------------ Limit + Locus: Entry -> Gather Motion 3:1 (slice1; segments: 3) + Locus: Entry Merge Key: c2 -> Limit + Locus: Hashed -> Sort + Locus: Hashed Sort Key: c2 -> Seq Scan on t1 + Locus: Hashed Optimizer: Postgres query optimizer -(8 rows) +(13 rows) select * from t1 order by c2 asc limit 3 offset 5; c1 | c2 @@ -1446,17 +1459,23 @@ analyze t1; set local optimizer = off; set local gp_enable_multiphase_limit = off; set local enable_parallel = on; -explain(costs off) select * from t1 order by c2 asc limit 3 offset 5; +explain(costs off, locus) select * from t1 order by c2 asc limit 3 offset 5; QUERY PLAN ------------------------------------------------ Limit + Locus: Entry -> Gather Motion 6:1 (slice1; segments: 6) + Locus: Entry Merge Key: c2 -> Sort + Locus: HashedWorkers + Parallel Workers: 2 Sort Key: c2 -> Parallel Seq Scan on t1 + Locus: HashedWorkers + Parallel Workers: 2 Optimizer: Postgres query optimizer -(7 rows) +(13 rows) select * from t1 order by c2 asc limit 3 offset 5; c1 | c2 @@ -1468,17 +1487,21 @@ select * from t1 order by c2 asc limit 3 offset 5; -- non-parallel results set local enable_parallel = off; -explain(costs off) select * from t1 order by c2 asc limit 3 offset 5; +explain(costs off, locus) select * from t1 order by c2 asc limit 3 offset 5; QUERY PLAN ------------------------------------------------ Limit + Locus: Entry -> Gather Motion 3:1 (slice1; segments: 3) + Locus: Entry Merge Key: c2 -> Sort + Locus: Hashed Sort Key: c2 -> Seq Scan on t1 + Locus: Hashed Optimizer: Postgres query optimizer -(7 rows) +(11 rows) select * from t1 order by c2 asc limit 3 offset 5; c1 | c2 @@ -1542,6 +1565,136 @@ select count(*) from aocs; alter table aocs reset (parallel_workers); abort; +-- +-- Test locus after eliding mtion node. +-- +begin; +create table t1(c1 int) distributed by (c1); +insert into t1 values(11), (12); +analyze t1; +explain(costs off, locus) select distinct min(c1), max(c1) from t1; + QUERY PLAN +------------------------------------------------------------ + Unique + Locus: Entry + Group Key: (min(c1)), (max(c1)) + -> Sort + Locus: Entry + Sort Key: (min(c1)), (max(c1)) + -> Aggregate + Locus: Entry + -> Gather Motion 3:1 (slice1; segments: 3) + Locus: Entry + -> Seq Scan on t1 + Locus: Hashed + Optimizer: Postgres query optimizer +(13 rows) + +abort; +begin; +create table t1(id int) distributed by (id); +create index on t1(id); +insert into t1 values(generate_series(1, 100)); +analyze t1; +set enable_seqscan =off; +explain (locus, costs off) +select * from + (select count(id) from t1 where id > 10) ss + right join (values (1),(2),(3)) v(x) on true; + QUERY PLAN +--------------------------------------------------------------------- + Nested Loop Left Join + Locus: Entry + -> Values Scan on "*VALUES*" + Locus: General + -> Materialize + Locus: Entry + -> Finalize Aggregate + Locus: Entry + -> Gather Motion 3:1 (slice1; segments: 3) + Locus: Entry + -> Partial Aggregate + Locus: Hashed + -> Index Only Scan using t1_id_idx on t1 + Locus: Hashed + Index Cond: (id > 10) + Optimizer: Postgres query optimizer +(16 rows) + +abort; +begin; +create table pagg_tab (a int, b int, c text, d int) partition by list(c); +create table pagg_tab_p1 partition of pagg_tab for values in ('0000', '0001', '0002', '0003', '0004'); +create table pagg_tab_p2 partition of pagg_tab for values in ('0005', '0006', '0007', '0008'); +create table pagg_tab_p3 partition of pagg_tab for values in ('0009', '0010', '0011'); +insert into pagg_tab select i % 20, i % 30, to_char(i % 12, 'FM0000'), i % 30 from generate_series(0, 2999) i; +analyze pagg_tab; +set local enable_partitionwise_aggregate to true; +set local enable_partitionwise_join to true; +set local enable_incremental_sort to off; +set local enable_hashagg to false; +set local enable_parallel = off; +explain (costs off, locus) +select c, sum(a), avg(b), count(*) from pagg_tab group by 1 having avg(d) < 15 order by 1, 2, 3; + QUERY PLAN +------------------------------------------------------------------------ + Sort + Locus: Entry + Sort Key: pagg_tab.c, (sum(pagg_tab.a)), (avg(pagg_tab.b)) + -> Append + Locus: Entry + -> Finalize GroupAggregate + Locus: Entry + Group Key: pagg_tab.c + Filter: (avg(pagg_tab.d) < '15'::numeric) + -> Gather Motion 3:1 (slice1; segments: 3) + Locus: Entry + Merge Key: pagg_tab.c + -> Partial GroupAggregate + Locus: Hashed + Group Key: pagg_tab.c + -> Sort + Locus: Hashed + Sort Key: pagg_tab.c + -> Seq Scan on pagg_tab_p1 pagg_tab + Locus: Hashed + -> Finalize GroupAggregate + Locus: Entry + Group Key: pagg_tab_1.c + Filter: (avg(pagg_tab_1.d) < '15'::numeric) + -> Gather Motion 3:1 (slice2; segments: 3) + Locus: Entry + Merge Key: pagg_tab_1.c + -> Partial GroupAggregate + Locus: Hashed + Group Key: pagg_tab_1.c + -> Sort + Locus: Hashed + Sort Key: pagg_tab_1.c + -> Seq Scan on pagg_tab_p2 pagg_tab_1 + Locus: Hashed + -> Finalize GroupAggregate + Locus: Entry + Group Key: pagg_tab_2.c + Filter: (avg(pagg_tab_2.d) < '15'::numeric) + -> Gather Motion 3:1 (slice3; segments: 3) + Locus: Entry + Merge Key: pagg_tab_2.c + -> Partial GroupAggregate + Locus: Hashed + Group Key: pagg_tab_2.c + -> Sort + Locus: Hashed + Sort Key: pagg_tab_2.c + -> Seq Scan on pagg_tab_p3 pagg_tab_2 + Locus: Hashed + Optimizer: Postgres query optimizer +(51 rows) + +abort; +-- +-- End of Test locus after eliding mtion node. +-- -- start_ignore drop schema test_parallel cascade; -- end_ignore diff --git a/src/test/regress/sql/gp_parallel.sql b/src/test/regress/sql/gp_parallel.sql index cade5a7a8df..6b68b31ebae 100644 --- a/src/test/regress/sql/gp_parallel.sql +++ b/src/test/regress/sql/gp_parallel.sql @@ -416,11 +416,11 @@ insert into t1 select i, i+1 from generate_series(1, 100000) i; analyze t1; set local optimizer = off; set local enable_parallel = on; -explain(costs off) select * from t1 order by c2 asc limit 3 offset 5; +explain(costs off, locus) select * from t1 order by c2 asc limit 3 offset 5; select * from t1 order by c2 asc limit 3 offset 5; -- non-parallel results set local enable_parallel = off; -explain(costs off) select * from t1 order by c2 asc limit 3 offset 5; +explain(costs off, locus) select * from t1 order by c2 asc limit 3 offset 5; select * from t1 order by c2 asc limit 3 offset 5; abort; @@ -434,11 +434,11 @@ analyze t1; set local optimizer = off; set local gp_enable_multiphase_limit = off; set local enable_parallel = on; -explain(costs off) select * from t1 order by c2 asc limit 3 offset 5; +explain(costs off, locus) select * from t1 order by c2 asc limit 3 offset 5; select * from t1 order by c2 asc limit 3 offset 5; -- non-parallel results set local enable_parallel = off; -explain(costs off) select * from t1 order by c2 asc limit 3 offset 5; +explain(costs off, locus) select * from t1 order by c2 asc limit 3 offset 5; select * from t1 order by c2 asc limit 3 offset 5; abort; -- @@ -463,6 +463,47 @@ select count(*) from aocs; alter table aocs reset (parallel_workers); abort; +-- +-- Test locus after eliding mtion node. +-- +begin; +create table t1(c1 int) distributed by (c1); +insert into t1 values(11), (12); +analyze t1; +explain(costs off, locus) select distinct min(c1), max(c1) from t1; +abort; + +begin; +create table t1(id int) distributed by (id); +create index on t1(id); +insert into t1 values(generate_series(1, 100)); +analyze t1; +set enable_seqscan =off; +explain (locus, costs off) +select * from + (select count(id) from t1 where id > 10) ss + right join (values (1),(2),(3)) v(x) on true; +abort; + +begin; +create table pagg_tab (a int, b int, c text, d int) partition by list(c); +create table pagg_tab_p1 partition of pagg_tab for values in ('0000', '0001', '0002', '0003', '0004'); +create table pagg_tab_p2 partition of pagg_tab for values in ('0005', '0006', '0007', '0008'); +create table pagg_tab_p3 partition of pagg_tab for values in ('0009', '0010', '0011'); +insert into pagg_tab select i % 20, i % 30, to_char(i % 12, 'FM0000'), i % 30 from generate_series(0, 2999) i; +analyze pagg_tab; +set local enable_partitionwise_aggregate to true; +set local enable_partitionwise_join to true; +set local enable_incremental_sort to off; +set local enable_hashagg to false; +set local enable_parallel = off; +explain (costs off, locus) +select c, sum(a), avg(b), count(*) from pagg_tab group by 1 having avg(d) < 15 order by 1, 2, 3; +abort; +-- +-- End of Test locus after eliding mtion node. +-- + -- start_ignore drop schema test_parallel cascade; -- end_ignore From 0154be1ebd4a73193c010b5bf2e5d69f4a54c1e6 Mon Sep 17 00:00:00 2001 From: zhaoxi Date: Fri, 7 Jul 2023 10:56:14 +0800 Subject: [PATCH 4/9] Fix warning for AOCS when targetlist or qual is null There would be a memory detected write warning if targetlist or qual is null for AOCS table. Enlarge columns array size to natts + 1 to avoid that as heap tables. A sql like below will get there: SET default_table_access_method=ao_column; create temp table nocolumns(); select exists(select * from nocolumns); --- src/backend/access/aocs/aocsam_handler.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/access/aocs/aocsam_handler.c b/src/backend/access/aocs/aocsam_handler.c index 20119ef0841..5e20b685d20 100644 --- a/src/backend/access/aocs/aocsam_handler.c +++ b/src/backend/access/aocs/aocsam_handler.c @@ -469,7 +469,7 @@ aoco_beginscan_extractcolumns(Relation rel, Snapshot snapshot, ParallelTableScan bool *cols; bool found = false; - cols = palloc0(natts * sizeof(*cols)); + cols = palloc0((natts + 1) * sizeof(*cols)); found |= extractcolumns_from_node((Node *)targetlist, cols, natts); found |= extractcolumns_from_node((Node *)qual, cols, natts); From afe937af1151a19ca31d4e5f59eac54c7be27d88 Mon Sep 17 00:00:00 2001 From: yangjianghua Date: Thu, 13 Jul 2023 10:47:09 +0800 Subject: [PATCH 5/9] BugFix: initialize columnScanInfo.relationTupleDesc for sample scan When sample scan on AOCS, the relationTupleDesc in aocs_rescan is not set. That will cause a assertion failure that scan->columnScanInfo.relationTupleDesc is not null. This commit will set that relationTupleDesc if it is null. --- src/backend/access/aocs/aocsam.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/backend/access/aocs/aocsam.c b/src/backend/access/aocs/aocsam.c index 124ce44b4d3..870cfbe4b9e 100644 --- a/src/backend/access/aocs/aocsam.c +++ b/src/backend/access/aocs/aocsam.c @@ -635,6 +635,11 @@ void aocs_rescan(AOCSScanDesc scan) { close_cur_scan_seg(scan); + if (scan->columnScanInfo.relationTupleDesc == NULL) + { + scan->columnScanInfo.relationTupleDesc = RelationGetDescr(scan->rs_base.rs_rd); + PinTupleDesc(scan->columnScanInfo.relationTupleDesc); + } if (scan->columnScanInfo.ds) close_ds_read(scan->columnScanInfo.ds, scan->columnScanInfo.relationTupleDesc->natts); initscan_with_colinfo(scan); From 73a9b9ac33afd4a8d3376ccff64936d955952ec4 Mon Sep 17 00:00:00 2001 From: Zhang Mingli Date: Thu, 13 Jul 2023 10:52:14 +0800 Subject: [PATCH 6/9] Ignore temp files generated by regression test. Some temp file are forgetten to be removed after regression. Untracked files: (use "git add ..." to include in what will be committed) src/test/regress/data/minirepro_q.sql src/test/regress/data/part_ext.tbl src/test/regress/expected/external_table_union_all.out src/test/regress/expected/external_table_union_all_optimizer.out src/test/regress/sql/external_table_union_all.sql Clean them all to avoid git diffs after regression is passed. Authored-by: Zhang Mingli avamingli@gmail.com --- src/test/regress/data/.gitignore | 1 + src/test/regress/expected/.gitignore | 2 ++ src/test/regress/expected/minirepro.out | 1 + src/test/regress/sql/.gitignore | 1 + src/test/regress/sql/minirepro.sql | 2 +- 5 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/test/regress/data/.gitignore b/src/test/regress/data/.gitignore index 82b0e51ecda..701071c0e0b 100644 --- a/src/test/regress/data/.gitignore +++ b/src/test/regress/data/.gitignore @@ -1 +1,2 @@ wet_region.out +part_ext.tbl diff --git a/src/test/regress/expected/.gitignore b/src/test/regress/expected/.gitignore index 119808b57ff..00baf22e72a 100644 --- a/src/test/regress/expected/.gitignore +++ b/src/test/regress/expected/.gitignore @@ -57,3 +57,5 @@ /trigger_sets_oid.out /workfile_mgr_test.out /external_table_persistent_error_log.out +/external_table_union_all.out +/external_table_union_all_optimizer.out diff --git a/src/test/regress/expected/minirepro.out b/src/test/regress/expected/minirepro.out index ac16b68b8e9..32fb6aab6da 100644 --- a/src/test/regress/expected/minirepro.out +++ b/src/test/regress/expected/minirepro.out @@ -340,3 +340,4 @@ from pg_statistic where starelid='pg_tablespace'::regclass; 5 | f | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | | | | | | | | | | (5 rows) +\! rm data/minirepro_q.sql diff --git a/src/test/regress/sql/.gitignore b/src/test/regress/sql/.gitignore index 5b39b7c04ce..f485f8dc2d8 100644 --- a/src/test/regress/sql/.gitignore +++ b/src/test/regress/sql/.gitignore @@ -56,3 +56,4 @@ /upgrade.sql /workfile_mgr_test.sql /external_table_persistent_error_log.sql +/external_table_union_all.sql diff --git a/src/test/regress/sql/minirepro.sql b/src/test/regress/sql/minirepro.sql index 4ad98309353..c41c0299d01 100644 --- a/src/test/regress/sql/minirepro.sql +++ b/src/test/regress/sql/minirepro.sql @@ -197,4 +197,4 @@ select stavalues5 from pg_statistic where starelid='pg_tablespace'::regclass; - +\! rm data/minirepro_q.sql From 29a8773e4079cb02351a347ded8d09c0c1307627 Mon Sep 17 00:00:00 2001 From: zhoujiaqi Date: Thu, 13 Jul 2023 19:43:10 +0800 Subject: [PATCH 7/9] Feature: refactor interconnect module Split interconnect code as a module into contrib/intercontect A new interconnect interface in ml_ipc.h The new interconnect interface is decoupled from the motion layer The new interconnect interface makes it easier to add an interconnect implementation * ic_proxy_bgworker.c/ic_proxy_bgworker.h still remains in cbdb --- GNUmakefile.in | 2 + configure | 41 + configure.ac | 11 + contrib/Makefile | 1 + contrib/README | 4 + contrib/interconnect/.gitignore | 1 + contrib/interconnect/Makefile | 45 + contrib/interconnect/Makefile.interconnect.in | 5 + contrib/interconnect/README.md | 204 +++ contrib/interconnect/ic_common.c | 547 ++++++++ contrib/interconnect/ic_common.h | 143 ++ contrib/interconnect/ic_internal.h | 317 +++++ contrib/interconnect/ic_modules.c | 162 +++ contrib/interconnect/ic_modules.h | 24 + .../interconnect/proxy}/README.ic-proxy.md | 0 .../interconnect/proxy}/ic_proxy.h | 2 +- .../interconnect/proxy}/ic_proxy_addr.c | 0 .../interconnect/proxy}/ic_proxy_addr.h | 0 .../interconnect/proxy}/ic_proxy_backend.c | 10 +- .../interconnect/proxy}/ic_proxy_backend.h | 2 +- .../interconnect/proxy}/ic_proxy_client.c | 0 .../interconnect/proxy}/ic_proxy_iobuf.c | 0 .../interconnect/proxy}/ic_proxy_iobuf.h | 0 .../interconnect/proxy}/ic_proxy_key.c | 0 .../interconnect/proxy}/ic_proxy_key.h | 0 .../interconnect/proxy}/ic_proxy_main.c | 0 .../interconnect/proxy}/ic_proxy_packet.c | 0 .../interconnect/proxy}/ic_proxy_packet.h | 0 .../interconnect/proxy}/ic_proxy_peer.c | 0 .../interconnect/proxy}/ic_proxy_pkt_cache.c | 0 .../interconnect/proxy}/ic_proxy_pkt_cache.h | 0 .../interconnect/proxy}/ic_proxy_router.c | 0 .../interconnect/proxy}/ic_proxy_router.h | 0 .../interconnect/proxy}/ic_proxy_server.h | 0 .../interconnect/tcp}/ic_tcp.c | 796 +++++++---- contrib/interconnect/tcp/ic_tcp.h | 92 ++ .../interconnect/udp/ic_faultinjection.h | 11 +- .../interconnect/udp}/ic_udpifc.c | 1210 +++++++++++------ contrib/interconnect/udp/ic_udpifc.h | 214 +++ gpAux/gpdemo/demo_cluster.sh | 1 - src/Makefile | 1 + src/Makefile.global.in | 2 +- src/backend/cdb/cdbutil.c | 19 +- src/backend/cdb/dispatcher/cdbgang.c | 14 +- src/backend/cdb/motion/Makefile | 18 +- src/backend/cdb/motion/cdbmotion.c | 111 +- src/backend/cdb/motion/ic_common.c | 878 ------------ src/backend/cdb/motion/ic_proxy_bgworker.c | 13 +- src/backend/cdb/motion/tupser.c | 4 +- src/backend/executor/execMain.c | 4 +- src/backend/executor/execUtils.c | 6 +- src/backend/executor/nodeSubplan.c | 8 +- src/backend/storage/ipc/ipc.c | 6 +- src/backend/tcop/dest.c | 7 +- src/backend/utils/init/miscinit.c | 6 + src/backend/utils/misc/guc_gp.c | 2 +- src/include/cdb/cdbinterconnect.h | 377 +---- src/include/cdb/cdbmotion.h | 3 +- src/include/cdb/ml_ipc.h | 510 ++++--- src/include/cdb/tupser.h | 5 +- src/include/pg_config.h.in | 3 + src/test/regress/expected/ic.out | 2 +- src/test/regress/expected/ic_1.out | 2 +- src/test/regress/regress_gp.c | 2 +- src/test/regress/sql/ic.sql | 2 +- 65 files changed, 3566 insertions(+), 2284 deletions(-) create mode 100644 contrib/interconnect/.gitignore create mode 100644 contrib/interconnect/Makefile create mode 100644 contrib/interconnect/Makefile.interconnect.in create mode 100644 contrib/interconnect/README.md create mode 100644 contrib/interconnect/ic_common.c create mode 100644 contrib/interconnect/ic_common.h create mode 100644 contrib/interconnect/ic_internal.h create mode 100644 contrib/interconnect/ic_modules.c create mode 100644 contrib/interconnect/ic_modules.h rename {src/backend/cdb/motion => contrib/interconnect/proxy}/README.ic-proxy.md (100%) rename {src/backend/cdb/motion => contrib/interconnect/proxy}/ic_proxy.h (98%) rename {src/backend/cdb/motion => contrib/interconnect/proxy}/ic_proxy_addr.c (100%) rename {src/backend/cdb/motion => contrib/interconnect/proxy}/ic_proxy_addr.h (100%) rename {src/backend/cdb/motion => contrib/interconnect/proxy}/ic_proxy_backend.c (98%) rename {src/backend/cdb/motion => contrib/interconnect/proxy}/ic_proxy_backend.h (97%) rename {src/backend/cdb/motion => contrib/interconnect/proxy}/ic_proxy_client.c (100%) rename {src/backend/cdb/motion => contrib/interconnect/proxy}/ic_proxy_iobuf.c (100%) rename {src/backend/cdb/motion => contrib/interconnect/proxy}/ic_proxy_iobuf.h (100%) rename {src/backend/cdb/motion => contrib/interconnect/proxy}/ic_proxy_key.c (100%) rename {src/backend/cdb/motion => contrib/interconnect/proxy}/ic_proxy_key.h (100%) rename {src/backend/cdb/motion => contrib/interconnect/proxy}/ic_proxy_main.c (100%) rename {src/backend/cdb/motion => contrib/interconnect/proxy}/ic_proxy_packet.c (100%) rename {src/backend/cdb/motion => contrib/interconnect/proxy}/ic_proxy_packet.h (100%) rename {src/backend/cdb/motion => contrib/interconnect/proxy}/ic_proxy_peer.c (100%) rename {src/backend/cdb/motion => contrib/interconnect/proxy}/ic_proxy_pkt_cache.c (100%) rename {src/backend/cdb/motion => contrib/interconnect/proxy}/ic_proxy_pkt_cache.h (100%) rename {src/backend/cdb/motion => contrib/interconnect/proxy}/ic_proxy_router.c (100%) rename {src/backend/cdb/motion => contrib/interconnect/proxy}/ic_proxy_router.h (100%) rename {src/backend/cdb/motion => contrib/interconnect/proxy}/ic_proxy_server.h (100%) rename {src/backend/cdb/motion => contrib/interconnect/tcp}/ic_tcp.c (79%) create mode 100644 contrib/interconnect/tcp/ic_tcp.h rename src/include/cdb/cdbicudpfaultinjection.h => contrib/interconnect/udp/ic_faultinjection.h (99%) rename {src/backend/cdb/motion => contrib/interconnect/udp}/ic_udpifc.c (85%) create mode 100644 contrib/interconnect/udp/ic_udpifc.h delete mode 100644 src/backend/cdb/motion/ic_common.c diff --git a/GNUmakefile.in b/GNUmakefile.in index 86b58a4557e..d01ebbe76f8 100644 --- a/GNUmakefile.in +++ b/GNUmakefile.in @@ -20,6 +20,7 @@ all: $(MAKE) -C contrib/extprotocol all $(MAKE) -C contrib/dblink all $(MAKE) -C contrib/indexscan all + $(MAKE) -C contrib/interconnect all $(MAKE) -C contrib/pageinspect all # needed by src/test/isolation $(MAKE) -C contrib/hstore all $(MAKE) -C contrib/pgcrypto all @@ -62,6 +63,7 @@ install: $(MAKE) -C contrib/extprotocol $@ $(MAKE) -C contrib/dblink $@ $(MAKE) -C contrib/indexscan $@ + $(MAKE) -C contrib/interconnect $@ $(MAKE) -C contrib/pageinspect $@ # needed by src/test/isolation $(MAKE) -C contrib/hstore $@ $(MAKE) -C contrib/pgcrypto $@ diff --git a/configure b/configure index 6ca6679155d..02f3ca04d69 100755 --- a/configure +++ b/configure @@ -753,6 +753,7 @@ ICU_CFLAGS with_icu enable_thread_safety INCLUDES +enable_preload_ic_module enable_ic_proxy enable_external_fts HAVE_CXX14 @@ -897,6 +898,7 @@ enable_mapreduce enable_gpcloud enable_external_fts enable_ic_proxy +enable_preload_ic_module enable_thread_safety with_icu with_tcl @@ -1607,6 +1609,8 @@ Optional Features: --enable-external-fts enable external fts support --enable-ic-proxy enable interconnect proxy mode (requires libuv library) + --disable-preload-ic-module + disable preload interconnect module --disable-thread-safety disable thread-safety in client libraries --enable-openssl-redirect enable redirect openssl interface to internal @@ -8953,6 +8957,40 @@ fi +# +# ic-module preload +# + + +# Check whether --enable-preload-ic-module was given. +if test "${enable_preload_ic_module+set}" = set; then : + enableval=$enable_preload_ic_module; + case $enableval in + yes) + +$as_echo "#define ENABLE_PRELOAD_IC_MODULE 1" >>confdefs.h + + ;; + no) + : + ;; + *) + as_fn_error $? "no argument expected for --enable-preload-ic-module option" "$LINENO" 5 + ;; + esac + +else + enable_preload_ic_module=yes + +$as_echo "#define ENABLE_PRELOAD_IC_MODULE 1" >>confdefs.h + +fi + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: checking whether to build with preload ic module ... $enable_preload_ic_module" >&5 +$as_echo "checking whether to build with preload ic module ... $enable_preload_ic_module" >&6; } + + # # Include directories # @@ -14149,6 +14187,8 @@ else fi fi +ac_config_files="$ac_config_files contrib/interconnect/Makefile.interconnect" + if test "$enable_spinlocks" = yes; then @@ -23593,6 +23633,7 @@ for ac_config_target in $ac_config_targets do case $ac_config_target in "src/backend/port/tas.s") CONFIG_LINKS="$CONFIG_LINKS src/backend/port/tas.s:src/backend/port/tas/${tas_file}" ;; + "contrib/interconnect/Makefile.interconnect") CONFIG_FILES="$CONFIG_FILES contrib/interconnect/Makefile.interconnect" ;; "GNUmakefile") CONFIG_FILES="$CONFIG_FILES GNUmakefile" ;; "src/Makefile.global") CONFIG_FILES="$CONFIG_FILES src/Makefile.global" ;; "src/include/catalog/gp_version_at_initdb.dat") CONFIG_FILES="$CONFIG_FILES src/include/catalog/gp_version_at_initdb.dat" ;; diff --git a/configure.ac b/configure.ac index 9264a7a917a..2899d25d4e0 100644 --- a/configure.ac +++ b/configure.ac @@ -890,6 +890,16 @@ PGAC_ARG_BOOL(enable, ic-proxy, no, [Define to 1 to build with ic-proxy support (--enable-ic-proxy)])]) AC_SUBST(enable_ic_proxy) +# +# ic-module preload +# +PGAC_ARG_BOOL(enable, preload-ic-module, yes, + [disable preload interconnect module], + [AC_DEFINE(ENABLE_PRELOAD_IC_MODULE, 1, + [Define to 1 to preload with ic module])]) +AC_MSG_RESULT([checking whether to build with preload ic module ... $enable_preload_ic_module]) +AC_SUBST(enable_preload_ic_module) + # # Include directories # @@ -1505,6 +1515,7 @@ if test "$enable_ic_proxy" = yes; then AC_CHECK_LIB(uv, uv_default_loop, [], [AC_MSG_ERROR([libuv library not found, it is required by --enable-ic-proxy.])]) fi +AC_CONFIG_FILES([contrib/interconnect/Makefile.interconnect]) if test "$enable_spinlocks" = yes; then AC_DEFINE(HAVE_SPINLOCKS, 1, [Define to 1 if you have spinlocks.]) diff --git a/contrib/Makefile b/contrib/Makefile index 402304212e7..c73b0ece22a 100644 --- a/contrib/Makefile +++ b/contrib/Makefile @@ -27,6 +27,7 @@ SUBDIRS = \ hstore \ intagg \ intarray \ + interconnect \ isn \ ltree \ oid2name \ diff --git a/contrib/README b/contrib/README index 6d849a8d3fb..bcf6a1d9785 100644 --- a/contrib/README +++ b/contrib/README @@ -99,6 +99,10 @@ intarray - Index support for arrays of int4, using GiST by Teodor Sigaev and Oleg Bartunov +interconnect - + Module for interconnect + Split from src/backend/cdb/motion/ as a independency module + isn - PostgreSQL type extensions for ISBN, ISSN, ISMN, EAN13 product numbers by Germán Méndez Bravo (Kronuz) diff --git a/contrib/interconnect/.gitignore b/contrib/interconnect/.gitignore new file mode 100644 index 00000000000..00d763bd558 --- /dev/null +++ b/contrib/interconnect/.gitignore @@ -0,0 +1 @@ +/Makefile.interconnect \ No newline at end of file diff --git a/contrib/interconnect/Makefile b/contrib/interconnect/Makefile new file mode 100644 index 00000000000..31489fa9148 --- /dev/null +++ b/contrib/interconnect/Makefile @@ -0,0 +1,45 @@ +# contrib/interconnect/Makefile +# Makefile.interconnect will be generate when src/Makefile.global generated +# Used to get $(enable_ic_proxy) +top_builddir = ../.. +include $(top_builddir)/contrib/interconnect/Makefile.interconnect + +MODULE_big = interconnect +PGFILEDESC = "interconnect - inter connection module" + +OBJS = \ + $(WIN32RES) \ + ic_common.o \ + tcp/ic_tcp.o \ + udp/ic_udpifc.o \ + ic_modules.o + +ifeq ($(enable_ic_proxy), yes) +# servere +OBJS += proxy/ic_proxy_main.o +OBJS += proxy/ic_proxy_client.o +OBJS += proxy/ic_proxy_peer.o +OBJS += proxy/ic_proxy_router.o + +# backend +OBJS += proxy/ic_proxy_backend.o + +# utils +OBJS += proxy/ic_proxy_addr.o +OBJS += proxy/ic_proxy_key.o +OBJS += proxy/ic_proxy_packet.o +OBJS += proxy/ic_proxy_pkt_cache.o +OBJS += proxy/ic_proxy_iobuf.o +SHLIB_LINK += $(filter -luv, $(LIBS)) +endif # enable_ic_proxy + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = contrib/interconnect +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif \ No newline at end of file diff --git a/contrib/interconnect/Makefile.interconnect.in b/contrib/interconnect/Makefile.interconnect.in new file mode 100644 index 00000000000..4e54d312817 --- /dev/null +++ b/contrib/interconnect/Makefile.interconnect.in @@ -0,0 +1,5 @@ +# -*-makefile-*- +# contrib/interconnect/Makefile.interconnect.in +# @configure_input@ + +enable_ic_proxy = @enable_ic_proxy@ \ No newline at end of file diff --git a/contrib/interconnect/README.md b/contrib/interconnect/README.md new file mode 100644 index 00000000000..3975cb7b832 --- /dev/null +++ b/contrib/interconnect/README.md @@ -0,0 +1,204 @@ +# Intercontect + +This subtree contains interconnect module && test && benchmark that different with other subtree inside {cbdb_src}/contrib. Other moudles are not part of the core CloudBerry system, but interconnect module split from `cdb module`, it **must be preload with CloudBerry**, otherwise CloudBerry system will not work properly. + +**The interconnect module will be preloaded by default as a library.** When the compile option `--disable-preload-ic-module` is turned on, then the interconnect module will not be preloaded, then users need to add `interconnect` into guc `shared_preload_libraries`. + + +Benefits of Separating Interconnect Separately + +- Independent & decoupling, easier to add new interconnect types +- It is more convenient to **test & debug & benchmark**. + + +# Intercontect interfaces and data structure + +The main interface name is `MotionIPCLayer` defined in `ml_ipc.h`, there are already three implementations: +- tcp +- udpifc +- proxy + +The specific method can refer to the notes. Here is a diagram to describe the specific timing of the interface function being called. + +``` + + + + + + + + ┌─────────────┐ ┌─────────────────────┐ ┌──────────────────────┐ ┌─────────────┐ + │ cdb init │────────▶│ InitMotionLayerIPC │ │ InitMotionLayerIPC │◀─────│ cdb init │ + └─────────────┘ └─────────────────────┘ └──────────────────────┘ └─────────────┘ + │ │ + │ ┌──────────┐ │ + │ │ registe │ │ + │ │request(if│ │ + ▼ │ have) │ ▼ + ┌─────────────┐ ┌────────────────────┐ │ │ ┌───────────────────────┐ ┌─────────────┐ + │ motion exec │────────▶│ SetupInterconnect │◀───┴──────────┴──│ SetupInterconnect │◀────│ motion exec │ + └─────────────┘ └────────────────────┘ └───────────────────────┘ └─────────────┘ + │ │ + │ │ + ▼ ▼ + ┌────────────────────┐ ┌──────────────────────┐ + │SendTupleChunkToAMS │ │RecvTupleChunkFromAny/│ + └────────────────────┘ │ RecvTupleChunkFrom │ + ┌───────────────────────────┐ │ ┌─────────────────┐└──────────────────────┘ + │ GetTransportDirectBuffer │───┐ ┌───────────────┐ │ │ TupleChunkList │ │ ┌───────────────┐ + └───────────────────────────┘ │ │ internal call │──────▶ │ └─────────────────┘ │◀──────│ internal call │ + ▲ │ └───────────────┘ ▼ │ ▼ └───────────────┘ + │ │ ┌────────────────────┐ ▼ ┌──────────────────────┐ ┌──────────────────┐ ┌───────────────────┐ + │ │ ┌─────────────│ SendChunk │─────────────────▶│ RecvTupleChunk │─────▶│ udp recv buffer │────▶│ DirectPutRxBuffer │ + ┌────────────────────┐ │ ▼ └────────────────────┘ └──────────────────────┘ └──────────────────┘ └───────────────────┘ + │ direct access │ │ ┌──────────────┐ │ ┌────────────┐ │ + │ reduce memcpy │ ├───▶│ buffer Pool │ │ │ stop or │ │ + └────────────────────┘ │ └──────────────┘ ◀─────────────│ interrupt │─────────────▶ + │ │ ▲ │ └────────────┘ │ + │ │ │ │ │ │ + ▼ │ │ ┌────────────────────┐ │ ▼ ┌──────────────────┐ + ┌───────────────────────────┐ │ └──────▲──────│ SendEOS │ │ ┌──────────────────────┐ │motion exec finish│ + │ PutTransportDirectBuffer │───┘ ┌──────────┘ └────────────────────┘ │ │ TeardownInterconnect │◀───────────────│ or interrupt │ + └───────────────────────────┘ │ ▲ │ └──────────────────────┘ └──────────────────┘ + ┌──────────┐ │ │ │ + │ flush │ │ │ │ + └──────────┘ │ │ ▼ + ┌────────────────────┐ │ ┌──────────────────────┐ ┌─────────────┐ + │ SendStopMessage │ │ │CleanUpMotionLayerIPC │◀───────────────│ cdb cleanup │ + └────────────────────┘ │ └──────────────────────┘ └─────────────┘ + │ + │ + ┌──────────────────┐ │ + │motion exec finish│ ┌────────────────────┐ │ + │ or interrupt │───────────────▶│TeardownInterconnect│◀─────────┘ + └──────────────────┘ └────────────────────┘ + │ + ▼ + ┌─────────────┐ ┌─────────────────────┐ + │ cdb cleanup │──────────────────▶│CleanUpMotionLayerIPC│ + └─────────────┘ └─────────────────────┘ +``` + + +Notice: + +- The ones starting with capital letters in the figure are methods in the interface. +- Notes starting with lowercase letters in the figure. + +Interconnect contains three main data structures + +- **ChunkTransportState**: Generated by `EState`, one-to-one correspondence with `EState` object. + - Records most of the global information, such as remote connection information. + - Contains a set of **ChunkTransportStateEntry**. +- **ChunkTransportStateEntry**: Generated by `SliceTable`, one-to-one correspondence with `motionId`. + - Different interconnect implementations have different ** ChunkTransportStateEntry** objects, Obtain subclass objects through `CONTAINER_OF`. + - In this structure, multiple links for managing a single motion node(Use `motionId` to identify). It is distinguished from incoming/outgoing + - Contains a set of **MotionConn** +- **MotionConn** + - Different interconnect implementations have different ** MotionConn** objects, Obtain subclass objects through `CONTAINER_OF`. + - In this structure, a specific point-to-point connection is established. + + +# How to implements a interconnect type + +Here I assume an optimization scenario: **in intercontect layer, support domain socket , used domain socket on local machine, and use udp implements when across machines** + +Here are two pieces of pseudo-code to achieve this function + +Solution1: call interfaces of `udpifc` in the `domain socket interface` which we will defined. + +``` +MotionIPCLayer domain_udp_ipc_layer = { + .ic_type = INTERCONNECT_TYPE_DOMAIN_UDP, // new ic type + .GetMaxTupleChunkSize = GetMaxTupleChunkSizeUDP, // udp header bigger than tcp, so use udp max size + .GetListenPort = GetListenPortDomainUDP, // need return a combined port. return type is `int32`, can hold both. + + + /** + * InitMotionIPCLayerDomainUDP() { + * // init domain socket ipc layer + * + * InitMotionIPCLayerUDP(); // also init udp ipc layer + * } + * + */ + .InitMotionLayerIPC = InitMotionIPCLayerDomainUDP, + + /** + * SetupInterconnectDomainUDP() { + * SetupInterconnectUDP(); // setup udp ipc layer before setup domain socket ipc layer + * // after SetupInterconnectUDP, `ChunkTransportState` have been inited + * // and some of `MotionConnUDP` have been init + * + * SetupInterconnectDomainSocket(ChunkTransportState obj); + * // in SetupInterconnectDomainSocket should create some of object `MotionConnDomianUDP` + * // which contains `MotionConnUDP` and replace it in `ChunkTransportStateEntry->conns` + * + * // after this call, we can make sure that each `MotionConnDomianUDP` will use domain socket or udp? + * } + * + */ + .SetupInterconnect = SetupInterconnectDomainUDP, + + /** + * SendChunkDomainUDP() { + * // `MotionConnDomianUDP` use domain socket or udp send tuple + * } + */ + .SendTupleChunkToAMS = SendChunkDomainUDP, + + /** + * RecvTupleChunkDomainUDP() { + * // `MotionConnDomianUDP` use domain socket or udp recv tuple + * } + */ + .RecvTupleChunk = RecvTupleChunkDomainUDP, + + + ... other interfaces +} +``` + +Solution 2: Coupling the logic in `udpifc`, just like the current `proxy` implementation. + +``` +// still used udp interface +// and add some struct inside `MotionConnUDP` + `ChunkTransportStateEntryUDP` +MotionIPCLayer udpifc_ipc_layer = { + .ic_type = INTERCONNECT_TYPE_DOMAINUDP, + .GetMaxTupleChunkSize = GetMaxTupleChunkSizeUDP, + .GetListenPort = GetListenPortUDP, + .InitMotionLayerIPC = InitMotionIPCLayerUDP, + .SetupInterconnect = SetupInterconnectUDP, + .SendChunk = SendChunkUDPIFC, + .RecvTupleChunk = RecvTupleChunkUDPIFC, +} + +// do some hook like ic_proxy implements +InitMotionIPCLayerUDP/SetupInterconnect() { + +// origin logic +#ifdef ENABLE_DOMAIN_SOCKET +// init/setup domain socket +#endif + +} + +SendChunk/RecvTupleChunk() { +// inited MotionConnUDP will make sure used domain socket or udp +#ifdef ENABLE_DOMAIN_SOCKET + if (conn->ShouldUseDomainSocket()) { + // send with domain socket + return; + } +#endif + + // origin udp logic +} + +``` + + + + diff --git a/contrib/interconnect/ic_common.c b/contrib/interconnect/ic_common.c new file mode 100644 index 00000000000..8ed72146f11 --- /dev/null +++ b/contrib/interconnect/ic_common.c @@ -0,0 +1,547 @@ +/*------------------------------------------------------------------------- + * ic_common.c + * Interconnect code shared between UDP, and TCP IPC Layers. + * + * Portions Copyright (c) 2023-, Cloudberry + * + * + * IDENTIFICATION + * contrib/interconnect/ic_common.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "ic_common.h" +#include "ic_modules.h" +#include "common/ip.h" +#include "nodes/execnodes.h" /* ExecSlice, SliceTable */ +#include "miscadmin.h" +#include "libpq/libpq-be.h" +#include "utils/builtins.h" +#include "utils/memutils.h" + +#include "cdb/ml_ipc.h" +#include "cdb/cdbvars.h" +#include "cdb/cdbdisp.h" + +#include +#include +#include +#include + +static interconnect_handle_t * open_interconnect_handles; +static bool interconnect_resowner_callback_registered; + +/*========================================================================= + * VISIBLE FUNCTIONS + */ + +/* See ml_ipc.h */ +bool +SendTupleChunkToAMS(ChunkTransportState * transportStates, + int16 motNodeID, + int16 targetRoute, + TupleChunkListItem tcItem) +{ + int i, + recount = 0; + ChunkTransportStateEntry *pEntry = NULL; + MotionConn *conn; + TupleChunkListItem currItem; + + if (!transportStates) + elog(FATAL, "SendTupleChunkToAMS: no transport-states."); + if (!transportStates->activated) + elog(FATAL, "SendTupleChunkToAMS: transport states inactive"); + + /* check em' */ + ML_CHECK_FOR_INTERRUPTS(transportStates->teardownActive); + +#ifdef AMS_VERBOSE_LOGGING + elog(DEBUG3, "sendtuplechunktoams: calling get_transport_state" + "w/transportStates %p transportState->size %d motnodeid %d route %d", + transportStates, transportStates->size, motNodeID, targetRoute); +#endif + + getChunkTransportState(transportStates, motNodeID, &pEntry); + + /* + * tcItem can actually be a chain of tcItems. we need to send out all of + * them. + */ + for (currItem = tcItem; currItem != NULL; currItem = currItem->p_next) + { +#ifdef AMS_VERBOSE_LOGGING + elog(DEBUG5, "SendTupleChunkToAMS: chunk length %d", currItem->chunk_length); +#endif + + if (targetRoute == BROADCAST_SEGIDX) + { + doBroadcast(transportStates, pEntry, currItem, &recount); + } + else + { + if (targetRoute < 0 || targetRoute >= pEntry->numConns) + { + elog(FATAL, "SendTupleChunkToAMS: targetRoute is %d, must be between 0 and %d .", + targetRoute, pEntry->numConns); + } + /* handle pt-to-pt message. Primary */ + getMotionConn(pEntry, targetRoute, &conn); + + /* only send to interested connections */ + if (conn->stillActive) + { + CurrentMotionIPCLayer->SendChunk(transportStates, pEntry, conn, currItem, motNodeID); + if (!conn->stillActive) + recount = 1; + } + /* in 4.0 logical mirror xmit eliminated. */ + } + } + + if (recount == 0) + return true; + + /* if we don't have any connections active, return false */ + for (i = 0; i < pEntry->numConns; i++) + { + getMotionConn(pEntry, i, &conn); + if (conn->stillActive) + break; + } + + /* if we found an active connection we're not done */ + return (i < pEntry->numConns); +} + +/* + * The fetches a direct pointer into our transmit buffers, along with + * an indication as to how much data can be safely shoved into the + * buffer (started at the pointed location). + * + * This works a lot like SendTupleChunkToAMS(). + */ +void +GetTransportDirectBuffer(ChunkTransportState * transportStates, + int16 motNodeID, + int16 targetRoute, + struct directTransportBuffer *b) +{ + ChunkTransportStateEntry *pEntry = NULL; + MotionConn *conn; + + if (!transportStates) + { + elog(FATAL, "GetTransportDirectBuffer: no transport states"); + } + else if (!transportStates->activated) + { + elog(FATAL, "GetTransportDirectBuffer: inactive transport states"); + } + else if (targetRoute == BROADCAST_SEGIDX) + { + elog(FATAL, "GetTransportDirectBuffer: can't direct-transport to broadcast"); + } + + Assert(b != NULL); + + do + { + getChunkTransportState(transportStates, motNodeID, &pEntry); + + /* handle pt-to-pt message. Primary */ + getMotionConn(pEntry, targetRoute, &conn); + + /* only send to interested connections */ + if (!conn->stillActive) + { + break; + } + + b->pri = conn->pBuff + conn->msgSize; + b->prilen = Gp_max_packet_size - conn->msgSize; + + /* got buffer. */ + return; + } + while (0); + + /* buffer is missing ? */ + + b->pri = NULL; + b->prilen = 0; + + return; +} + +/* + * The fetches a direct pointer into our transmit buffers, along with + * an indication as to how much data can be safely shoved into the + * buffer (started at the pointed location). + * + * This works a lot like SendTupleChunkToAMS(). + */ +void +PutTransportDirectBuffer(ChunkTransportState * transportStates, + int16 motNodeID, + int16 targetRoute, int length) +{ + ChunkTransportStateEntry *pEntry = NULL; + MotionConn *conn; + + if (!transportStates) + { + elog(FATAL, "PutTransportDirectBuffer: no transport states"); + } + else if (!transportStates->activated) + { + elog(FATAL, "PutTransportDirectBuffer: inactive transport states"); + } + else if (targetRoute == BROADCAST_SEGIDX) + { + elog(FATAL, "PutTransportDirectBuffer: can't direct-transport to broadcast"); + } + + getChunkTransportState(transportStates, motNodeID, &pEntry); + + /* handle pt-to-pt message. Primary */ + getMotionConn(pEntry, targetRoute, &conn); + /* only send to interested connections */ + if (conn->stillActive) + { + conn->msgSize += length; + conn->tupleCount++; + } + + /* put buffer. */ + return; +} + +/*========================================================================= + * HELPER FUNCTIONS + */ + + +/* Function createChunkTransportState() is used to create a ChunkTransportState struct and + * place it in the hashtab hashtable based on the motNodeID. + * + * PARAMETERS + * + * motNodeID - motion node ID for this ChunkTransportState. + * + * numConns - number of primary connections for this motion node. + * All are incoming if this is a receiving motion node. + * All are outgoing if this is a sending motion node. + * + * RETURNS + * An empty and initialized ChunkTransportState struct for the given motion node. If + * a ChuckTransportState struct is already registered for the motNodeID an ERROR is + * thrown. + */ +ChunkTransportStateEntry * +createChunkTransportState(ChunkTransportState * transportStates, + ExecSlice * sendSlice, + ExecSlice * recvSlice, + int numConns, + size_t chunk_trans_state_entry_size) +{ + ChunkTransportStateEntry *pEntry; + int motNodeID; + int i; + + Assert(recvSlice->sliceIndex >= 0); + Assert(sendSlice->sliceIndex > 0); + + motNodeID = sendSlice->sliceIndex; + if (motNodeID > transportStates->size) + { + /* increase size of our table */ + ChunkTransportStateEntry *newTable; + size_t old_states_pos = transportStates->size * chunk_trans_state_entry_size; + + newTable = repalloc(transportStates->states, motNodeID * chunk_trans_state_entry_size); + transportStates->states = newTable; + /* zero-out the new piece at the end */ + MemSet(((uint8 *) transportStates->states) + old_states_pos, 0, (motNodeID - transportStates->size) * chunk_trans_state_entry_size); + transportStates->size = motNodeID; + } + + getChunkTransportStateNoValid(transportStates, motNodeID, &pEntry); + + if (pEntry->valid) + { + MotionConn *conn = NULL; + + getMotionConn(pEntry, 0, &conn); + ereport(ERROR, + (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), + errmsg("interconnect error: A HTAB entry for motion node %d already exists", + motNodeID), + errdetail("conns %p numConns %d first sock %d", + pEntry->conns, pEntry->numConns, + conn != NULL ? conn->sockfd : -2))); + } + + pEntry->valid = true; + + pEntry->motNodeId = motNodeID; + pEntry->numConns = numConns; + pEntry->scanStart = 0; + pEntry->sendSlice = sendSlice; + pEntry->recvSlice = recvSlice; + + allocMotionConns(pEntry); + + for (i = 0; i < pEntry->numConns; i++) + { + MotionConn *conn = NULL; + MotionConnKey motion_conn_key; + MotionConnSentRecordTypmodEnt *motion_conn_ent; + + getMotionConn(pEntry, i, &conn); + + /* Initialize MotionConn entry. */ + conn->state = mcsNull; + conn->sockfd = -1; + conn->msgSize = 0; + conn->tupleCount = 0; + conn->stillActive = false; + conn->stopRequested = false; + conn->cdbProc = NULL; + conn->remapper = NULL; + + motion_conn_key.mot_node_id = motNodeID; + motion_conn_key.conn_index = i; + + motion_conn_ent = (MotionConnSentRecordTypmodEnt *) hash_search(transportStates->conn_sent_record_typmod, + &motion_conn_key, HASH_ENTER, NULL); + motion_conn_ent->sent_record_typmod = 0; + } + + return pEntry; +} + +/* Function removeChunkTransportState() is used to remove a ChunkTransportState struct from + * the hashtab hashtable. + * + * This should only be called after createChunkTransportState(). + * + * PARAMETERS + * + * motNodeID - motion node ID to lookup the ChunkTransportState. + * pIncIdx - parent slice idx in child slice. If not multiplexed, should be 1. + * + * RETURNS + * The ChunkTransportState that was removed from the hashtab hashtable. + */ +ChunkTransportStateEntry * +removeChunkTransportState(ChunkTransportState * transportStates, + int16 motNodeID) +{ + ChunkTransportStateEntry *pEntry = NULL; + + if (motNodeID > transportStates->size) + { + ereport(ERROR, + (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), + errmsg("interconnect error: Unexpected Motion Node Id: %d", + motNodeID), + errdetail("During remove. (size %d)", transportStates->size))); + } + + + getChunkTransportStateNoValid(transportStates, motNodeID, &pEntry); + + if (!pEntry->valid) + { + ereport(ERROR, + (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), + errmsg("interconnect error: Unexpected Motion Node Id: %d", + motNodeID), + errdetail("During remove. State not valid"))); + } + else + { + pEntry->valid = false; + } + + MPP_FD_ZERO(&pEntry->readSet); + + return pEntry; +} + +/* + * checkForCancelFromQD + * Check for cancel from QD. + * + * Should be called only inside the dispatcher + */ +void +checkForCancelFromQD(ChunkTransportState * pTransportStates) +{ + Assert(Gp_role == GP_ROLE_DISPATCH); + Assert(pTransportStates); + Assert(pTransportStates->estate); + + if (cdbdisp_checkForCancel(pTransportStates->estate->dispatcherState)) + { + ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), + errmsg(CDB_MOTION_LOST_CONTACT_STRING))); + /* not reached */ + } +} + +/* + * format_sockaddr + * Format a sockaddr to a human readable string + * + * This function must be kept threadsafe, elog/ereport/palloc etc are not + * allowed within this function. + */ +char * +format_sockaddr(struct sockaddr_storage *sa, char *buf, size_t len) +{ + int ret; + char remote_host[NI_MAXHOST]; + char remote_port[NI_MAXSERV]; + + ret = pg_getnameinfo_all(sa, sizeof(struct sockaddr_storage), + remote_host, sizeof(remote_host), + remote_port, sizeof(remote_port), + NI_NUMERICHOST | NI_NUMERICSERV); + + if (ret != 0) + snprintf(buf, len, "?host?:?port?"); + else + { +#ifdef HAVE_IPV6 + if (sa->ss_family == AF_INET6) + snprintf(buf, len, "[%s]:%s", remote_host, remote_port); + else +#endif + snprintf(buf, len, "%s:%s", remote_host, remote_port); + } + + return buf; +} + +void +destroy_interconnect_handle(interconnect_handle_t * h) +{ + h->interconnect_context = NULL; + /* unlink from linked list first */ + if (h->prev) + h->prev->next = h->next; + else + open_interconnect_handles = h->next; + if (h->next) + h->next->prev = h->prev; + + pfree(h); + + if (open_interconnect_handles == NULL) + MemoryContextReset(InterconnectContext); +} + +static void +cleanup_interconnect_handle(interconnect_handle_t * h) +{ + if (h->interconnect_context == NULL) + { + destroy_interconnect_handle(h); + return; + } + h->teardown_cb(h->interconnect_context, true); +} + +static void +interconnect_abort_callback(ResourceReleasePhase phase, + bool isCommit, + bool isTopLevel, + void *arg) +{ + interconnect_handle_t *curr; + interconnect_handle_t *next; + + if (phase != RESOURCE_RELEASE_AFTER_LOCKS) + return; + + next = open_interconnect_handles; + while (next) + { + curr = next; + next = curr->next; + + if (curr->owner == CurrentResourceOwner) + { + if (isCommit) + elog(WARNING, "interconnect reference leak: %p still referenced", curr); + + cleanup_interconnect_handle(curr); + } + } +} + + +interconnect_handle_t * +allocate_interconnect_handle(TeardownInterconnectCallBack callback) +{ + interconnect_handle_t *h; + + if (InterconnectContext == NULL) + InterconnectContext = AllocSetContextCreate(TopMemoryContext, + "Interconnect Context", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + + h = MemoryContextAllocZero(InterconnectContext, sizeof(interconnect_handle_t)); + + h->teardown_cb = callback; + h->owner = CurrentResourceOwner; + h->next = open_interconnect_handles; + h->prev = NULL; + if (open_interconnect_handles) + open_interconnect_handles->prev = h; + open_interconnect_handles = h; + + if (!interconnect_resowner_callback_registered) + { + RegisterResourceReleaseCallback(interconnect_abort_callback, NULL); + interconnect_resowner_callback_registered = true; + } + return h; +} + +interconnect_handle_t * +find_interconnect_handle(ChunkTransportState * icContext) +{ + interconnect_handle_t *head = open_interconnect_handles; + + while (head != NULL) + { + if (head->interconnect_context == icContext) + return head; + head = head->next; + } + return NULL; +} + + +TupleRemapper * +GetMotionConnTupleRemapper(ChunkTransportState * transportStates, + int16 motNodeID, + int16 targetRoute) +{ + ChunkTransportStateEntry *pEntry = NULL; + MotionConn *conn = NULL; + + getChunkTransportState(transportStates, motNodeID, &pEntry); + getMotionConn(pEntry, targetRoute, &conn); + Assert(conn); + + return conn->remapper; +} diff --git a/contrib/interconnect/ic_common.h b/contrib/interconnect/ic_common.h new file mode 100644 index 00000000000..32fb9be025a --- /dev/null +++ b/contrib/interconnect/ic_common.h @@ -0,0 +1,143 @@ +/*------------------------------------------------------------------------- + * ic_common.c + * Interconnect code shared between UDP, and TCP IPC Layers. + * + * Portions Copyright (c) 2023-, Cloudberry + * + * + * IDENTIFICATION + * contrib/interconnect/ic_common.c + * + *------------------------------------------------------------------------- + */ + +#ifndef IC_COMMON_H +#define IC_COMMON_H + +#include "postgres.h" + +#include "ic_internal.h" +#include "common/ip.h" +#include "nodes/execnodes.h" /* ExecSlice, SliceTable */ +#include "cdb/cdbvars.h" +#include "cdb/cdbdisp.h" +#include "executor/execdesc.h" +#include "utils/memutils.h" + +typedef void (*TeardownInterconnectCallBack)(ChunkTransportState *transportStates, bool hasErrors); + +/*========================================================================= + * STRUCTS + */ +typedef struct interconnect_handle_t +{ + ChunkTransportState *interconnect_context; /* Interconnect state */ + + // callback for interconnect been abort + TeardownInterconnectCallBack teardown_cb; + + ResourceOwner owner; /* owner of this handle */ + struct interconnect_handle_t *next; + struct interconnect_handle_t *prev; +} interconnect_handle_t; + +/*========================================================================= + * GLOBAL STATE VARIABLES + */ + +/* Socket file descriptor for the listener. */ +extern int TCP_listenerFd; +extern int UDP_listenerFd; + +/*========================================================================= + * Resource manager + */ + +void destroy_interconnect_handle(interconnect_handle_t *h); + +interconnect_handle_t * allocate_interconnect_handle(TeardownInterconnectCallBack callback); + +interconnect_handle_t * find_interconnect_handle(ChunkTransportState *icContext); + +/*========================================================================= + * Common method in IPC layer + */ + +extern char * +format_sockaddr(struct sockaddr_storage *sa, char *buf, size_t len); + +extern bool SendTupleChunkToAMS(ChunkTransportState *transportStates, + int16 motNodeID, + int16 targetRoute, + TupleChunkListItem tcItem); + +extern ChunkTransportStateEntry *createChunkTransportState(ChunkTransportState *transportStates, + ExecSlice *sendSlice, + ExecSlice *recvSlice, + int numConns, + size_t chunk_trans_state_entry_size); + +extern ChunkTransportStateEntry *removeChunkTransportState(ChunkTransportState *transportStates, + int16 motNodeID); + + +#define ML_CHECK_FOR_INTERRUPTS(teardownActive) \ + do {if (!teardownActive && InterruptPending) CHECK_FOR_INTERRUPTS(); } while (0) + +/* doBroadcast() is used to send a TupleChunk to all recipients. + * + * PARAMETERS + * mlStates - motion-layer state ptr. + * transportStates - IC-instance ptr. + * pChunkEntry - ChunkTransportState context that contains everything we need to send. + * tcItem - TupleChunk to send. + */ +#define doBroadcast(transportStates, pChunkEntry, tcItem, inactiveCountPtr) \ + do { \ + MotionConn *conn; \ + int *p_inactive = inactiveCountPtr; \ + int i, index, inactive = 0; \ + /* add our tcItem to each of the outgoing buffers. */ \ + index = Max(0, GpIdentity.segindex); /* entry-db has -1 */ \ + for (i = 0; i < pChunkEntry->numConns; i++, index++) \ + { \ + if (index >= pChunkEntry->numConns) \ + index = 0; \ + getMotionConn(pChunkEntry, index, &conn);\ + /* only send to still interested receivers. */ \ + if (conn->stillActive) \ + { \ + CurrentMotionIPCLayer->SendChunk(transportStates, pChunkEntry, conn, tcItem, pChunkEntry->motNodeId); \ + if (!conn->stillActive) \ + inactive++; \ + } \ + } \ + if (p_inactive != NULL) \ + *p_inactive = (inactive ? 1 : 0); \ + } while (0) + +/* + * checkForCancelFromQD + * Check for cancel from QD. + * + * Should be called only inside the dispatcher + */ +extern void +checkForCancelFromQD(ChunkTransportState *pTransportStates); + +extern void +GetTransportDirectBuffer(ChunkTransportState *transportStates, + int16 motNodeID, + int16 targetRoute, + struct directTransportBuffer *b); + +extern void PutTransportDirectBuffer(ChunkTransportState *transportStates, + int16 motNodeID, + int16 targetRoute, int length); + +extern TupleRemapper * GetMotionConnTupleRemapper(ChunkTransportState *transportStates, + int16 motNodeID, + int16 targetRoute); + + +#endif \ No newline at end of file diff --git a/contrib/interconnect/ic_internal.h b/contrib/interconnect/ic_internal.h new file mode 100644 index 00000000000..c09d89e9697 --- /dev/null +++ b/contrib/interconnect/ic_internal.h @@ -0,0 +1,317 @@ +/*------------------------------------------------------------------------- + * interconnect_internal.h + * Motion IPC Layer. + * + * Portions Copyright (c) 2023-, Cloudberry inc + * + * + * IDENTIFICATION + * contrib/interconnect/interconnect_internal.h + * + *------------------------------------------------------------------------- + */ +#ifndef INTER_CONNECT_INTERNAL_H +#define INTER_CONNECT_INTERNAL_H + +#include "tcp/ic_tcp.h" +#include "udp/ic_udpifc.h" +#include "cdb/cdbinterconnect.h" + +#define CONTAINER_OF(ptr, type, member) \ + ({ \ + const typeof( ((type *)0)->member ) *__member_ptr = (ptr); \ + (type *)( (char *)__member_ptr - offsetof(type,member) ); \ + }) + +typedef enum MotionConnState +{ + mcsNull, + mcsAccepted, + mcsSetupOutgoingConnection, + mcsConnecting, + mcsRecvRegMsg, + mcsSendRegMsg, + mcsStarted, + mcsEosSent +} MotionConnState; + +/* + * Structure used for keeping track of a pt-to-pt connection between two + * Cdb Entities (either QE or QD). + */ +typedef struct MotionConn +{ + /* socket file descriptor. */ + int sockfd; + + /* pointer to the data buffer. */ + uint8 *pBuff; + + /* size of the message in the buffer, if any. */ + int32 msgSize; + + /* position of message inside of buffer, "cursor" pointer */ + uint8 *msgPos; + + /* + * recv bytes: we can have more than one message/message fragment in recv + * queue at once + */ + int32 recvBytes; + + int tupleCount; + + /* + * false means 1) received a stop message and has handled it. 2) received + * EOS message or sent out EOS message 3) received a QueryFinishPending + * notify and has handled it. + */ + bool stillActive; + + /* + * used both by motion sender and motion receiver + * + * sender: true means receiver don't need to consume tuples any more, + * sender is also responsible to send stop message to its senders. + * + * receiver: true means have sent out a stop message to its senders. The + * stop message might be lost, stopRequested can also tell sender that no + * more data needed in the ack message. + */ + bool stopRequested; + + MotionConnState state; + + struct CdbProcess *cdbProc; + int remoteContentId; + char remoteHostAndPort[128]; /* Numeric IP addresses should never + * be longer than about 50 chars, but + * play it safe */ + + /* + * used by the receiver. + * + * all the remap information. + */ + TupleRemapper *remapper; +} MotionConn; + +typedef struct MotionConnUDP +{ + struct MotionConn mConn; + + /* send side queue for packets to be sent */ + ICBufferList sndQueue; + int capacity; + + /* seq already sent */ + uint32 sentSeq; + + /* ack of this seq and packets with smaller seqs have been received */ + uint32 receivedAckSeq; + + /* packets with this seq or smaller seqs have been consumed */ + uint32 consumedSeq; + + uint64 rtt; + uint64 dev; + uint64 deadlockCheckBeginTime; + + ICBuffer *curBuff; + + /* + * send side unacked packet queue. Since it is often accessed at the same + * time with unack queue ring, it is protected with unqck queue ring lock. + */ + ICBufferList unackQueue; + + uint16 route; + + struct icpkthdr conn_info; + + struct sockaddr_storage peer; /* Allow for IPv4 or IPv6 */ + socklen_t peer_len; /* And remember the actual length */ + + /* a queue of maximum length Gp_interconnect_queue_depth */ + int pkt_q_capacity; /* max capacity of the queue */ + int pkt_q_size; /* number of packets in the queue */ + int pkt_q_head; + int pkt_q_tail; + uint8 **pkt_q; + + uint64 stat_total_ack_time; + uint64 stat_count_acks; + uint64 stat_max_ack_time; + uint64 stat_min_ack_time; + uint64 stat_count_resent; + uint64 stat_max_resent; + uint64 stat_count_dropped; +} MotionConnUDP; + +typedef struct MotionConnTCP +{ + struct MotionConn mConn; + + uint64 wakeup_ms; + + char localHostAndPort[128]; +} MotionConnTCP; + +/* + * Used to organize all of the information for a given motion node. + */ +typedef struct ChunkTransportStateEntry +{ + int motNodeId; + bool valid; + + /* Connection array */ + MotionConn *conns; + int numConns; + + int scanStart; + + /* + * used for receiving. to select() from a set of interesting MotionConns + * to see when data is ready to be read. When the incoming connections + * are established, read interest is turned on. It is turned off when an + * EOS (End of Stream) message is read. + */ + mpp_fd_set readSet; + + /* slice table entries */ + struct ExecSlice *sendSlice; + struct ExecSlice *recvSlice; + +} ChunkTransportStateEntry; + +typedef struct ChunkTransportStateEntryTCP +{ + ChunkTransportStateEntry entry; + + /* highest file descriptor in the readSet. */ + int highReadSock; +} ChunkTransportStateEntryTCP; + +typedef struct ChunkTransportStateEntryUDP +{ + ChunkTransportStateEntry entry; + + /* setup info */ + int txfd; + int txfd_family; + unsigned short txport; + + bool sendingEos; + + /* Statistics info for this motion on the interconnect level */ + uint64 stat_total_ack_time; + uint64 stat_count_acks; + uint64 stat_max_ack_time; + uint64 stat_min_ack_time; + uint64 stat_count_resent; + uint64 stat_max_resent; + uint64 stat_count_dropped; +} ChunkTransportStateEntryUDP; + +#define GetMotionConn(pEntry, MotionConnType, offset, cconn) \ + Assert((pEntry) != NULL); \ + if (offset >= 0 && offset < (pEntry)->numConns) { \ + *(cconn) = &((MotionConnType *)(pEntry)->conns)[offset].mConn;\ + } else { \ + ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), \ + errmsg("Interconnect Error: Unexpected Motion conn offset: %ld (size %d). This means" \ + " a motion conn that wasn't setup is requesting interconnect" \ + " resources.", (unsigned long int)(offset), (pEntry)->numConns))); \ + } + +#define getMotionConn(pEntry, offset, cconn) \ + do { \ + if (CurrentMotionIPCLayer->ic_type == INTERCONNECT_TYPE_TCP || \ + CurrentMotionIPCLayer->ic_type == INTERCONNECT_TYPE_PROXY) { \ + GetMotionConn(pEntry, MotionConnTCP, offset, cconn) \ + } else if (CurrentMotionIPCLayer->ic_type == INTERCONNECT_TYPE_UDPIFC) { \ + GetMotionConn(pEntry, MotionConnUDP, offset, cconn) \ + } else { \ + Assert(false); \ + } \ + Assert(*cconn); \ + } while (0) + +#define allocMotionConns(pEntry) \ + do { \ + Assert((pEntry) != NULL); \ + if (CurrentMotionIPCLayer->ic_type == INTERCONNECT_TYPE_TCP || \ + CurrentMotionIPCLayer->ic_type == INTERCONNECT_TYPE_PROXY) { \ + (pEntry)->conns = palloc0((pEntry)->numConns * sizeof(MotionConnTCP)); \ + } else if (CurrentMotionIPCLayer->ic_type == INTERCONNECT_TYPE_UDPIFC) { \ + (pEntry)->conns = palloc0((pEntry)->numConns * sizeof(MotionConnUDP)); \ + } else { \ + Assert(false); \ + } \ + } while (0) +/* + * Must used getChunkTransportState/getChunkTransportStateNoValid to fill ChunkTransportStateEntry + * Cause is a ChunkTransportStateEntry array in ChunkTransportState + */ +#define GetChunkTransportState(transportState, ChunkTransportStateEntryType, motNodeID, ppEntry) \ + Assert((transportState) != NULL); \ + if ((motNodeID) > 0 && \ + (transportState) && \ + (motNodeID) <= (transportState)->size && \ + ((ChunkTransportStateEntryType *)(transportState)->states)[(motNodeID)-1].entry.motNodeId == (motNodeID) && \ + ((ChunkTransportStateEntryType *)(transportState)->states)[(motNodeID)-1].entry.valid) \ + { \ + *(ppEntry) = &((ChunkTransportStateEntryType *)(transportState)->states)[(motNodeID) - 1].entry; \ + } \ + else \ + { \ + ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), \ + errmsg("Interconnect Error: Unexpected Motion Node Id: %d (size %d). This means" \ + " a motion node that wasn't setup is requesting interconnect" \ + " resources.", (motNodeID), (transportState)->size))); \ + /* not reached */ \ + } + +#define getChunkTransportState(transportState, motNodeID, ppEntry) \ + do { \ + if (CurrentMotionIPCLayer->ic_type == INTERCONNECT_TYPE_TCP || \ + CurrentMotionIPCLayer->ic_type == INTERCONNECT_TYPE_PROXY) { \ + GetChunkTransportState(transportState, ChunkTransportStateEntryTCP,motNodeID, ppEntry) \ + } else if (CurrentMotionIPCLayer->ic_type == INTERCONNECT_TYPE_UDPIFC) { \ + GetChunkTransportState(transportState, ChunkTransportStateEntryUDP,motNodeID, ppEntry) \ + } else { \ + Assert(false); \ + }\ + } while (0) + +#define GetChunkTransportStateNoValid(transportState, ChunkTransportStateEntryType, motNodeID, ppEntry) \ + Assert((transportState) != NULL); \ + if ((motNodeID) > 0 && \ + (transportState) && \ + (motNodeID) <= (transportState)->size) \ + { \ + *(ppEntry) = &((ChunkTransportStateEntryType *)(transportState)->states)[(motNodeID) - 1].entry; \ + } \ + else \ + { \ + ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), \ + errmsg("Interconnect Error: Unexpected Motion Node Id: %d (size %d). This means" \ + " a motion node that wasn't setup is requesting interconnect" \ + " resources.", (motNodeID), (transportState)->size))); \ + /* not reached */ \ + } + +#define getChunkTransportStateNoValid(transportState, motNodeID, ppEntry) \ + do { \ + if (CurrentMotionIPCLayer->ic_type == INTERCONNECT_TYPE_TCP || \ + CurrentMotionIPCLayer->ic_type == INTERCONNECT_TYPE_PROXY) { \ + GetChunkTransportStateNoValid(transportState, ChunkTransportStateEntryTCP,motNodeID, ppEntry) \ + } else if (CurrentMotionIPCLayer->ic_type == INTERCONNECT_TYPE_UDPIFC) { \ + GetChunkTransportStateNoValid(transportState, ChunkTransportStateEntryUDP,motNodeID, ppEntry) \ + } else { \ + Assert(false); \ + } \ + } while (0) + +#endif // INTER_CONNECT_INTERNAL_H diff --git a/contrib/interconnect/ic_modules.c b/contrib/interconnect/ic_modules.c new file mode 100644 index 00000000000..8a1c95a0cc8 --- /dev/null +++ b/contrib/interconnect/ic_modules.c @@ -0,0 +1,162 @@ +/*------------------------------------------------------------------------- + * interconnect.c + * Interconnect code shared between UDP, and TCP IPC Layers. + * + * Portions Copyright (c) 2023-, Cloudberry + * + * + * IDENTIFICATION + * contrib/interconnect/interconnect.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "ic_modules.h" +#include "ic_internal.h" +#include "ic_common.h" +#include "tcp/ic_tcp.h" +#include "udp/ic_udpifc.h" +#include "proxy/ic_proxy_server.h" + +PG_MODULE_MAGIC; + +MotionIPCLayer tcp_ipc_layer = { + .ic_type = INTERCONNECT_TYPE_TCP, + + .GetMaxTupleChunkSize = GetMaxTupleChunkSizeTCP, + .GetListenPort = GetListenPortTCP, + + .InitMotionLayerIPC = InitMotionIPCLayerTCP, + .CleanUpMotionLayerIPC = CleanUpMotionIPCLayerTCP, + .WaitInterconnectQuit = WaitInterconnectQuitTCP, + .SetupInterconnect = SetupInterconnectTCP, + .TeardownInterconnect = TeardownInterconnectTCP, + + .SendTupleChunkToAMS = SendTupleChunkToAMS, + .SendChunk = SendChunkTCP, + .SendEOS = SendEOSTCP, + .SendStopMessage = SendStopMessageTCP, + + .RecvTupleChunkFromAny = RecvTupleChunkFromAnyTCP, + .RecvTupleChunkFrom = RecvTupleChunkFromTCP, + .RecvTupleChunk = RecvTupleChunkTCP, + + .DirectPutRxBuffer = NULL, + + .DeregisterReadInterest = DeregisterReadInterestTCP, + .GetActiveMotionConns = NULL, + + .GetTransportDirectBuffer = GetTransportDirectBuffer, + .PutTransportDirectBuffer = PutTransportDirectBuffer, +#ifdef ENABLE_IC_PROXY + .IcProxyServiceMain = ic_proxy_server_main, +#else + .IcProxyServiceMain = NULL, +#endif + + .GetMotionConnTupleRemapper = GetMotionConnTupleRemapper, +}; + +MotionIPCLayer proxy_ipc_layer = { + .ic_type = INTERCONNECT_TYPE_PROXY, + + .GetMaxTupleChunkSize = GetMaxTupleChunkSizeTCP, + .GetListenPort = GetListenPortTCP, + + .InitMotionLayerIPC = InitMotionIPCLayerTCP, + .CleanUpMotionLayerIPC = CleanUpMotionIPCLayerTCP, + .WaitInterconnectQuit = WaitInterconnectQuitTCP, + .SetupInterconnect = SetupInterconnectTCP, + .TeardownInterconnect = TeardownInterconnectTCP, + + .SendTupleChunkToAMS = SendTupleChunkToAMS, + .SendChunk = SendChunkTCP, + .SendEOS = SendEOSTCP, + .SendStopMessage = SendStopMessageTCP, + + .RecvTupleChunkFromAny = RecvTupleChunkFromAnyTCP, + .RecvTupleChunkFrom = RecvTupleChunkFromTCP, + .RecvTupleChunk = RecvTupleChunkTCP, + + .DirectPutRxBuffer = NULL, + + .DeregisterReadInterest = DeregisterReadInterestTCP, + .GetActiveMotionConns = NULL, + + .GetTransportDirectBuffer = GetTransportDirectBuffer, + .PutTransportDirectBuffer = PutTransportDirectBuffer, +#ifdef ENABLE_IC_PROXY + .IcProxyServiceMain = ic_proxy_server_main, +#else + .IcProxyServiceMain = NULL, +#endif + + .GetMotionConnTupleRemapper = GetMotionConnTupleRemapper, +}; + + +MotionIPCLayer udpifc_ipc_layer = { + .ic_type = INTERCONNECT_TYPE_UDPIFC, + + .GetMaxTupleChunkSize = GetMaxTupleChunkSizeUDP, + .GetListenPort = GetListenPortUDP, + + .InitMotionLayerIPC = InitMotionIPCLayerUDP, + .CleanUpMotionLayerIPC = CleanUpMotionLayerIPCUDP, + .WaitInterconnectQuit = WaitInterconnectQuitUDPIFC, + .SetupInterconnect = SetupInterconnectUDP, + .TeardownInterconnect = TeardownInterconnectUDP, + + .SendTupleChunkToAMS = SendTupleChunkToAMS, + .SendChunk = SendChunkUDPIFC, + .SendEOS = SendEOSUDPIFC, + .SendStopMessage = SendStopMessageUDPIFC, + + .RecvTupleChunkFromAny = RecvTupleChunkFromAnyUDPIFC, + .RecvTupleChunkFrom = RecvTupleChunkFromUDPIFC, + .RecvTupleChunk = RecvTupleChunkUDPIFC, + + .DirectPutRxBuffer = MlPutRxBufferIFC, + + .DeregisterReadInterest = DeregisterReadInterestUDP, + .GetActiveMotionConns = GetActiveMotionConnsUDPIFC, + + .GetTransportDirectBuffer = GetTransportDirectBuffer, + .PutTransportDirectBuffer = PutTransportDirectBuffer, +#ifdef ENABLE_IC_PROXY + .IcProxyServiceMain = ic_proxy_server_main, +#else + .IcProxyServiceMain = NULL, +#endif + + .GetMotionConnTupleRemapper = GetMotionConnTupleRemapper, +}; + +void +_PG_init(void) +{ + if (!process_shared_preload_libraries_in_progress) { + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not load interconnect outside process shared preload"))); + } + + switch(Gp_interconnect_type) { + case INTERCONNECT_TYPE_TCP: + CurrentMotionIPCLayer = &tcp_ipc_layer; + break; + case INTERCONNECT_TYPE_UDPIFC: + CurrentMotionIPCLayer = &udpifc_ipc_layer; + break; + case INTERCONNECT_TYPE_PROXY: + CurrentMotionIPCLayer = &proxy_ipc_layer; + break; + default: + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not decide interconnect type"))); + } + +} \ No newline at end of file diff --git a/contrib/interconnect/ic_modules.h b/contrib/interconnect/ic_modules.h new file mode 100644 index 00000000000..6096acf695a --- /dev/null +++ b/contrib/interconnect/ic_modules.h @@ -0,0 +1,24 @@ +/*------------------------------------------------------------------------- + * interconnect.h + * Motion IPC Layer. + * + * Portions Copyright (c) 2023-, Cloudberry inc + * + * + * IDENTIFICATION + * contrib/interconnect/interconnect.h + * + *------------------------------------------------------------------------- + */ +#ifndef INTER_CONNECT_H +#define INTER_CONNECT_H + +#include "cdb/ml_ipc.h" + +extern MotionIPCLayer tcp_ipc_layer; +extern MotionIPCLayer proxy_ipc_layer; +extern MotionIPCLayer udpifc_ipc_layer; + +extern void _PG_init(void); + +#endif // INTER_CONNECT_H \ No newline at end of file diff --git a/src/backend/cdb/motion/README.ic-proxy.md b/contrib/interconnect/proxy/README.ic-proxy.md similarity index 100% rename from src/backend/cdb/motion/README.ic-proxy.md rename to contrib/interconnect/proxy/README.ic-proxy.md diff --git a/src/backend/cdb/motion/ic_proxy.h b/contrib/interconnect/proxy/ic_proxy.h similarity index 98% rename from src/backend/cdb/motion/ic_proxy.h rename to contrib/interconnect/proxy/ic_proxy.h index 0c1284f0602..e7da702caf3 100644 --- a/src/backend/cdb/motion/ic_proxy.h +++ b/contrib/interconnect/proxy/ic_proxy.h @@ -14,7 +14,7 @@ #include "postgres.h" -#include "cdb/cdbinterconnect.h" +#include "ic_internal.h" #include "cdb/cdbvars.h" #include "nodes/pg_list.h" #include "postmaster/postmaster.h" diff --git a/src/backend/cdb/motion/ic_proxy_addr.c b/contrib/interconnect/proxy/ic_proxy_addr.c similarity index 100% rename from src/backend/cdb/motion/ic_proxy_addr.c rename to contrib/interconnect/proxy/ic_proxy_addr.c diff --git a/src/backend/cdb/motion/ic_proxy_addr.h b/contrib/interconnect/proxy/ic_proxy_addr.h similarity index 100% rename from src/backend/cdb/motion/ic_proxy_addr.h rename to contrib/interconnect/proxy/ic_proxy_addr.h diff --git a/src/backend/cdb/motion/ic_proxy_backend.c b/contrib/interconnect/proxy/ic_proxy_backend.c similarity index 98% rename from src/backend/cdb/motion/ic_proxy_backend.c rename to contrib/interconnect/proxy/ic_proxy_backend.c index ebbf908d62a..7514096dbb6 100644 --- a/src/backend/cdb/motion/ic_proxy_backend.c +++ b/contrib/interconnect/proxy/ic_proxy_backend.c @@ -35,6 +35,7 @@ #include "cdb/ml_ipc.h" #include "executor/execdesc.h" +#include "ic_common.h" #include "ic_proxy.h" #include "ic_proxy_backend.h" #include "ic_proxy_packet.h" @@ -190,11 +191,14 @@ ic_proxy_backend_on_read_hello_ack(uv_stream_t *stream, ssize_t nread, const uv_ /* ic_tcp compatitble code to modify ChunkTransportStateEntry for receiver */ if (!backend->isSender) { - ChunkTransportStateEntry *pEntry; + ChunkTransportStateEntry *pChunkEntry; + ChunkTransportStateEntryTCP *pEntry; - pEntry = ic_proxy_backend_get_pentry(backend); + pChunkEntry = ic_proxy_backend_get_pentry(backend); + pEntry = CONTAINER_OF(pChunkEntry, ChunkTransportStateEntryTCP, entry); + Assert(pEntry); - MPP_FD_SET(backend->conn->sockfd, &pEntry->readSet); + MPP_FD_SET(backend->conn->sockfd, (&pEntry->entry.readSet)); if (backend->conn->sockfd > pEntry->highReadSock) pEntry->highReadSock = backend->conn->sockfd; } diff --git a/src/backend/cdb/motion/ic_proxy_backend.h b/contrib/interconnect/proxy/ic_proxy_backend.h similarity index 97% rename from src/backend/cdb/motion/ic_proxy_backend.h rename to contrib/interconnect/proxy/ic_proxy_backend.h index f5fc3261825..d03273c448c 100644 --- a/src/backend/cdb/motion/ic_proxy_backend.h +++ b/contrib/interconnect/proxy/ic_proxy_backend.h @@ -14,7 +14,7 @@ #include "postgres.h" -#include "cdb/cdbinterconnect.h" +#include "ic_internal.h" #include diff --git a/src/backend/cdb/motion/ic_proxy_client.c b/contrib/interconnect/proxy/ic_proxy_client.c similarity index 100% rename from src/backend/cdb/motion/ic_proxy_client.c rename to contrib/interconnect/proxy/ic_proxy_client.c diff --git a/src/backend/cdb/motion/ic_proxy_iobuf.c b/contrib/interconnect/proxy/ic_proxy_iobuf.c similarity index 100% rename from src/backend/cdb/motion/ic_proxy_iobuf.c rename to contrib/interconnect/proxy/ic_proxy_iobuf.c diff --git a/src/backend/cdb/motion/ic_proxy_iobuf.h b/contrib/interconnect/proxy/ic_proxy_iobuf.h similarity index 100% rename from src/backend/cdb/motion/ic_proxy_iobuf.h rename to contrib/interconnect/proxy/ic_proxy_iobuf.h diff --git a/src/backend/cdb/motion/ic_proxy_key.c b/contrib/interconnect/proxy/ic_proxy_key.c similarity index 100% rename from src/backend/cdb/motion/ic_proxy_key.c rename to contrib/interconnect/proxy/ic_proxy_key.c diff --git a/src/backend/cdb/motion/ic_proxy_key.h b/contrib/interconnect/proxy/ic_proxy_key.h similarity index 100% rename from src/backend/cdb/motion/ic_proxy_key.h rename to contrib/interconnect/proxy/ic_proxy_key.h diff --git a/src/backend/cdb/motion/ic_proxy_main.c b/contrib/interconnect/proxy/ic_proxy_main.c similarity index 100% rename from src/backend/cdb/motion/ic_proxy_main.c rename to contrib/interconnect/proxy/ic_proxy_main.c diff --git a/src/backend/cdb/motion/ic_proxy_packet.c b/contrib/interconnect/proxy/ic_proxy_packet.c similarity index 100% rename from src/backend/cdb/motion/ic_proxy_packet.c rename to contrib/interconnect/proxy/ic_proxy_packet.c diff --git a/src/backend/cdb/motion/ic_proxy_packet.h b/contrib/interconnect/proxy/ic_proxy_packet.h similarity index 100% rename from src/backend/cdb/motion/ic_proxy_packet.h rename to contrib/interconnect/proxy/ic_proxy_packet.h diff --git a/src/backend/cdb/motion/ic_proxy_peer.c b/contrib/interconnect/proxy/ic_proxy_peer.c similarity index 100% rename from src/backend/cdb/motion/ic_proxy_peer.c rename to contrib/interconnect/proxy/ic_proxy_peer.c diff --git a/src/backend/cdb/motion/ic_proxy_pkt_cache.c b/contrib/interconnect/proxy/ic_proxy_pkt_cache.c similarity index 100% rename from src/backend/cdb/motion/ic_proxy_pkt_cache.c rename to contrib/interconnect/proxy/ic_proxy_pkt_cache.c diff --git a/src/backend/cdb/motion/ic_proxy_pkt_cache.h b/contrib/interconnect/proxy/ic_proxy_pkt_cache.h similarity index 100% rename from src/backend/cdb/motion/ic_proxy_pkt_cache.h rename to contrib/interconnect/proxy/ic_proxy_pkt_cache.h diff --git a/src/backend/cdb/motion/ic_proxy_router.c b/contrib/interconnect/proxy/ic_proxy_router.c similarity index 100% rename from src/backend/cdb/motion/ic_proxy_router.c rename to contrib/interconnect/proxy/ic_proxy_router.c diff --git a/src/backend/cdb/motion/ic_proxy_router.h b/contrib/interconnect/proxy/ic_proxy_router.h similarity index 100% rename from src/backend/cdb/motion/ic_proxy_router.h rename to contrib/interconnect/proxy/ic_proxy_router.h diff --git a/src/backend/cdb/motion/ic_proxy_server.h b/contrib/interconnect/proxy/ic_proxy_server.h similarity index 100% rename from src/backend/cdb/motion/ic_proxy_server.h rename to contrib/interconnect/proxy/ic_proxy_server.h diff --git a/src/backend/cdb/motion/ic_tcp.c b/contrib/interconnect/tcp/ic_tcp.c similarity index 79% rename from src/backend/cdb/motion/ic_tcp.c rename to contrib/interconnect/tcp/ic_tcp.c index 3fd58315874..990ff74c2a6 100644 --- a/src/backend/cdb/motion/ic_tcp.c +++ b/contrib/interconnect/tcp/ic_tcp.c @@ -14,6 +14,9 @@ #include "postgres.h" +#include "ic_tcp.h" +#include "ic_internal.h" +#include "ic_common.h" #include "common/ip.h" #include "nodes/execnodes.h" /* ExecSlice, SliceTable */ #include "nodes/pg_list.h" @@ -30,8 +33,8 @@ #include "cdb/cdbdisp.h" #ifdef ENABLE_IC_PROXY -#include "ic_proxy_backend.h" -#endif /* ENABLE_IC_PROXY */ +#include "proxy/ic_proxy_backend.h" +#endif /* ENABLE_IC_PROXY */ #include #include @@ -51,12 +54,12 @@ typedef struct GpMonotonicTime { struct timeval beginTime; struct timeval endTime; -} GpMonotonicTime; +} GpMonotonicTime; -static void gp_set_monotonic_begin_time(GpMonotonicTime *time); -static void gp_get_monotonic_time(GpMonotonicTime *time); -static inline uint64 gp_get_elapsed_ms(GpMonotonicTime *time); -static inline uint64 gp_get_elapsed_us(GpMonotonicTime *time); +static void gp_set_monotonic_begin_time(GpMonotonicTime * time); +static void gp_get_monotonic_time(GpMonotonicTime * time); +static inline uint64 gp_get_elapsed_ms(GpMonotonicTime * time); +static inline uint64 gp_get_elapsed_us(GpMonotonicTime * time); static inline int timeCmp(struct timeval *t1, struct timeval *t2); /* @@ -68,68 +71,49 @@ static inline int timeCmp(struct timeval *t1, struct timeval *t2); #define CONNECT_AGGRESSIVERETRY_MS 500 /* listener backlog is calculated at listener-creation time */ -int listenerBacklog = 128; +extern int listenerBacklog; /* our timeout value for select() and other socket operations. */ static struct timeval tval; -static inline MotionConn * -getMotionConn(ChunkTransportStateEntry *pEntry, int iConn) -{ - Assert(pEntry); - Assert(pEntry->conns); - Assert(iConn < pEntry->numConns); - - return pEntry->conns + iConn; -} - -static ChunkTransportStateEntry *startOutgoingConnections(ChunkTransportState *transportStates, - ExecSlice *sendSlice, - int *pOutgoingCount); - -static void format_fd_set(StringInfo buf, int nfds, mpp_fd_set *fds, char *pfx, char *sfx); -static void setupOutgoingConnection(ChunkTransportState *transportStates, - ChunkTransportStateEntry *pEntry, MotionConn *conn); -static void updateOutgoingConnection(ChunkTransportState *transportStates, - ChunkTransportStateEntry *pEntry, MotionConn *conn, int errnoSave); -static void sendRegisterMessage(ChunkTransportState *transportStates, ChunkTransportStateEntry *pEntry, MotionConn *conn); -static bool readRegisterMessage(ChunkTransportState *transportStates, - MotionConn *conn); -static MotionConn *acceptIncomingConnection(void); +/* TCP listen fd */ +int TCP_listenerFd; -static void flushInterconnectListenerBacklog(void); - -static void waitOnOutbound(ChunkTransportStateEntry *pEntry); +/* TCP listen port */ +int32 tcp_listener_port; -static TupleChunkListItem RecvTupleChunkFromAnyTCP(ChunkTransportState *transportStates, - int16 motNodeID, - int16 *srcRoute); -static TupleChunkListItem RecvTupleChunkFromTCP(ChunkTransportState *transportStates, - int16 motNodeID, - int16 srcRoute); +static ChunkTransportStateEntry * startOutgoingConnections(ChunkTransportState * transportStates, + ExecSlice * sendSlice, + int *pOutgoingCount); -static void SendEosTCP(ChunkTransportState *transportStates, - int motNodeID, TupleChunkListItem tcItem); +static void format_fd_set(StringInfo buf, int nfds, mpp_fd_set * fds, char *pfx, char *sfx); +static void setupOutgoingConnection(ChunkTransportState * transportStates, + ChunkTransportStateEntry * pEntry, MotionConn * conn); +static void updateOutgoingConnection(ChunkTransportState * transportStates, + ChunkTransportStateEntry * pEntry, MotionConn * conn, int errnoSave); +static void sendRegisterMessage(ChunkTransportState * transportStates, ChunkTransportStateEntry * pEntry, MotionConn * conn); +static bool readRegisterMessage(ChunkTransportState * transportStates, + MotionConn * conn); +static MotionConn * acceptIncomingConnection(void); -static bool SendChunkTCP(ChunkTransportState *transportStates, - ChunkTransportStateEntry *pEntry, MotionConn *conn, TupleChunkListItem tcItem, int16 motionId); +static void flushInterconnectListenerBacklog(void); -static bool flushBuffer(ChunkTransportState *transportStates, - ChunkTransportStateEntry *pEntry, MotionConn *conn, int16 motionId); +static void waitOnOutbound(ChunkTransportStateEntry * pEntry); -static void doSendStopMessageTCP(ChunkTransportState *transportStates, int16 motNodeID); +static bool flushBuffer(ChunkTransportState * transportStates, + ChunkTransportStateEntry * pEntry, MotionConn * conn, int16 motionId); #ifdef AMS_VERBOSE_LOGGING -static void dumpEntryConnections(int elevel, ChunkTransportStateEntry *pEntry); -static void print_connection(ChunkTransportState *transportStates, int fd, const char *msg); +static void dumpEntryConnections(int elevel, ChunkTransportStateEntry * pEntry); +static void print_connection(ChunkTransportState * transportStates, int fd, const char *msg); #endif /* * setupTCPListeningSocket */ static void -setupTCPListeningSocket(int backlog, int *listenerSocketFd, uint16 *listenerPort) +setupTCPListeningSocket(int backlog, int *listenerSocketFd, int32 *listenerPort) { int errnoSave; int fd = -1; @@ -156,15 +140,16 @@ setupTCPListeningSocket(int backlog, int *listenerSocketFd, uint16 *listenerPort hints.ai_family = AF_UNSPEC; /* Allow IPv4 or IPv6 */ hints.ai_socktype = SOCK_STREAM; /* Two-way, out of band connection */ hints.ai_flags = AI_PASSIVE; /* For wildcard IP address */ - hints.ai_protocol = 0; /* Any protocol - TCP implied for network use due to SOCK_STREAM */ + hints.ai_protocol = 0; /* Any protocol - TCP implied for network use + * due to SOCK_STREAM */ /* - * We set interconnect_address on the primary to the local address of the connection from QD - * to the primary, which is the primary's ADDRESS from gp_segment_configuration, - * used for interconnection. - * However it's wrong on the master. Because the connection from the client to the master may - * have different IP addresses as its destination, which is very likely not the master's - * ADDRESS in gp_segment_configuration. + * We set interconnect_address on the primary to the local address of the + * connection from QD to the primary, which is the primary's ADDRESS from + * gp_segment_configuration, used for interconnection. However it's wrong + * on the master. Because the connection from the client to the master may + * have different IP addresses as its destination, which is very likely + * not the master's ADDRESS in gp_segment_configuration. */ if (interconnect_address) { @@ -293,7 +278,7 @@ setupTCPListeningSocket(int backlog, int *listenerSocketFd, uint16 *listenerPort * Initialize TCP specific comms. */ void -InitMotionTCP(int *listenerSocketFd, uint16 *listenerPort) +InitMotionTCP(int *listenerSocketFd, int32 *listenerPort) { tval.tv_sec = 0; tval.tv_usec = 500000; @@ -303,12 +288,27 @@ InitMotionTCP(int *listenerSocketFd, uint16 *listenerPort) return; } -/* cleanup any TCP-specific comms info */ void -CleanupMotionTCP(void) +InitMotionIPCLayerTCP(void) { - /* nothing to do. */ - return; + InitMotionTCP(&TCP_listenerFd, &tcp_listener_port); + + elog(DEBUG1, "Interconnect listening on tcp port %d ", tcp_listener_port); +} + +void +CleanUpMotionIPCLayerTCP(void) +{ + if (gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG) + elog(DEBUG3, "Cleaning Up Motion Layer IPC..."); + + /* close down the Interconnect listener socket. */ + if (TCP_listenerFd >= 0) + closesocket(TCP_listenerFd); + + /* be safe and reset global state variables. */ + tcp_listener_port = 0; + TCP_listenerFd = -1; } /* Function readPacket() is used to read in the next packet from the given @@ -328,9 +328,8 @@ CleanupMotionTCP(void) * conn - MotionConn to read the packet from. * */ -/* static inline void */ -void -readPacket(MotionConn *conn, ChunkTransportState *transportStates) +static inline void +readPacket(MotionConn * conn, ChunkTransportState * transportStates) { int n, bytesRead = conn->recvBytes; @@ -499,8 +498,8 @@ flushIncomingData(int fd) * Initialized ChunkTransportState for the Sending Motion Node Id. */ static ChunkTransportStateEntry * -startOutgoingConnections(ChunkTransportState *transportStates, - ExecSlice *sendSlice, +startOutgoingConnections(ChunkTransportState * transportStates, + ExecSlice * sendSlice, int *pOutgoingCount) { ChunkTransportStateEntry *pEntry; @@ -508,6 +507,7 @@ startOutgoingConnections(ChunkTransportState *transportStates, ListCell *cell; ExecSlice *recvSlice; CdbProcess *cdbProc; + size_t conn_index = 0; *pOutgoingCount = 0; @@ -529,25 +529,26 @@ startOutgoingConnections(ChunkTransportState *transportStates, pEntry = createChunkTransportState(transportStates, sendSlice, recvSlice, - list_length(recvSlice->primaryProcesses)); + list_length(recvSlice->primaryProcesses), + sizeof(ChunkTransportStateEntryTCP)); /* * Setup a MotionConn entry for each of our outbound connections. Request * a connection to each receiving backend's listening port. */ - conn = pEntry->conns; foreach(cell, recvSlice->primaryProcesses) { cdbProc = (CdbProcess *) lfirst(cell); if (cdbProc) { + getMotionConn(pEntry, conn_index, &conn); conn->cdbProc = cdbProc; conn->pBuff = palloc(Gp_max_packet_size); conn->state = mcsSetupOutgoingConnection; (*pOutgoingCount)++; } - conn++; + conn_index++; } return pEntry; @@ -557,7 +558,7 @@ startOutgoingConnections(ChunkTransportState *transportStates, /* * setupOutgoingConnection * - * Called by SetupInterconnect when conn->state == mcsSetupOutgoingConnection. + * Called by SetupTCPInterconnect when conn->state == mcsSetupOutgoingConnection. * * On return, state is: * mcsSetupOutgoingConnection if failed and caller should retry. @@ -566,9 +567,10 @@ startOutgoingConnections(ChunkTransportState *transportStates, * mcsSendRegMsg or mcsStarted if connect() completed successfully. */ static void -setupOutgoingConnection(ChunkTransportState *transportStates, ChunkTransportStateEntry *pEntry, MotionConn *conn) +setupOutgoingConnection(ChunkTransportState * transportStates, ChunkTransportStateEntry * pEntry, MotionConn * conn) { - CdbProcess *cdbProc = conn->cdbProc; + CdbProcess *cdbProc = NULL; + MotionConnTCP *tcp_conn = NULL; int n; @@ -578,10 +580,11 @@ setupOutgoingConnection(ChunkTransportState *transportStates, ChunkTransportStat struct addrinfo *addrs = NULL; struct addrinfo hint; + tcp_conn = CONTAINER_OF(conn, MotionConnTCP, mConn); Assert(conn->cdbProc); Assert(conn->state == mcsSetupOutgoingConnection); - - conn->wakeup_ms = 0; + cdbProc = conn->cdbProc; + tcp_conn->wakeup_ms = 0; conn->remoteContentId = cdbProc->contentid; /* @@ -605,12 +608,13 @@ setupOutgoingConnection(ChunkTransportState *transportStates, ChunkTransportStat } #ifdef ENABLE_IC_PROXY - if (Gp_interconnect_type == INTERCONNECT_TYPE_PROXY) + if (CurrentMotionIPCLayer->ic_type == INTERCONNECT_TYPE_PROXY) { - /* + /* * Using libuv pipe to register backend to proxy. * ic_proxy_backend_connect only appends the connect request into - * connection queue and waits for the libuv_run_loop to handle the queue. + * connection queue and waits for the libuv_run_loop to handle the + * queue. */ ic_proxy_backend_connect(transportStates->proxyContext, pEntry, conn, true); @@ -626,7 +630,7 @@ setupOutgoingConnection(ChunkTransportState *transportStates, ChunkTransportStat conn->remoteContentId = conn->cdbProc->contentid; return; } -#endif /* ENABLE_IC_PROXY */ +#endif /* ENABLE_IC_PROXY */ /* Initialize hint structure */ MemSet(&hint, 0, sizeof(hint)); @@ -731,7 +735,7 @@ setupOutgoingConnection(ChunkTransportState *transportStates, ChunkTransportStat * Called when connect() succeeds or fails. */ static void -updateOutgoingConnection(ChunkTransportState *transportStates, ChunkTransportStateEntry *pEntry, MotionConn *conn, int errnoSave) +updateOutgoingConnection(ChunkTransportState * transportStates, ChunkTransportStateEntry * pEntry, MotionConn * conn, int errnoSave) { socklen_t sizeoferrno = sizeof(errnoSave); @@ -777,7 +781,7 @@ updateOutgoingConnection(ChunkTransportState *transportStates, ChunkTransportSta * pEntry - ChunkTransportState. * conn - MotionConn to send message out on. * - * Called by SetupInterconnect when conn->state == mcsSetupOutgoingConnection. + * Called by SetupTCPInterconnect when conn->state == mcsSetupOutgoingConnection. * * On return, state is: * mcsSendRegMsg if registration message has not been completely sent. @@ -786,11 +790,14 @@ updateOutgoingConnection(ChunkTransportState *transportStates, ChunkTransportSta * sending data. */ static void -sendRegisterMessage(ChunkTransportState *transportStates, ChunkTransportStateEntry *pEntry, MotionConn *conn) +sendRegisterMessage(ChunkTransportState * transportStates, ChunkTransportStateEntry * pEntry, MotionConn * conn) { int bytesToSend; int bytesSent; - SliceTable *sliceTbl = transportStates->sliceTable; + MotionConnTCP *tcp_conn = NULL; + SliceTable *sliceTbl = transportStates->sliceTable; + + tcp_conn = CONTAINER_OF(conn, MotionConnTCP, mConn); if (conn->state != mcsSendRegMsg) { @@ -812,8 +819,8 @@ sendRegisterMessage(ChunkTransportState *transportStates, ChunkTransportStateEnt errdetail("getsockname sockfd=%d remote=%s: %m", conn->sockfd, conn->remoteHostAndPort))); } - format_sockaddr(&localAddr, conn->localHostAndPort, - sizeof(conn->localHostAndPort)); + format_sockaddr(&localAddr, tcp_conn->localHostAndPort, + sizeof(tcp_conn->localHostAndPort)); if (gp_log_interconnect >= GPVARS_VERBOSITY_VERBOSE) ereport(LOG, @@ -824,7 +831,7 @@ sendRegisterMessage(ChunkTransportState *transportStates, ChunkTransportStateEnt conn->cdbProc->pid, GpIdentity.segindex, pEntry->sendSlice->sliceIndex, - conn->localHostAndPort, + tcp_conn->localHostAndPort, conn->sockfd))); regMsg->msgBytes = sizeof(*regMsg); @@ -832,7 +839,7 @@ sendRegisterMessage(ChunkTransportState *transportStates, ChunkTransportStateEnt regMsg->sendSliceIndex = pEntry->sendSlice->sliceIndex; regMsg->srcContentId = GpIdentity.segindex; - regMsg->srcListenerPort = Gp_listener_port & 0x0ffff; + regMsg->srcListenerPort = GetListenPortTCP(); regMsg->srcPid = MyProcPid; regMsg->srcSessionId = gp_session_id; regMsg->srcCommandCount = sliceTbl->ic_instance_id; @@ -866,7 +873,7 @@ sendRegisterMessage(ChunkTransportState *transportStates, ChunkTransportStateEnt errdetail("write pid=%d sockfd=%d local=%s: %m", conn->cdbProc->pid, conn->sockfd, - conn->localHostAndPort))); + tcp_conn->localHostAndPort))); } } @@ -890,8 +897,8 @@ sendRegisterMessage(ChunkTransportState *transportStates, ChunkTransportStateEnt * when socket becomes read-ready. */ static bool -readRegisterMessage(ChunkTransportState *transportStates, - MotionConn *conn) +readRegisterMessage(ChunkTransportState * transportStates, + MotionConn * conn) { int bytesToReceive; int bytesReceived; @@ -899,10 +906,14 @@ readRegisterMessage(ChunkTransportState *transportStates, RegisterMessage *regMsg; RegisterMessage msg; MotionConn *newConn; - ChunkTransportStateEntry *pEntry = NULL; + ChunkTransportStateEntry *pChunkEntry = NULL; + ChunkTransportStateEntryTCP *pEntry = NULL; CdbProcess *cdbproc = NULL; - ListCell *lc; - SliceTable *sliceTbl = transportStates->sliceTable; + ListCell *lc; + SliceTable *sliceTbl = transportStates->sliceTable; + MotionConnTCP *tcp_conn = NULL; + + tcp_conn = CONTAINER_OF(conn, MotionConnTCP, mConn); /* Get ready to receive the Register message. */ if (conn->state != mcsRecvRegMsg) @@ -944,7 +955,7 @@ readRegisterMessage(ChunkTransportState *transportStates, conn->remoteHostAndPort), errdetail("read sockfd=%d local=%s: %m", conn->sockfd, - conn->localHostAndPort))); + tcp_conn->localHostAndPort))); } } @@ -971,7 +982,7 @@ readRegisterMessage(ChunkTransportState *transportStates, conn->remoteHostAndPort), errdetail("msgBytes=%d expected=%d sockfd=%d local=%s", msg.msgBytes, (int) sizeof(*regMsg), - conn->sockfd, conn->localHostAndPort))); + conn->sockfd, tcp_conn->localHostAndPort))); } /* get rid of old connections first */ @@ -981,7 +992,7 @@ readRegisterMessage(ChunkTransportState *transportStates, /* * This is an old connection, which can be safely ignored. We get this * kind of stuff for cases in which one gang participating in the - * interconnect exited a query before calling SetupInterconnect(). + * interconnect exited a query before calling SetupTCPInterconnect(). * Later queries wind up receiving their registration messages. */ elog(LOG, "Received invalid, old registration message: " @@ -1018,12 +1029,13 @@ readRegisterMessage(ChunkTransportState *transportStates, * Find state info for the specified Motion node. The sender's slice * number equals the motion node id. */ - getChunkTransportState(transportStates, msg.sendSliceIndex, &pEntry); + getChunkTransportState(transportStates, msg.sendSliceIndex, &pChunkEntry); + pEntry = CONTAINER_OF(pChunkEntry, ChunkTransportStateEntryTCP, entry); Assert(pEntry); - foreach_with_count(lc, pEntry->sendSlice->primaryProcesses, iconn) + foreach_with_count(lc, pEntry->entry.sendSlice->primaryProcesses, iconn) { - cdbproc = (CdbProcess *)lfirst(lc); + cdbproc = (CdbProcess *) lfirst(lc); if (!cdbproc) continue; @@ -1034,7 +1046,7 @@ readRegisterMessage(ChunkTransportState *transportStates, break; } - if (iconn == list_length(pEntry->sendSlice->primaryProcesses)) + if (iconn == list_length(pEntry->entry.sendSlice->primaryProcesses)) { ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), @@ -1049,7 +1061,7 @@ readRegisterMessage(ChunkTransportState *transportStates, * Allocate MotionConn slot corresponding to sender's position in the * sending slice's CdbProc list. */ - newConn = getMotionConn(pEntry, iconn); + getMotionConn(pChunkEntry, iconn, &newConn); if (newConn->sockfd != -1 || newConn->state != mcsNull) @@ -1093,13 +1105,13 @@ readRegisterMessage(ChunkTransportState *transportStates, newConn->msgSize = 0; newConn->stillActive = true; - MPP_FD_SET(newConn->sockfd, &pEntry->readSet); + MPP_FD_SET(newConn->sockfd, &pEntry->entry.readSet); if (newConn->sockfd > pEntry->highReadSock) pEntry->highReadSock = newConn->sockfd; #ifdef AMS_VERBOSE_LOGGING - dumpEntryConnections(DEBUG4, pEntry); + dumpEntryConnections(DEBUG4, &pEntry->entry); #endif /* we've completed registration of this connection */ @@ -1135,6 +1147,7 @@ acceptIncomingConnection(void) MotionConn *conn; struct sockaddr_storage remoteAddr; struct sockaddr_storage localAddr; + MotionConnTCP *tcp_conn = NULL; /* * Accept a connection. @@ -1168,7 +1181,7 @@ acceptIncomingConnection(void) ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), errmsg("interconnect error on listener port %d", - Gp_listener_port), + tcp_listener_port), errdetail("accept sockfd=%d: %m", TCP_listenerFd))); break; /* not reached */ case ENOMEM: @@ -1179,7 +1192,7 @@ acceptIncomingConnection(void) ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), errmsg("interconnect error on listener port %d", - Gp_listener_port), + tcp_listener_port), errdetail("accept sockfd=%d: %m", TCP_listenerFd))); break; /* not reached */ default: @@ -1187,7 +1200,7 @@ acceptIncomingConnection(void) ereport(LOG, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), errmsg("interconnect connection request not completed on listener port %d", - Gp_listener_port), + tcp_listener_port), errdetail("accept sockfd=%d: %m", TCP_listenerFd))); } /* switch (errno) */ } /* loop until success or EWOULDBLOCK */ @@ -1195,7 +1208,7 @@ acceptIncomingConnection(void) /* * Create a MotionConn object to hold the connection state. */ - conn = palloc0(sizeof(MotionConn)); + conn = palloc0(sizeof(MotionConnTCP)); conn->sockfd = newsockfd; conn->pBuff = palloc(Gp_max_packet_size); conn->msgSize = 0; @@ -1206,6 +1219,8 @@ acceptIncomingConnection(void) conn->state = mcsAccepted; conn->remoteContentId = -2; + tcp_conn = CONTAINER_OF(conn, MotionConnTCP, mConn); + /* Save remote and local host:port strings for error messages. */ format_sockaddr(&remoteAddr, conn->remoteHostAndPort, sizeof(conn->remoteHostAndPort)); @@ -1218,8 +1233,8 @@ acceptIncomingConnection(void) errdetail("getsockname sockfd=%d remote=%s: %m", newsockfd, conn->remoteHostAndPort))); } - format_sockaddr(&localAddr, conn->localHostAndPort, - sizeof(conn->localHostAndPort)); + format_sockaddr(&localAddr, tcp_conn->localHostAndPort, + sizeof(tcp_conn->localHostAndPort)); /* make socket non-blocking */ if (!pg_set_noblock(newsockfd)) @@ -1229,19 +1244,18 @@ acceptIncomingConnection(void) errmsg("interconnect error after accepting connection"), errdetail("fcntl(O_NONBLOCK) sockfd=%d remote=%s local=%s: %m", newsockfd, conn->remoteHostAndPort, - conn->localHostAndPort))); + tcp_conn->localHostAndPort))); } if (gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG) elog(DEBUG4, "Interconnect got incoming connection " "from remote=%s to local=%s sockfd=%d", - conn->remoteHostAndPort, conn->localHostAndPort, newsockfd); + conn->remoteHostAndPort, tcp_conn->localHostAndPort, newsockfd); return conn; } /* acceptIncomingConnection */ -/* See ml_ipc.h */ -void +static inline void SetupTCPInterconnect(EState *estate) { int i, @@ -1251,6 +1265,7 @@ SetupTCPInterconnect(EState *estate) ExecSlice *mySlice; ExecSlice *aSlice; MotionConn *conn; + MotionConnTCP *tcp_conn; SliceTable *sliceTable = estate->es_sliceTable; int incoming_count = 0; int outgoing_count = 0; @@ -1265,6 +1280,7 @@ SetupTCPInterconnect(EState *estate) /* we can have at most one of these. */ ChunkTransportStateEntry *sendingChunkTransportState = NULL; ChunkTransportState *interconnect_context; + HASHCTL conn_sent_record_typmod_ctl; SIMPLE_FAULT_INJECTOR("interconnect_setup_palloc"); interconnect_context = palloc0(sizeof(ChunkTransportState)); @@ -1272,7 +1288,7 @@ SetupTCPInterconnect(EState *estate) /* initialize state variables */ interconnect_context->estate = estate; interconnect_context->size = CTS_INITIAL_SIZE; - interconnect_context->states = palloc0(CTS_INITIAL_SIZE * sizeof(ChunkTransportStateEntry)); + interconnect_context->states = palloc0(CTS_INITIAL_SIZE * sizeof(ChunkTransportStateEntryTCP)); interconnect_context->teardownActive = false; interconnect_context->activated = false; @@ -1281,15 +1297,16 @@ SetupTCPInterconnect(EState *estate) interconnect_context->sliceTable = copyObject(sliceTable); interconnect_context->sliceId = sliceTable->localSlice; - interconnect_context->RecvTupleChunkFrom = RecvTupleChunkFromTCP; - interconnect_context->RecvTupleChunkFromAny = RecvTupleChunkFromAnyTCP; - interconnect_context->SendEos = SendEosTCP; - interconnect_context->SendChunk = SendChunkTCP; - interconnect_context->doSendStopMessage = doSendStopMessageTCP; + conn_sent_record_typmod_ctl.keysize = sizeof(MotionConnKey); + conn_sent_record_typmod_ctl.entrysize = sizeof(MotionConnSentRecordTypmodEnt); + conn_sent_record_typmod_ctl.hcxt = CurrentMemoryContext; + + interconnect_context->conn_sent_record_typmod = hash_create( + "MotionConn sent record typmod mapping", 128, &conn_sent_record_typmod_ctl, HASH_CONTEXT | HASH_ELEM | HASH_BLOBS); #ifdef ENABLE_IC_PROXY ic_proxy_backend_init_context(interconnect_context); -#endif /* ENABLE_IC_PROXY */ +#endif /* ENABLE_IC_PROXY */ mySlice = &interconnect_context->sliceTable->slices[sliceTable->localSlice]; @@ -1317,7 +1334,7 @@ SetupTCPInterconnect(EState *estate) */ totalNumProcs = list_length(aSlice->primaryProcesses); - pEntry = createChunkTransportState(interconnect_context, aSlice, mySlice, totalNumProcs); + pEntry = createChunkTransportState(interconnect_context, aSlice, mySlice, totalNumProcs, sizeof(ChunkTransportStateEntryTCP)); for (i = 0; i < totalNumProcs; i++) { @@ -1328,23 +1345,23 @@ SetupTCPInterconnect(EState *estate) expectedTotalIncoming++; #ifdef ENABLE_IC_PROXY - if (Gp_interconnect_type == INTERCONNECT_TYPE_PROXY) + if (CurrentMotionIPCLayer->ic_type == INTERCONNECT_TYPE_PROXY) { - conn = &pEntry->conns[i]; + getMotionConn(pEntry, i, &conn); conn->cdbProc = list_nth(aSlice->primaryProcesses, i); if (conn->cdbProc) { incoming_count++; - /* + /* * Using libuv pipe to register backend to proxy. - * ic_proxy_backend_connect only appends the connect request - * into connection queue and waits for the libuv_run_loop to - * handle the queue. + * ic_proxy_backend_connect only appends the connect + * request into connection queue and waits for the + * libuv_run_loop to handle the queue. */ ic_proxy_backend_connect(interconnect_context->proxyContext, - pEntry, conn, false /* isSender */); + pEntry, conn, false /* isSender */ ); conn->pBuff = palloc(Gp_max_packet_size); conn->recvBytes = 0; @@ -1359,7 +1376,7 @@ SetupTCPInterconnect(EState *estate) conn->remapper = CreateTupleRemapper(); } } -#endif /* ENABLE_IC_PROXY */ +#endif /* ENABLE_IC_PROXY */ } } @@ -1375,43 +1392,44 @@ SetupTCPInterconnect(EState *estate) sendingChunkTransportState = startOutgoingConnections(interconnect_context, mySlice, &expectedTotalOutgoing); #ifdef ENABLE_IC_PROXY - if (Gp_interconnect_type == INTERCONNECT_TYPE_PROXY) + if (CurrentMotionIPCLayer->ic_type == INTERCONNECT_TYPE_PROXY) { for (i = 0; i < expectedTotalOutgoing; i++) { - conn = &sendingChunkTransportState->conns[i]; + getMotionConn(sendingChunkTransportState, i, &conn); setupOutgoingConnection(interconnect_context, sendingChunkTransportState, conn); } outgoing_count = expectedTotalOutgoing; } + /* * Before ic_proxy_backend_run_loop, we have already gone though all the - * incoming and outgoing connections and append them into the connect queue. - * ic_proxy_backend_run_loop will trigger the uv_loop and begin to handle - * the connect event in parallel and asynchronous way. + * incoming and outgoing connections and append them into the connect + * queue. ic_proxy_backend_run_loop will trigger the uv_loop and begin to + * handle the connect event in parallel and asynchronous way. * * Note that the domain socket fds are binded to libuv pipe handle, but we * still depends on ic_tcp code to send/recv interconnect data based on * these fds and close these fds in teardown function. As a result, we - * should not touch the libuv pipe handles until ic_tcp close all the fds in - * teardown function. In future, we should retire the ic_tcp code in ic_proxy - * backend and use libuv to handle connection setup, data transfer and - * teardown in a unified way. + * should not touch the libuv pipe handles until ic_tcp close all the fds + * in teardown function. In future, we should retire the ic_tcp code in + * ic_proxy backend and use libuv to handle connection setup, data + * transfer and teardown in a unified way. */ ic_proxy_backend_run_loop(interconnect_context->proxyContext); -#endif /* ENABLE_IC_PROXY */ +#endif /* ENABLE_IC_PROXY */ if (expectedTotalIncoming > listenerBacklog) ereport(WARNING, (errmsg("SetupTCPInterconnect: too many expected incoming connections(%d), Interconnect setup might possibly fail", expectedTotalIncoming), errhint("Try enlarging the gp_interconnect_tcp_listener_backlog GUC value and OS net.core.somaxconn parameter"))); if (gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG) - ereport(DEBUG1, (errmsg("SetupInterconnect will activate " + ereport(DEBUG1, (errmsg("SetupTCPInterconnect will activate " "%d incoming, %d outgoing routes. " "Listening on port=%d sockfd=%d.", expectedTotalIncoming, expectedTotalOutgoing, - Gp_listener_port, TCP_listenerFd))); + tcp_listener_port, TCP_listenerFd))); /* * Loop until all connections are completed or time limit is exceeded. @@ -1467,13 +1485,13 @@ SetupTCPInterconnect(EState *estate) for (i = 0; i < n; i++) { index = i; - - conn = &sendingChunkTransportState->conns[index]; + getMotionConn(sendingChunkTransportState, index, &conn); + tcp_conn = CONTAINER_OF(conn, MotionConnTCP, mConn); /* Time to cancel incomplete connect() and retry? */ if (conn->state == mcsConnecting && - conn->wakeup_ms > 0 && - conn->wakeup_ms <= elapsed_ms + 20) + tcp_conn->wakeup_ms > 0 && + tcp_conn->wakeup_ms <= elapsed_ms + 20) { ereport(LOG, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), errmsg("Interconnect timeout: Connection " @@ -1482,31 +1500,31 @@ SetupTCPInterconnect(EState *estate) "ms " UINT64_FORMAT " elapsed. Will retry.", conn->remoteContentId, conn->remoteHostAndPort, - conn->localHostAndPort, - conn->wakeup_ms, (elapsed_ms + 20)))); + tcp_conn->localHostAndPort, + tcp_conn->wakeup_ms, (elapsed_ms + 20)))); conn->state = mcsSetupOutgoingConnection; } /* Time to connect? */ if (conn->state == mcsSetupOutgoingConnection && - conn->wakeup_ms <= elapsed_ms + 20) + tcp_conn->wakeup_ms <= elapsed_ms + 20) { setupOutgoingConnection(interconnect_context, sendingChunkTransportState, conn); switch (conn->state) { case mcsSetupOutgoingConnection: /* Retry failed connection after awhile. */ - conn->wakeup_ms = (iteration - 1) * 1000 + elapsed_ms; + tcp_conn->wakeup_ms = (iteration - 1) * 1000 + elapsed_ms; break; case mcsConnecting: /* Set time limit for connect() to complete. */ if (interconnect_context->aggressiveRetry) - conn->wakeup_ms = CONNECT_AGGRESSIVERETRY_MS + elapsed_ms; + tcp_conn->wakeup_ms = CONNECT_AGGRESSIVERETRY_MS + elapsed_ms; else - conn->wakeup_ms = CONNECT_RETRY_MS + elapsed_ms; + tcp_conn->wakeup_ms = CONNECT_RETRY_MS + elapsed_ms; break; default: - conn->wakeup_ms = 0; + tcp_conn->wakeup_ms = 0; break; } } @@ -1544,8 +1562,8 @@ SetupTCPInterconnect(EState *estate) elog(FATAL, "SetupTCPInterconnect: bad connection state"); } - if (conn->wakeup_ms > 0) - timeout_ms = Min(timeout_ms, conn->wakeup_ms - elapsed_ms); + if (tcp_conn->wakeup_ms > 0) + timeout_ms = Min(timeout_ms, tcp_conn->wakeup_ms - elapsed_ms); } /* loop to set up outgoing connections */ /* Break out of select() loop if completed all connections. */ @@ -1591,7 +1609,7 @@ SetupTCPInterconnect(EState *estate) { if (gp_log_interconnect >= GPVARS_VERBOSITY_VERBOSE && (timeout_ms > 0 || iteration > 2)) - ereport(LOG, (errmsg("SetupInterconnect+" UINT64_FORMAT + ereport(LOG, (errmsg("SetupTCPInterconnect+" UINT64_FORMAT "ms: pause " UINT64_FORMAT "ms " "outgoing_fail=%d iteration=%d", elapsed_ms, timeout_ms, @@ -1601,7 +1619,7 @@ SetupTCPInterconnect(EState *estate) /* Shouldn't be in this loop unless we have some work to do. */ if (outgoing_fail_count <= 0) { - elog(FATAL, "SetupInterconnect: invalid outgoing count"); + elog(FATAL, "SetupTCPInterconnect: invalid outgoing count"); } /* Wait until earliest wakeup time or overall timeout. */ @@ -1642,7 +1660,7 @@ SetupTCPInterconnect(EState *estate) elapsed_ms = gp_get_elapsed_ms(&startTime); - ereport(DEBUG1, (errmsg("SetupInterconnect+" UINT64_FORMAT + ereport(DEBUG1, (errmsg("SetupTCPInterconnect+" UINT64_FORMAT "ms: select() " "Interest: %s. timeout=" UINT64_FORMAT "ms " "outgoing_fail=%d iteration=%d", @@ -1683,7 +1701,7 @@ SetupTCPInterconnect(EState *estate) } else appendStringInfoString(&logbuf, n < 0 ? "error" : "timeout"); - ereport(elevel, (errmsg("SetupInterconnect+" UINT64_FORMAT "ms: select() %s", + ereport(elevel, (errmsg("SetupTCPInterconnect+" UINT64_FORMAT "ms: select() %s", elapsed_ms, logbuf.data))); pfree(logbuf.data); MemSet(&logbuf, 0, sizeof(logbuf)); @@ -1712,13 +1730,15 @@ SetupTCPInterconnect(EState *estate) * are left over -- better to just process them here. */ cell = list_head(interconnect_context->incompleteConns); - foreach (cell, interconnect_context->incompleteConns) + foreach(cell, interconnect_context->incompleteConns) { - if (n <= 0) { + if (n <= 0) + { break; } conn = (MotionConn *) lfirst(cell); + tcp_conn = CONTAINER_OF(conn, MotionConnTCP, mConn); if (MPP_FD_ISSET(conn->sockfd, &rset)) { @@ -1739,7 +1759,7 @@ SetupTCPInterconnect(EState *estate) if (conn->pBuff) pfree(conn->pBuff); /* Free temporary MotionConn storage. */ - pfree(conn); + pfree(tcp_conn); } } } @@ -1773,7 +1793,9 @@ SetupTCPInterconnect(EState *estate) outgoing_count < expectedTotalOutgoing && i < sendingChunkTransportState->numConns) { /* loop to check outgoing connections */ - conn = &sendingChunkTransportState->conns[i++]; + getMotionConn(sendingChunkTransportState, i, &conn); + i++; + tcp_conn = CONTAINER_OF(conn, MotionConnTCP, mConn); switch (conn->state) { case mcsConnecting: @@ -1787,18 +1809,18 @@ SetupTCPInterconnect(EState *estate) { case mcsSetupOutgoingConnection: /* Failed. Wait awhile before retrying. */ - conn->wakeup_ms = (iteration - 1) * 1000 + elapsed_ms; + tcp_conn->wakeup_ms = (iteration - 1) * 1000 + elapsed_ms; break; case mcsSendRegMsg: /* Connected, but reg msg not fully sent. */ - conn->wakeup_ms = 0; + tcp_conn->wakeup_ms = 0; break; case mcsStarted: /* Connected, sent reg msg, ready to rock. */ outgoing_count++; break; default: - elog(FATAL, "SetupInterconnect: bad outgoing state"); + elog(FATAL, "SetupTCPInterconnect: bad outgoing state"); } } break; @@ -1822,7 +1844,7 @@ SetupTCPInterconnect(EState *estate) /* By now we have dealt with all the events reported by select(). */ if (n != 0) - elog(FATAL, "SetupInterconnect: extra select events."); + elog(FATAL, "SetupTCPInterconnect: extra select events."); } /* select() loop */ /* @@ -1844,6 +1866,7 @@ SetupTCPInterconnect(EState *estate) foreach(cell, interconnect_context->incompleteConns) { conn = (MotionConn *) lfirst(cell); + tcp_conn = CONTAINER_OF(conn, MotionConnTCP, mConn); if (conn->sockfd != -1) { @@ -1855,7 +1878,7 @@ SetupTCPInterconnect(EState *estate) if (conn->pBuff) pfree(conn->pBuff); - pfree(conn); + pfree(tcp_conn); interconnect_context->incompleteConns = foreach_delete_current(interconnect_context->incompleteConns, cell); } @@ -1868,28 +1891,54 @@ SetupTCPInterconnect(EState *estate) elapsed_ms = gp_get_elapsed_ms(&startTime); if (gp_log_interconnect >= GPVARS_VERBOSITY_VERBOSE || elapsed_ms >= 0.1 * 1000 * interconnect_setup_timeout) - elog(LOG, "SetupInterconnect+" UINT64_FORMAT "ms: Activated %d incoming, " + elog(LOG, "SetupTCPInterconnect+" UINT64_FORMAT "ms: Activated %d incoming, " "%d outgoing routes.", elapsed_ms, incoming_count, outgoing_count); } estate->interconnect_context = interconnect_context; estate->es_interconnect_is_setup = true; -} /* SetupInterconnect */ +} /* SetupTCPInterconnect */ + +void +SetupInterconnectTCP(EState *estate) +{ + interconnect_handle_t *h; + MemoryContext oldContext; + + if (estate->interconnect_context) + { + elog(ERROR, "SetupInterconnectTCP: already initialized."); + } + else if (!estate->es_sliceTable) + { + elog(ERROR, "SetupInterconnectTCP: no slice table ?"); + } + + h = allocate_interconnect_handle(TeardownInterconnectTCP); + + Assert(InterconnectContext != NULL); + oldContext = MemoryContextSwitchTo(InterconnectContext); + + SetupTCPInterconnect(estate); + + MemoryContextSwitchTo(oldContext); + h->interconnect_context = estate->interconnect_context; +} /* TeardownInterconnect() function is used to cleanup interconnect resources that - * were allocated during SetupInterconnect(). This function should ALWAYS be - * called after SetupInterconnect to avoid leaking resources (like sockets) - * even if SetupInterconnect did not complete correctly. As a result, this - * function must complete successfully even if SetupInterconnect didn't. + * were allocated during SetupInterconnectTCP(). This function should ALWAYS be + * called after SetupInterconnectTCP to avoid leaking resources (like sockets) + * even if SetupInterconnectTCP did not complete correctly. As a result, this + * function must complete successfully even if SetupInterconnectTCP didn't. * - * SetupInterconnect() always gets called under the ExecutorState MemoryContext. + * SetupInterconnectTCP() always gets called under the ExecutorState MemoryContext. * This context is destroyed at the end of the query and all memory that gets * allocated under it is free'd. We don't have have to worry about pfree() but * we definitely have to worry about socket resources. */ void -TeardownTCPInterconnect(ChunkTransportState *transportStates, bool hasErrors) +TeardownTCPInterconnect(ChunkTransportState * transportStates, bool hasErrors) { ListCell *cell; ChunkTransportStateEntry *pEntry = NULL; @@ -1912,7 +1961,7 @@ TeardownTCPInterconnect(ChunkTransportState *transportStates, bool hasErrors) mySlice = &transportStates->sliceTable->slices[transportStates->sliceId]; - /* Log the start of TeardownInterconnect. */ + /* Log the start of TeardownTCPInterconnect. */ if (gp_log_interconnect >= GPVARS_VERBOSITY_TERSE) { int elevel = 0; @@ -2009,7 +2058,7 @@ TeardownTCPInterconnect(ChunkTransportState *transportStates, bool hasErrors) for (i = 0; i < pEntry->numConns; i++) { - conn = pEntry->conns + i; + getMotionConn(pEntry, i, &conn); if (conn->sockfd >= 0) shutdown(conn->sockfd, SHUT_WR); @@ -2046,7 +2095,7 @@ TeardownTCPInterconnect(ChunkTransportState *transportStates, bool hasErrors) */ for (i = 0; i < pEntry->numConns; i++) { - conn = pEntry->conns + i; + getMotionConn(pEntry, i, &conn); if (conn->sockfd >= 0) { @@ -2081,23 +2130,23 @@ TeardownTCPInterconnect(ChunkTransportState *transportStates, bool hasErrors) /* * On a normal teardown routine, sender has sent an EOS packet and * disabled further send operations on phase 1. sender can't close the - * connection immediately because EOS packet or data packets within the - * kernel sending buffer may be lost on some platform if sender close the - * connection totally. + * connection immediately because EOS packet or data packets within + * the kernel sending buffer may be lost on some platform if sender + * close the connection totally. * * The correct way is sender blocks on the connection until receivers - * get the EOS packets and close the peer, then it's safe for sender to - * close the connection totally. + * get the EOS packets and close the peer, then it's safe for sender + * to close the connection totally. * - * If some errors are happening, senders can skip this step to avoid hung - * issues, QD will take care of the error handling. + * If some errors are happening, senders can skip this step to avoid + * hung issues, QD will take care of the error handling. */ if (!hasErrors) waitOnOutbound(pEntry); for (i = 0; i < pEntry->numConns; i++) { - conn = pEntry->conns + i; + getMotionConn(pEntry, i, &conn); if (conn->sockfd >= 0) { @@ -2121,10 +2170,13 @@ TeardownTCPInterconnect(ChunkTransportState *transportStates, bool hasErrors) #ifdef ENABLE_IC_PROXY ic_proxy_backend_close_context(transportStates); -#endif /* ENABLE_IC_PROXY */ +#endif /* ENABLE_IC_PROXY */ if (transportStates->states != NULL) pfree(transportStates->states); + if (transportStates->conn_sent_record_typmod) + hash_destroy(transportStates->conn_sent_record_typmod); + pfree(transportStates); if (hasErrors) @@ -2132,38 +2184,54 @@ TeardownTCPInterconnect(ChunkTransportState *transportStates, bool hasErrors) #ifdef AMS_VERBOSE_LOGGING if (gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG) - elog(DEBUG4, "TeardownInterconnect successful"); + elog(DEBUG4, "TeardownTCPInterconnect successful"); #endif } +void +TeardownInterconnectTCP(ChunkTransportState * transportStates, + bool hasErrors) +{ + /* TODO: should pass interconnect_handle_t as arg? */ + interconnect_handle_t *h = find_interconnect_handle(transportStates); + + TeardownTCPInterconnect(transportStates, hasErrors); + + if (h != NULL) { + destroy_interconnect_handle(h); + } +} + #ifdef AMS_VERBOSE_LOGGING void -dumpEntryConnections(int elevel, ChunkTransportStateEntry *pEntry) +dumpEntryConnections(int elevel, ChunkTransportStateEntry * pEntry) { int i; - MotionConn *conn; + MotionConn *mConn; + MotionConnTCP *conn; for (i = 0; i < pEntry->numConns; i++) { - conn = &pEntry->conns[i]; - if (conn->sockfd == -1 && - conn->state == mcsNull) + getMotionConn(pEntry, i, &mConn); + conn = CONTAINER_OF(mConn, MotionConnTCP, mConn); + if (conn->mConn.sockfd == -1 && + conn->mConn.state == mcsNull) elog(elevel, "... motNodeId=%d conns[%d]: not connected", pEntry->motNodeId, i); else elog(elevel, "... motNodeId=%d conns[%d]: " "%d pid=%d sockfd=%d remote=%s local=%s", pEntry->motNodeId, i, - conn->remoteContentId, - conn->cdbProc ? conn->cdbProc->pid : 0, - conn->sockfd, - conn->remoteHostAndPort, + conn->mConn.remoteContentId, + conn->mConn.cdbProc ? conn->mConn.cdbProc->pid : 0, + conn->mConn.sockfd, + conn->mConn.remoteHostAndPort, conn->localHostAndPort); } } static void -print_connection(ChunkTransportState *transportStates, int fd, const char *msg) +print_connection(ChunkTransportState * transportStates, int fd, const char *msg) { struct sockaddr_in local, remote; @@ -2191,7 +2259,7 @@ print_connection(ChunkTransportState *transportStates, int fd, const char *msg) #endif static void -format_fd_set(StringInfo buf, int nfds, mpp_fd_set *fds, char *pfx, char *sfx) +format_fd_set(StringInfo buf, int nfds, mpp_fd_set * fds, char *pfx, char *sfx) { int i; bool first = true; @@ -2331,7 +2399,7 @@ flushInterconnectListenerBacklog(void) * QueryCancelPending */ static void -waitOnOutbound(ChunkTransportStateEntry *pEntry) +waitOnOutbound(ChunkTransportStateEntry * pEntry) { MotionConn *conn; @@ -2347,7 +2415,7 @@ waitOnOutbound(ChunkTransportStateEntry *pEntry) for (i = 0; i < pEntry->numConns; i++) { - conn = pEntry->conns + i; + getMotionConn(pEntry, i, &conn); if (conn->sockfd >= 0) { @@ -2399,7 +2467,7 @@ waitOnOutbound(ChunkTransportStateEntry *pEntry) for (i = 0; i < pEntry->numConns; i++) { - conn = pEntry->conns + i; + getMotionConn(pEntry, i, &conn); if (conn->sockfd >= 0 && MPP_FD_ISSET(conn->sockfd, &curset)) { @@ -2409,7 +2477,7 @@ waitOnOutbound(ChunkTransportStateEntry *pEntry) /* ready to read. */ count = recv(conn->sockfd, &buf, sizeof(buf), 0); - if (count == 0 || count == 1) /* done ! */ + if (count == 0 || count == 1) /* done ! */ { /* got a stop message */ AssertImply(count == 1, buf == 'S'); @@ -2421,7 +2489,7 @@ waitOnOutbound(ChunkTransportStateEntry *pEntry) } else if (count < 0 && (errno == EAGAIN || errno == EINTR)) continue; - + /* * Something unexpected, but probably not horrible warn and * return @@ -2438,8 +2506,54 @@ waitOnOutbound(ChunkTransportStateEntry *pEntry) return; } -static void -doSendStopMessageTCP(ChunkTransportState *transportStates, int16 motNodeID) +void +DeregisterReadInterestTCP(ChunkTransportState * transportStates, + int motNodeID, + int srcRoute, + const char *reason) +{ + ChunkTransportStateEntry *pEntry = NULL; + MotionConn *conn; + + if (!transportStates) + { + elog(FATAL, "DeregisterReadInterest: no transport states"); + } + + if (!transportStates->activated) + return; + + getChunkTransportState(transportStates, motNodeID, &pEntry); + getMotionConn(pEntry, srcRoute, &conn); + + if (gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG) + { + elog(DEBUG3, "Interconnect finished receiving " + "from seg%d slice%d %s pid=%d sockfd=%d; %s", + conn->remoteContentId, + pEntry->sendSlice->sliceIndex, + conn->remoteHostAndPort, + conn->cdbProc->pid, + conn->sockfd, + reason); + } + + /* + * we also mark the connection as "done." The way synchronization works is + * strange. On QDs the "teardown" doesn't get called until all segments + * are finished, which means that we need some way for the QEs to know + * that Teardown should complete, otherwise we deadlock the entire query + * (QEs wait in their Teardown calls, while the QD waits for them to + * finish) + */ + shutdown(conn->sockfd, SHUT_WR); + + MPP_FD_CLR(conn->sockfd, &pEntry->readSet); + return; +} + +void +SendStopMessageTCP(ChunkTransportState * transportStates, int16 motNodeID) { ChunkTransportStateEntry *pEntry = NULL; MotionConn *conn; @@ -2459,7 +2573,7 @@ doSendStopMessageTCP(ChunkTransportState *transportStates, int16 motNodeID) */ for (i = 0; i < pEntry->numConns; i++) { - conn = pEntry->conns + i; + getMotionConn(pEntry, i, &conn); if (conn->sockfd >= 0 && MPP_FD_ISSET(conn->sockfd, &pEntry->readSet)) @@ -2486,13 +2600,142 @@ doSendStopMessageTCP(ChunkTransportState *transportStates, int16 motNodeID) } } /* CRITICAL TO AVOID DEADLOCK */ - DeregisterReadInterest(transportStates, motNodeID, i, - "no more input needed"); + DeregisterReadInterestTCP(transportStates, motNodeID, i, + "no more input needed"); } } -static TupleChunkListItem -RecvTupleChunkFromTCP(ChunkTransportState *transportStates, +TupleChunkListItem +RecvTupleChunkTCP(MotionConn * conn, ChunkTransportState * transportStates) +{ + TupleChunkListItem tcItem; + TupleChunkListItem firstTcItem = NULL; + TupleChunkListItem lastTcItem = NULL; + uint32 tcSize; + int bytesProcessed = 0; + + + /* read the packet in from the network. */ + readPacket(conn, transportStates); + + /* go through and form us some TupleChunks. */ + bytesProcessed = PACKET_HEADER_SIZE; + +#ifdef AMS_VERBOSE_LOGGING + elog(DEBUG5, "recvtuple chunk recv bytes %d msgsize %d conn->pBuff %p conn->msgPos: %p", + conn->recvBytes, conn->msgSize, conn->pBuff, conn->msgPos); +#endif + + while (bytesProcessed != conn->msgSize) + { + if (conn->msgSize - bytesProcessed < TUPLE_CHUNK_HEADER_SIZE) + { + elog(LOG, "Interconnect parse details(TCP/PROXY): pkt->len %d", + *((int32 *) conn->pBuff)); + + ereport(ERROR, + (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), + errmsg("interconnect error parsing message: insufficient data received"), + errdetail("conn->msgSize %d bytesProcessed %d < chunk-header %d", + conn->msgSize, bytesProcessed, TUPLE_CHUNK_HEADER_SIZE))); + } + + tcSize = TUPLE_CHUNK_HEADER_SIZE + (*(uint16 *) (conn->msgPos + bytesProcessed)); + + /* sanity check */ + if (tcSize > Gp_max_packet_size) + { + /* + * see MPP-720: it is possible that our message got messed up by a + * cancellation ? + */ + ML_CHECK_FOR_INTERRUPTS(transportStates->teardownActive); + + /* + * MPP-4010: add some extra debugging. + */ + if (lastTcItem != NULL) + elog(LOG, "Interconnect error parsing message: last item length %d inplace %p", lastTcItem->chunk_length, lastTcItem->inplace); + else + elog(LOG, "Interconnect error parsing message: no last item"); + + elog(LOG, "Interconnect parse details(TCP/PROXY): pkt->len %d", + *((int32 *) conn->pBuff)); + + ereport(ERROR, + (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), + errmsg("interconnect error parsing message"), + errdetail("tcSize %d > max %d header %d processed %d/%d from %p", + tcSize, Gp_max_packet_size, + TUPLE_CHUNK_HEADER_SIZE, bytesProcessed, + conn->msgSize, conn->msgPos))); + } + + + /* + * we only check for interrupts here when we don't have a guaranteed + * full-message + */ + + if (tcSize >= conn->msgSize) + { + /* + * see MPP-720: it is possible that our message got messed up by a + * cancellation ? + */ + ML_CHECK_FOR_INTERRUPTS(transportStates->teardownActive); + + elog(LOG, "Interconnect parse details(TCP/PROXY): pkt->len %d", + *((int32 *) conn->pBuff)); + + ereport(ERROR, + (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), + errmsg("interconnect error parsing message"), + errdetail("tcSize %d >= conn->msgSize %d", + tcSize, conn->msgSize))); + } + Assert(tcSize < conn->msgSize); + + /* + * We store the data inplace, and handle any necessary copying later + * on + */ + tcItem = (TupleChunkListItem) palloc(sizeof(TupleChunkListItemData)); + + tcItem->p_next = NULL; + tcItem->chunk_length = tcSize; + tcItem->inplace = (char *) (conn->msgPos + bytesProcessed); + + bytesProcessed += tcSize; + + if (firstTcItem == NULL) + { + firstTcItem = tcItem; + lastTcItem = tcItem; + } + else + { + lastTcItem->p_next = tcItem; + lastTcItem = tcItem; + } + } + + conn->recvBytes -= conn->msgSize; + if (conn->recvBytes != 0) + { +#ifdef AMS_VERBOSE_LOGGING + elog(DEBUG5, "residual message %d bytes", conn->recvBytes); +#endif + conn->msgPos += conn->msgSize; + } + + conn->msgSize = 0; + + return firstTcItem; +} + +TupleChunkListItem +RecvTupleChunkFromTCP(ChunkTransportState * transportStates, int16 motNodeID, int16 srcRoute) { @@ -2503,36 +2746,39 @@ RecvTupleChunkFromTCP(ChunkTransportState *transportStates, ML_CHECK_FOR_INTERRUPTS(transportStates->teardownActive); #ifdef AMS_VERBOSE_LOGGING - elog(DEBUG5, "RecvTupleChunkFrom(motNodID=%d, srcRoute=%d)", motNodeID, srcRoute); + elog(DEBUG5, "RecvTupleChunkFromTCP(motNodID=%d, srcRoute=%d)", motNodeID, srcRoute); #endif getChunkTransportState(transportStates, motNodeID, &pEntry); - conn = pEntry->conns + srcRoute; + getMotionConn(pEntry, srcRoute, &conn); - return RecvTupleChunk(conn, transportStates); + return RecvTupleChunkTCP(conn, transportStates); } -static TupleChunkListItem -RecvTupleChunkFromAnyTCP(ChunkTransportState *transportStates, +TupleChunkListItem +RecvTupleChunkFromAnyTCP(ChunkTransportState * transportStates, int16 motNodeID, int16 *srcRoute) { - ChunkTransportStateEntry *pEntry = NULL; + ChunkTransportStateEntry *pChunkEntry = NULL; + ChunkTransportStateEntryTCP *pEntry = NULL; TupleChunkListItem tcItem; - MotionConn *conn; + MotionConn *conn; mpp_fd_set rset; int n, i, index; bool skipSelect = false; - int nwaitfds = 0; - int *waitFds = NULL; + int nwaitfds = 0; + int *waitFds = NULL; #ifdef AMS_VERBOSE_LOGGING - elog(DEBUG5, "RecvTupleChunkFromAny(motNodeId=%d)", motNodeID); + elog(DEBUG5, "RecvTupleChunkFromAnyTCP(motNodeId=%d)", motNodeID); #endif - getChunkTransportState(transportStates, motNodeID, &pEntry); + getChunkTransportState(transportStates, motNodeID, &pChunkEntry); + pEntry = CONTAINER_OF(pChunkEntry, ChunkTransportStateEntryTCP, entry); + Assert(pEntry); int retry = 0; @@ -2547,21 +2793,21 @@ RecvTupleChunkFromAnyTCP(ChunkTransportState *transportStates, } struct timeval timeout = tval; - int nfds = pEntry->highReadSock; - + int nfds = pEntry->highReadSock; + /* make sure we check for these. */ ML_CHECK_FOR_INTERRUPTS(transportStates->teardownActive); - memcpy(&rset, &pEntry->readSet, sizeof(mpp_fd_set)); + memcpy(&rset, &pEntry->entry.readSet, sizeof(mpp_fd_set)); /* * since we may have data in a local buffer, we may be able to * short-circuit the select() call (and if we don't do this we may * wait when we have data ready, since it has already been read) */ - for (i = 0; i < pEntry->numConns; i++) + for (i = 0; i < pEntry->entry.numConns; i++) { - conn = pEntry->conns + i; + getMotionConn(&pEntry->entry, i, &conn); if (conn->sockfd >= 0 && MPP_FD_ISSET(conn->sockfd, &rset) && @@ -2596,8 +2842,17 @@ RecvTupleChunkFromAnyTCP(ChunkTransportState *transportStates, } - // GPDB_12_MERGE_FIXME: should use WaitEventSetWait() instead of select() - // follow the routine in ic_udpifc.c + /* + * GPDB_12_MERGE_FIXME: should use WaitEventSetWait() instead of + * select() + */ + /* follow the routine in ic_udpifc.c */ + /** + * Notice that: ic-proxy won't get any tuple if ic_proxy_addrs is empty + * So double check you have already configurate the proxy addresses + * + * check README.ic-proxy.md for more infos + */ n = select(nfds + 1, (fd_set *) &rset, NULL, NULL, &timeout); if (n < 0) { @@ -2610,7 +2865,8 @@ RecvTupleChunkFromAnyTCP(ChunkTransportState *transportStates, } else if (n > 0 && nwaitfds > 0) { - bool need_check = false; + bool need_check = false; + for (i = 0; i < nwaitfds; i++) if (MPP_FD_ISSET(waitFds[i], &rset)) { @@ -2627,7 +2883,7 @@ RecvTupleChunkFromAnyTCP(ChunkTransportState *transportStates, pfree(waitFds); #ifdef AMS_VERBOSE_LOGGING - elog(DEBUG5, "RecvTupleChunkFromAny() select() returned %d ready sockets", n); + elog(DEBUG5, "RecvTupleChunkFromAnyTCP() select() returned %d ready sockets", n); #endif } while (n < 1); @@ -2635,21 +2891,24 @@ RecvTupleChunkFromAnyTCP(ChunkTransportState *transportStates, * We scan the file descriptors starting from where we left off in the * last call (don't continually poll the first when others may be ready!). */ - index = pEntry->scanStart; - for (i = 0; i < pEntry->numConns; i++, index++) + index = pEntry->entry.scanStart; + for (i = 0; i < pEntry->entry.numConns; i++, index++) { /* - * avoid division ? index = ((scanStart + i) % pEntry->numConns); + * avoid division ? index = ((scanStart + i) % + * pEntry->entry.numConns); */ - if (index >= pEntry->numConns) + if (index >= pEntry->entry.numConns) + { index = 0; + } - conn = pEntry->conns + index; + getMotionConn(&pEntry->entry, index, &conn); #ifdef AMS_VERBOSE_LOGGING if (!conn->stillActive) { - elog(LOG, "RecvTupleChunkFromAny: trying to read on inactive socket %d", conn->sockfd); + elog(LOG, "RecvTupleChunkFromAnyTCP: trying to read on inactive socket %d", conn->sockfd); } #endif @@ -2657,9 +2916,9 @@ RecvTupleChunkFromAnyTCP(ChunkTransportState *transportStates, MPP_FD_ISSET(conn->sockfd, &rset)) { #ifdef AMS_VERBOSE_LOGGING - elog(DEBUG5, "RecvTupleChunkFromAny() (fd %d) %d/%d", conn->sockfd, motNodeID, index); + elog(DEBUG5, "RecvTupleChunkFromAnyTCP() (fd %d) %d/%d", conn->sockfd, motNodeID, index); #endif - tcItem = RecvTupleChunk(conn, transportStates); + tcItem = RecvTupleChunkTCP(conn, transportStates); *srcRoute = index; @@ -2667,7 +2926,7 @@ RecvTupleChunkFromAnyTCP(ChunkTransportState *transportStates, * advance start point (avoid doing division/modulus operation * here) */ - pEntry->scanStart = index + 1; + pEntry->entry.scanStart = index + 1; return tcItem; } @@ -2679,8 +2938,8 @@ RecvTupleChunkFromAnyTCP(ChunkTransportState *transportStates, } /* See ml_ipc.h */ -static void -SendEosTCP(ChunkTransportState *transportStates, +void +SendEOSTCP(ChunkTransportState * transportStates, int motNodeID, TupleChunkListItem tcItem) { @@ -2690,11 +2949,11 @@ SendEosTCP(ChunkTransportState *transportStates, if (!transportStates) { - elog(FATAL, "SendEosTCP: missing interconnect context."); + elog(FATAL, "SendEOSTCP: missing interconnect context."); } else if (!transportStates->activated && !transportStates->teardownActive) { - elog(FATAL, "SendEosTCP: context and teardown inactive."); + elog(FATAL, "SendEOSTCP: context and teardown inactive."); } /* check em' */ @@ -2715,13 +2974,13 @@ SendEosTCP(ChunkTransportState *transportStates, /* now flush all of the buffers. */ for (i = 0; i < pEntry->numConns; i++) { - conn = pEntry->conns + i; + getMotionConn(pEntry, i, &conn); if (conn->sockfd >= 0 && conn->state == mcsStarted) flushBuffer(transportStates, pEntry, conn, motNodeID); #ifdef AMS_VERBOSE_LOGGING - elog(DEBUG5, "SendEosTCP() Leaving"); + elog(DEBUG5, "SendEOSTCP() Leaving"); #endif } @@ -2729,8 +2988,8 @@ SendEosTCP(ChunkTransportState *transportStates, } static bool -flushBuffer(ChunkTransportState *transportStates, - ChunkTransportStateEntry *pEntry, MotionConn *conn, int16 motionId) +flushBuffer(ChunkTransportState * transportStates, + ChunkTransportStateEntry * pEntry, MotionConn * conn, int16 motionId) { char *sendptr; int n, @@ -2829,8 +3088,8 @@ flushBuffer(ChunkTransportState *transportStates, /* * as a sender... if there is something to read... it must - * mean its a StopSendingMessage or receiver has teared down - * the interconnect, we don't even bother to read it. + * mean its a StopSendingMessage or receiver has teared + * down the interconnect, we don't even bother to read it. */ if (MPP_FD_ISSET(conn->sockfd, &rset) || transportStates->teardownActive) { @@ -2909,15 +3168,15 @@ flushBuffer(ChunkTransportState *transportStates, * tcItem - message to be sent. * motionId - Node Motion Id. */ -static bool -SendChunkTCP(ChunkTransportState *transportStates, ChunkTransportStateEntry *pEntry, MotionConn *conn, TupleChunkListItem tcItem, int16 motionId) +bool +SendChunkTCP(ChunkTransportState * transportStates, ChunkTransportStateEntry * pEntry, MotionConn * conn, TupleChunkListItem tcItem, int16 motionId) { int length = tcItem->chunk_length; Assert(conn->msgSize > 0); #ifdef AMS_VERBOSE_LOGGING - elog(DEBUG5, "sendChunk: msgSize %d this chunk length %d", conn->msgSize, tcItem->chunk_length); + elog(DEBUG5, "SendChunkTCP: msgSize %d this chunk length %d", conn->msgSize, tcItem->chunk_length); #endif if (conn->msgSize + length > Gp_max_packet_size) @@ -2940,7 +3199,7 @@ SendChunkTCP(ChunkTransportState *transportStates, ChunkTransportStateEntry *pEn * time. */ static void -gp_set_monotonic_begin_time(GpMonotonicTime *time) +gp_set_monotonic_begin_time(GpMonotonicTime * time) { time->beginTime.tv_sec = 0; time->beginTime.tv_usec = 0; @@ -2965,14 +3224,15 @@ gp_set_monotonic_begin_time(GpMonotonicTime *time) * calls. It is not for getting the system time. */ static void -gp_get_monotonic_time(GpMonotonicTime *time) +gp_get_monotonic_time(GpMonotonicTime * time) { struct timeval newTime; - int status; + int status; #if HAVE_LIBRT /* Use clock_gettime to return monotonic time value. */ struct timespec ts; + status = clock_gettime(CLOCK_MONOTONIC, &ts); newTime.tv_sec = ts.tv_sec; @@ -2981,7 +3241,7 @@ gp_get_monotonic_time(GpMonotonicTime *time) #else gettimeofday(&newTime, NULL); - status = 0; /* gettimeofday always succeeds. */ + status = 0; /* gettimeofday always succeeds. */ #endif @@ -3033,7 +3293,7 @@ timeCmp(struct timeval *t1, struct timeval *t2) * to the current time. */ static inline uint64 -gp_get_elapsed_us(GpMonotonicTime *time) +gp_get_elapsed_us(GpMonotonicTime * time) { if (time->beginTime.tv_sec == 0 && time->beginTime.tv_usec == 0) @@ -3046,7 +3306,25 @@ gp_get_elapsed_us(GpMonotonicTime *time) } static inline uint64 -gp_get_elapsed_ms(GpMonotonicTime *time) +gp_get_elapsed_ms(GpMonotonicTime * time) { return gp_get_elapsed_us(time) / (USECS_PER_SECOND / MSECS_PER_SECOND); } + +int +GetMaxTupleChunkSizeTCP(void) +{ + return Gp_max_packet_size - PACKET_HEADER_SIZE - TUPLE_CHUNK_HEADER_SIZE; +} + +int32 +GetListenPortTCP(void) +{ + return tcp_listener_port; +} + +void +WaitInterconnectQuitTCP(void) +{ + /* do nothing */ +} diff --git a/contrib/interconnect/tcp/ic_tcp.h b/contrib/interconnect/tcp/ic_tcp.h new file mode 100644 index 00000000000..4072b35e6b5 --- /dev/null +++ b/contrib/interconnect/tcp/ic_tcp.h @@ -0,0 +1,92 @@ +/*------------------------------------------------------------------------- + * ic_tcp.h + * Motion IPC TCP implements. + * + * Portions Copyright (c) 2023-, Cloudberry inc + * + * + * IDENTIFICATION + * contrib/interconnect/tcp/ic_tcp.h + * + *------------------------------------------------------------------------- + */ +#ifndef IC_TCP_INTERFACE_H +#define IC_TCP_INTERFACE_H + +#include "postgres.h" + +#include "common/ip.h" +#include "nodes/execnodes.h" /* ExecSlice, SliceTable */ +#include "miscadmin.h" +#include "libpq/libpq-be.h" +#include "utils/builtins.h" +#include "utils/memutils.h" + +#include "cdb/ml_ipc.h" +#include "cdb/cdbvars.h" +#include "cdb/cdbdisp.h" + +#include +#include +#include +#include + + +/* + * Registration message + * + * Upon making a connection, the sender sends a registration message to + * identify itself to the receiver. A lot of the fields are just there + * for validity checking. + */ +typedef struct RegisterMessage +{ + int32 msgBytes; + int32 recvSliceIndex; + int32 sendSliceIndex; + int32 srcContentId; + int32 srcListenerPort; + int32 srcPid; + int32 srcSessionId; + int32 srcCommandCount; +} RegisterMessage; + +extern int GetMaxTupleChunkSizeTCP(void); + +extern int32 GetListenPortTCP(void); + +extern void InitMotionTCP(int *listenerSocketFd, int32 *listenerPort); +extern void TeardownTCPInterconnect(ChunkTransportState * transportStates, bool hasErrors); + +extern void InitMotionIPCLayerTCP(void); +extern void CleanUpMotionIPCLayerTCP(void); + +extern void WaitInterconnectQuitTCP(void); + +extern void TeardownInterconnectTCP(ChunkTransportState * transportStates, + bool hasErrors); + +extern void SetupInterconnectTCP(EState *estate); + +extern void DeregisterReadInterestTCP(ChunkTransportState * transportStates, + int motNodeID, + int srcRoute, + const char *reason); + +extern bool SendChunkTCP(ChunkTransportState * transportStates, ChunkTransportStateEntry * pEntry, + MotionConn * conn, TupleChunkListItem tcItem, int16 motionId); +extern void SendEOSTCP(ChunkTransportState * transportStates, + int motNodeID, TupleChunkListItem tcItem); +extern void SendStopMessageTCP(ChunkTransportState * transportStates, int16 motNodeID); + +extern TupleChunkListItem RecvTupleChunkFromAnyTCP(ChunkTransportState * transportStates, + int16 motNodeID, + int16 *srcRoute); + +extern TupleChunkListItem RecvTupleChunkFromTCP(ChunkTransportState * transportStates, + int16 motNodeID, + int16 srcRoute); + +extern TupleChunkListItem RecvTupleChunkTCP(MotionConn * conn, ChunkTransportState * transportStates); + +#endif // IC_TCP_INTERFACE_H diff --git a/src/include/cdb/cdbicudpfaultinjection.h b/contrib/interconnect/udp/ic_faultinjection.h similarity index 99% rename from src/include/cdb/cdbicudpfaultinjection.h rename to contrib/interconnect/udp/ic_faultinjection.h index 7bea414b7ef..c3aa69a2dcc 100644 --- a/src/include/cdb/cdbicudpfaultinjection.h +++ b/contrib/interconnect/udp/ic_faultinjection.h @@ -1,6 +1,6 @@ /*------------------------------------------------------------------------- * - * cdbicudpfaultinjection.h + * ic_faultinjection.h * Fault injection code for UDP interconnect. * * Portions Copyright (c) 2005-2011, Greenplum Inc. @@ -9,13 +9,13 @@ * * * IDENTIFICATION - * src/include/cdb/cdbicudpfaultinjection.h + * contrib/interconnect/udp/ic_faultinjection.h * *------------------------------------------------------------------------- */ -#ifndef CDBICUDPFAULTINJECTION_H -#define CDBICUDPFAULTINJECTION_H +#ifndef IC_FAULTINJECTION_H +#define IC_FAULTINJECTION_H #ifdef HAVE_POLL_H #include @@ -619,8 +619,9 @@ testmode_pthread_create(const char *caller_name, pthread_t *thread, return pthread_create(thread, attr, start_routine, arg); } - +#ifdef ML_CHECK_FOR_INTERRUPTS #undef ML_CHECK_FOR_INTERRUPTS +#endif #undef sendto #undef recvfrom #undef poll diff --git a/src/backend/cdb/motion/ic_udpifc.c b/contrib/interconnect/udp/ic_udpifc.c similarity index 85% rename from src/backend/cdb/motion/ic_udpifc.c rename to contrib/interconnect/udp/ic_udpifc.c index 90a1e6abde7..a5cf7b7aaaa 100644 --- a/src/backend/cdb/motion/ic_udpifc.c +++ b/contrib/interconnect/udp/ic_udpifc.c @@ -23,6 +23,9 @@ #endif #include "postgres.h" +#include "ic_udpifc.h" +#include "ic_internal.h" +#include "ic_common.h" #include #include @@ -55,7 +58,7 @@ #include "cdb/cdbvars.h" #include "cdb/cdbdisp.h" #include "cdb/cdbdispatchresult.h" -#include "cdb/cdbicudpfaultinjection.h" +#include "ic_faultinjection.h" #ifdef WIN32 #define WIN32_LEAN_AND_MEAN @@ -438,7 +441,7 @@ struct ICGlobalControlInfo /* Used by main thread to ask the background thread to exit. */ pg_atomic_uint32 shutdown; - /* + /*Serialization * Used by ic thread in the QE to identify the current serving ic instance * and handle the mismatch packets. It is not used by QD because QD may have * cursors, QD may receive packets for open the cursors with lower instance @@ -576,7 +579,7 @@ struct UnackQueueRing static UnackQueueRing unack_queue_ring = {0, 0, 0}; static int ICSenderSocket = -1; -static uint16 ICSenderPort = 0; +static int32 ICSenderPort = 0; static int ICSenderFamily = 0; /* @@ -643,6 +646,12 @@ typedef struct ICStatistics /* Statistics for UDP interconnect. */ static ICStatistics ic_statistics; +/* UDP listen fd */ +int UDP_listenerFd; + +/* UDP listen port */ +int32 udp_listener_port; + /*========================================================================= * STATIC FUNCTIONS declarations */ @@ -667,12 +676,12 @@ static void SendDummyPacket(void); static void getSockAddr(struct sockaddr_storage *peer, socklen_t *peer_len, const char *listenerAddr, int listenerPort); static void setXmitSocketOptions(int txfd); static uint32 setSocketBufferSize(int fd, int type, int expectedSize, int leastSize); -static void setupUDPListeningSocket(int *listenerSocketFd, uint16 *listenerPort, int *txFamily); +static void setupUDPListeningSocket(int *listenerSocketFd, int32 *listenerPort, int *txFamily); static ChunkTransportStateEntry *startOutgoingUDPConnections(ChunkTransportState *transportStates, ExecSlice *sendSlice, int *pOutgoingCount); static void setupOutgoingUDPConnection(ChunkTransportState *transportStates, - ChunkTransportStateEntry *pEntry, MotionConn *conn); + ChunkTransportStateEntry *pChunkEntry, MotionConn *conn); /* Connection hash table functions. */ static bool initConnHashTable(ConnHashTable *ht, MemoryContext ctx); @@ -703,35 +712,31 @@ static void icBufferListFree(ICBufferList *list); static inline ICBuffer *icBufferListAppend(ICBufferList *list, ICBuffer *buf); static void icBufferListReturn(ICBufferList *list, bool inExpirationQueue); +static inline void SetupUDPIFCInterconnect(EState *estate); +static inline void InitMotionUDPIFC(int *listenerSocketFd, int32 *listenerPort); static ChunkTransportState *SetupUDPIFCInterconnect_Internal(SliceTable *sliceTable); + +static inline void markUDPConnInactiveIFC(MotionConn *conn); +static inline void CleanupMotionUDPIFC(void); + static inline TupleChunkListItem RecvTupleChunkFromAnyUDPIFC_Internal(ChunkTransportState *transportStates, int16 motNodeID, int16 *srcRoute); static inline TupleChunkListItem RecvTupleChunkFromUDPIFC_Internal(ChunkTransportState *transportStates, int16 motNodeID, int16 srcRoute); + +static inline void +TeardownUDPIFCInterconnect(ChunkTransportState *transportStates, + bool hasErrors); static void TeardownUDPIFCInterconnect_Internal(ChunkTransportState *transportStates, bool hasErrors); static void freeDisorderedPackets(MotionConn *conn); static void prepareRxConnForRead(MotionConn *conn); -static TupleChunkListItem RecvTupleChunkFromAnyUDPIFC(ChunkTransportState *transportStates, - int16 motNodeID, - int16 *srcRoute); - -static TupleChunkListItem RecvTupleChunkFromUDPIFC(ChunkTransportState *transportStates, - int16 motNodeID, - int16 srcRoute); static TupleChunkListItem receiveChunksUDPIFC(ChunkTransportState *pTransportStates, ChunkTransportStateEntry *pEntry, int16 motNodeID, int16 *srcRoute, MotionConn *conn); - -static void SendEosUDPIFC(ChunkTransportState *transportStates, - int motNodeID, TupleChunkListItem tcItem); -static bool SendChunkUDPIFC(ChunkTransportState *transportStates, - ChunkTransportStateEntry *pEntry, MotionConn *conn, TupleChunkListItem tcItem, int16 motionId); - -static void doSendStopMessageUDPIFC(ChunkTransportState *transportStates, int16 motNodeID); static bool dispatcherAYT(void); static void checkQDConnectionAlive(void); @@ -740,8 +745,8 @@ static void *rxThreadFunc(void *arg); static bool handleMismatch(icpkthdr *pkt, struct sockaddr_storage *peer, int peer_len); static void handleAckedPacket(MotionConn *ackConn, ICBuffer *buf, uint64 now); -static bool handleAcks(ChunkTransportState *transportStates, ChunkTransportStateEntry *pEntry); -static void handleStopMsgs(ChunkTransportState *transportStates, ChunkTransportStateEntry *pEntry, int16 motionId); +static bool handleAcks(ChunkTransportState *transportStates, ChunkTransportStateEntry *pChunkEntry); +static void handleStopMsgs(ChunkTransportState *transportStates, ChunkTransportStateEntry *pChunkEntry, int16 motionId); static void handleDisorderPacket(MotionConn *conn, int pos, uint32 tailSeq, icpkthdr *pkt); static bool handleDataPacket(MotionConn *conn, icpkthdr *pkt, struct sockaddr_storage *peer, socklen_t *peerlen, AckSendParam *param, bool *wakeup_mainthread); static bool handleAckForDuplicatePkt(MotionConn *conn, icpkthdr *pkt); @@ -751,7 +756,7 @@ static inline void prepareXmit(MotionConn *conn); static inline void addCRC(icpkthdr *pkt); static inline bool checkCRC(icpkthdr *pkt); static void sendBuffers(ChunkTransportState *transportStates, ChunkTransportStateEntry *pEntry, MotionConn *conn); -static void sendOnce(ChunkTransportState *transportStates, ChunkTransportStateEntry *pEntry, ICBuffer *buf, MotionConn *conn); +static void sendOnce(ChunkTransportState *transportStates, ChunkTransportStateEntry *pChunkEntry, ICBuffer *buf, MotionConn *conn); static inline uint64 computeExpirationPeriod(MotionConn *conn, uint32 retry); static ICBuffer *getSndBuffer(MotionConn *conn); @@ -761,7 +766,7 @@ static void putIntoUnackQueueRing(UnackQueueRing *uqr, ICBuffer *buf, uint64 exp static void initUnackQueueRing(UnackQueueRing *uqr); static void checkExpiration(ChunkTransportState *transportStates, ChunkTransportStateEntry *pEntry, MotionConn *triggerConn, uint64 now); -static void checkDeadlock(ChunkTransportStateEntry *pEntry, MotionConn *conn); +static void checkDeadlock(ChunkTransportStateEntry *pChunkEntry, MotionConn *conn); static bool cacheFuturePacket(icpkthdr *pkt, struct sockaddr_storage *peer, int peer_len); static void cleanupStartupCache(void); @@ -771,7 +776,7 @@ static uint64 getCurrentTime(void); static void initMutex(pthread_mutex_t *mutex); static inline void logPkt(char *prefix, icpkthdr *pkt); -static void aggregateStatistics(ChunkTransportStateEntry *pEntry); +static void aggregateStatistics(ChunkTransportStateEntry *pChunkEntry); static inline bool pollAcks(ChunkTransportState *transportStates, int fd, int timeout); @@ -1158,7 +1163,7 @@ resetRxThreadError() * Setup udp listening socket. */ static void -setupUDPListeningSocket(int *listenerSocketFd, uint16 *listenerPort, int *txFamily) +setupUDPListeningSocket(int *listenerSocketFd, int32 *listenerPort, int *txFamily) { int errnoSave; int fd = -1; @@ -1402,8 +1407,8 @@ ic_reset_pthread_sigmasks(sigset_t *sigs) * InitMotionUDPIFC * Initialize UDP specific comms, and create rx-thread. */ -void -InitMotionUDPIFC(int *listenerSocketFd, uint16 *listenerPort) +static inline void +InitMotionUDPIFC(int *listenerSocketFd, int32 *listenerPort) { int pthread_err; int txFamily = -1; @@ -1491,7 +1496,7 @@ InitMotionUDPIFC(int *listenerSocketFd, uint16 *listenerPort) ic_control_info.threadCreated = false; ereport(FATAL, (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("InitMotionLayerIPC: failed to create thread"), + errmsg("InitMotionUDPIFC: failed to create thread"), errdetail("pthread_create() failed with err %d", pthread_err))); } @@ -1499,12 +1504,19 @@ InitMotionUDPIFC(int *listenerSocketFd, uint16 *listenerPort) return; } +void +InitMotionIPCLayerUDP(void) +{ + InitMotionUDPIFC(&UDP_listenerFd, &udp_listener_port); + + elog(DEBUG1, "Interconnect listening on udp port %d ", udp_listener_port); +} + /* * CleanupMotionUDPIFC * Clean up UDP specific stuff such as cursor ic hash table, thread etc. */ -void -CleanupMotionUDPIFC(void) +static inline void CleanupMotionUDPIFC(void) { elog(DEBUG2, "udp-ic: telling receiver thread to shutdown."); @@ -1557,10 +1569,25 @@ CleanupMotionUDPIFC(void) * introduce issues. */ if (icudp_malloc_times != 0) - elog(LOG, "WARNING: malloc times and free times do not match."); + elog(LOG, "WARNING: malloc times and free times do not match. remain alloc times: %ld", icudp_malloc_times); #endif } +void CleanUpMotionLayerIPCUDP(void) +{ + if (gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG) + elog(DEBUG3, "Cleaning Up Motion Layer IPC..."); + + CleanupMotionUDPIFC(); + + if (UDP_listenerFd >= 0) + closesocket(UDP_listenerFd); + + /* be safe and reset global state variables. */ + udp_listener_port = 0; + UDP_listenerFd = -1; +} + /* * initConnHashTable * Initialize a connection hash table. @@ -1600,12 +1627,15 @@ initConnHashTable(ConnHashTable *ht, MemoryContext cxt) * need to use CONN_HASH_MATCH() at all! */ static bool -connAddHash(ConnHashTable *ht, MotionConn *conn) +connAddHash(ConnHashTable *ht, MotionConn *mConn) { uint32 hashcode; struct ConnHtabBin *bin, *newbin; MemoryContext old = NULL; + MotionConnUDP *conn = NULL; + + conn = CONTAINER_OF(mConn, MotionConnUDP, mConn); hashcode = CONN_HASH_VALUE(&conn->conn_info) % ht->size; @@ -1615,7 +1645,7 @@ connAddHash(ConnHashTable *ht, MotionConn *conn) */ for (bin = ht->table[hashcode]; bin != NULL; bin = bin->next) { - if (bin->conn == conn) + if (bin->conn == &conn->mConn) { elog(DEBUG5, "connAddHash(): duplicate ?! node %d route %d", conn->conn_info.motNodeId, conn->route); return true; /* false *only* indicates memory-alloc @@ -1635,7 +1665,7 @@ connAddHash(ConnHashTable *ht, MotionConn *conn) return false; } - newbin->conn = conn; + newbin->conn = &conn->mConn; newbin->next = ht->table[hashcode]; ht->table[hashcode] = newbin; @@ -1656,13 +1686,15 @@ connAddHash(ConnHashTable *ht, MotionConn *conn) * use CONN_HASH_MATCH() at all! */ static void -connDelHash(ConnHashTable *ht, MotionConn *conn) +connDelHash(ConnHashTable *ht, MotionConn *mConn) { uint32 hashcode; struct ConnHtabBin *c, *p, *trash; + MotionConnUDP *conn = NULL; + conn = CONTAINER_OF(mConn, MotionConnUDP, mConn); hashcode = CONN_HASH_VALUE(&conn->conn_info) % ht->size; c = ht->table[hashcode]; @@ -1672,7 +1704,7 @@ connDelHash(ConnHashTable *ht, MotionConn *conn) while (c != NULL) { /* found ? */ - if (c->conn == conn) + if (c->conn == &conn->mConn) break; p = c; @@ -1720,18 +1752,20 @@ findConnByHeader(ConnHashTable *ht, icpkthdr *hdr) uint32 hashcode; struct ConnHtabBin *bin; MotionConn *ret = NULL; + MotionConnUDP *conn = NULL; hashcode = CONN_HASH_VALUE(hdr) % ht->size; for (bin = ht->table[hashcode]; bin != NULL; bin = bin->next) { - if (CONN_HASH_MATCH(&bin->conn->conn_info, hdr)) + conn = CONTAINER_OF(bin->conn, MotionConnUDP, mConn); + if (CONN_HASH_MATCH(&conn->conn_info, hdr)) { - ret = bin->conn; + ret = &conn->mConn; if (DEBUG5 >= log_min_messages) write_log("findConnByHeader: found. route %d state %d hashcode %d conn %p", - ret->route, ret->state, hashcode, ret); + conn->route, ret->state, hashcode, ret); return ret; } @@ -1822,8 +1856,12 @@ sendControlMessage(icpkthdr *pkt, int fd, struct sockaddr *addr, socklen_t peerL * Set the ack sending parameters. */ static inline void -setAckSendParam(AckSendParam *param, MotionConn *conn, int32 flags, uint32 seq, uint32 extraSeq) +setAckSendParam(AckSendParam *param, MotionConn *mConn, int32 flags, uint32 seq, uint32 extraSeq) { + MotionConnUDP *conn = NULL; + + conn = CONTAINER_OF(mConn, MotionConnUDP, mConn); + memcpy(¶m->msg, (char *) &conn->conn_info, sizeof(icpkthdr)); param->msg.flags = flags; param->msg.seq = seq; @@ -1848,9 +1886,12 @@ sendAckWithParam(AckSendParam *param) * Send acknowledgment to sender. */ static void -sendAck(MotionConn *conn, int32 flags, uint32 seq, uint32 extraSeq) +sendAck(MotionConn *mConn, int32 flags, uint32 seq, uint32 extraSeq) { icpkthdr msg; + MotionConnUDP *conn = NULL; + + conn = CONTAINER_OF(mConn, MotionConnUDP, mConn); memcpy(&msg, (char *) &conn->conn_info, sizeof(msg)); @@ -1877,9 +1918,12 @@ sendAck(MotionConn *conn, int32 flags, uint32 seq, uint32 extraSeq) * */ static void -sendDisorderAck(MotionConn *conn, uint32 seq, uint32 extraSeq, uint32 lostPktCnt) +sendDisorderAck(MotionConn *mConn, uint32 seq, uint32 extraSeq, uint32 lostPktCnt) { icpkthdr *disorderBuffer = rx_control_info.disorderBuffer; + MotionConnUDP *conn = NULL; + + conn = CONTAINER_OF(mConn, MotionConnUDP, mConn); memcpy(disorderBuffer, (char *) &conn->conn_info, sizeof(icpkthdr)); @@ -1891,7 +1935,7 @@ sendDisorderAck(MotionConn *conn, uint32 seq, uint32 extraSeq, uint32 lostPktCnt #ifdef AMS_VERBOSE_LOGGING if (!(conn->peer.ss_family == AF_INET || conn->peer.ss_family == AF_INET6)) { - write_log("UDP Interconnect bug (in sendDisorderAck): trying to send ack when we don't know where to send to %s", conn->remoteHostAndPort); + write_log("UDP Interconnect bug (in sendDisorderAck): trying to send ack when we don't know where to send to %s", conn->mConn.remoteHostAndPort); } #endif @@ -1907,9 +1951,12 @@ sendDisorderAck(MotionConn *conn, uint32 seq, uint32 extraSeq, uint32 lostPktCnt * the connection status (consumed seq, received seq ...). */ static void -sendStatusQueryMessage(MotionConn *conn, int fd, uint32 seq) +sendStatusQueryMessage(MotionConn *mConn, int fd, uint32 seq) { icpkthdr msg; + MotionConnUDP *conn = NULL; + + conn = CONTAINER_OF(mConn, MotionConnUDP, mConn); memcpy(&msg, (char *) &conn->conn_info, sizeof(msg)); msg.flags = UDPIC_FLAGS_CAPACITY; @@ -1918,7 +1965,7 @@ sendStatusQueryMessage(MotionConn *conn, int fd, uint32 seq) msg.len = sizeof(msg); #ifdef TRANSFER_PROTOCOL_STATS - updateStats(TPE_ACK_PKT_QUERY, conn, &msg); + updateStats(TPE_ACK_PKT_QUERY, &conn->mConn, &msg); #endif sendControlMessage(&msg, fd, (struct sockaddr *) &conn->peer, conn->peer_len); @@ -1932,10 +1979,13 @@ sendStatusQueryMessage(MotionConn *conn, int fd, uint32 seq) * SHOULD BE CALLED WITH ic_control_info.lock *LOCKED* */ static void -putRxBufferAndSendAck(MotionConn *conn, AckSendParam *param) +putRxBufferAndSendAck(MotionConn *mConn, AckSendParam *param) { icpkthdr *buf; uint32 seq; + MotionConnUDP *conn = NULL; + + conn = CONTAINER_OF(mConn, MotionConnUDP, mConn); buf = (icpkthdr *) conn->pkt_q[conn->pkt_q_head]; if (buf == NULL) @@ -1947,16 +1997,20 @@ putRxBufferAndSendAck(MotionConn *conn, AckSendParam *param) seq = buf->seq; #ifdef AMS_VERBOSE_LOGGING - elog(LOG, "putRxBufferAndSendAck conn %p pkt [seq %d] for node %d route %d, [head seq] %d queue size %d, queue head %d queue tail %d", conn, seq, buf->motNodeId, conn->route, conn->conn_info.seq - conn->pkt_q_size, conn->pkt_q_size, conn->pkt_q_head, conn->pkt_q_tail); + elog(LOG, "putRxBufferAndSendAck conn %p pkt [seq %d] for node %d route %d, [head seq] %d queue size %d, queue head %d queue tail %d", + &conn->mConn, seq, buf->motNodeId, conn->route, conn->conn_info.seq - conn->pkt_q_size, + conn->pkt_q_size, conn->pkt_q_head, conn->pkt_q_tail); #endif conn->pkt_q[conn->pkt_q_head] = NULL; - conn->pBuff = NULL; + conn->mConn.pBuff = NULL; conn->pkt_q_head = (conn->pkt_q_head + 1) % conn->pkt_q_capacity; conn->pkt_q_size--; #ifdef AMS_VERBOSE_LOGGING - elog(LOG, "putRxBufferAndSendAck conn %p pkt [seq %d] for node %d route %d, [head seq] %d queue size %d, queue head %d queue tail %d", conn, seq, buf->motNodeId, conn->route, conn->conn_info.seq - conn->pkt_q_size, conn->pkt_q_size, conn->pkt_q_head, conn->pkt_q_tail); + elog(LOG, "putRxBufferAndSendAck conn %p pkt [seq %d] for node %d route %d, [head seq] %d queue size %d, queue head %d queue tail %d", + &conn->mConn, seq, buf->motNodeId, conn->route, conn->conn_info.seq - conn->pkt_q_size, + conn->pkt_q_size, conn->pkt_q_head, conn->pkt_q_tail); #endif putRxBufferToFreeList(&rx_buffer_pool, buf); @@ -1968,56 +2022,15 @@ putRxBufferAndSendAck(MotionConn *conn, AckSendParam *param) { if (param != NULL) { - setAckSendParam(param, conn, UDPIC_FLAGS_ACK | UDPIC_FLAGS_CAPACITY | conn->conn_info.flags, conn->conn_info.seq - 1, seq); + setAckSendParam(param, &conn->mConn, UDPIC_FLAGS_ACK | UDPIC_FLAGS_CAPACITY | conn->conn_info.flags, conn->conn_info.seq - 1, seq); } else { - sendAck(conn, UDPIC_FLAGS_ACK | UDPIC_FLAGS_CAPACITY | conn->conn_info.flags, conn->conn_info.seq - 1, seq); + sendAck(&conn->mConn, UDPIC_FLAGS_ACK | UDPIC_FLAGS_CAPACITY | conn->conn_info.flags, conn->conn_info.seq - 1, seq); } } } -/* - * MlPutRxBufferIFC - * - * The cdbmotion code has discarded our pointer to the motion-conn - * structure, but has enough info to fully specify it. - */ -void -MlPutRxBufferIFC(ChunkTransportState *transportStates, int motNodeID, int route) -{ - ChunkTransportStateEntry *pEntry = NULL; - MotionConn *conn = NULL; - AckSendParam param; - - getChunkTransportState(transportStates, motNodeID, &pEntry); - - conn = pEntry->conns + route; - - memset(¶m, 0, sizeof(AckSendParam)); - - pthread_mutex_lock(&ic_control_info.lock); - - if (conn->pBuff != NULL) - { - putRxBufferAndSendAck(conn, ¶m); - } - else - { - pthread_mutex_unlock(&ic_control_info.lock); - elog(FATAL, "Interconnect error: tried to release a NULL buffer"); - } - - pthread_mutex_unlock(&ic_control_info.lock); - - /* - * real ack sending is after lock release to decrease the lock holding - * time. - */ - if (param.msg.len != 0) - sendAckWithParam(¶m); -} - /* * getRxBuffer * Get a receive buffer. @@ -2524,8 +2537,11 @@ initUnackQueueRing(UnackQueueRing *uqr) * */ static inline uint64 -computeExpirationPeriod(MotionConn *conn, uint32 retry) +computeExpirationPeriod(MotionConn *mConn, uint32 retry) { + MotionConnUDP *conn = NULL; + + conn = CONTAINER_OF(mConn, MotionConnUDP, mConn); /* * In fault injection mode, we often use DEFAULT_RTT, because the * intentional large percent of packet/ack losses will make the RTT too @@ -2584,9 +2600,12 @@ cleanSndBufferPool(SendBufferPool *p) * Return NULL when no free buffer available. */ static ICBuffer * -getSndBuffer(MotionConn *conn) +getSndBuffer(MotionConn *mConn) { ICBuffer *ret = NULL; + MotionConnUDP *conn = NULL; + + conn = CONTAINER_OF(mConn, MotionConnUDP, mConn); ic_statistics.totalBuffers += (icBufferListLength(&snd_buffer_pool.freeList) + snd_buffer_pool.maxCount - snd_buffer_pool.count); ic_statistics.bufferCountingTime++; @@ -2650,12 +2669,15 @@ startOutgoingUDPConnections(ChunkTransportState *transportStates, ExecSlice *sendSlice, int *pOutgoingCount) { - ChunkTransportStateEntry *pEntry; - MotionConn *conn; + ChunkTransportStateEntry *pChunkEntry; + ChunkTransportStateEntryUDP *pEntry; + MotionConn *mConn; + MotionConnUDP *conn; ListCell *cell; ExecSlice *recvSlice; CdbProcess *cdbProc; int i; + size_t index; *pOutgoingCount = 0; @@ -2665,27 +2687,31 @@ startOutgoingUDPConnections(ChunkTransportState *transportStates, elog(DEBUG1, "Interconnect seg%d slice%d setting up sending motion node", GpIdentity.segindex, sendSlice->sliceIndex); - pEntry = createChunkTransportState(transportStates, + pChunkEntry = createChunkTransportState(transportStates, sendSlice, recvSlice, - list_length(recvSlice->primaryProcesses)); - - Assert(pEntry && pEntry->valid); - - /* - * Setup a MotionConn entry for each of our outbound connections. Request - * a connection to each receiving backend's listening port. NB: Some - * mirrors could be down & have no CdbProcess entry. - */ - conn = pEntry->conns; + list_length(recvSlice->primaryProcesses), + sizeof(ChunkTransportStateEntryUDP)); + pEntry = CONTAINER_OF(pChunkEntry, ChunkTransportStateEntryUDP, entry); + Assert(pEntry); + Assert(pEntry && pEntry->entry.valid); i = 0; + index = 0; foreach(cell, recvSlice->primaryProcesses) { + /* + * Setup a MotionConn entry for each of our outbound connections. Request + * a connection to each receiving backend's listening port. NB: Some + * mirrors could be down & have no CdbProcess entry. + */ + getMotionConn(&pEntry->entry, index, &mConn); + conn = CONTAINER_OF(mConn, MotionConnUDP, mConn); + cdbProc = (CdbProcess *) lfirst(cell); if (cdbProc) { - conn->cdbProc = cdbProc; + conn->mConn.cdbProc = cdbProc; icBufferListInit(&conn->sndQueue, ICBufferListType_Primary); icBufferListInit(&conn->unackQueue, ICBufferListType_Primary); conn->capacity = Gp_interconnect_queue_depth; @@ -2693,7 +2719,7 @@ startOutgoingUDPConnections(ChunkTransportState *transportStates, /* send buffer pool must be initialized before this. */ snd_buffer_pool.maxCount += Gp_interconnect_snd_queue_depth; snd_control_info.cwnd += 1; - conn->curBuff = getSndBuffer(conn); + conn->curBuff = getSndBuffer(&conn->mConn); /* should have at least one buffer for each connection */ Assert(conn->curBuff != NULL); @@ -2701,27 +2727,26 @@ startOutgoingUDPConnections(ChunkTransportState *transportStates, conn->rtt = DEFAULT_RTT; conn->dev = DEFAULT_DEV; conn->deadlockCheckBeginTime = 0; - conn->tupleCount = 0; - conn->msgSize = sizeof(conn->conn_info); + conn->mConn.tupleCount = 0; + conn->mConn.msgSize = sizeof(conn->conn_info); conn->sentSeq = 0; conn->receivedAckSeq = 0; conn->consumedSeq = 0; - conn->pBuff = (uint8 *) conn->curBuff->pkt; - conn->state = mcsSetupOutgoingConnection; + conn->mConn.pBuff = (uint8 *) conn->curBuff->pkt; + conn->mConn.state = mcsSetupOutgoingConnection; conn->route = i++; (*pOutgoingCount)++; } - conn++; + index++; } pEntry->txfd = ICSenderSocket; pEntry->txport = ICSenderPort; pEntry->txfd_family = ICSenderFamily; - return pEntry; - + return &pEntry->entry; } @@ -2789,24 +2814,30 @@ getSockAddr(struct sockaddr_storage *peer, socklen_t *peer_len, const char *list * Setup outgoing UDP connection. */ void -setupOutgoingUDPConnection(ChunkTransportState *transportStates, ChunkTransportStateEntry *pEntry, MotionConn *conn) +setupOutgoingUDPConnection(ChunkTransportState *transportStates, ChunkTransportStateEntry *pChunkEntry, MotionConn *mConn) { - CdbProcess *cdbProc = conn->cdbProc; + ChunkTransportStateEntryUDP *pEntry; + CdbProcess *cdbProc = NULL; SliceTable *sliceTbl = transportStates->sliceTable; + MotionConnUDP *conn = NULL; - Assert(conn->state == mcsSetupOutgoingConnection); - Assert(conn->cdbProc); + conn = CONTAINER_OF(mConn, MotionConnUDP, mConn); + cdbProc = conn->mConn.cdbProc; + Assert(conn->mConn.state == mcsSetupOutgoingConnection); + Assert(conn->mConn.cdbProc); - conn->wakeup_ms = 0; - conn->remoteContentId = cdbProc->contentid; + pEntry = CONTAINER_OF(pChunkEntry, ChunkTransportStateEntryUDP, entry); + Assert(pEntry); + + conn->mConn.remoteContentId = cdbProc->contentid; conn->stat_min_ack_time = ~((uint64) 0); /* Save the information for the error message if getaddrinfo fails */ if (strchr(cdbProc->listenerAddr, ':') != 0) - snprintf(conn->remoteHostAndPort, sizeof(conn->remoteHostAndPort), + snprintf(conn->mConn.remoteHostAndPort, sizeof(conn->mConn.remoteHostAndPort), "[%s]:%d", cdbProc->listenerAddr, cdbProc->listenerPort); else - snprintf(conn->remoteHostAndPort, sizeof(conn->remoteHostAndPort), + snprintf(conn->mConn.remoteHostAndPort, sizeof(conn->mConn.remoteHostAndPort), "%s:%d", cdbProc->listenerAddr, cdbProc->listenerPort); /* @@ -2815,8 +2846,8 @@ setupOutgoingUDPConnection(ChunkTransportState *transportStates, ChunkTransportS getSockAddr(&conn->peer, &conn->peer_len, cdbProc->listenerAddr, cdbProc->listenerPort); /* Save the destination IP address */ - format_sockaddr(&conn->peer, conn->remoteHostAndPort, - sizeof(conn->remoteHostAndPort)); + format_sockaddr(&conn->peer, conn->mConn.remoteHostAndPort, + sizeof(conn->mConn.remoteHostAndPort)); Assert(conn->peer.ss_family == AF_INET || conn->peer.ss_family == AF_INET6); @@ -2898,44 +2929,44 @@ setupOutgoingUDPConnection(ChunkTransportState *transportStates, ChunkTransportS if (gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG) ereport(DEBUG1, (errmsg("Interconnect connecting to seg%d slice%d %s " "pid=%d sockfd=%d", - conn->remoteContentId, - pEntry->recvSlice->sliceIndex, - conn->remoteHostAndPort, - conn->cdbProc->pid, - conn->sockfd))); + conn->mConn.remoteContentId, + pEntry->entry.recvSlice->sliceIndex, + conn->mConn.remoteHostAndPort, + conn->mConn.cdbProc->pid, + conn->mConn.sockfd))); /* send connection request */ MemSet(&conn->conn_info, 0, sizeof(conn->conn_info)); conn->conn_info.len = 0; conn->conn_info.flags = 0; - conn->conn_info.motNodeId = pEntry->motNodeId; + conn->conn_info.motNodeId = pEntry->entry.motNodeId; - conn->conn_info.recvSliceIndex = pEntry->recvSlice->sliceIndex; - conn->conn_info.sendSliceIndex = pEntry->sendSlice->sliceIndex; + conn->conn_info.recvSliceIndex = pEntry->entry.recvSlice->sliceIndex; + conn->conn_info.sendSliceIndex = pEntry->entry.sendSlice->sliceIndex; conn->conn_info.srcContentId = GpIdentity.segindex; - conn->conn_info.dstContentId = conn->cdbProc->contentid; + conn->conn_info.dstContentId = conn->mConn.cdbProc->contentid; if (gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG) elog(DEBUG1, "setupOutgoingUDPConnection: node %d route %d srccontent %d dstcontent %d: %s", - pEntry->motNodeId, conn->route, GpIdentity.segindex, conn->cdbProc->contentid, conn->remoteHostAndPort); + pEntry->entry.motNodeId, conn->route, GpIdentity.segindex, conn->mConn.cdbProc->contentid, conn->mConn.remoteHostAndPort); - conn->conn_info.srcListenerPort = (Gp_listener_port >> 16) & 0x0ffff; + conn->conn_info.srcListenerPort = GetListenPortUDP(); conn->conn_info.srcPid = MyProcPid; - conn->conn_info.dstPid = conn->cdbProc->pid; - conn->conn_info.dstListenerPort = conn->cdbProc->listenerPort; + conn->conn_info.dstPid = conn->mConn.cdbProc->pid; + conn->conn_info.dstListenerPort = conn->mConn.cdbProc->listenerPort; conn->conn_info.sessionId = gp_session_id; conn->conn_info.icId = sliceTbl->ic_instance_id; - connAddHash(&ic_control_info.connHtab, conn); + connAddHash(&ic_control_info.connHtab, &conn->mConn); /* * No need to get the connection lock here, since background rx thread * will never access send connections. */ - conn->msgPos = NULL; - conn->msgSize = sizeof(conn->conn_info); - conn->stillActive = true; + conn->mConn.msgPos = NULL; + conn->mConn.msgSize = sizeof(conn->conn_info); + conn->mConn.stillActive = true; conn->conn_info.seq = 1; Assert(conn->peer.ss_family == AF_INET || conn->peer.ss_family == AF_INET6); @@ -2948,7 +2979,8 @@ setupOutgoingUDPConnection(ChunkTransportState *transportStates, ChunkTransportS static void handleCachedPackets(void) { - MotionConn *cachedConn = NULL; + MotionConn *cachedMotionConn = NULL; + MotionConnUDP *cachedConn = NULL; MotionConn *setupConn = NULL; ConnHtabBin *bin = NULL; icpkthdr *pkt = NULL; @@ -2963,8 +2995,10 @@ handleCachedPackets(void) while (bin) { - cachedConn = bin->conn, - setupConn = NULL; + + cachedMotionConn = bin->conn; + cachedConn = CONTAINER_OF(cachedMotionConn, MotionConnUDP, mConn); + setupConn = NULL; for (j = 0; j < cachedConn->pkt_q_size; j++) { @@ -2999,7 +3033,7 @@ handleCachedPackets(void) cachedConn->pkt_q[j] = NULL; } bin = bin->next; - connDelHash(&ic_control_info.startupCacheHtab, cachedConn); + connDelHash(&ic_control_info.startupCacheHtab, &cachedConn->mConn); /* * MPP-19981 free the cached connections; otherwise memory leak @@ -3023,7 +3057,8 @@ SetupUDPIFCInterconnect_Internal(SliceTable *sliceTable) ListCell *cell; ExecSlice *mySlice; ExecSlice *aSlice; - MotionConn *conn = NULL; + MotionConn *mConn = NULL; + MotionConnUDP *conn = NULL; int incoming_count = 0; int outgoing_count = 0; int expectedTotalIncoming = 0; @@ -3031,6 +3066,7 @@ SetupUDPIFCInterconnect_Internal(SliceTable *sliceTable) ChunkTransportStateEntry *sendingChunkTransportState = NULL; ChunkTransportState *interconnect_context; + HASHCTL conn_sent_record_typmod_ctl; pthread_mutex_lock(&ic_control_info.lock); @@ -3053,12 +3089,16 @@ SetupUDPIFCInterconnect_Internal(SliceTable *sliceTable) ic_control_info.ic_instance_id = sliceTable->ic_instance_id; } + conn_sent_record_typmod_ctl.keysize = sizeof(MotionConnKey); + conn_sent_record_typmod_ctl.entrysize = sizeof(MotionConnSentRecordTypmodEnt); + conn_sent_record_typmod_ctl.hcxt = CurrentMemoryContext; + interconnect_context = palloc0(sizeof(ChunkTransportState)); /* initialize state variables */ Assert(interconnect_context->size == 0); interconnect_context->size = CTS_INITIAL_SIZE; - interconnect_context->states = palloc0(CTS_INITIAL_SIZE * sizeof(ChunkTransportStateEntry)); + interconnect_context->states = palloc0(CTS_INITIAL_SIZE * sizeof(ChunkTransportStateEntryUDP)); interconnect_context->networkTimeoutIsLogged = false; interconnect_context->teardownActive = false; @@ -3066,12 +3106,8 @@ SetupUDPIFCInterconnect_Internal(SliceTable *sliceTable) interconnect_context->incompleteConns = NIL; interconnect_context->sliceTable = copyObject(sliceTable); interconnect_context->sliceId = sliceTable->localSlice; - - interconnect_context->RecvTupleChunkFrom = RecvTupleChunkFromUDPIFC; - interconnect_context->RecvTupleChunkFromAny = RecvTupleChunkFromAnyUDPIFC; - interconnect_context->SendEos = SendEosUDPIFC; - interconnect_context->SendChunk = SendChunkUDPIFC; - interconnect_context->doSendStopMessage = doSendStopMessageUDPIFC; + interconnect_context->conn_sent_record_typmod = hash_create( + "MotionConn sent record typmod mapping", 128, &conn_sent_record_typmod_ctl, HASH_CONTEXT | HASH_ELEM | HASH_BLOBS); mySlice = &interconnect_context->sliceTable->slices[sliceTable->localSlice]; @@ -3123,17 +3159,17 @@ SetupUDPIFCInterconnect_Internal(SliceTable *sliceTable) elog(DEBUG1, "Setup recving connections: my slice %d, childId %d", mySlice->sliceIndex, childId); - pEntry = createChunkTransportState(interconnect_context, aSlice, mySlice, numProcs); - + pEntry = createChunkTransportState(interconnect_context, aSlice, mySlice, numProcs, sizeof(ChunkTransportStateEntryUDP)); Assert(pEntry); Assert(pEntry->valid); for (i = 0; i < pEntry->numConns; i++) { - conn = &pEntry->conns[i]; - conn->cdbProc = list_nth(aSlice->primaryProcesses, i); + getMotionConn(pEntry, i, &mConn); + conn = CONTAINER_OF(mConn, MotionConnUDP, mConn); + conn->mConn.cdbProc = list_nth(aSlice->primaryProcesses, i); - if (conn->cdbProc) + if (conn->mConn.cdbProc) { expectedTotalIncoming++; @@ -3158,8 +3194,8 @@ SetupUDPIFCInterconnect_Internal(SliceTable *sliceTable) conn->route = i; conn->conn_info.seq = 1; - conn->stillActive = true; - conn->remapper = CreateTupleRemapper(); + conn->mConn.stillActive = true; + conn->mConn.remapper = CreateTupleRemapper(); incoming_count++; @@ -3167,18 +3203,18 @@ SetupUDPIFCInterconnect_Internal(SliceTable *sliceTable) conn->conn_info.recvSliceIndex = mySlice->sliceIndex; conn->conn_info.sendSliceIndex = aSlice->sliceIndex; - conn->conn_info.srcContentId = conn->cdbProc->contentid; + conn->conn_info.srcContentId = conn->mConn.cdbProc->contentid; conn->conn_info.dstContentId = GpIdentity.segindex; - conn->conn_info.srcListenerPort = conn->cdbProc->listenerPort; - conn->conn_info.srcPid = conn->cdbProc->pid; + conn->conn_info.srcListenerPort = conn->mConn.cdbProc->listenerPort; + conn->conn_info.srcPid = conn->mConn.cdbProc->pid; conn->conn_info.dstPid = MyProcPid; - conn->conn_info.dstListenerPort = (Gp_listener_port >> 16) & 0x0ffff; + conn->conn_info.dstListenerPort = GetListenPortUDP(); conn->conn_info.sessionId = gp_session_id; conn->conn_info.icId = sliceTable->ic_instance_id; conn->conn_info.flags = UDPIC_FLAGS_RECEIVER_TO_SENDER; - connAddHash(&ic_control_info.connHtab, conn); + connAddHash(&ic_control_info.connHtab, &conn->mConn); } } } @@ -3202,11 +3238,12 @@ SetupUDPIFCInterconnect_Internal(SliceTable *sliceTable) for (i = 0; i < n; i++) { /* loop to set up outgoing connections */ - conn = &sendingChunkTransportState->conns[i]; + getMotionConn(sendingChunkTransportState, i, &mConn); + conn = CONTAINER_OF(mConn, MotionConnUDP, mConn); - if (conn->cdbProc) + if (conn->mConn.cdbProc) { - setupOutgoingUDPConnection(interconnect_context, sendingChunkTransportState, conn); + setupOutgoingUDPConnection(interconnect_context, sendingChunkTransportState, &conn->mConn); outgoing_count++; } } @@ -3230,7 +3267,7 @@ SetupUDPIFCInterconnect_Internal(SliceTable *sliceTable) "Listening on ports=%d/%d sockfd=%d.", incoming_count, expectedTotalIncoming, outgoing_count, expectedTotalOutgoing, sliceTable->ic_instance_id, - Gp_listener_port & 0x0ffff, (Gp_listener_port >> 16) & 0x0ffff, UDP_listenerFd))); + 0, GetListenPortUDP(), UDP_listenerFd))); /* * If there are packets cached by background thread, add them to the @@ -3250,7 +3287,7 @@ SetupUDPIFCInterconnect_Internal(SliceTable *sliceTable) * SetupUDPIFCInterconnect * setup UDP interconnect. */ -void +static inline void SetupUDPIFCInterconnect(EState *estate) { ChunkTransportState *icContext = NULL; @@ -3306,15 +3343,46 @@ SetupUDPIFCInterconnect(EState *estate) checkForCancelFromQD(icContext); } +void +SetupInterconnectUDP(EState *estate) +{ + interconnect_handle_t *h; + MemoryContext oldContext; + + if (estate->interconnect_context) + { + elog(ERROR, "SetupInterconnectUDP: already initialized."); + } + + if (!estate->es_sliceTable) + { + elog(ERROR, "SetupInterconnectUDP: no slice table ?"); + } + + h = allocate_interconnect_handle(TeardownInterconnectUDP); + + Assert(InterconnectContext != NULL); + oldContext = MemoryContextSwitchTo(InterconnectContext); + + SetupUDPIFCInterconnect(estate); + + MemoryContextSwitchTo(oldContext); + + h->interconnect_context = estate->interconnect_context; +} + /* * freeDisorderedPackets * Put the disordered packets into free buffer list. */ static void -freeDisorderedPackets(MotionConn *conn) +freeDisorderedPackets(MotionConn *mConn) { int k; + MotionConnUDP *conn = NULL; + + conn = CONTAINER_OF(mConn, MotionConnUDP, mConn); if (conn->pkt_q == NULL) return; @@ -3326,7 +3394,9 @@ freeDisorderedPackets(MotionConn *conn) if (buf != NULL) { if (gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG) - elog(DEBUG1, "CLEAR Out-of-order PKT: conn %p pkt [seq %d] for node %d route %d, [head seq] %d queue size %d, queue head %d queue tail %d", conn, buf->seq, buf->motNodeId, conn->route, conn->conn_info.seq - conn->pkt_q_size, conn->pkt_q_size, conn->pkt_q_head, conn->pkt_q_tail); + elog(DEBUG1, "CLEAR Out-of-order PKT: conn %p pkt [seq %d] for node %d route %d, [head seq] %d queue size %d, queue head %d queue tail %d", + &conn->mConn, buf->seq, buf->motNodeId, conn->route, conn->conn_info.seq - conn->pkt_q_size, + conn->pkt_q_size, conn->pkt_q_head, conn->pkt_q_tail); /* return the buffer into the free list. */ putRxBufferToFreeList(&rx_buffer_pool, buf); @@ -3384,7 +3454,8 @@ TeardownUDPIFCInterconnect_Internal(ChunkTransportState *transportStates, ChunkTransportStateEntry *pEntry = NULL; int i; ExecSlice *mySlice; - MotionConn *conn; + MotionConn *mConn; + MotionConnUDP *conn = NULL; uint64 maxRtt = 0; double avgRtt = 0; @@ -3484,8 +3555,9 @@ TeardownUDPIFCInterconnect_Internal(ChunkTransportState *transportStates, { for (i = 0; i < pEntry->numConns; i++) { - conn = pEntry->conns + i; - if (conn->cdbProc == NULL) + getMotionConn(pEntry, i, &mConn); + conn = CONTAINER_OF(mConn, MotionConnUDP, mConn); + if (conn->mConn.cdbProc == NULL) continue; /* compute some statistics */ @@ -3495,7 +3567,7 @@ TeardownUDPIFCInterconnect_Internal(ChunkTransportState *transportStates, icBufferListReturn(&conn->sndQueue, false); icBufferListReturn(&conn->unackQueue, Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_CAPACITY ? false : true); - connDelHash(&ic_control_info.connHtab, conn); + connDelHash(&ic_control_info.connHtab, &conn->mConn); } avgRtt = avgRtt / pEntry->numConns; avgDev = avgDev / pEntry->numConns; @@ -3555,8 +3627,9 @@ TeardownUDPIFCInterconnect_Internal(ChunkTransportState *transportStates, */ for (i = 0; i < pEntry->numConns; i++) { - conn = pEntry->conns + i; - if (conn->cdbProc == NULL) + getMotionConn(pEntry, i, &mConn); + conn = CONTAINER_OF(mConn, MotionConnUDP, mConn); + if (conn->mConn.cdbProc == NULL) continue; /* out of memory has occurred, break out */ @@ -3565,7 +3638,7 @@ TeardownUDPIFCInterconnect_Internal(ChunkTransportState *transportStates, rx_buffer_pool.maxCount -= conn->pkt_q_capacity; - connDelHash(&ic_control_info.connHtab, conn); + connDelHash(&ic_control_info.connHtab, &conn->mConn); /* * putRxBufferAndSendAck() dequeues messages and moves @@ -3573,19 +3646,19 @@ TeardownUDPIFCInterconnect_Internal(ChunkTransportState *transportStates, */ while (conn->pkt_q_size > 0) { - putRxBufferAndSendAck(conn, NULL); + putRxBufferAndSendAck(&conn->mConn, NULL); } /* we also need to clear all the out-of-order packets */ - freeDisorderedPackets(conn); + freeDisorderedPackets(&conn->mConn); /* free up the packet queue */ pfree(conn->pkt_q); conn->pkt_q = NULL; /* free up the tuple remapper */ - if (conn->remapper) - DestroyTupleRemapper(conn->remapper); + if (conn->mConn.remapper) + DestroyTupleRemapper(conn->mConn.remapper); } pfree(pEntry->conns); pEntry->conns = NULL; @@ -3670,6 +3743,8 @@ TeardownUDPIFCInterconnect_Internal(ChunkTransportState *transportStates, pfree(transportStates->states); transportStates->states = NULL; } + if (transportStates->conn_sent_record_typmod) + hash_destroy(transportStates->conn_sent_record_typmod); pfree(transportStates); } @@ -3685,7 +3760,7 @@ TeardownUDPIFCInterconnect_Internal(ChunkTransportState *transportStates, * * This function is called to release the resources used by interconnect. */ -void +static inline void TeardownUDPIFCInterconnect(ChunkTransportState *transportStates, bool hasErrors) { @@ -3703,6 +3778,18 @@ TeardownUDPIFCInterconnect(ChunkTransportState *transportStates, PG_END_TRY(); } +void TeardownInterconnectUDP(ChunkTransportState *transportStates, + bool hasErrors) +{ + /* TODO: should pass interconnect_handle_t as arg? */ + interconnect_handle_t *h = find_interconnect_handle(transportStates); + + TeardownUDPIFCInterconnect(transportStates, hasErrors); + + if (h != NULL) + destroy_interconnect_handle(h); +} + /* * prepareRxConnForRead * Prepare the receive connection for reading. @@ -3710,15 +3797,18 @@ TeardownUDPIFCInterconnect(ChunkTransportState *transportStates, * MUST BE CALLED WITH ic_control_info.lock LOCKED. */ static void -prepareRxConnForRead(MotionConn *conn) +prepareRxConnForRead(MotionConn *mConn) { + MotionConnUDP *conn = NULL; + + conn = CONTAINER_OF(mConn, MotionConnUDP, mConn); elog(DEBUG3, "In prepareRxConnForRead: conn %p, q_head %d q_tail %d q_size %d", conn, conn->pkt_q_head, conn->pkt_q_tail, conn->pkt_q_size); Assert(conn->pkt_q[conn->pkt_q_head] != NULL); - conn->pBuff = conn->pkt_q[conn->pkt_q_head]; - conn->msgPos = conn->pBuff; - conn->msgSize = ((icpkthdr *) conn->pBuff)->len; - conn->recvBytes = conn->msgSize; + conn->mConn.pBuff = conn->pkt_q[conn->pkt_q_head]; + conn->mConn.msgPos = conn->mConn.pBuff; + conn->mConn.msgSize = ((icpkthdr *) conn->mConn.pBuff)->len; + conn->mConn.recvBytes = conn->mConn.msgSize; } /* @@ -3729,7 +3819,7 @@ prepareRxConnForRead(MotionConn *conn) */ static TupleChunkListItem receiveChunksUDPIFC(ChunkTransportState *pTransportStates, ChunkTransportStateEntry *pEntry, - int16 motNodeID, int16 *srcRoute, MotionConn *conn) + int16 motNodeID, int16 *srcRoute, MotionConn *mConn) { int retries = 0; bool directed = false; @@ -3737,9 +3827,11 @@ receiveChunksUDPIFC(ChunkTransportState *pTransportStates, ChunkTransportStateEn int *waitFds = NULL; int nevent = 0; MotionConn *rxconn = NULL; + MotionConnUDP *udpRXconn = NULL; WaitEvent *rEvents = NULL; WaitEventSet *waitset = NULL; TupleChunkListItem tcItem = NULL; + MotionConnUDP *conn = NULL; #ifdef AMS_VERBOSE_LOGGING elog(DEBUG5, "receivechunksUDP: motnodeid %d", motNodeID); @@ -3748,8 +3840,9 @@ receiveChunksUDPIFC(ChunkTransportState *pTransportStates, ChunkTransportStateEn Assert(pTransportStates); Assert(pTransportStates->sliceTable); - if (conn != NULL) + if (mConn != NULL) { + conn = CONTAINER_OF(mConn, MotionConnUDP, mConn); directed = true; *srcRoute = conn->route; setMainThreadWaiting(&rx_control_info.mainWaitingState, motNodeID, conn->route, @@ -3790,7 +3883,7 @@ receiveChunksUDPIFC(ChunkTransportState *pTransportStates, ChunkTransportStateEn /* 1. Do we have data ready */ if (rx_control_info.mainWaitingState.reachRoute != ANY_ROUTE) { - rxconn = pEntry->conns + rx_control_info.mainWaitingState.reachRoute; + getMotionConn(pEntry, rx_control_info.mainWaitingState.reachRoute, &rxconn); prepareRxConnForRead(rxconn); @@ -3803,15 +3896,16 @@ receiveChunksUDPIFC(ChunkTransportState *pTransportStates, ChunkTransportStateEn if (rxconn != NULL) { Assert(rxconn->pBuff); + udpRXconn = CONTAINER_OF(rxconn, MotionConnUDP, mConn); pthread_mutex_unlock(&ic_control_info.lock); - elog(DEBUG2, "got data with length %d", rxconn->recvBytes); + elog(DEBUG2, "got data with length %d", udpRXconn->mConn.recvBytes); /* successfully read into this connection's buffer. */ - tcItem = RecvTupleChunk(rxconn, pTransportStates); + tcItem = RecvTupleChunkUDPIFC(&udpRXconn->mConn, pTransportStates); if (!directed) - *srcRoute = rxconn->route; + *srcRoute = udpRXconn->route; FreeWaitEventSet(waitset); if (rEvents != NULL) @@ -3898,6 +3992,105 @@ receiveChunksUDPIFC(ChunkTransportState *pTransportStates, ChunkTransportStateEn return NULL; /* make GCC behave */ } +TupleChunkListItem +RecvTupleChunkUDPIFC(MotionConn *conn, ChunkTransportState *transportStates) +{ + TupleChunkListItem tcItem; + TupleChunkListItem firstTcItem = NULL; + TupleChunkListItem lastTcItem = NULL; + uint32 tcSize; + int bytesProcessed = 0; + + bytesProcessed = sizeof(struct icpkthdr); + +#ifdef AMS_VERBOSE_LOGGING + elog(DEBUG5, "recvtuple chunk recv bytes %d msgsize %d conn->pBuff %p conn->msgPos: %p", + conn->recvBytes, conn->msgSize, conn->pBuff, conn->msgPos); +#endif + + while (bytesProcessed != conn->msgSize) + { + if (conn->msgSize - bytesProcessed < TUPLE_CHUNK_HEADER_SIZE) + { + logChunkParseDetails(conn, transportStates->sliceTable->ic_instance_id); + + ereport(ERROR, + (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), + errmsg("interconnect error parsing message: insufficient data received"), + errdetail("conn->msgSize %d bytesProcessed %d < chunk-header %d", + conn->msgSize, bytesProcessed, TUPLE_CHUNK_HEADER_SIZE))); + } + + tcSize = TUPLE_CHUNK_HEADER_SIZE + (*(uint16 *) (conn->msgPos + bytesProcessed)); + + /* sanity check */ + if (tcSize > Gp_max_packet_size) + { + /* + * see MPP-720: it is possible that our message got messed up by a + * cancellation ? + */ + ML_CHECK_FOR_INTERRUPTS(transportStates->teardownActive); + + /* + * MPP-4010: add some extra debugging. + */ + if (lastTcItem != NULL) + elog(LOG, "Interconnect error parsing message: last item length %d inplace %p", lastTcItem->chunk_length, lastTcItem->inplace); + else + elog(LOG, "Interconnect error parsing message: no last item"); + + logChunkParseDetails(conn, transportStates->sliceTable->ic_instance_id); + + ereport(ERROR, + (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), + errmsg("interconnect error parsing message"), + errdetail("tcSize %d > max %d header %d processed %d/%d from %p", + tcSize, Gp_max_packet_size, + TUPLE_CHUNK_HEADER_SIZE, bytesProcessed, + conn->msgSize, conn->msgPos))); + } + + Assert(tcSize < conn->msgSize); + + /* + * We store the data inplace, and handle any necessary copying later + * on + */ + tcItem = (TupleChunkListItem) palloc(sizeof(TupleChunkListItemData)); + + tcItem->p_next = NULL; + tcItem->chunk_length = tcSize; + tcItem->inplace = (char *) (conn->msgPos + bytesProcessed); + + bytesProcessed += tcSize; + + if (firstTcItem == NULL) + { + firstTcItem = tcItem; + lastTcItem = tcItem; + } + else + { + lastTcItem->p_next = tcItem; + lastTcItem = tcItem; + } + } + + conn->recvBytes -= conn->msgSize; + if (conn->recvBytes != 0) + { +#ifdef AMS_VERBOSE_LOGGING + elog(DEBUG5, "residual message %d bytes", conn->recvBytes); +#endif + conn->msgPos += conn->msgSize; + } + + conn->msgSize = 0; + + return firstTcItem; +} + /* * RecvTupleChunkFromAnyUDPIFC_Internal * Receive tuple chunks from any route (connections) @@ -3908,12 +4101,13 @@ RecvTupleChunkFromAnyUDPIFC_Internal(ChunkTransportState *transportStates, int16 *srcRoute) { ChunkTransportStateEntry *pEntry = NULL; - MotionConn *conn = NULL; + MotionConn *mConn = NULL; int i, index, activeCount = 0; TupleChunkListItem tcItem = NULL; bool found = false; + MotionConnUDP *conn = NULL; if (!transportStates) { @@ -3935,9 +4129,10 @@ RecvTupleChunkFromAnyUDPIFC_Internal(ChunkTransportState *transportStates, if (index >= pEntry->numConns) index = 0; - conn = pEntry->conns + index; + getMotionConn(pEntry, index, &mConn); + conn = CONTAINER_OF(mConn, MotionConnUDP, mConn); - if (conn->stillActive) + if (conn->mConn.stillActive) activeCount++; ic_statistics.totalRecvQueueSize += conn->pkt_q_size; @@ -3946,7 +4141,7 @@ RecvTupleChunkFromAnyUDPIFC_Internal(ChunkTransportState *transportStates, if (conn->pkt_q_size > 0) { found = true; - prepareRxConnForRead(conn); + prepareRxConnForRead(&conn->mConn); break; } } @@ -3955,7 +4150,7 @@ RecvTupleChunkFromAnyUDPIFC_Internal(ChunkTransportState *transportStates, { pthread_mutex_unlock(&ic_control_info.lock); - tcItem = RecvTupleChunk(conn, transportStates); + tcItem = RecvTupleChunkUDPIFC(&conn->mConn, transportStates); *srcRoute = conn->route; pEntry->scanStart = index + 1; return tcItem; @@ -3984,7 +4179,7 @@ RecvTupleChunkFromAnyUDPIFC_Internal(ChunkTransportState *transportStates, * RecvTupleChunkFromAnyUDPIFC * Receive tuple chunks from any route (connections) */ -static TupleChunkListItem +TupleChunkListItem RecvTupleChunkFromAnyUDPIFC(ChunkTransportState *transportStates, int16 motNodeID, int16 *srcRoute) @@ -4019,7 +4214,8 @@ RecvTupleChunkFromUDPIFC_Internal(ChunkTransportState *transportStates, int16 srcRoute) { ChunkTransportStateEntry *pEntry = NULL; - MotionConn *conn = NULL; + MotionConn *mConn = NULL; + MotionConnUDP *conn = NULL; int16 route; if (!transportStates) @@ -4043,10 +4239,11 @@ RecvTupleChunkFromUDPIFC_Internal(ChunkTransportState *transportStates, #endif getChunkTransportState(transportStates, motNodeID, &pEntry); - conn = pEntry->conns + srcRoute; + getMotionConn(pEntry, srcRoute, &mConn); + conn = CONTAINER_OF(mConn, MotionConnUDP, mConn); #ifdef AMS_VERBOSE_LOGGING - if (!conn->stillActive) + if (!conn->mConn.stillActive) { elog(LOG, "RecvTupleChunkFromUDPIFC(): connection inactive ?!"); } @@ -4054,7 +4251,7 @@ RecvTupleChunkFromUDPIFC_Internal(ChunkTransportState *transportStates, pthread_mutex_lock(&ic_control_info.lock); - if (!conn->stillActive) + if (!conn->mConn.stillActive) { pthread_mutex_unlock(&ic_control_info.lock); return NULL; @@ -4065,13 +4262,13 @@ RecvTupleChunkFromUDPIFC_Internal(ChunkTransportState *transportStates, if (conn->pkt_q[conn->pkt_q_head] != NULL) { - prepareRxConnForRead(conn); + prepareRxConnForRead(&conn->mConn); pthread_mutex_unlock(&ic_control_info.lock); TupleChunkListItem tcItem = NULL; - tcItem = RecvTupleChunk(conn, transportStates); + tcItem = RecvTupleChunkUDPIFC(&conn->mConn, transportStates); return tcItem; } @@ -4079,7 +4276,7 @@ RecvTupleChunkFromUDPIFC_Internal(ChunkTransportState *transportStates, /* no existing data, we've got to read a packet */ /* receiveChunksUDPIFC() releases ic_control_info.lock as a side-effect */ - TupleChunkListItem chunks = receiveChunksUDPIFC(transportStates, pEntry, motNodeID, &route, conn); + TupleChunkListItem chunks = receiveChunksUDPIFC(transportStates, pEntry, motNodeID, &route, &conn->mConn); return chunks; } @@ -4088,7 +4285,7 @@ RecvTupleChunkFromUDPIFC_Internal(ChunkTransportState *transportStates, * RecvTupleChunkFromUDPIFC * Receive tuple chunks from a specific route (connection) */ -static TupleChunkListItem +TupleChunkListItem RecvTupleChunkFromUDPIFC(ChunkTransportState *transportStates, int16 motNodeID, int16 srcRoute) @@ -4117,7 +4314,7 @@ RecvTupleChunkFromUDPIFC(ChunkTransportState *transportStates, * markUDPConnInactiveIFC * Mark the connection inactive. */ -void +static inline void markUDPConnInactiveIFC(MotionConn *conn) { pthread_mutex_lock(&ic_control_info.lock); @@ -4127,13 +4324,59 @@ markUDPConnInactiveIFC(MotionConn *conn) return; } +void +DeregisterReadInterestUDP(ChunkTransportState *transportStates, + int motNodeID, + int srcRoute, + const char *reason) +{ + ChunkTransportStateEntry *pEntry = NULL; + MotionConn *conn; + + if (!transportStates) + { + elog(FATAL, "DeregisterReadInterestUDP: no transport states"); + } + + if (!transportStates->activated) + return; + + getChunkTransportState(transportStates, motNodeID, &pEntry); + getMotionConn(pEntry, srcRoute, &conn); + + if (gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG) + { + elog(DEBUG3, "Interconnect finished receiving " + "from seg%d slice%d %s pid=%d sockfd=%d; %s", + conn->remoteContentId, + pEntry->sendSlice->sliceIndex, + conn->remoteHostAndPort, + conn->cdbProc->pid, + conn->sockfd, + reason); + } + +#ifdef AMS_VERBOSE_LOGGING + elog(LOG, "deregisterReadInterest set stillactive = false for node %d route %d (%s)", motNodeID, srcRoute, reason); +#endif + markUDPConnInactiveIFC(conn); + + return; +} + + /* * aggregateStatistics * aggregate statistics. */ static void -aggregateStatistics(ChunkTransportStateEntry *pEntry) +aggregateStatistics(ChunkTransportStateEntry *pChunkEntry) { + ChunkTransportStateEntryUDP * pEntry = NULL; + + pEntry = CONTAINER_OF(pChunkEntry, ChunkTransportStateEntryUDP, entry); + Assert(pEntry); + /* * We first clear the stats, and then compute new stats by aggregating the * stats from each connection. @@ -4147,10 +4390,12 @@ aggregateStatistics(ChunkTransportStateEntry *pEntry) pEntry->stat_count_dropped = 0; int connNo; - - for (connNo = 0; connNo < pEntry->numConns; connNo++) + MotionConn *mConn = NULL; + MotionConnUDP *conn = NULL; + for (connNo = 0; connNo < pEntry->entry.numConns; connNo++) { - MotionConn *conn = &pEntry->conns[connNo]; + getMotionConn(&pEntry->entry, connNo, &mConn); + conn = CONTAINER_OF(mConn, MotionConnUDP, mConn); pEntry->stat_total_ack_time += conn->stat_total_ack_time; pEntry->stat_count_acks += conn->stat_count_acks; @@ -4214,11 +4459,16 @@ logPkt(char *prefix, icpkthdr *pkt) * packet is retransmitted. */ static void -handleAckedPacket(MotionConn *ackConn, ICBuffer *buf, uint64 now) +handleAckedPacket(MotionConn *ackMotionConn, ICBuffer *buf, uint64 now) { uint64 ackTime = 0; + bool bufIsHead = false; + MotionConnUDP *ackConn = NULL; + MotionConnUDP *bufConn = NULL; + + ackConn = CONTAINER_OF(ackMotionConn, MotionConnUDP, mConn); - bool bufIsHead = (&buf->primary == icBufferListFirst(&ackConn->unackQueue)); + bufIsHead = (&buf->primary == icBufferListFirst(&ackConn->unackQueue)); buf = icBufferListDelete(&ackConn->unackQueue, buf); @@ -4245,13 +4495,14 @@ handleAckedPacket(MotionConn *ackConn, ICBuffer *buf, uint64 now) if (buf->nRetry == 0) { - newRTT = buf->conn->rtt - (buf->conn->rtt >> RTT_SHIFT_COEFFICIENT) + (ackTime >> RTT_SHIFT_COEFFICIENT); + bufConn = CONTAINER_OF(buf->conn, MotionConnUDP, mConn); + newRTT = bufConn->rtt - (bufConn->rtt >> RTT_SHIFT_COEFFICIENT) + (ackTime >> RTT_SHIFT_COEFFICIENT); newRTT = Min(MAX_RTT, Max(newRTT, MIN_RTT)); - buf->conn->rtt = newRTT; + bufConn->rtt = newRTT; - newDEV = buf->conn->dev - (buf->conn->dev >> DEV_SHIFT_COEFFICIENT) + ((Max(ackTime, newRTT) - Min(ackTime, newRTT)) >> DEV_SHIFT_COEFFICIENT); + newDEV = bufConn->dev - (bufConn->dev >> DEV_SHIFT_COEFFICIENT) + ((Max(ackTime, newRTT) - Min(ackTime, newRTT)) >> DEV_SHIFT_COEFFICIENT); newDEV = Min(MAX_DEV, Max(newDEV, MIN_DEV)); - buf->conn->dev = newDEV; + bufConn->dev = newDEV; /* adjust the congestion control window. */ if (snd_control_info.cwnd < snd_control_info.ssthresh) @@ -4263,9 +4514,10 @@ handleAckedPacket(MotionConn *ackConn, ICBuffer *buf, uint64 now) } } - buf->conn->stat_total_ack_time += ackTime; - buf->conn->stat_max_ack_time = Max(ackTime, buf->conn->stat_max_ack_time); - buf->conn->stat_min_ack_time = Min(ackTime, buf->conn->stat_min_ack_time); + bufConn = CONTAINER_OF(buf->conn, MotionConnUDP, mConn); + bufConn->stat_total_ack_time += ackTime; + bufConn->stat_max_ack_time = Max(ackTime, bufConn->stat_max_ack_time); + bufConn->stat_min_ack_time = Min(ackTime, bufConn->stat_min_ack_time); /* * only change receivedAckSeq when it is the smallest pkt we sent and have @@ -4276,16 +4528,18 @@ handleAckedPacket(MotionConn *ackConn, ICBuffer *buf, uint64 now) /* The first packet acts like a connect setup packet */ if (buf->pkt->seq == 1) - ackConn->state = mcsStarted; + ackConn->mConn.state = mcsStarted; icBufferListAppend(&snd_buffer_pool.freeList, buf); #ifdef AMS_VERBOSE_LOGGING - write_log("REMOVEPKT %d from unack queue for route %d (retry %d) sndbufmaxcount %d sndbufcount %d sndbuffreelistlen %d, sntSeq %d consumedSeq %d recvAckSeq %d capacity %d, sndQ %d, unackQ %d", buf->pkt->seq, ackConn->route, buf->nRetry, snd_buffer_pool.maxCount, snd_buffer_pool.count, icBufferListLength(&snd_buffer_pool.freeList), buf->conn->sentSeq, buf->conn->consumedSeq, buf->conn->receivedAckSeq, buf->conn->capacity, icBufferListLength(&buf->conn->sndQueue), icBufferListLength(&buf->conn->unackQueue)); + write_log("REMOVEPKT %d from unack queue for route %d (retry %d) sndbufmaxcount %d sndbufcount %d sndbuffreelistlen %d, sntSeq %d consumedSeq %d recvAckSeq %d capacity %d, sndQ %d, unackQ %d", + buf->pkt->seq, ackConn->route, buf->nRetry, snd_buffer_pool.maxCount, snd_buffer_pool.count, icBufferListLength(&snd_buffer_pool.freeList), bufConn->sentSeq, + bufConn->consumedSeq, bufConn->receivedAckSeq, bufConn->capacity, icBufferListLength(&bufConn->sndQueue), icBufferListLength(&bufConn->unackQueue)); if (gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG) { - icBufferListLog(&buf->conn->unackQueue); - icBufferListLog(&buf->conn->sndQueue); + icBufferListLog(&bufConn->unackQueue); + icBufferListLog(&bufConn->sndQueue); } #endif } @@ -4297,11 +4551,12 @@ handleAckedPacket(MotionConn *ackConn, ICBuffer *buf, uint64 now) * if we receive a stop message, return true (caller will clean up). */ static bool -handleAcks(ChunkTransportState *transportStates, ChunkTransportStateEntry *pEntry) +handleAcks(ChunkTransportState *transportStates, ChunkTransportStateEntry *pChunkEntry) { - + ChunkTransportStateEntryUDP * pEntry = NULL; bool ret = false; - MotionConn *ackConn = NULL; + MotionConn *ackMotionConn = NULL; + MotionConnUDP *ackConn = NULL; int n; struct sockaddr_storage peer; @@ -4313,6 +4568,9 @@ handleAcks(ChunkTransportState *transportStates, ChunkTransportStateEntry *pEntr bool shouldSendBuffers = false; SliceTable *sliceTbl = transportStates->sliceTable; + pEntry = CONTAINER_OF(pChunkEntry, ChunkTransportStateEntryUDP, entry); + Assert(pEntry); + for (;;) { @@ -4325,7 +4583,7 @@ handleAcks(ChunkTransportState *transportStates, ChunkTransportStateEntry *pEntr { if (errno == EWOULDBLOCK) /* had nothing to read. */ { - aggregateStatistics(pEntry); + aggregateStatistics(&pEntry->entry); return ret; } @@ -4366,7 +4624,7 @@ handleAcks(ChunkTransportState *transportStates, ChunkTransportStateEntry *pEntr */ if (pkt->srcContentId == GpIdentity.segindex && pkt->srcPid == MyProcPid && - pkt->srcListenerPort == ((Gp_listener_port >> 16) & 0x0ffff) && + pkt->srcListenerPort == (GetListenPortUDP()) && pkt->sessionId == gp_session_id && pkt->icId == sliceTbl->ic_instance_id) { @@ -4375,13 +4633,14 @@ handleAcks(ChunkTransportState *transportStates, ChunkTransportStateEntry *pEntr * packet is for me. Note here we do not need to get a connection * lock here, since background rx thread only read the hash table. */ - ackConn = findConnByHeader(&ic_control_info.connHtab, pkt); + ackMotionConn = findConnByHeader(&ic_control_info.connHtab, pkt); - if (ackConn == NULL) + if (ackMotionConn == NULL) { elog(LOG, "Received ack for unknown connection (flags 0x%x)", pkt->flags); continue; } + ackConn = CONTAINER_OF(ackMotionConn, MotionConnUDP, mConn); ackConn->stat_count_acks++; ic_statistics.recvAckNum++; @@ -4419,7 +4678,7 @@ handleAcks(ChunkTransportState *transportStates, ChunkTransportStateEntry *pEntr if (DEBUG1 >= log_min_messages) write_log("GOTDUPACK [seq %d] from route %d; srcpid %d dstpid %d cmd %d flags 0x%x connseq %d", pkt->seq, ackConn->route, pkt->srcPid, pkt->dstPid, pkt->icId, pkt->flags, ackConn->conn_info.seq); - shouldSendBuffers |= (handleAckForDuplicatePkt(ackConn, pkt)); + shouldSendBuffers |= (handleAckForDuplicatePkt(&ackConn->mConn, pkt)); break; } else if (pkt->flags & UDPIC_FLAGS_DISORDER) @@ -4427,7 +4686,7 @@ handleAcks(ChunkTransportState *transportStates, ChunkTransportStateEntry *pEntr if (DEBUG1 >= log_min_messages) write_log("GOTDISORDER [seq %d] from route %d; srcpid %d dstpid %d cmd %d flags 0x%x connseq %d", pkt->seq, ackConn->route, pkt->srcPid, pkt->dstPid, pkt->icId, pkt->flags, ackConn->conn_info.seq); - shouldSendBuffers |= (handleAckForDisorderPkt(transportStates, pEntry, ackConn, pkt)); + shouldSendBuffers |= (handleAckForDisorderPkt(transportStates, &pEntry->entry, &ackConn->mConn, pkt)); break; } @@ -4444,12 +4703,12 @@ handleAcks(ChunkTransportState *transportStates, ChunkTransportStateEntry *pEntr } /* haven't gotten a stop request, maybe this is one ? */ - if ((pkt->flags & UDPIC_FLAGS_STOP) && !ackConn->stopRequested && ackConn->stillActive) + if ((pkt->flags & UDPIC_FLAGS_STOP) && !ackConn->mConn.stopRequested && ackConn->mConn.stillActive) { #ifdef AMS_VERBOSE_LOGGING elog(LOG, "got ack with stop; srcpid %d dstpid %d cmd %d flags 0x%x pktseq %d connseq %d", pkt->srcPid, pkt->dstPid, pkt->icId, pkt->flags, pkt->seq, ackConn->conn_info.seq); #endif - ackConn->stopRequested = true; + ackConn->mConn.stopRequested = true; ackConn->conn_info.flags |= UDPIC_FLAGS_STOP; ret = true; /* continue to deal with acks */ @@ -4479,7 +4738,7 @@ handleAcks(ChunkTransportState *transportStates, ChunkTransportStateEntry *pEntr while (!icBufferListIsHead(&ackConn->unackQueue, link) && buf->pkt->seq <= pkt->seq) { next = link->next; - handleAckedPacket(ackConn, buf, now); + handleAckedPacket(&ackConn->mConn, buf, now); shouldSendBuffers = true; link = next; buf = GET_ICBUFFER_FROM_PRIMARY(link); @@ -4496,7 +4755,7 @@ handleAcks(ChunkTransportState *transportStates, ChunkTransportStateEntry *pEntr * in EOS sending logic and will not check stop message. */ if (shouldSendBuffers) - sendBuffers(transportStates, pEntry, ackConn); + sendBuffers(transportStates, &pEntry->entry, &ackConn->mConn); } else if (DEBUG1 >= log_min_messages) write_log("handleAck: not the ack we're looking for (flags 0x%x)...mot(%d) content(%d:%d) srcpid(%d:%d) dstpid(%d) srcport(%d:%d) dstport(%d) sess(%d:%d) cmd(%d:%d)", @@ -4504,7 +4763,7 @@ handleAcks(ChunkTransportState *transportStates, ChunkTransportStateEntry *pEntr pkt->srcContentId, GpIdentity.segindex, pkt->srcPid, MyProcPid, pkt->dstPid, - pkt->srcListenerPort, ((Gp_listener_port >> 16) & 0x0ffff), + pkt->srcListenerPort, (GetListenPortUDP()), pkt->dstListenerPort, pkt->sessionId, gp_session_id, pkt->icId, sliceTbl->ic_instance_id); @@ -4558,21 +4817,23 @@ checkCRC(icpkthdr *pkt) * Prepare connection for transmit. */ static inline void -prepareXmit(MotionConn *conn) +prepareXmit(MotionConn *mConn) { - Assert(conn != NULL); + MotionConnUDP *conn = NULL; + Assert(mConn != NULL); + conn = CONTAINER_OF(mConn, MotionConnUDP, mConn); - conn->conn_info.len = conn->msgSize; + conn->conn_info.len = conn->mConn.msgSize; conn->conn_info.crc = 0; - memcpy(conn->pBuff, &conn->conn_info, sizeof(conn->conn_info)); + memcpy(conn->mConn.pBuff, &conn->conn_info, sizeof(conn->conn_info)); /* increase the sequence no */ conn->conn_info.seq++; if (gp_interconnect_full_crc) { - icpkthdr *pkt = (icpkthdr *) conn->pBuff; + icpkthdr *pkt = (icpkthdr *) conn->mConn.pBuff; addCRC(pkt); } @@ -4583,9 +4844,16 @@ prepareXmit(MotionConn *conn) * Send a packet. */ static void -sendOnce(ChunkTransportState *transportStates, ChunkTransportStateEntry *pEntry, ICBuffer *buf, MotionConn *conn) +sendOnce(ChunkTransportState *transportStates, ChunkTransportStateEntry *pChunkEntry, ICBuffer *buf, MotionConn *mConn) { int32 n; + ChunkTransportStateEntryUDP *pEntry = NULL; + MotionConnUDP *conn = NULL; + + pEntry = CONTAINER_OF(pChunkEntry, ChunkTransportStateEntryUDP, entry); + Assert(pEntry); + + conn = CONTAINER_OF(mConn, MotionConnUDP, mConn); #ifdef USE_ASSERT_CHECKING if (testmode_inject_fault(gp_udpic_dropxmit_percent)) @@ -4621,7 +4889,7 @@ sendOnce(ChunkTransportState *transportStates, ChunkTransportStateEntry *pEntry, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), errmsg("Interconnect error writing an outgoing packet: %m"), errdetail("error during sendto() for Remote Connection: contentId=%d at %s", - conn->remoteContentId, conn->remoteHostAndPort))); + conn->mConn.remoteContentId, conn->mConn.remoteHostAndPort))); return; } @@ -4629,8 +4897,8 @@ sendOnce(ChunkTransportState *transportStates, ChunkTransportStateEntry *pEntry, errmsg("Interconnect error writing an outgoing packet: %m"), errdetail("error during sendto() call (error:%d).\n" "For Remote Connection: contentId=%d at %s", - save_errno, conn->remoteContentId, - conn->remoteHostAndPort))); + save_errno, conn->mConn.remoteContentId, + conn->mConn.remoteHostAndPort))); /* not reached */ } @@ -4639,8 +4907,8 @@ sendOnce(ChunkTransportState *transportStates, ChunkTransportStateEntry *pEntry, if (DEBUG1 >= log_min_messages) write_log("Interconnect error writing an outgoing packet [seq %d]: short transmit (given %d sent %d) during sendto() call." "For Remote Connection: contentId=%d at %s", buf->pkt->seq, buf->pkt->len, n, - conn->remoteContentId, - conn->remoteHostAndPort); + conn->mConn.remoteContentId, + conn->mConn.remoteHostAndPort); #ifdef AMS_VERBOSE_LOGGING logPkt("PKT DETAILS ", buf->pkt); #endif @@ -4656,23 +4924,28 @@ sendOnce(ChunkTransportState *transportStates, ChunkTransportStateEntry *pEntry, * */ static void -handleStopMsgs(ChunkTransportState *transportStates, ChunkTransportStateEntry *pEntry, int16 motionId) +handleStopMsgs(ChunkTransportState *transportStates, ChunkTransportStateEntry *pChunkEntry, int16 motionId) { int i = 0; + ChunkTransportStateEntryUDP *pEntry = NULL; + MotionConn *mConn = NULL; + MotionConnUDP *conn = NULL; + + pEntry = CONTAINER_OF(pChunkEntry, ChunkTransportStateEntryUDP, entry); + Assert(pEntry); #ifdef AMS_VERBOSE_LOGGING elog(DEBUG3, "handleStopMsgs: node %d", motionId); #endif - while (i < pEntry->numConns) + while (i < pEntry->entry.numConns) { - MotionConn *conn = NULL; - - conn = pEntry->conns + i; + getMotionConn(&pEntry->entry, i, &mConn); + conn = CONTAINER_OF(mConn, MotionConnUDP, mConn); #ifdef AMS_VERBOSE_LOGGING elog(DEBUG3, "handleStopMsgs: node %d route %d %s %s", motionId, conn->route, - (conn->stillActive ? "active" : "NOT active"), (conn->stopRequested ? "stop requested" : "")); - elog(DEBUG3, "handleStopMsgs: node %d route %d msgSize %d", motionId, conn->route, conn->msgSize); + (conn->mConn.stillActive ? "active" : "NOT active"), (conn->mConn.stopRequested ? "stop requested" : "")); + elog(DEBUG3, "handleStopMsgs: node %d route %d msgSize %d", motionId, conn->route, conn->mConn.msgSize); #endif /* @@ -4681,22 +4954,22 @@ handleStopMsgs(ChunkTransportState *transportStates, ChunkTransportStateEntry *p * were sending) ... empty it first so the outbound buffer is empty * when we get here. */ - if (conn->stillActive && conn->stopRequested) + if (conn->mConn.stillActive && conn->mConn.stopRequested) { /* mark buffer empty */ - conn->tupleCount = 0; - conn->msgSize = sizeof(conn->conn_info); + conn->mConn.tupleCount = 0; + conn->mConn.msgSize = sizeof(conn->conn_info); /* now send our stop-ack EOS */ conn->conn_info.flags |= UDPIC_FLAGS_EOS; Assert(conn->curBuff != NULL); - conn->pBuff[conn->msgSize] = 'S'; - conn->msgSize += 1; + conn->mConn.pBuff[conn->mConn.msgSize] = 'S'; + conn->mConn.msgSize += 1; - prepareXmit(conn); + prepareXmit(&conn->mConn); /* now ready to actually send */ if (gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG) @@ -4709,23 +4982,23 @@ handleStopMsgs(ChunkTransportState *transportStates, ChunkTransportStateEntry *p icBufferListReturn(&conn->sndQueue, false); icBufferListReturn(&conn->unackQueue, Gp_interconnect_fc_method == INTERCONNECT_FC_METHOD_CAPACITY ? false : true); - conn->tupleCount = 0; - conn->msgSize = sizeof(conn->conn_info); + conn->mConn.tupleCount = 0; + conn->mConn.msgSize = sizeof(conn->conn_info); - conn->state = mcsEosSent; + conn->mConn.state = mcsEosSent; conn->curBuff = NULL; - conn->pBuff = NULL; - conn->stillActive = false; - conn->stopRequested = false; + conn->mConn.pBuff = NULL; + conn->mConn.stillActive = false; + conn->mConn.stopRequested = false; } i++; - if (i == pEntry->numConns) + if (i == pEntry->entry.numConns) { if (pollAcks(transportStates, pEntry->txfd, 0)) { - if (handleAcks(transportStates, pEntry)) + if (handleAcks(transportStates, &pEntry->entry)) { /* more stops found, loop again. */ i = 0; @@ -4756,8 +5029,13 @@ handleStopMsgs(ChunkTransportState *transportStates, ChunkTransportStateEntry *p * the corresponding queue in the unack queue ring. */ static void -sendBuffers(ChunkTransportState *transportStates, ChunkTransportStateEntry *pEntry, MotionConn *conn) +sendBuffers(ChunkTransportState *transportStates, ChunkTransportStateEntry *pEntry, MotionConn *mConn) { + MotionConnUDP *conn = NULL; + MotionConnUDP *buffConn = NULL; + + conn = CONTAINER_OF(mConn, MotionConnUDP, mConn); + while (conn->capacity > 0 && icBufferListLength(&conn->sndQueue) > 0) { ICBuffer *buf = NULL; @@ -4768,7 +5046,7 @@ sendBuffers(ChunkTransportState *transportStates, ChunkTransportStateEntry *pEnt break; /* for connection setup, we only allow one outstanding packet. */ - if (conn->state == mcsSetupOutgoingConnection && icBufferListLength(&conn->unackQueue) >= 1) + if (conn->mConn.state == mcsSetupOutgoingConnection && icBufferListLength(&conn->unackQueue) >= 1) break; buf = icBufferListPop(&conn->sndQueue); @@ -4778,7 +5056,7 @@ sendBuffers(ChunkTransportState *transportStates, ChunkTransportStateEntry *pEnt buf->sentTime = now; buf->unackQueueRingSlot = -1; buf->nRetry = 0; - buf->conn = conn; + buf->conn = &conn->mConn; conn->capacity--; icBufferListAppend(&conn->unackQueue, buf); @@ -4807,14 +5085,14 @@ sendBuffers(ChunkTransportState *transportStates, ChunkTransportStateEntry *pEnt updateStats(TPE_DATA_PKT_SEND, conn, buf->pkt); #endif - sendOnce(transportStates, pEntry, buf, conn); + sendOnce(transportStates, pEntry, buf, &conn->mConn); ic_statistics.sndPktNum++; #ifdef AMS_VERBOSE_LOGGING logPkt("SEND PKT DETAIL", buf->pkt); #endif - - buf->conn->sentSeq = buf->pkt->seq; + buffConn = CONTAINER_OF(buf->conn, MotionConnUDP, mConn); + buffConn->sentSeq = buf->pkt->seq; } } @@ -4845,12 +5123,15 @@ sendBuffers(ChunkTransportState *transportStates, ChunkTransportStateEntry *pEnt * */ static void -handleDisorderPacket(MotionConn *conn, int pos, uint32 tailSeq, icpkthdr *pkt) +handleDisorderPacket(MotionConn *mConn, int pos, uint32 tailSeq, icpkthdr *pkt) { int start = 0; uint32 lostPktCnt = 0; uint32 *curSeq = (uint32 *) &rx_control_info.disorderBuffer[1]; uint32 maxSeqs = MAX_SEQS_IN_DISORDER_ACK; + MotionConnUDP *conn = NULL; + + conn = CONTAINER_OF(mConn, MotionConnUDP, mConn); #ifdef AMS_VERBOSE_LOGGING write_log("PROCESS_DISORDER PKT BEGIN:"); @@ -4879,7 +5160,7 @@ handleDisorderPacket(MotionConn *conn, int pos, uint32 tailSeq, icpkthdr *pkt) #endif /* when reaching here, cnt must not be 0 */ - sendDisorderAck(conn, pkt->seq, conn->conn_info.seq - 1, lostPktCnt); + sendDisorderAck(&conn->mConn, pkt->seq, conn->conn_info.seq - 1, lostPktCnt); } /* @@ -4890,10 +5171,9 @@ handleDisorderPacket(MotionConn *conn, int pos, uint32 tailSeq, icpkthdr *pkt) static bool handleAckForDisorderPkt(ChunkTransportState *transportStates, ChunkTransportStateEntry *pEntry, - MotionConn *conn, + MotionConn *mConn, icpkthdr *pkt) { - ICBufferLink *link = NULL; ICBuffer *buf = NULL; ICBufferLink *next = NULL; @@ -4903,6 +5183,9 @@ handleAckForDisorderPkt(ChunkTransportState *transportStates, static uint32 times = 0; static uint32 lastSeq = 0; bool shouldSendBuffers = false; + MotionConnUDP *conn = NULL; + + conn = CONTAINER_OF(mConn, MotionConnUDP, mConn); if (pkt->extraSeq != lastSeq) { @@ -4949,7 +5232,7 @@ handleAckForDisorderPkt(ChunkTransportState *transportStates, if (buf->pkt->seq == pkt->seq) { - handleAckedPacket(conn, buf, now); + handleAckedPacket(&conn->mConn, buf, now); shouldSendBuffers = true; break; } @@ -4988,7 +5271,7 @@ handleAckForDisorderPkt(ChunkTransportState *transportStates, /* remove packet already received. */ next = link->next; - handleAckedPacket(conn, buf, now); + handleAckedPacket(&conn->mConn, buf, now); shouldSendBuffers = true; link = next; buf = GET_ICBUFFER_FROM_PRIMARY(link); @@ -5029,13 +5312,16 @@ handleAckForDisorderPkt(ChunkTransportState *transportStates, * */ static bool -handleAckForDuplicatePkt(MotionConn *conn, icpkthdr *pkt) +handleAckForDuplicatePkt(MotionConn *mConn, icpkthdr *pkt) { ICBufferLink *link = NULL; ICBuffer *buf = NULL; ICBufferLink *next = NULL; uint64 now = getCurrentTime(); bool shouldSendBuffers = false; + MotionConnUDP *conn = NULL; + + conn = CONTAINER_OF(mConn, MotionConnUDP, mConn); #ifdef AMS_VERBOSE_LOGGING write_log("RESEND the unacked buffers in the queue due to %s", pkt->len == 0 ? "PROCESS_START_RACE" : "DISORDER"); @@ -5055,7 +5341,7 @@ handleAckForDuplicatePkt(MotionConn *conn, icpkthdr *pkt) while (!icBufferListIsHead(&conn->unackQueue, link) && (buf->pkt->seq <= pkt->extraSeq)) { next = link->next; - handleAckedPacket(conn, buf, now); + handleAckedPacket(&conn->mConn, buf, now); shouldSendBuffers = true; link = next; buf = GET_ICBUFFER_FROM_PRIMARY(link); @@ -5067,7 +5353,7 @@ handleAckForDuplicatePkt(MotionConn *conn, icpkthdr *pkt) next = link->next; if (buf->pkt->seq == pkt->seq) { - handleAckedPacket(conn, buf, now); + handleAckedPacket(&conn->mConn, buf, now); shouldSendBuffers = true; break; } @@ -5151,6 +5437,7 @@ checkExpiration(ChunkTransportState *transportStates, /* check for expiration */ int count = 0; int retransmits = 0; + MotionConnUDP *currBuffConn = NULL; Assert(unack_queue_ring.currentTime != 0); while (now >= (unack_queue_ring.currentTime + TIMER_SPAN) && count++ < UNACK_QUEUE_RING_SLOTS_NUM) @@ -5172,17 +5459,19 @@ checkExpiration(ChunkTransportState *transportStates, sendOnce(transportStates, pEntry, curBuf, curBuf->conn); + currBuffConn = CONTAINER_OF(curBuf->conn, MotionConnUDP, mConn); + retransmits++; ic_statistics.retransmits++; - curBuf->conn->stat_count_resent++; - curBuf->conn->stat_max_resent = Max(curBuf->conn->stat_max_resent, - curBuf->conn->stat_count_resent); + currBuffConn->stat_count_resent++; + currBuffConn->stat_max_resent = Max(currBuffConn->stat_max_resent, + currBuffConn->stat_count_resent); checkNetworkTimeout(curBuf, now, &transportStates->networkTimeoutIsLogged); #ifdef AMS_VERBOSE_LOGGING write_log("RESEND pkt with seq %d (retry %d, rtt " UINT64_FORMAT ") to route %d", - curBuf->pkt->seq, curBuf->nRetry, curBuf->conn->rtt, curBuf->conn->route); + curBuf->pkt->seq, curBuf->nRetry, currBuffConn->rtt, currBuffConn->route); logPkt("RESEND PKT in checkExpiration", curBuf->pkt); #endif } @@ -5222,9 +5511,16 @@ checkExpiration(ChunkTransportState *transportStates, * */ static void -checkDeadlock(ChunkTransportStateEntry *pEntry, MotionConn *conn) +checkDeadlock(ChunkTransportStateEntry *pChunkEntry, MotionConn *mConn) { uint64 deadlockCheckTime; + ChunkTransportStateEntryUDP *pEntry = NULL; + MotionConnUDP *conn = NULL; + + pEntry = CONTAINER_OF(pChunkEntry, ChunkTransportStateEntryUDP, entry); + Assert(pEntry); + + conn = CONTAINER_OF(mConn, MotionConnUDP, mConn); if (icBufferListLength(&conn->unackQueue) == 0 && conn->capacity == 0 && icBufferListLength(&conn->sndQueue) > 0) { @@ -5248,7 +5544,7 @@ checkDeadlock(ChunkTransportStateEntry *pEntry, MotionConn *conn) if (((now - ic_control_info.lastDeadlockCheckTime) > deadlockCheckTime) && ((now - conn->deadlockCheckBeginTime) > deadlockCheckTime)) { - sendStatusQueryMessage(conn, pEntry->txfd, conn->conn_info.seq - 1); + sendStatusQueryMessage(&conn->mConn, pEntry->txfd, conn->conn_info.seq - 1); ic_control_info.lastDeadlockCheckTime = now; ic_statistics.statusQueryMsgNum++; @@ -5259,7 +5555,7 @@ checkDeadlock(ChunkTransportStateEntry *pEntry, MotionConn *conn) (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), errmsg("interconnect encountered a network error, please check your network"), errdetail("Did not get any response from %s (pid %d cid %d) in %d seconds.", - conn->remoteHostAndPort, + conn->mConn.remoteHostAndPort, conn->conn_info.dstPid, conn->conn_info.dstContentId, Gp_interconnect_transmit_timeout))); @@ -5316,8 +5612,12 @@ pollAcks(ChunkTransportState *transportStates, int fd, int timeout) * Update the retransmit statistics. */ static inline void -updateRetransmitStatistics(MotionConn *conn) +updateRetransmitStatistics(MotionConn *mConn) { + MotionConnUDP *conn = NULL; + + conn = CONTAINER_OF(mConn, MotionConnUDP, mConn); + ic_statistics.retransmits++; conn->stat_count_resent++; conn->stat_max_resent = Max(conn->stat_max_resent, conn->stat_count_resent); @@ -5331,9 +5631,12 @@ updateRetransmitStatistics(MotionConn *conn) static void checkExpirationCapacityFC(ChunkTransportState *transportStates, ChunkTransportStateEntry *pEntry, - MotionConn *conn, + MotionConn *mConn, int timeout) { + MotionConnUDP *conn = NULL; + + conn = CONTAINER_OF(mConn, MotionConnUDP, mConn); if (icBufferListLength(&conn->unackQueue) == 0) return; @@ -5349,7 +5652,7 @@ checkExpirationCapacityFC(ChunkTransportState *transportStates, buf->nRetry++; ic_control_info.lastPacketSendTime = now; - updateRetransmitStatistics(conn); + updateRetransmitStatistics(&conn->mConn); checkNetworkTimeout(buf, now, &transportStates->networkTimeoutIsLogged); } } @@ -5416,8 +5719,11 @@ checkExceptions(ChunkTransportState *transportStates, * Compute timeout value in ms. */ static inline int -computeTimeout(MotionConn *conn, int retry) +computeTimeout(MotionConn *mConn, int retry) { + MotionConnUDP *conn = NULL; + + conn = CONTAINER_OF(mConn, MotionConnUDP, mConn); if (icBufferListLength(&conn->unackQueue) == 0) return TIMER_CHECKING_PERIOD; @@ -5444,32 +5750,38 @@ computeTimeout(MotionConn *conn, int retry) * tcItem - message to be sent. * motionId - Node Motion Id. */ -static bool +bool SendChunkUDPIFC(ChunkTransportState *transportStates, - ChunkTransportStateEntry *pEntry, - MotionConn *conn, + ChunkTransportStateEntry *pChunkEntry, + MotionConn *mConn, TupleChunkListItem tcItem, int16 motionId) { - + ChunkTransportStateEntryUDP *pEntry = NULL; int length = tcItem->chunk_length; int retry = 0; bool doCheckExpiration = false; bool gotStops = false; + MotionConnUDP *conn = NULL; + + conn = CONTAINER_OF(mConn, MotionConnUDP, mConn); + + Assert(conn->mConn.msgSize > 0); - Assert(conn->msgSize > 0); + pEntry = CONTAINER_OF(pChunkEntry, ChunkTransportStateEntryUDP, entry); + Assert(pEntry); #ifdef AMS_VERBOSE_LOGGING - elog(DEBUG3, "sendChunk: msgSize %d this chunk length %d conn seq %d", - conn->msgSize, tcItem->chunk_length, conn->conn_info.seq); + elog(DEBUG3, "SendChunkUDPIFC: msgSize %d this chunk length %d conn seq %d", + conn->mConn.msgSize, tcItem->chunk_length, conn->conn_info.seq); #endif - if (conn->msgSize + length <= Gp_max_packet_size) + if (conn->mConn.msgSize + length <= Gp_max_packet_size) { - memcpy(conn->pBuff + conn->msgSize, tcItem->chunk_data, tcItem->chunk_length); - conn->msgSize += length; + memcpy(conn->mConn.pBuff + conn->mConn.msgSize, tcItem->chunk_data, tcItem->chunk_length); + conn->mConn.msgSize += length; - conn->tupleCount++; + conn->mConn.tupleCount++; return true; } @@ -5480,10 +5792,10 @@ SendChunkUDPIFC(ChunkTransportState *transportStates, /* try to send it */ - prepareXmit(conn); + prepareXmit(&conn->mConn); icBufferListAppend(&conn->sndQueue, conn->curBuff); - sendBuffers(transportStates, pEntry, conn); + sendBuffers(transportStates, &pEntry->entry, &conn->mConn); uint64 now = getCurrentTime(); @@ -5494,18 +5806,18 @@ SendChunkUDPIFC(ChunkTransportState *transportStates, /* get a new buffer */ conn->curBuff = NULL; - conn->pBuff = NULL; + conn->mConn.pBuff = NULL; ic_control_info.lastPacketSendTime = 0; conn->deadlockCheckBeginTime = now; - while (doCheckExpiration || (conn->curBuff = getSndBuffer(conn)) == NULL) + while (doCheckExpiration || (conn->curBuff = getSndBuffer(&conn->mConn)) == NULL) { - int timeout = (doCheckExpiration ? 0 : computeTimeout(conn, retry)); + int timeout = (doCheckExpiration ? 0 : computeTimeout(&conn->mConn, retry)); if (pollAcks(transportStates, pEntry->txfd, timeout)) { - if (handleAcks(transportStates, pEntry)) + if (handleAcks(transportStates, &pEntry->entry)) { /* * We make sure that we deal with the stop messages only after @@ -5516,47 +5828,49 @@ SendChunkUDPIFC(ChunkTransportState *transportStates, gotStops = true; } } - checkExceptions(transportStates, pEntry, conn, retry++, timeout); + checkExceptions(transportStates, &pEntry->entry, &conn->mConn, retry++, timeout); doCheckExpiration = false; } - conn->pBuff = (uint8 *) conn->curBuff->pkt; + conn->mConn.pBuff = (uint8 *) conn->curBuff->pkt; if (gotStops) { /* handling stop message will make some connection not active anymore */ - handleStopMsgs(transportStates, pEntry, motionId); - if (!conn->stillActive) + handleStopMsgs(transportStates, &pEntry->entry, motionId); + if (!conn->mConn.stillActive) return true; } /* reinitialize connection */ - conn->tupleCount = 0; - conn->msgSize = sizeof(conn->conn_info); + conn->mConn.tupleCount = 0; + conn->mConn.msgSize = sizeof(conn->conn_info); /* now we can copy the input to the new buffer */ - memcpy(conn->pBuff + conn->msgSize, tcItem->chunk_data, tcItem->chunk_length); - conn->msgSize += length; + memcpy(conn->mConn.pBuff + conn->mConn.msgSize, tcItem->chunk_data, tcItem->chunk_length); + conn->mConn.msgSize += length; - conn->tupleCount++; + conn->mConn.tupleCount++; return true; } /* - * SendEosUDPIFC + * SendEOSUDPIFC * broadcast eos messages to receivers. * * See ml_ipc.h * */ -static void -SendEosUDPIFC(ChunkTransportState *transportStates, +void +SendEOSUDPIFC(ChunkTransportState *transportStates, int motNodeID, TupleChunkListItem tcItem) { - ChunkTransportStateEntry *pEntry = NULL; - MotionConn *conn; + ChunkTransportStateEntry *pChunkEntry = NULL; + ChunkTransportStateEntryUDP *pEntry = NULL; + MotionConn *mConn; + MotionConnUDP *conn = NULL; int i = 0; int retry = 0; int activeCount = 0; @@ -5564,11 +5878,11 @@ SendEosUDPIFC(ChunkTransportState *transportStates, if (!transportStates) { - elog(FATAL, "SendEosUDPIFC: missing interconnect context."); + elog(FATAL, "SendEOSUDPIFC: missing interconnect context."); } else if (!transportStates->activated && !transportStates->teardownActive) { - elog(FATAL, "SendEosUDPIFC: context and teardown inactive."); + elog(FATAL, "SendEOSUDPIFC: context and teardown inactive."); } #ifdef AMS_VERBOSE_LOGGING elog(LOG, "entering seneosudp"); @@ -5577,47 +5891,51 @@ SendEosUDPIFC(ChunkTransportState *transportStates, /* check em' */ ML_CHECK_FOR_INTERRUPTS(transportStates->teardownActive); - getChunkTransportState(transportStates, motNodeID, &pEntry); + getChunkTransportState(transportStates, motNodeID, &pChunkEntry); + + pEntry = CONTAINER_OF(pChunkEntry, ChunkTransportStateEntryUDP, entry); + Assert(pEntry); if (gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG) elog(DEBUG1, "Interconnect seg%d slice%d sending end-of-stream to slice%d", - GpIdentity.segindex, motNodeID, pEntry->recvSlice->sliceIndex); + GpIdentity.segindex, motNodeID, pEntry->entry.recvSlice->sliceIndex); /* * we want to add our tcItem onto each of the outgoing buffers -- this is * guaranteed to leave things in a state where a flush is *required*. */ - doBroadcast(transportStates, pEntry, tcItem, NULL); + doBroadcast(transportStates, (&pEntry->entry), tcItem, NULL); pEntry->sendingEos = true; uint64 now = getCurrentTime(); /* now flush all of the buffers. */ - for (i = 0; i < pEntry->numConns; i++) + for (i = 0; i < pEntry->entry.numConns; i++) { - conn = pEntry->conns + i; + getMotionConn(&pEntry->entry, i, &mConn); + conn = CONTAINER_OF(mConn, MotionConnUDP, mConn); - if (conn->stillActive) + if (conn->mConn.stillActive) { if (gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG) elog(DEBUG1, "sent eos to route %d tuplecount %d seq %d flags 0x%x stillActive %s icId %d %d", - conn->route, conn->tupleCount, conn->conn_info.seq, - conn->conn_info.flags, (conn->stillActive ? "true" : "false"), - conn->conn_info.icId, conn->msgSize); + conn->route, conn->mConn.tupleCount, conn->conn_info.seq, + conn->conn_info.flags, (conn->mConn.stillActive ? "true" : "false"), + conn->conn_info.icId, conn->mConn.msgSize); /* prepare this for transmit */ if (pEntry->sendingEos) conn->conn_info.flags |= UDPIC_FLAGS_EOS; - prepareXmit(conn); + prepareXmit(&conn->mConn); /* place it into the send queue */ icBufferListAppend(&conn->sndQueue, conn->curBuff); - sendBuffers(transportStates, pEntry, conn); + sendBuffers(transportStates, &pEntry->entry, &conn->mConn); - conn->tupleCount = 0; - conn->msgSize = sizeof(conn->conn_info); + conn->mConn.tupleCount = 0; + conn->mConn.msgSize = sizeof(conn->conn_info); conn->curBuff = NULL; conn->deadlockCheckBeginTime = now; @@ -5639,11 +5957,12 @@ SendEosUDPIFC(ChunkTransportState *transportStates, { activeCount = 0; - for (i = 0; i < pEntry->numConns; i++) + for (i = 0; i < pEntry->entry.numConns; i++) { - conn = pEntry->conns + i; + getMotionConn(&pEntry->entry, i, &mConn); + conn = CONTAINER_OF(mConn, MotionConnUDP, mConn); - if (conn->stillActive) + if (conn->mConn.stillActive) { retry = 0; ic_control_info.lastPacketSendTime = 0; @@ -5652,22 +5971,22 @@ SendEosUDPIFC(ChunkTransportState *transportStates, while (icBufferListLength(&conn->unackQueue) > 0 || icBufferListLength(&conn->sndQueue) > 0) { - timeout = computeTimeout(conn, retry); + timeout = computeTimeout(&conn->mConn, retry); if (pollAcks(transportStates, pEntry->txfd, timeout)) - handleAcks(transportStates, pEntry); + handleAcks(transportStates, &pEntry->entry); - checkExceptions(transportStates, pEntry, conn, retry++, timeout); + checkExceptions(transportStates, &pEntry->entry, &conn->mConn, retry++, timeout); if (retry >= MAX_TRY) break; } - if ((!conn->cdbProc) || (icBufferListLength(&conn->unackQueue) == 0 && + if ((!conn->mConn.cdbProc) || (icBufferListLength(&conn->unackQueue) == 0 && icBufferListLength(&conn->sndQueue) == 0)) { - conn->state = mcsEosSent; - conn->stillActive = false; + conn->mConn.state = mcsEosSent; + conn->mConn.stillActive = false; } else activeCount++; @@ -5676,18 +5995,15 @@ SendEosUDPIFC(ChunkTransportState *transportStates, } if (gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG) - elog(DEBUG1, "SendEosUDPIFC leaving, activeCount %d", activeCount); + elog(DEBUG1, "SendEOSUDPIFC leaving, activeCount %d", activeCount); } -/* - * doSendStopMessageUDPIFC - * Send stop messages to all senders. - */ -static void -doSendStopMessageUDPIFC(ChunkTransportState *transportStates, int16 motNodeID) +void +SendStopMessageUDPIFC(ChunkTransportState *transportStates, int16 motNodeID) { ChunkTransportStateEntry *pEntry = NULL; - MotionConn *conn = NULL; + MotionConn *mConn = NULL; + MotionConnUDP *conn = NULL; int i; if (!transportStates->activated) @@ -5707,13 +6023,14 @@ doSendStopMessageUDPIFC(ChunkTransportState *transportStates, int16 motNodeID) for (i = 0; i < pEntry->numConns; i++) { - conn = pEntry->conns + i; + getMotionConn(pEntry, i, &mConn); + conn = CONTAINER_OF(mConn, MotionConnUDP, mConn); /* * Note here, the stillActive flag of a connection may have been set * to false by markUDPConnInactiveIFC. */ - if (conn->stillActive) + if (conn->mConn.stillActive) { if (conn->conn_info.flags & UDPIC_FLAGS_EOS) { @@ -5725,17 +6042,17 @@ doSendStopMessageUDPIFC(ChunkTransportState *transportStates, int16 motNodeID) elog(DEBUG1, "do sendstop: already have queued EOS packet, we're done. node %d route %d", motNodeID, i); - conn->stillActive = false; + conn->mConn.stillActive = false; /* need to drop the queues in the teardown function. */ while (conn->pkt_q_size > 0) { - putRxBufferAndSendAck(conn, NULL); + putRxBufferAndSendAck(&conn->mConn, NULL); } } else { - conn->stopRequested = true; + conn->mConn.stopRequested = true; conn->conn_info.flags |= UDPIC_FLAGS_STOP; /* @@ -5761,7 +6078,7 @@ doSendStopMessageUDPIFC(ChunkTransportState *transportStates, int16 motNodeID) { uint32 seq = conn->conn_info.seq > 0 ? conn->conn_info.seq - 1 : 0; - sendAck(conn, UDPIC_FLAGS_STOP | UDPIC_FLAGS_ACK | UDPIC_FLAGS_CAPACITY | conn->conn_info.flags, seq, seq); + sendAck(&conn->mConn, UDPIC_FLAGS_STOP | UDPIC_FLAGS_ACK | UDPIC_FLAGS_CAPACITY | conn->conn_info.flags, seq, seq); if (gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG) elog(DEBUG1, "sent stop message. node %d route %d seq %d", motNodeID, i, seq); @@ -5924,22 +6241,26 @@ putIntoUnackQueueRing(UnackQueueRing *uqr, ICBuffer *buf, uint64 expTime, uint64 * and the caller should wake up the main thread, after releasing the mutex. */ static bool -handleDataPacket(MotionConn *conn, icpkthdr *pkt, struct sockaddr_storage *peer, socklen_t *peerlen, +handleDataPacket(MotionConn *mConn, icpkthdr *pkt, struct sockaddr_storage *peer, socklen_t *peerlen, AckSendParam *param, bool *wakeup_mainthread) { + MotionConnUDP *conn = NULL; + + conn = CONTAINER_OF(mConn, MotionConnUDP, mConn); if ((pkt->len == sizeof(icpkthdr)) && (pkt->flags & UDPIC_FLAGS_CAPACITY)) { if (DEBUG1 >= log_min_messages) - write_log("status queuy message received, seq %d, srcpid %d, dstpid %d, icid %d, sid %d", pkt->seq, pkt->srcPid, pkt->dstPid, pkt->icId, pkt->sessionId); + write_log("status queuy message received, seq %d, srcpid %d, dstpid %d, icid %d, sid %d", + pkt->seq, pkt->srcPid, pkt->dstPid, pkt->icId, pkt->sessionId); #ifdef AMS_VERBOSE_LOGGING logPkt("STATUS QUERY MESSAGE", pkt); #endif uint32 seq = conn->conn_info.seq > 0 ? conn->conn_info.seq - 1 : 0; - uint32 extraSeq = conn->stopRequested ? seq : conn->conn_info.extraSeq; + uint32 extraSeq = conn->mConn.stopRequested ? seq : conn->conn_info.extraSeq; - setAckSendParam(param, conn, UDPIC_FLAGS_CAPACITY | UDPIC_FLAGS_ACK | conn->conn_info.flags, seq, extraSeq); + setAckSendParam(param, &conn->mConn, UDPIC_FLAGS_CAPACITY | UDPIC_FLAGS_ACK | conn->conn_info.flags, seq, extraSeq); return false; } @@ -5982,7 +6303,7 @@ handleDataPacket(MotionConn *conn, icpkthdr *pkt, struct sockaddr_storage *peer, * * this is especially important after eliding setup is enabled. */ - if (!conn->stopRequested && (pkt->flags & UDPIC_FLAGS_STOP)) + if (!conn->mConn.stopRequested && (pkt->flags & UDPIC_FLAGS_STOP)) { if (pkt->flags & UDPIC_FLAGS_EOS) { @@ -5991,7 +6312,7 @@ handleDataPacket(MotionConn *conn, icpkthdr *pkt, struct sockaddr_storage *peer, return false; } - if (conn->stopRequested && conn->stillActive) + if (conn->mConn.stopRequested && conn->mConn.stillActive) { if (gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG && DEBUG5 >= log_min_messages) write_log("rx_thread got packet on active connection marked stopRequested. " @@ -6010,14 +6331,14 @@ handleDataPacket(MotionConn *conn, icpkthdr *pkt, struct sockaddr_storage *peer, if (conn->conn_info.seq < pkt->seq) conn->conn_info.seq = pkt->seq; /* note here */ - setAckSendParam(param, conn, UDPIC_FLAGS_ACK | UDPIC_FLAGS_STOP | UDPIC_FLAGS_CAPACITY | conn->conn_info.flags, pkt->seq, pkt->seq); + setAckSendParam(param, &conn->mConn, UDPIC_FLAGS_ACK | UDPIC_FLAGS_STOP | UDPIC_FLAGS_CAPACITY | conn->conn_info.flags, pkt->seq, pkt->seq); /* we only update stillActive if eos has been sent by peer. */ if (pkt->flags & UDPIC_FLAGS_EOS) { if (DEBUG2 >= log_min_messages) write_log("stop requested and acknowledged by sending peer"); - conn->stillActive = false; + conn->mConn.stillActive = false; } return false; @@ -6031,13 +6352,13 @@ handleDataPacket(MotionConn *conn, icpkthdr *pkt, struct sockaddr_storage *peer, write_log("dropped ack ? ignored data packet w/ cmd %d conn->cmd %d node %d route %d seq %d expected %d flags 0x%x", pkt->icId, conn->conn_info.icId, pkt->motNodeId, conn->route, pkt->seq, conn->conn_info.seq, pkt->flags); - setAckSendParam(param, conn, UDPIC_FLAGS_ACK | UDPIC_FLAGS_CAPACITY | conn->conn_info.flags, conn->conn_info.seq - 1, conn->conn_info.extraSeq); + setAckSendParam(param, &conn->mConn, UDPIC_FLAGS_ACK | UDPIC_FLAGS_CAPACITY | conn->conn_info.flags, conn->conn_info.seq - 1, conn->conn_info.extraSeq); return false; } /* sequence number is correct */ - if (!conn->stillActive) + if (!conn->mConn.stillActive) { /* peer may have dropped ack */ if (gp_log_interconnect >= GPVARS_VERBOSITY_VERBOSE && @@ -6046,7 +6367,7 @@ handleDataPacket(MotionConn *conn, icpkthdr *pkt, struct sockaddr_storage *peer, pkt->motNodeId, conn->route, conn->conn_info.seq, pkt->seq); if (conn->conn_info.seq < pkt->seq) conn->conn_info.seq = pkt->seq; - setAckSendParam(param, conn, UDPIC_FLAGS_ACK | UDPIC_FLAGS_STOP | UDPIC_FLAGS_CAPACITY | conn->conn_info.flags, pkt->seq, pkt->seq); + setAckSendParam(param, &conn->mConn, UDPIC_FLAGS_ACK | UDPIC_FLAGS_STOP | UDPIC_FLAGS_CAPACITY | conn->conn_info.flags, pkt->seq, pkt->seq); return false; } @@ -6100,7 +6421,7 @@ handleDataPacket(MotionConn *conn, icpkthdr *pkt, struct sockaddr_storage *peer, } /* ack data packet */ - setAckSendParam(param, conn, UDPIC_FLAGS_CAPACITY | UDPIC_FLAGS_ACK | conn->conn_info.flags, conn->conn_info.seq - 1, conn->conn_info.extraSeq); + setAckSendParam(param, &conn->mConn, UDPIC_FLAGS_CAPACITY | UDPIC_FLAGS_ACK | conn->conn_info.flags, conn->conn_info.seq - 1, conn->conn_info.extraSeq); #ifdef AMS_VERBOSE_LOGGING write_log("SAVE conn %p pkt at QUEUE TAIL [seq %d] at pos [%d] for node %d route %d, [head seq] %d, queue size %d, queue head %d queue tail %d", conn, pkt->seq, pos, pkt->motNodeId, conn->route, headSeq, conn->pkt_q_size, conn->pkt_q_head, conn->pkt_q_tail); @@ -6113,7 +6434,7 @@ handleDataPacket(MotionConn *conn, icpkthdr *pkt, struct sockaddr_storage *peer, /* send an ack for out-of-order packet */ ic_statistics.disorderedPktNum++; - handleDisorderPacket(conn, pos, headSeq + conn->pkt_q_size, pkt); + handleDisorderPacket(&conn->mConn, pos, headSeq + conn->pkt_q_size, pkt); } } else /* duplicate pkt */ @@ -6121,7 +6442,7 @@ handleDataPacket(MotionConn *conn, icpkthdr *pkt, struct sockaddr_storage *peer, if (DEBUG1 >= log_min_messages) write_log("DUPLICATE pkt [seq %d], [head seq] %d, queue size %d, queue head %d queue tail %d", pkt->seq, headSeq, conn->pkt_q_size, conn->pkt_q_head, conn->pkt_q_tail); - setAckSendParam(param, conn, UDPIC_FLAGS_DUPLICATE | conn->conn_info.flags, pkt->seq, conn->conn_info.seq - 1); + setAckSendParam(param, &conn->mConn, UDPIC_FLAGS_DUPLICATE | conn->conn_info.flags, pkt->seq, conn->conn_info.seq - 1); ic_statistics.duplicatedPktNum++; return false; } @@ -6536,11 +6857,10 @@ handleMismatch(icpkthdr *pkt, struct sockaddr_storage *peer, int peer_len) if (need_ack) { - MotionConn dummyconn; + MotionConnUDP dummyconn; char buf[128]; /* numeric IP addresses shouldn't exceed * about 50 chars, but play it safe */ - memcpy(&dummyconn.conn_info, pkt, sizeof(icpkthdr)); dummyconn.peer = *peer; dummyconn.peer_len = peer_len; @@ -6578,7 +6898,7 @@ handleMismatch(icpkthdr *pkt, struct sockaddr_storage *peer, int peer_len) * 1) UDPIC_FLAGS_STOP 2) UDPIC_FLAGS_EOS 3) UDPIC_FLAGS_CAPACITY * which are from original packet */ - sendAck(&dummyconn, ack_flags | dummyconn.conn_info.flags, dummyconn.conn_info.seq, dummyconn.conn_info.seq); + sendAck(&dummyconn.mConn, ack_flags | dummyconn.conn_info.flags, dummyconn.conn_info.seq, dummyconn.conn_info.seq); } } else @@ -6599,20 +6919,21 @@ handleMismatch(icpkthdr *pkt, struct sockaddr_storage *peer, int peer_len) static bool cacheFuturePacket(icpkthdr *pkt, struct sockaddr_storage *peer, int peer_len) { - MotionConn *conn; + MotionConn *mConn; + MotionConnUDP *conn; - conn = findConnByHeader(&ic_control_info.startupCacheHtab, pkt); + mConn = findConnByHeader(&ic_control_info.startupCacheHtab, pkt); - if (conn == NULL) + if (mConn == NULL) { - conn = malloc(sizeof(MotionConn)); + conn = malloc(sizeof(MotionConnUDP)); if (conn == NULL) { setRxThreadError(errno); return false; } - memset((void *) conn, 0, sizeof(MotionConn)); + memset((void *) conn, 0, sizeof(MotionConnUDP)); memcpy(&conn->conn_info, pkt, sizeof(icpkthdr)); conn->pkt_q_capacity = Gp_interconnect_queue_depth; @@ -6631,7 +6952,7 @@ cacheFuturePacket(icpkthdr *pkt, struct sockaddr_storage *peer, int peer_len) memset(conn->pkt_q, 0, Gp_interconnect_queue_depth * sizeof(uint8 *)); /* Put connection to the hashtable. */ - if (!connAddHash(&ic_control_info.startupCacheHtab, conn)) + if (!connAddHash(&ic_control_info.startupCacheHtab, &conn->mConn)) { free(conn->pkt_q); free(conn); @@ -6642,6 +6963,8 @@ cacheFuturePacket(icpkthdr *pkt, struct sockaddr_storage *peer, int peer_len) /* Setup the peer sock information. */ memcpy(&conn->peer, peer, peer_len); conn->peer_len = peer_len; + } else { + conn = CONTAINER_OF(mConn, MotionConnUDP, mConn); } /* @@ -6665,7 +6988,8 @@ static void cleanupStartupCache() { ConnHtabBin *bin = NULL; - MotionConn *cachedConn = NULL; + MotionConn *cachedMotionConn = NULL; + MotionConnUDP *cachedConn = NULL; icpkthdr *pkt = NULL; int i = 0; int j = 0; @@ -6676,7 +7000,8 @@ cleanupStartupCache() while (bin) { - cachedConn = bin->conn; + cachedMotionConn = bin->conn; + cachedConn = CONTAINER_OF(cachedMotionConn, MotionConnUDP, mConn); for (j = 0; j < cachedConn->pkt_q_size; j++) { @@ -6691,7 +7016,7 @@ cleanupStartupCache() cachedConn->pkt_q[j] = NULL; } bin = bin->next; - connDelHash(&ic_control_info.startupCacheHtab, cachedConn); + connDelHash(&ic_control_info.startupCacheHtab, &cachedConn->mConn); /* * MPP-19981 free the cached connections; otherwise memory leak @@ -6797,7 +7122,8 @@ dumpConnections(ChunkTransportStateEntry *pEntry, const char *fname) { int i, j; - MotionConn *conn; + MotionConn *mConn; + MotionConnUDP *conn; FILE *ofile = fopen(fname, "w+"); @@ -6806,23 +7132,23 @@ dumpConnections(ChunkTransportStateEntry *pEntry, const char *fname) for (i = 0; i < pEntry->numConns; i++) { - conn = &pEntry->conns[i]; + getMotionConn(pEntry, i, &mConn); + conn = CONTAINER_OF(mConn, MotionConnUDP, mConn); - fprintf(ofile, "conns[%d] motNodeId=%d: remoteContentId=%d pid=%d sockfd=%d remote=%s local=%s " + fprintf(ofile, "conns[%d] motNodeId=%d: remoteContentId=%d pid=%d sockfd=%d remote=%s " "capacity=%d sentSeq=%d receivedAckSeq=%d consumedSeq=%d rtt=" UINT64_FORMAT " dev=" UINT64_FORMAT " deadlockCheckBeginTime=" UINT64_FORMAT " route=%d msgSize=%d msgPos=%p" " recvBytes=%d tupleCount=%d stillActive=%d stopRequested=%d " "state=%d\n", i, pEntry->motNodeId, - conn->remoteContentId, - conn->cdbProc ? conn->cdbProc->pid : 0, - conn->sockfd, - conn->remoteHostAndPort, - conn->localHostAndPort, + conn->mConn.remoteContentId, + conn->mConn.cdbProc ? conn->mConn.cdbProc->pid : 0, + conn->mConn.sockfd, + conn->mConn.remoteHostAndPort, conn->capacity, conn->sentSeq, conn->receivedAckSeq, conn->consumedSeq, - conn->rtt, conn->dev, conn->deadlockCheckBeginTime, conn->route, conn->msgSize, conn->msgPos, - conn->recvBytes, conn->tupleCount, conn->stillActive, conn->stopRequested, - conn->state); + conn->rtt, conn->dev, conn->deadlockCheckBeginTime, conn->route, conn->mConn.msgSize, conn->mConn.msgPos, + conn->mConn.recvBytes, conn->mConn.tupleCount, conn->mConn.stillActive, conn->mConn.stopRequested, + conn->mConn.state); fprintf(ofile, "conn_info [%s: seq %d extraSeq %d]: motNodeId %d, crc %d len %d " "srcContentId %d dstDesContentId %d " "srcPid %d dstPid %d " @@ -6916,7 +7242,7 @@ SendDummyPacket(void) /* * Get address info from interconnect udp listener port */ - udp_listener = (Gp_listener_port >> 16) & 0x0ffff; + udp_listener = GetListenPortUDP(); snprintf(port_str, sizeof(port_str), "%d", udp_listener); MemSet(&hint, 0, sizeof(hint)); @@ -7003,8 +7329,74 @@ SendDummyPacket(void) return; } +void logChunkParseDetails(MotionConn *conn, uint32 ic_instance_id) +{ + struct icpkthdr *pkt; + + Assert(conn != NULL); + Assert(conn->pBuff != NULL); + + pkt = (struct icpkthdr *) conn->pBuff; + + elog(LOG, "Interconnect parse details(UDP): pkt->len %d pkt->seq %d pkt->flags 0x%x conn->active %d conn->stopRequest %d pkt->icId %d my_icId %d", + pkt->len, pkt->seq, pkt->flags, conn->stillActive, conn->stopRequested, pkt->icId, ic_instance_id); + + elog(LOG, "Interconnect parse details continued: peer: srcpid %d dstpid %d recvslice %d sendslice %d srccontent %d dstcontent %d", + pkt->srcPid, pkt->dstPid, pkt->recvSliceIndex, pkt->sendSliceIndex, pkt->srcContentId, pkt->dstContentId); +} + +int GetMaxTupleChunkSizeUDP(void) +{ + return Gp_max_packet_size - sizeof(struct icpkthdr) - TUPLE_CHUNK_HEADER_SIZE; +} + +int32 GetListenPortUDP(void) +{ + return udp_listener_port; +} + uint32 -getActiveMotionConns(void) +GetActiveMotionConnsUDPIFC(void) { return ic_statistics.activeConnectionsNum; } + +/* + * MlPutRxBufferIFC + * + * The cdbmotion code has discarded our pointer to the motion-conn + * structure, but has enough info to fully specify it. + */ +void +MlPutRxBufferIFC(ChunkTransportState *transportStates, int motNodeID, int route) +{ + ChunkTransportStateEntry *pEntry = NULL; + MotionConn *conn = NULL; + AckSendParam param; + + getChunkTransportState(transportStates, motNodeID, &pEntry); + getMotionConn(pEntry, route, &conn); + + memset(¶m, 0, sizeof(AckSendParam)); + + pthread_mutex_lock(&ic_control_info.lock); + + if (conn->pBuff != NULL) + { + putRxBufferAndSendAck(conn, ¶m); + } + else + { + pthread_mutex_unlock(&ic_control_info.lock); + elog(FATAL, "Interconnect error: tried to release a NULL buffer"); + } + + pthread_mutex_unlock(&ic_control_info.lock); + + /* + * real ack sending is after lock release to decrease the lock holding + * time. + */ + if (param.msg.len != 0) + sendAckWithParam(¶m); +} diff --git a/contrib/interconnect/udp/ic_udpifc.h b/contrib/interconnect/udp/ic_udpifc.h new file mode 100644 index 00000000000..72cef3ab0ef --- /dev/null +++ b/contrib/interconnect/udp/ic_udpifc.h @@ -0,0 +1,214 @@ +/*------------------------------------------------------------------------- + * ic_udp.h + * Motion IPC UDP implements. + * + * Portions Copyright (c) 2023-, Cloudberry inc + * + * + * IDENTIFICATION + * contrib/interconnect/udp/ic_udp.h + * + *------------------------------------------------------------------------- + */ +#ifndef IC_UDP_INTERFACE_H +#define IC_UDP_INTERFACE_H + +#include "postgres.h" + +#include "common/ip.h" +#include "nodes/execnodes.h" /* ExecSlice, SliceTable */ +#include "miscadmin.h" +#include "libpq/libpq-be.h" +#include "utils/builtins.h" +#include "utils/memutils.h" + +#include "cdb/ml_ipc.h" +#include "cdb/cdbvars.h" +#include "cdb/cdbdisp.h" + +#include +#include +#include +#include + +typedef struct icpkthdr +{ + int32 motNodeId; + + /* + * three pairs which seem useful for identifying packets. + * + * MPP-4194: It turns out that these can cause collisions; but the high + * bit (1<<31) of the dstListener port is now used for disambiguation with + * mirrors. + */ + int32 srcPid; + int32 srcListenerPort; + + int32 dstPid; + int32 dstListenerPort; + + int32 sessionId; + uint32 icId; + + int32 recvSliceIndex; + int32 sendSliceIndex; + int32 srcContentId; + int32 dstContentId; + + /* MPP-6042: add CRC field */ + uint32 crc; + + /* packet specific info */ + int32 flags; + int32 len; + + /* + * The usage of seq and extraSeq field + * a) In a normal DATA packet + * seq -> the data packet sequence number + * extraSeq -> not used + * b) In a normal ACK message (UDPIC_FLAGS_ACK | UDPIC_FLAGS_CAPACITY) + * seq -> the largest seq of the continuously cached packets + * sometimes, it is special, for exampke, conn req ack, mismatch ack. + * extraSeq -> the largest seq of the consumed packets + * c) In a start race NAK message (UPDIC_FLAGS_NAK) + * seq -> the seq from the pkt + * extraSeq -> the extraSeq from the pkt + * d) In a DISORDER message (UDPIC_FLAGS_DISORDER) + * seq -> packet sequence number that triggers the disorder message + * extraSeq -> the largest seq of the received packets + * e) In a DUPLICATE message (UDPIC_FLAGS_DUPLICATE) + * seq -> packet sequence number that triggers the duplicate message + * extraSeq -> the largest seq of the continuously cached packets + * f) In a stop messege (UDPIC_FLAGS_STOP | UDPIC_FLAGS_ACK | UDPIC_FLAGS_CAPACITY) + * seq -> the largest seq of the continuously cached packets + * extraSeq -> the largest seq of the continuously cached packets + * + * + * NOTE that: EOS/STOP flags are often saved in conn_info structure of a connection. + * It is possible for them to be sent together with other flags. + * + */ + uint32 seq; + uint32 extraSeq; +} icpkthdr; + +typedef struct ICBuffer ICBuffer; +typedef struct ICBufferLink ICBufferLink; + +typedef enum ICBufferListType +{ + ICBufferListType_Primary, + ICBufferListType_Secondary, + ICBufferListType_UNDEFINED +} ICBufferListType; + +struct ICBufferLink +{ + ICBufferLink *next; + ICBufferLink *prev; +}; + +/* + * ICBufferList + * ic buffer list data structure. + * + * There are two kinds of lists. The first kind of list uses the primary next/prev pointers. + * And the second kind uses the secondary next/prev pointers. + */ +typedef struct ICBufferList +{ + int length; + ICBufferListType type; /* primary or secondary */ + + ICBufferLink head; +} ICBufferList; + + +#define GET_ICBUFFER_FROM_PRIMARY(ptr) CONTAINER_OF(ptr, ICBuffer, primary) +#define GET_ICBUFFER_FROM_SECONDARY(ptr) CONTAINER_OF(ptr, ICBuffer, secondary) + +/* + * ICBuffer + * interconnect buffer data structure. + * + * In some cases, an ICBuffer may exists in two lists/queues, + * thus it has two sets of pointers. For example, an ICBuffer + * can exist in an unack queue and an expiration queue at the same time. + * + * It is important to get the ICBuffer address when we iterate a list of + * ICBuffers through primary/secondary links. The Macro GET_ICBUFFER_FROM_PRIMARY + * and GET_ICBUFFER_FROM_SECONDARY are for this purpose. + * + */ +struct ICBuffer +{ + /* primary next and prev pointers */ + ICBufferLink primary; + + /* secondary next and prev pointers */ + ICBufferLink secondary; + + /* connection that this buffer belongs to */ + MotionConn *conn; + + /* + * Three fields for expiration processing + * + * sentTime - the time this buffer was sent nRetry - the number of send + * retries unackQueueRingSlot - unack queue ring slot index + */ + uint64 sentTime; + uint32 nRetry; + int32 unackQueueRingSlot; + + /* real data */ + icpkthdr pkt[0]; +}; + +extern void logChunkParseDetails(MotionConn * conn, uint32 ic_instance_id); + +extern int GetMaxTupleChunkSizeUDP(void); + +extern int32 GetListenPortUDP(void); + +extern void InitMotionIPCLayerUDP(void); +extern void CleanUpMotionLayerIPCUDP(void); + +extern void WaitInterconnectQuitUDPIFC(void); + +extern void SetupInterconnectUDP(EState *estate); +extern void TeardownInterconnectUDP(ChunkTransportState * transportStates, bool hasErrors); + +extern void DeregisterReadInterestUDP(ChunkTransportState * transportStates, + int motNodeID, + int srcRoute, + const char *reason); + +extern TupleChunkListItem +RecvTupleChunkUDPIFC(MotionConn * conn, ChunkTransportState * transportStates); + +extern bool SendChunkUDPIFC(ChunkTransportState * transportStates, ChunkTransportStateEntry * pChunkEntry, + MotionConn * conn, TupleChunkListItem tcItem, int16 motionId); +extern void SendEOSUDPIFC(ChunkTransportState * transportStates, + int motNodeID, TupleChunkListItem tcItem); +extern void SendStopMessageUDPIFC(ChunkTransportState * transportStates, int16 motNodeID); + +extern TupleChunkListItem RecvTupleChunkFromAnyUDPIFC(ChunkTransportState * transportStates, + int16 motNodeID, + int16 *srcRoute); + +extern TupleChunkListItem RecvTupleChunkFromUDPIFC(ChunkTransportState * transportStates, + int16 motNodeID, + int16 srcRoute); +extern uint32 GetActiveMotionConnsUDPIFC(void); + +void MlPutRxBufferIFC(ChunkTransportState * transportStates, int motNodeID, int route); + +/* debug function for udpifc */ +extern void dumpICBufferList(ICBufferList * list, const char *fname); +extern void dumpUnackQueueRing(const char *fname); +extern void dumpConnections(ChunkTransportStateEntry * pEntry, const char *fname); + +#endif // IC_UDP_INTERFACE_H diff --git a/gpAux/gpdemo/demo_cluster.sh b/gpAux/gpdemo/demo_cluster.sh index 5eb7161f5a4..a89c3ac114a 100755 --- a/gpAux/gpdemo/demo_cluster.sh +++ b/gpAux/gpdemo/demo_cluster.sh @@ -338,7 +338,6 @@ cat >> $CLUSTER_CONFIG <<-EOF DEFAULT_QD_MAX_CONNECT=$DEFAULT_QD_MAX_CONNECT QE_CONNECT_FACTOR=5 - EOF if [ -n "${STATEMENT_MEM}" ]; then diff --git a/src/Makefile b/src/Makefile index ebbaeea9685..ee7c06bb4ca 100644 --- a/src/Makefile +++ b/src/Makefile @@ -74,6 +74,7 @@ distclean maintainer-clean: $(MAKE) -C tutorial NO_PGXS=1 $@ $(MAKE) -C test/isolation $@ rm -f Makefile.port Makefile.global + rm -f contrib/interconnect/Makefile.interconnect .PHONY: install-local installdirs-local uninstall-local diff --git a/src/Makefile.global.in b/src/Makefile.global.in index 1d0acc04c21..9e03d53d547 100644 --- a/src/Makefile.global.in +++ b/src/Makefile.global.in @@ -892,7 +892,7 @@ ifndef PGXS # write `include $(top_builddir)/src/Makefile.global', not some # shortcut thereof. $(top_builddir)/src/Makefile.global: $(top_srcdir)/src/Makefile.global.in $(top_builddir)/config.status - cd $(top_builddir) && ./config.status src/Makefile.global + cd $(top_builddir) && ./config.status src/Makefile.global && ./config.status contrib/interconnect/Makefile.interconnect # Remake pg_config.h from pg_config.h.in if the latter changed. # config.status will not change the timestamp on pg_config.h if it diff --git a/src/backend/cdb/cdbutil.c b/src/backend/cdb/cdbutil.c index c01f8d0108c..5fdd45ea116 100644 --- a/src/backend/cdb/cdbutil.c +++ b/src/backend/cdb/cdbutil.c @@ -1056,9 +1056,15 @@ cdb_setup(void) if (Gp_role != GP_ROLE_UTILITY) { + if (!CurrentMotionIPCLayer) { + ereport(ERROR, + (errmsg("Interconnect moudle have not been preloaded"), + errdetail("Please make sure interconnect is included in option shared_preload_libraries"))); + } + ensureInterconnectAddress(); /* Initialize the Motion Layer IPC subsystem. */ - InitMotionLayerIPC(); + CurrentMotionIPCLayer->InitMotionLayerIPC(); } /* @@ -1111,7 +1117,7 @@ cdb_cleanup(int code pg_attribute_unused(), Datum arg if (Gp_role != GP_ROLE_UTILITY) { /* shutdown our listener socket */ - CleanUpMotionLayerIPC(); + CurrentMotionIPCLayer->CleanUpMotionLayerIPC(); } } @@ -3375,9 +3381,14 @@ cdb_setup(void) if (Gp_role != GP_ROLE_UTILITY) { + if (!CurrentMotionIPCLayer) { + ereport(ERROR, + (errmsg("Interconnect moudle have not been preloaded"), + errdetail("Please make sure interconnect is included in option shared_preload_libraries"))); + } ensureInterconnectAddress(); /* Initialize the Motion Layer IPC subsystem. */ - InitMotionLayerIPC(); + CurrentMotionIPCLayer->InitMotionLayerIPC(); } /* @@ -3430,7 +3441,7 @@ cdb_cleanup(int code pg_attribute_unused(), Datum arg if (Gp_role != GP_ROLE_UTILITY) { /* shutdown our listener socket */ - CleanUpMotionLayerIPC(); + CurrentMotionIPCLayer->CleanUpMotionLayerIPC(); } } diff --git a/src/backend/cdb/dispatcher/cdbgang.c b/src/backend/cdb/dispatcher/cdbgang.c index f2349ee58ef..bc4382effcb 100644 --- a/src/backend/cdb/dispatcher/cdbgang.c +++ b/src/backend/cdb/dispatcher/cdbgang.c @@ -40,6 +40,7 @@ #include "cdb/cdbutil.h" /* CdbComponentDatabaseInfo */ #include "cdb/cdbvars.h" /* Gp_role, etc. */ #include "cdb/cdbconn.h" /* cdbconn_* */ +#include "cdb/ml_ipc.h" #include "libpq/libpq-be.h" #include "utils/guc_tables.h" @@ -578,10 +579,10 @@ makeCdbProcess(SegmentDatabaseDescriptor *segdbDesc) process->listenerAddr = pstrdup(qeinfo->config->hostip); - if (Gp_interconnect_type == INTERCONNECT_TYPE_UDPIFC) + if (CurrentMotionIPCLayer->ic_type == INTERCONNECT_TYPE_UDPIFC) process->listenerPort = (segdbDesc->motionListener >> 16) & 0x0ffff; - else if (Gp_interconnect_type == INTERCONNECT_TYPE_TCP || - Gp_interconnect_type == INTERCONNECT_TYPE_PROXY) + else if (CurrentMotionIPCLayer->ic_type == INTERCONNECT_TYPE_TCP || + CurrentMotionIPCLayer->ic_type == INTERCONNECT_TYPE_PROXY) process->listenerPort = (segdbDesc->motionListener & 0x0ffff); process->pid = segdbDesc->backendPid; @@ -672,12 +673,7 @@ getCdbProcessesForQD(int isPrimary) * 2. When the segments have their own ADDRESS, the connection address could be confusing. */ proc->listenerAddr = pstrdup(qdinfo->config->hostip); - - if (Gp_interconnect_type == INTERCONNECT_TYPE_UDPIFC) - proc->listenerPort = (Gp_listener_port >> 16) & 0x0ffff; - else if (Gp_interconnect_type == INTERCONNECT_TYPE_TCP || - Gp_interconnect_type == INTERCONNECT_TYPE_PROXY) - proc->listenerPort = (Gp_listener_port & 0x0ffff); + proc->listenerPort = CurrentMotionIPCLayer->GetListenPort(); proc->pid = MyProcPid; proc->contentid = -1; diff --git a/src/backend/cdb/motion/Makefile b/src/backend/cdb/motion/Makefile index 354b77f537a..1e45deeb9a8 100644 --- a/src/backend/cdb/motion/Makefile +++ b/src/backend/cdb/motion/Makefile @@ -13,25 +13,9 @@ include $(top_builddir)/src/Makefile.global override CPPFLAGS := -I$(libpq_srcdir) $(CPPFLAGS) OBJS = cdbmotion.o tupchunklist.o tupser.o \ - ic_common.o ic_tcp.o ic_udpifc.o htupfifo.o tupleremap.o - + htupfifo.o tupleremap.o ifeq ($(enable_ic_proxy),yes) -# server OBJS += ic_proxy_bgworker.o -OBJS += ic_proxy_main.o -OBJS += ic_proxy_client.o -OBJS += ic_proxy_peer.o -OBJS += ic_proxy_router.o - -# backend -OBJS += ic_proxy_backend.o - -# utils -OBJS += ic_proxy_addr.o -OBJS += ic_proxy_key.o -OBJS += ic_proxy_packet.o -OBJS += ic_proxy_pkt_cache.o -OBJS += ic_proxy_iobuf.o endif # enable_ic_proxy include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/cdb/motion/cdbmotion.c b/src/backend/cdb/motion/cdbmotion.c index 27105b68afa..63e27a4b741 100644 --- a/src/backend/cdb/motion/cdbmotion.c +++ b/src/backend/cdb/motion/cdbmotion.c @@ -30,6 +30,7 @@ #include "utils/memutils.h" #include "utils/typcache.h" +MotionIPCLayer *CurrentMotionIPCLayer = NULL; /* * MOTION NODE INFO DATA STRUCTURES @@ -55,13 +56,12 @@ static ChunkSorterEntry *getChunkSorterEntry(MotionLayerState *mlStates, MotionNodeEntry *motNodeEntry, int16 srcRoute); static void addChunkToSorter(ChunkTransportState *transportStates, - MotionNodeEntry *pMNEntry, - TupleChunkListItem tcItem, - int16 motNodeID, - ChunkSorterEntry *chunkSorterEntry, - ChunkTransportStateEntry *pEntry, - MotionConn *conn, - int16 srcRoute); + MotionNodeEntry *pMNEntry, + TupleChunkListItem tcItem, + int16 motNodeID, + int16 srcRoute, + ChunkSorterEntry *chunkSorterEntry, + TupleRemapper *tuple_remapper); static void processIncomingChunks(MotionLayerState *mlStates, ChunkTransportState *transportStates, @@ -77,10 +77,8 @@ static void statSendEOS(MotionLayerState *mlStates, MotionNodeEntry *pMNEntry); static void statChunksProcessed(MotionLayerState *mlStates, MotionNodeEntry *pMNEntry, int chunksProcessed, int chunkBytes, int tupleBytes); static void statNewTupleArrived(MotionNodeEntry *pMNEntry, ChunkSorterEntry *pCSEntry); static void statRecvTuple(MotionNodeEntry *pMNEntry, ChunkSorterEntry *pCSEntry); -static bool ShouldSendRecordCache(MotionConn *conn, SerTupInfo *pSerInfo); -static void UpdateSentRecordCache(MotionConn *conn); - - +static bool ShouldSendRecordCache(const int32 conn, SerTupInfo *pSerInfo); +static void UpdateSentRecordCache(int32 *conn); /* Helper function to perform the operations necessary to reconstruct a * HeapTuple from a list of tuple-chunks, and then update the Motion Layer @@ -165,11 +163,7 @@ createMotionLayerState(int maxMotNodeID) if (Gp_role == GP_ROLE_UTILITY) return NULL; - if (Gp_interconnect_type == INTERCONNECT_TYPE_UDPIFC) - Gp_max_tuple_chunk_size = Gp_max_packet_size - sizeof(struct icpkthdr) - TUPLE_CHUNK_HEADER_SIZE; - else if (Gp_interconnect_type == INTERCONNECT_TYPE_TCP || - Gp_interconnect_type == INTERCONNECT_TYPE_PROXY) - Gp_max_tuple_chunk_size = Gp_max_packet_size - PACKET_HEADER_SIZE - TUPLE_CHUNK_HEADER_SIZE; + Gp_max_tuple_chunk_size = CurrentMotionIPCLayer->GetMaxTupleChunkSize(); /* * Use the statically allocated chunk that is intended for sending end-of- @@ -350,8 +344,8 @@ SendStopMessage(MotionLayerState *mlStates, MotionNodeEntry *pEntry = getMotionNodeEntry(mlStates, motNodeID); pEntry->stopped = true; - if (transportStates != NULL && transportStates->doSendStopMessage != NULL) - transportStates->doSendStopMessage(transportStates, motNodeID); + if (transportStates != NULL && CurrentMotionIPCLayer->SendStopMessage != NULL) + CurrentMotionIPCLayer->SendStopMessage(transportStates, motNodeID); } void @@ -363,19 +357,27 @@ CheckAndSendRecordCache(MotionLayerState *mlStates, MotionNodeEntry *pMNEntry; TupleChunkListData tcList; MemoryContext oldCtxt; - ChunkTransportStateEntry *pEntry = NULL; - MotionConn *conn; - - getChunkTransportState(transportStates, motNodeID, &pEntry); + bool sent_record_typmod_found = false; + MotionConnKey motion_conn_key; + MotionConnSentRecordTypmodEnt *motion_conn_ent; /* * for broadcast we only mark sent_record_typmod for connection 0 for * efficiency and convenience */ - if (targetRoute == BROADCAST_SEGIDX) - conn = &pEntry->conns[0]; - else - conn = &pEntry->conns[targetRoute]; + motion_conn_key.mot_node_id = motNodeID; + motion_conn_key.conn_index = targetRoute == BROADCAST_SEGIDX ? 0 : targetRoute; + + motion_conn_ent = (MotionConnSentRecordTypmodEnt *)hash_search(transportStates->conn_sent_record_typmod, + &motion_conn_key, HASH_FIND, &sent_record_typmod_found); + + if (!sent_record_typmod_found) { + ereport(ERROR, + (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), + errmsg("interconnect error: Unexpected Motion Node Id: %d, targetRoute: %d", + motNodeID, targetRoute), + errdetail("Fail to get sent_record_typmod from motion conntion"))); + } /* * Analyze tools. Do not send any thing if this slice is in the bit mask @@ -390,7 +392,7 @@ CheckAndSendRecordCache(MotionLayerState *mlStates, */ pMNEntry = getMotionNodeEntry(mlStates, motNodeID); - if (!ShouldSendRecordCache(conn, &pMNEntry->ser_tup_info)) + if (!ShouldSendRecordCache(motion_conn_ent->sent_record_typmod, &pMNEntry->ser_tup_info)) return; #ifdef AMS_VERBOSE_LOGGING @@ -400,7 +402,7 @@ CheckAndSendRecordCache(MotionLayerState *mlStates, /* Create and store the serialized form, and some stats about it. */ oldCtxt = MemoryContextSwitchTo(mlStates->motion_layer_mctx); - SerializeRecordCacheIntoChunks(&pMNEntry->ser_tup_info, &tcList, conn); + SerializeRecordCacheIntoChunks(&pMNEntry->ser_tup_info, &tcList, motion_conn_ent->sent_record_typmod); MemoryContextSwitchTo(oldCtxt); @@ -415,7 +417,7 @@ CheckAndSendRecordCache(MotionLayerState *mlStates, #endif /* do the send. */ - if (!SendTupleChunkToAMS(mlStates, transportStates, motNodeID, targetRoute, tcList.p_first)) + if (!CurrentMotionIPCLayer->SendTupleChunkToAMS(transportStates, motNodeID, targetRoute, tcList.p_first)) { pMNEntry->stopped = true; } @@ -428,7 +430,7 @@ CheckAndSendRecordCache(MotionLayerState *mlStates, /* cleanup */ clearTCList(&pMNEntry->ser_tup_info.chunkCache, &tcList); - UpdateSentRecordCache(conn); + UpdateSentRecordCache(&motion_conn_ent->sent_record_typmod); } /* @@ -467,7 +469,7 @@ SendTuple(MotionLayerState *mlStates, struct directTransportBuffer b; if (targetRoute != BROADCAST_SEGIDX) - getTransportDirectBuffer(transportStates, motNodeID, targetRoute, &b); + CurrentMotionIPCLayer->GetTransportDirectBuffer(transportStates, motNodeID, targetRoute, &b); int sent = 0; @@ -479,7 +481,7 @@ SendTuple(MotionLayerState *mlStates, MemoryContextSwitchTo(oldCtxt); if (sent > 0) { - putTransportDirectBuffer(transportStates, motNodeID, targetRoute, sent); + CurrentMotionIPCLayer->PutTransportDirectBuffer(transportStates, motNodeID, targetRoute, sent); /* fill-in tcList fields to update stats */ tcList.num_chunks = 1; @@ -503,7 +505,7 @@ SendTuple(MotionLayerState *mlStates, #endif /* do the send. */ - if (!SendTupleChunkToAMS(mlStates, transportStates, motNodeID, targetRoute, tcList.p_first)) + if (!CurrentMotionIPCLayer->SendTupleChunkToAMS(transportStates, motNodeID, targetRoute, tcList.p_first)) { pMNEntry->stopped = true; rc = STOP_SENDING; @@ -546,7 +548,7 @@ SendEndOfStream(MotionLayerState *mlStates, */ pMNEntry = getMotionNodeEntry(mlStates, motNodeID); - transportStates->SendEos(transportStates, motNodeID, s_eos_chunk_data); + CurrentMotionIPCLayer->SendEOS(transportStates, motNodeID, s_eos_chunk_data); /* * We increment our own "stream-ends received" count when we send our own, @@ -654,8 +656,7 @@ processIncomingChunks(MotionLayerState *mlStates, tcNext; MemoryContext oldCtxt; ChunkSorterEntry *chunkSorterEntry; - ChunkTransportStateEntry *pEntry = NULL; - MotionConn *conn; + TupleRemapper * tuple_remapper; /* Keep track of processed chunk stats. */ int numChunks, @@ -669,14 +670,14 @@ processIncomingChunks(MotionLayerState *mlStates, * the chunk-sorter. */ if (srcRoute == ANY_ROUTE) - tcItem = transportStates->RecvTupleChunkFromAny(transportStates, motNodeID, &srcRoute); + tcItem = CurrentMotionIPCLayer->RecvTupleChunkFromAny(transportStates, motNodeID, &srcRoute); else - tcItem = transportStates->RecvTupleChunkFrom(transportStates, motNodeID, srcRoute); + tcItem = CurrentMotionIPCLayer->RecvTupleChunkFrom(transportStates, motNodeID, srcRoute); /* Look up various things related to the sender that we received chunks from. */ chunkSorterEntry = getChunkSorterEntry(mlStates, pMNEntry, srcRoute); - getChunkTransportState(transportStates, motNodeID, &pEntry); - conn = pEntry->conns + srcRoute; + + tuple_remapper = CurrentMotionIPCLayer->GetMotionConnTupleRemapper(transportStates, motNodeID, srcRoute); numChunks = 0; chunkBytes = 0; @@ -708,17 +709,16 @@ processIncomingChunks(MotionLayerState *mlStates, pMNEntry, tcItem, motNodeID, + srcRoute, chunkSorterEntry, - pEntry, - conn, - srcRoute); + tuple_remapper); tcItem = tcNext; } /* The chunk list we just processed freed-up our rx-buffer space. */ - if (numChunks > 0 && Gp_interconnect_type == INTERCONNECT_TYPE_UDPIFC) - MlPutRxBufferIFC(transportStates, motNodeID, srcRoute); + if (numChunks > 0 && CurrentMotionIPCLayer->ic_type == INTERCONNECT_TYPE_UDPIFC) + CurrentMotionIPCLayer->DirectPutRxBuffer(transportStates, motNodeID, srcRoute); /* Stats */ statChunksProcessed(mlStates, pMNEntry, numChunks, chunkBytes, tupleBytes); @@ -1003,10 +1003,9 @@ addChunkToSorter(ChunkTransportState *transportStates, MotionNodeEntry *pMNEntry, TupleChunkListItem tcItem, int16 motNodeID, + int16 srcRoute, ChunkSorterEntry *chunkSorterEntry, - ChunkTransportStateEntry *pEntry, - MotionConn *conn, - int16 srcRoute) + TupleRemapper *tuple_remapper) { TupleChunkType tcType; @@ -1030,7 +1029,7 @@ addChunkToSorter(ChunkTransportState *transportStates, /* Put this chunk into the list, then turn it into a HeapTuple! */ appendChunkToTCList(&chunkSorterEntry->chunk_list, tcItem); - reconstructTuple(pMNEntry, chunkSorterEntry, conn->remapper); + reconstructTuple(pMNEntry, chunkSorterEntry, tuple_remapper); break; @@ -1091,7 +1090,7 @@ addChunkToSorter(ChunkTransportState *transportStates, /* Put this chunk into the list, then turn it into a HeapTuple! */ appendChunkToTCList(&chunkSorterEntry->chunk_list, tcItem); - reconstructTuple(pMNEntry, chunkSorterEntry, conn->remapper); + reconstructTuple(pMNEntry, chunkSorterEntry, tuple_remapper); break; @@ -1129,7 +1128,7 @@ addChunkToSorter(ChunkTransportState *transportStates, * Since we received an end-of-stream. Then we no longer need * read interest in the interconnect. */ - DeregisterReadInterest(transportStates, motNodeID, srcRoute, + CurrentMotionIPCLayer->DeregisterReadInterest(transportStates, motNodeID, srcRoute, "end of stream"); break; @@ -1248,7 +1247,7 @@ statRecvTuple(MotionNodeEntry *pMNEntry, ChunkSorterEntry *pCSEntry) * Return true if the record cache should be sent to master */ static bool -ShouldSendRecordCache(MotionConn *conn, SerTupInfo *pSerInfo) +ShouldSendRecordCache(const int32 sent_record_typmod, SerTupInfo *pSerInfo) { int32 typmod; @@ -1257,21 +1256,21 @@ ShouldSendRecordCache(MotionConn *conn, SerTupInfo *pSerInfo) return pSerInfo->has_record_types && typmod > 0 && - typmod > conn->sent_record_typmod; + typmod > sent_record_typmod; } /* * Update the number of sent record types. */ static void -UpdateSentRecordCache(MotionConn *conn) +UpdateSentRecordCache(int32 *sent_record_typmod) { if (CurrentSession->shared_typmod_registry != NULL) { - conn->sent_record_typmod = GetSharedNextRecordTypmod(CurrentSession->shared_typmod_registry); + *sent_record_typmod = GetSharedNextRecordTypmod(CurrentSession->shared_typmod_registry); } else { - conn->sent_record_typmod = NextRecordTypmod; + *sent_record_typmod = NextRecordTypmod; } } diff --git a/src/backend/cdb/motion/ic_common.c b/src/backend/cdb/motion/ic_common.c deleted file mode 100644 index 74be198ed5b..00000000000 --- a/src/backend/cdb/motion/ic_common.c +++ /dev/null @@ -1,878 +0,0 @@ -/*------------------------------------------------------------------------- - * ic_common.c - * Interconnect code shared between UDP, and TCP IPC Layers. - * - * Portions Copyright (c) 2005-2008, Cloudberry - * Portions Copyright (c) 2012-Present VMware, Inc. or its affiliates. - * - * - * IDENTIFICATION - * src/backend/cdb/motion/ic_common.c - * - * Reviewers: jzhang, tkordas - *------------------------------------------------------------------------- - */ - -#include "postgres.h" - -#include "common/ip.h" -#include "nodes/execnodes.h" /* ExecSlice, SliceTable */ -#include "miscadmin.h" -#include "libpq/libpq-be.h" -#include "utils/builtins.h" -#include "utils/memutils.h" - -#include "cdb/ml_ipc.h" -#include "cdb/cdbvars.h" -#include "cdb/cdbdisp.h" - -#include -#include -#include -#include - -/* - #define AMS_VERBOSE_LOGGING -*/ - -/*========================================================================= - * STRUCTS - */ -typedef struct interconnect_handle_t -{ - ChunkTransportState *interconnect_context; /* Interconnect state */ - - ResourceOwner owner; /* owner of this handle */ - struct interconnect_handle_t *next; - struct interconnect_handle_t *prev; -} interconnect_handle_t; - -/*========================================================================= - * GLOBAL STATE VARIABLES - */ - -/* Socket file descriptor for the listener. */ -int TCP_listenerFd; -int UDP_listenerFd; - -static interconnect_handle_t *open_interconnect_handles; -static bool interconnect_resowner_callback_registered; - -/*========================================================================= - * FUNCTIONS PROTOTYPES - */ - -static void interconnect_abort_callback(ResourceReleasePhase phase, - bool isCommit, - bool isTopLevel, - void *arg); -static void cleanup_interconnect_handle(interconnect_handle_t *h); -static interconnect_handle_t *allocate_interconnect_handle(void); -static void destroy_interconnect_handle(interconnect_handle_t *h); -static interconnect_handle_t *find_interconnect_handle(ChunkTransportState *icContext); - -static void -logChunkParseDetails(MotionConn *conn, uint32 ic_instance_id) -{ - struct icpkthdr *pkt; - - Assert(conn != NULL); - Assert(conn->pBuff != NULL); - - pkt = (struct icpkthdr *) conn->pBuff; - - elog(LOG, "Interconnect parse details: pkt->len %d pkt->seq %d pkt->flags 0x%x conn->active %d conn->stopRequest %d pkt->icId %d my_icId %d", - pkt->len, pkt->seq, pkt->flags, conn->stillActive, conn->stopRequested, pkt->icId, ic_instance_id); - - elog(LOG, "Interconnect parse details continued: peer: srcpid %d dstpid %d recvslice %d sendslice %d srccontent %d dstcontent %d", - pkt->srcPid, pkt->dstPid, pkt->recvSliceIndex, pkt->sendSliceIndex, pkt->srcContentId, pkt->dstContentId); -} - -TupleChunkListItem -RecvTupleChunk(MotionConn *conn, ChunkTransportState *transportStates) -{ - TupleChunkListItem tcItem; - TupleChunkListItem firstTcItem = NULL; - TupleChunkListItem lastTcItem = NULL; - uint32 tcSize; - int bytesProcessed = 0; - - if (Gp_interconnect_type == INTERCONNECT_TYPE_TCP || - Gp_interconnect_type == INTERCONNECT_TYPE_PROXY) - { - /* read the packet in from the network. */ - readPacket(conn, transportStates); - - /* go through and form us some TupleChunks. */ - bytesProcessed = PACKET_HEADER_SIZE; - } - else - { - /* go through and form us some TupleChunks. */ - bytesProcessed = sizeof(struct icpkthdr); - } - -#ifdef AMS_VERBOSE_LOGGING - elog(DEBUG5, "recvtuple chunk recv bytes %d msgsize %d conn->pBuff %p conn->msgPos: %p", - conn->recvBytes, conn->msgSize, conn->pBuff, conn->msgPos); -#endif - - while (bytesProcessed != conn->msgSize) - { - if (conn->msgSize - bytesProcessed < TUPLE_CHUNK_HEADER_SIZE) - { - logChunkParseDetails(conn, transportStates->sliceTable->ic_instance_id); - - ereport(ERROR, - (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), - errmsg("interconnect error parsing message: insufficient data received"), - errdetail("conn->msgSize %d bytesProcessed %d < chunk-header %d", - conn->msgSize, bytesProcessed, TUPLE_CHUNK_HEADER_SIZE))); - } - - tcSize = TUPLE_CHUNK_HEADER_SIZE + (*(uint16 *) (conn->msgPos + bytesProcessed)); - - /* sanity check */ - if (tcSize > Gp_max_packet_size) - { - /* - * see MPP-720: it is possible that our message got messed up by a - * cancellation ? - */ - ML_CHECK_FOR_INTERRUPTS(transportStates->teardownActive); - - /* - * MPP-4010: add some extra debugging. - */ - if (lastTcItem != NULL) - elog(LOG, "Interconnect error parsing message: last item length %d inplace %p", lastTcItem->chunk_length, lastTcItem->inplace); - else - elog(LOG, "Interconnect error parsing message: no last item"); - - logChunkParseDetails(conn, transportStates->sliceTable->ic_instance_id); - - ereport(ERROR, - (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), - errmsg("interconnect error parsing message"), - errdetail("tcSize %d > max %d header %d processed %d/%d from %p", - tcSize, Gp_max_packet_size, - TUPLE_CHUNK_HEADER_SIZE, bytesProcessed, - conn->msgSize, conn->msgPos))); - } - - - /* - * we only check for interrupts here when we don't have a guaranteed - * full-message - */ - if (Gp_interconnect_type == INTERCONNECT_TYPE_TCP || - Gp_interconnect_type == INTERCONNECT_TYPE_PROXY) - { - if (tcSize >= conn->msgSize) - { - /* - * see MPP-720: it is possible that our message got messed up - * by a cancellation ? - */ - ML_CHECK_FOR_INTERRUPTS(transportStates->teardownActive); - - logChunkParseDetails(conn, transportStates->sliceTable->ic_instance_id); - - ereport(ERROR, - (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), - errmsg("interconnect error parsing message"), - errdetail("tcSize %d >= conn->msgSize %d", - tcSize, conn->msgSize))); - } - } - Assert(tcSize < conn->msgSize); - - /* - * We store the data inplace, and handle any necessary copying later - * on - */ - tcItem = (TupleChunkListItem) palloc(sizeof(TupleChunkListItemData)); - - tcItem->p_next = NULL; - tcItem->chunk_length = tcSize; - tcItem->inplace = (char *) (conn->msgPos + bytesProcessed); - - bytesProcessed += tcSize; - - if (firstTcItem == NULL) - { - firstTcItem = tcItem; - lastTcItem = tcItem; - } - else - { - lastTcItem->p_next = tcItem; - lastTcItem = tcItem; - } - } - - conn->recvBytes -= conn->msgSize; - if (conn->recvBytes != 0) - { -#ifdef AMS_VERBOSE_LOGGING - elog(DEBUG5, "residual message %d bytes", conn->recvBytes); -#endif - conn->msgPos += conn->msgSize; - } - - conn->msgSize = 0; - - return firstTcItem; -} - -/*========================================================================= - * VISIBLE FUNCTIONS - */ - -/* See ml_ipc.h */ -void -InitMotionLayerIPC(void) -{ - uint16 tcp_listener = 0; - uint16 udp_listener = 0; - - /* activated = false; */ - - if (Gp_interconnect_type == INTERCONNECT_TYPE_TCP || - Gp_interconnect_type == INTERCONNECT_TYPE_PROXY) - InitMotionTCP(&TCP_listenerFd, &tcp_listener); - else if (Gp_interconnect_type == INTERCONNECT_TYPE_UDPIFC) - InitMotionUDPIFC(&UDP_listenerFd, &udp_listener); - - Gp_listener_port = (udp_listener << 16) | tcp_listener; - - elog(DEBUG1, "Interconnect listening on tcp port %d udp port %d (0x%x)", tcp_listener, udp_listener, Gp_listener_port); -} - -/* See ml_ipc.h */ -void -CleanUpMotionLayerIPC(void) -{ - if (gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG) - elog(DEBUG3, "Cleaning Up Motion Layer IPC..."); - - if (Gp_interconnect_type == INTERCONNECT_TYPE_TCP || - Gp_interconnect_type == INTERCONNECT_TYPE_PROXY) - CleanupMotionTCP(); - else if (Gp_interconnect_type == INTERCONNECT_TYPE_UDPIFC) - CleanupMotionUDPIFC(); - - /* close down the Interconnect listener socket. */ - if (TCP_listenerFd >= 0) - closesocket(TCP_listenerFd); - - if (UDP_listenerFd >= 0) - closesocket(UDP_listenerFd); - - /* be safe and reset global state variables. */ - Gp_listener_port = 0; - TCP_listenerFd = -1; - UDP_listenerFd = -1; -} - -/* See ml_ipc.h */ -bool -SendTupleChunkToAMS(MotionLayerState *mlStates, - ChunkTransportState *transportStates, - int16 motNodeID, - int16 targetRoute, - TupleChunkListItem tcItem) -{ - int i, - recount = 0; - ChunkTransportStateEntry *pEntry = NULL; - MotionConn *conn; - TupleChunkListItem currItem; - - if (!transportStates) - elog(FATAL, "SendTupleChunkToAMS: no transport-states."); - if (!transportStates->activated) - elog(FATAL, "SendTupleChunkToAMS: transport states inactive"); - - /* check em' */ - ML_CHECK_FOR_INTERRUPTS(transportStates->teardownActive); - -#ifdef AMS_VERBOSE_LOGGING - elog(DEBUG3, "sendtuplechunktoams: calling get_transport_state" - "w/transportStates %p transportState->size %d motnodeid %d route %d", - transportStates, transportStates->size, motNodeID, targetRoute); -#endif - - getChunkTransportState(transportStates, motNodeID, &pEntry); - - /* - * tcItem can actually be a chain of tcItems. we need to send out all of - * them. - */ - for (currItem = tcItem; currItem != NULL; currItem = currItem->p_next) - { -#ifdef AMS_VERBOSE_LOGGING - elog(DEBUG5, "SendTupleChunkToAMS: chunk length %d", currItem->chunk_length); -#endif - - if (targetRoute == BROADCAST_SEGIDX) - { - doBroadcast(transportStates, pEntry, currItem, &recount); - } - else - { - if (targetRoute < 0 || targetRoute >= pEntry->numConns) - { - elog(FATAL, "SendTupleChunkToAMS: targetRoute is %d, must be between 0 and %d .", - targetRoute, pEntry->numConns); - } - /* handle pt-to-pt message. Primary */ - conn = pEntry->conns + targetRoute; - /* only send to interested connections */ - if (conn->stillActive) - { - transportStates->SendChunk(transportStates, pEntry, conn, currItem, motNodeID); - if (!conn->stillActive) - recount = 1; - } - /* in 4.0 logical mirror xmit eliminated. */ - } - } - - if (recount == 0) - return true; - - /* if we don't have any connections active, return false */ - for (i = 0; i < pEntry->numConns; i++) - { - conn = pEntry->conns + i; - if (conn->stillActive) - break; - } - - /* if we found an active connection we're not done */ - return (i < pEntry->numConns); -} - -/* - * The fetches a direct pointer into our transmit buffers, along with - * an indication as to how much data can be safely shoved into the - * buffer (started at the pointed location). - * - * This works a lot like SendTupleChunkToAMS(). - */ -void -getTransportDirectBuffer(ChunkTransportState *transportStates, - int16 motNodeID, - int16 targetRoute, - struct directTransportBuffer *b) -{ - ChunkTransportStateEntry *pEntry = NULL; - MotionConn *conn; - - if (!transportStates) - { - elog(FATAL, "getTransportDirectBuffer: no transport states"); - } - else if (!transportStates->activated) - { - elog(FATAL, "getTransportDirectBuffer: inactive transport states"); - } - else if (targetRoute == BROADCAST_SEGIDX) - { - elog(FATAL, "getTransportDirectBuffer: can't direct-transport to broadcast"); - } - - Assert(b != NULL); - - do - { - getChunkTransportState(transportStates, motNodeID, &pEntry); - - /* handle pt-to-pt message. Primary */ - conn = pEntry->conns + targetRoute; - /* only send to interested connections */ - if (!conn->stillActive) - { - break; - } - - b->pri = conn->pBuff + conn->msgSize; - b->prilen = Gp_max_packet_size - conn->msgSize; - - /* got buffer. */ - return; - } - while (0); - - /* buffer is missing ? */ - - b->pri = NULL; - b->prilen = 0; - - return; -} - -/* - * The fetches a direct pointer into our transmit buffers, along with - * an indication as to how much data can be safely shoved into the - * buffer (started at the pointed location). - * - * This works a lot like SendTupleChunkToAMS(). - */ -void -putTransportDirectBuffer(ChunkTransportState *transportStates, - int16 motNodeID, - int16 targetRoute, int length) -{ - ChunkTransportStateEntry *pEntry = NULL; - MotionConn *conn; - - if (!transportStates) - { - elog(FATAL, "putTransportDirectBuffer: no transport states"); - } - else if (!transportStates->activated) - { - elog(FATAL, "putTransportDirectBuffer: inactive transport states"); - } - else if (targetRoute == BROADCAST_SEGIDX) - { - elog(FATAL, "putTransportDirectBuffer: can't direct-transport to broadcast"); - } - - getChunkTransportState(transportStates, motNodeID, &pEntry); - - /* handle pt-to-pt message. Primary */ - conn = pEntry->conns + targetRoute; - /* only send to interested connections */ - if (conn->stillActive) - { - conn->msgSize += length; - conn->tupleCount++; - } - - /* put buffer. */ - return; -} - -/* - * DeregisterReadInterest is called on receiving nodes when they - * believe that they're done with the receiver - */ -void -DeregisterReadInterest(ChunkTransportState *transportStates, - int motNodeID, - int srcRoute, - const char *reason) -{ - ChunkTransportStateEntry *pEntry = NULL; - MotionConn *conn; - - if (!transportStates) - { - elog(FATAL, "DeregisterReadInterest: no transport states"); - } - - if (!transportStates->activated) - return; - - getChunkTransportState(transportStates, motNodeID, &pEntry); - conn = pEntry->conns + srcRoute; - - if (gp_log_interconnect >= GPVARS_VERBOSITY_DEBUG) - { - elog(DEBUG3, "Interconnect finished receiving " - "from seg%d slice%d %s pid=%d sockfd=%d; %s", - conn->remoteContentId, - pEntry->sendSlice->sliceIndex, - conn->remoteHostAndPort, - conn->cdbProc->pid, - conn->sockfd, - reason); - } - - if (Gp_interconnect_type == INTERCONNECT_TYPE_UDPIFC) - { -#ifdef AMS_VERBOSE_LOGGING - elog(LOG, "deregisterReadInterest set stillactive = false for node %d route %d (%s)", motNodeID, srcRoute, reason); -#endif - markUDPConnInactiveIFC(conn); - } - else - { - /* - * we also mark the connection as "done." The way synchronization - * works is strange. On QDs the "teardown" doesn't get called until - * all segments are finished, which means that we need some way for - * the QEs to know that Teardown should complete, otherwise we - * deadlock the entire query (QEs wait in their Teardown calls, while - * the QD waits for them to finish) - */ - shutdown(conn->sockfd, SHUT_WR); - - MPP_FD_CLR(conn->sockfd, &pEntry->readSet); - } - return; -} - -void -SetupInterconnect(EState *estate) -{ - interconnect_handle_t *h; - MemoryContext oldContext; - - if (estate->interconnect_context) - { - elog(ERROR, "SetupInterconnect: already initialized."); - } - else if (!estate->es_sliceTable) - { - elog(ERROR, "SetupInterconnect: no slice table ?"); - } - - h = allocate_interconnect_handle(); - - Assert(InterconnectContext != NULL); - oldContext = MemoryContextSwitchTo(InterconnectContext); - - if (Gp_interconnect_type == INTERCONNECT_TYPE_UDPIFC) - SetupUDPIFCInterconnect(estate); - else if (Gp_interconnect_type == INTERCONNECT_TYPE_TCP || - Gp_interconnect_type == INTERCONNECT_TYPE_PROXY) - SetupTCPInterconnect(estate); - else - elog(ERROR, "unsupported expected interconnect type"); - - MemoryContextSwitchTo(oldContext); - - h->interconnect_context = estate->interconnect_context; -} - -/* TeardownInterconnect() function is used to cleanup interconnect resources that - * were allocated during SetupInterconnect(). This function should ALWAYS be - * called after SetupInterconnect to avoid leaking resources (like sockets) - * even if SetupInterconnect did not complete correctly. - */ -void -TeardownInterconnect(ChunkTransportState *transportStates, bool hasErrors) -{ - interconnect_handle_t *h = find_interconnect_handle(transportStates); - - if (Gp_interconnect_type == INTERCONNECT_TYPE_UDPIFC) - { - TeardownUDPIFCInterconnect(transportStates, hasErrors); - } - else if (Gp_interconnect_type == INTERCONNECT_TYPE_TCP || - Gp_interconnect_type == INTERCONNECT_TYPE_PROXY) - { - TeardownTCPInterconnect(transportStates, hasErrors); - } - - if (h != NULL) - destroy_interconnect_handle(h); -} - -/*========================================================================= - * HELPER FUNCTIONS - */ - - -/* Function createChunkTransportState() is used to create a ChunkTransportState struct and - * place it in the hashtab hashtable based on the motNodeID. - * - * PARAMETERS - * - * motNodeID - motion node ID for this ChunkTransportState. - * - * numConns - number of primary connections for this motion node. - * All are incoming if this is a receiving motion node. - * All are outgoing if this is a sending motion node. - * - * RETURNS - * An empty and initialized ChunkTransportState struct for the given motion node. If - * a ChuckTransportState struct is already registered for the motNodeID an ERROR is - * thrown. - */ -ChunkTransportStateEntry * -createChunkTransportState(ChunkTransportState *transportStates, - ExecSlice *sendSlice, - ExecSlice *recvSlice, - int numConns) -{ - ChunkTransportStateEntry *pEntry; - int motNodeID; - int i; - - Assert(recvSlice->sliceIndex >= 0); - Assert(sendSlice->sliceIndex > 0); - - motNodeID = sendSlice->sliceIndex; - if (motNodeID > transportStates->size) - { - /* increase size of our table */ - ChunkTransportStateEntry *newTable; - - newTable = repalloc(transportStates->states, motNodeID * sizeof(ChunkTransportStateEntry)); - transportStates->states = newTable; - /* zero-out the new piece at the end */ - MemSet(&transportStates->states[transportStates->size], 0, (motNodeID - transportStates->size) * sizeof(ChunkTransportStateEntry)); - transportStates->size = motNodeID; - } - - pEntry = &transportStates->states[motNodeID - 1]; - - if (pEntry->valid) - { - ereport(ERROR, - (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), - errmsg("interconnect error: A HTAB entry for motion node %d already exists", - motNodeID), - errdetail("conns %p numConns %d first sock %d", - pEntry->conns, pEntry->numConns, - pEntry->conns[0].sockfd))); - } - - pEntry->valid = true; - - pEntry->motNodeId = motNodeID; - pEntry->numConns = numConns; - pEntry->scanStart = 0; - pEntry->sendSlice = sendSlice; - pEntry->recvSlice = recvSlice; - - pEntry->conns = palloc0(pEntry->numConns * sizeof(pEntry->conns[0])); - - for (i = 0; i < pEntry->numConns; i++) - { - MotionConn *conn = &pEntry->conns[i]; - - /* Initialize MotionConn entry. */ - conn->state = mcsNull; - conn->sockfd = -1; - conn->msgSize = 0; - conn->tupleCount = 0; - conn->stillActive = false; - conn->stopRequested = false; - conn->wakeup_ms = 0; - conn->cdbProc = NULL; - conn->sent_record_typmod = 0; - conn->remapper = NULL; - } - - return pEntry; -} - -/* Function removeChunkTransportState() is used to remove a ChunkTransportState struct from - * the hashtab hashtable. - * - * This should only be called after createChunkTransportState(). - * - * PARAMETERS - * - * motNodeID - motion node ID to lookup the ChunkTransportState. - * pIncIdx - parent slice idx in child slice. If not multiplexed, should be 1. - * - * RETURNS - * The ChunkTransportState that was removed from the hashtab hashtable. - */ -ChunkTransportStateEntry * -removeChunkTransportState(ChunkTransportState *transportStates, - int16 motNodeID) -{ - ChunkTransportStateEntry *pEntry = NULL; - - if (motNodeID > transportStates->size) - { - ereport(ERROR, - (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), - errmsg("interconnect error: Unexpected Motion Node Id: %d", - motNodeID), - errdetail("During remove. (size %d)", transportStates->size))); - } - else if (!transportStates->states[motNodeID - 1].valid) - { - ereport(ERROR, - (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), - errmsg("interconnect error: Unexpected Motion Node Id: %d", - motNodeID), - errdetail("During remove. State not valid"))); - } - else - { - transportStates->states[motNodeID - 1].valid = false; - pEntry = &transportStates->states[motNodeID - 1]; - } - - MPP_FD_ZERO(&pEntry->readSet); - - return pEntry; -} - -/* - * checkForCancelFromQD - * Check for cancel from QD. - * - * Should be called only inside the dispatcher - */ -void -checkForCancelFromQD(ChunkTransportState *pTransportStates) -{ - Assert(Gp_role == GP_ROLE_DISPATCH); - Assert(pTransportStates); - Assert(pTransportStates->estate); - - if (cdbdisp_checkForCancel(pTransportStates->estate->dispatcherState)) - { - ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), - errmsg(CDB_MOTION_LOST_CONTACT_STRING))); - /* not reached */ - } -} - -/* -* WaitInterconnectQuit -* -* Wait for the ic thread to quit, don't clean any resource owned by ic thread -*/ -void -WaitInterconnectQuit(void) -{ - if (Gp_interconnect_type == INTERCONNECT_TYPE_UDPIFC) - { - WaitInterconnectQuitUDPIFC(); - } -} - -interconnect_handle_t * -allocate_interconnect_handle(void) -{ - interconnect_handle_t *h; - - if (InterconnectContext == NULL) - InterconnectContext = AllocSetContextCreate(TopMemoryContext, - "Interconnect Context", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); - - h = MemoryContextAllocZero(InterconnectContext, sizeof(interconnect_handle_t)); - - h->owner = CurrentResourceOwner; - h->next = open_interconnect_handles; - h->prev = NULL; - if (open_interconnect_handles) - open_interconnect_handles->prev = h; - open_interconnect_handles = h; - - if (!interconnect_resowner_callback_registered) - { - RegisterResourceReleaseCallback(interconnect_abort_callback, NULL); - interconnect_resowner_callback_registered = true; - } - return h; -} - -static void -destroy_interconnect_handle(interconnect_handle_t *h) -{ - h->interconnect_context = NULL; - /* unlink from linked list first */ - if (h->prev) - h->prev->next = h->next; - else - open_interconnect_handles = h->next; - if (h->next) - h->next->prev = h->prev; - - pfree(h); - - if (open_interconnect_handles == NULL) - MemoryContextReset(InterconnectContext); -} - -static interconnect_handle_t * -find_interconnect_handle(ChunkTransportState *icContext) -{ - interconnect_handle_t *head = open_interconnect_handles; - while (head != NULL) - { - if (head->interconnect_context == icContext) - return head; - head = head->next; - } - return NULL; -} - -static void -cleanup_interconnect_handle(interconnect_handle_t *h) -{ - if (h->interconnect_context == NULL) - { - destroy_interconnect_handle(h); - return; - } - TeardownInterconnect(h->interconnect_context, true); -} - -static void -interconnect_abort_callback(ResourceReleasePhase phase, - bool isCommit, - bool isTopLevel, - void *arg) -{ - interconnect_handle_t *curr; - interconnect_handle_t *next; - - if (phase != RESOURCE_RELEASE_AFTER_LOCKS) - return; - - next = open_interconnect_handles; - while (next) - { - curr = next; - next = curr->next; - - if (curr->owner == CurrentResourceOwner) - { - if (isCommit) - elog(WARNING, "interconnect reference leak: %p still referenced", curr); - - cleanup_interconnect_handle(curr); - } - } -} - -/* - * format_sockaddr - * Format a sockaddr to a human readable string - * - * This function must be kept threadsafe, elog/ereport/palloc etc are not - * allowed within this function. - */ -char * -format_sockaddr(struct sockaddr_storage *sa, char *buf, size_t len) -{ - int ret; - char remote_host[NI_MAXHOST]; - char remote_port[NI_MAXSERV]; - - ret = pg_getnameinfo_all(sa, sizeof(struct sockaddr_storage), - remote_host, sizeof(remote_host), - remote_port, sizeof(remote_port), - NI_NUMERICHOST | NI_NUMERICSERV); - - if (ret != 0) - snprintf(buf, len, "?host?:?port?"); - else - { -#ifdef HAVE_IPV6 - if (sa->ss_family == AF_INET6) - snprintf(buf, len, "[%s]:%s", remote_host, remote_port); - else -#endif - snprintf(buf, len, "%s:%s", remote_host, remote_port); - } - - return buf; -} diff --git a/src/backend/cdb/motion/ic_proxy_bgworker.c b/src/backend/cdb/motion/ic_proxy_bgworker.c index f2ebdd28ffd..ba50dbf9bc6 100644 --- a/src/backend/cdb/motion/ic_proxy_bgworker.c +++ b/src/backend/cdb/motion/ic_proxy_bgworker.c @@ -16,9 +16,8 @@ #include "postgres.h" #include "storage/ipc.h" - #include "cdb/ic_proxy_bgworker.h" -#include "ic_proxy_server.h" +#include "cdb/ml_ipc.h" bool ICProxyStartRule(Datum main_arg) @@ -32,6 +31,12 @@ ICProxyStartRule(Datum main_arg) void ICProxyMain(Datum main_arg) { - /* main loop */ - proc_exit(ic_proxy_server_main()); + /* in utility mode, won't preload interconnect module. + * also won't call cdb_setup(). + */ + if (CurrentMotionIPCLayer) { + proc_exit(CurrentMotionIPCLayer->IcProxyServiceMain()); + } else { + proc_exit(0); + } } diff --git a/src/backend/cdb/motion/tupser.c b/src/backend/cdb/motion/tupser.c index 40c658ea7d1..334275df66f 100644 --- a/src/backend/cdb/motion/tupser.c +++ b/src/backend/cdb/motion/tupser.c @@ -270,7 +270,7 @@ addByteStringToChunkList(TupleChunkList tcList, char *data, int datalen, TupleCh void SerializeRecordCacheIntoChunks(SerTupInfo *pSerInfo, TupleChunkList tcList, - MotionConn *conn) + int32 sent_record_typmod) { TupleChunkListItem tcItem = NULL; MemoryContext oldCtxt; @@ -305,7 +305,7 @@ SerializeRecordCacheIntoChunks(SerTupInfo *pSerInfo, * sent by sender. */ oldCtxt = MemoryContextSwitchTo(s_tupSerMemCtxt); - typelist = build_tuple_node_list(conn->sent_record_typmod); + typelist = build_tuple_node_list(sent_record_typmod); buf = serializeNode((Node *) typelist, &size, NULL); MemoryContextSwitchTo(oldCtxt); diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index 835d0179600..d9c4b6ad7ef 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -452,7 +452,7 @@ standard_ExecutorStart(QueryDesc *queryDesc, int eflags) * Initialize the motion layer for this query. */ Assert(!estate->interconnect_context); - SetupInterconnect(estate); + CurrentMotionIPCLayer->SetupInterconnect(estate); Assert(estate->interconnect_context); UpdateMotionExpectedReceivers(estate->motionlayer_context, estate->es_sliceTable); @@ -666,7 +666,7 @@ standard_ExecutorStart(QueryDesc *queryDesc, int eflags) !estate->es_interconnect_is_setup) { Assert(!estate->interconnect_context); - SetupInterconnect(estate); + CurrentMotionIPCLayer->SetupInterconnect(estate); Assert(estate->interconnect_context); UpdateMotionExpectedReceivers(estate->motionlayer_context, estate->es_sliceTable); } diff --git a/src/backend/executor/execUtils.c b/src/backend/executor/execUtils.c index b5d9cba778a..af4ee939a37 100644 --- a/src/backend/executor/execUtils.c +++ b/src/backend/executor/execUtils.c @@ -2007,7 +2007,8 @@ void mppExecutorFinishup(QueryDesc *queryDesc) /* Teardown the Interconnect */ if (estate->es_interconnect_is_setup) { - TeardownInterconnect(estate->interconnect_context, false); + Assert(CurrentMotionIPCLayer); + CurrentMotionIPCLayer->TeardownInterconnect(estate->interconnect_context, false); estate->interconnect_context = NULL; estate->es_interconnect_is_setup = false; } @@ -2092,7 +2093,8 @@ void mppExecutorCleanup(QueryDesc *queryDesc) /* Clean up the interconnect. */ if (estate->es_interconnect_is_setup) { - TeardownInterconnect(estate->interconnect_context, true); + Assert(CurrentMotionIPCLayer); + CurrentMotionIPCLayer->TeardownInterconnect(estate->interconnect_context, true); estate->es_interconnect_is_setup = false; } diff --git a/src/backend/executor/nodeSubplan.c b/src/backend/executor/nodeSubplan.c index b5d65741943..a8c634c83de 100644 --- a/src/backend/executor/nodeSubplan.c +++ b/src/backend/executor/nodeSubplan.c @@ -1160,7 +1160,7 @@ PG_TRY(); * Set up the interconnect for execution of the initplan root slice. */ Assert(!(queryDesc->estate->interconnect_context)); - SetupInterconnect(queryDesc->estate); + CurrentMotionIPCLayer->SetupInterconnect(queryDesc->estate); Assert((queryDesc->estate->interconnect_context)); UpdateMotionExpectedReceivers(queryDesc->estate->motionlayer_context, queryDesc->estate->es_sliceTable); @@ -1381,7 +1381,8 @@ PG_TRY(); /* Clean up the interconnect. */ if (queryDesc && queryDesc->estate && queryDesc->estate->es_interconnect_is_setup) { - TeardownInterconnect(queryDesc->estate->interconnect_context, false); /* following success on QD */ + Assert(CurrentMotionIPCLayer); + CurrentMotionIPCLayer->TeardownInterconnect(queryDesc->estate->interconnect_context, false); /* following success on QD */ queryDesc->estate->interconnect_context = NULL; queryDesc->estate->es_interconnect_is_setup = false; } @@ -1444,7 +1445,8 @@ PG_CATCH(); */ if (queryDesc && queryDesc->estate && queryDesc->estate->es_interconnect_is_setup) { - TeardownInterconnect(queryDesc->estate->interconnect_context, true); + Assert(CurrentMotionIPCLayer); + CurrentMotionIPCLayer->TeardownInterconnect(queryDesc->estate->interconnect_context, true); queryDesc->estate->interconnect_context = NULL; queryDesc->estate->es_interconnect_is_setup = false; } diff --git a/src/backend/storage/ipc/ipc.c b/src/backend/storage/ipc/ipc.c index 4476bb6f204..a727ea4a7ed 100644 --- a/src/backend/storage/ipc/ipc.c +++ b/src/backend/storage/ipc/ipc.c @@ -212,8 +212,12 @@ proc_exit_prepare(int code) * * It's ok to shutdown Interconnect background thread here, process is dying, no * necessary to receive more motion data. + * + * Current motion ipc layer can be NULL in UTILITY mode, in this case, + * it is not necessary to wait for the interconnect to exit. */ - WaitInterconnectQuit(); + if (CurrentMotionIPCLayer) + CurrentMotionIPCLayer->WaitInterconnectQuit(); elog(DEBUG3, "proc_exit(%d)", code); diff --git a/src/backend/tcop/dest.c b/src/backend/tcop/dest.c index 309b10962af..3db078a2f00 100644 --- a/src/backend/tcop/dest.c +++ b/src/backend/tcop/dest.c @@ -42,6 +42,7 @@ #include "utils/portal.h" #include "cdb/cdbvars.h" +#include "cdb/ml_ipc.h" #include "utils/vmem_tracker.h" /* ---------------- @@ -325,7 +326,11 @@ sendQEDetails(void) StringInfoData msgbuf; char port_str[11]; - snprintf(port_str, sizeof(port_str), "%u", Gp_listener_port); + if (CurrentMotionIPCLayer->ic_type == INTERCONNECT_TYPE_TCP || CurrentMotionIPCLayer->ic_type == INTERCONNECT_TYPE_PROXY) { + snprintf(port_str, sizeof(port_str), "%u", (0 << 16) | CurrentMotionIPCLayer->GetListenPort()); + } else if (CurrentMotionIPCLayer->ic_type == INTERCONNECT_TYPE_UDPIFC) { + snprintf(port_str, sizeof(port_str), "%u", (CurrentMotionIPCLayer->GetListenPort() << 16) | 0); + } pq_beginmessage(&msgbuf, 'S'); pq_sendstring(&msgbuf, "qe_listener_port"); diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c index 666722d464e..abdc2567914 100644 --- a/src/backend/utils/init/miscinit.c +++ b/src/backend/utils/init/miscinit.c @@ -1759,6 +1759,12 @@ process_shared_preload_libraries(void) "shared_preload_libraries", false); +#ifdef ENABLE_PRELOAD_IC_MODULE + load_libraries("interconnect", + "preload interconnect module", + false); +#endif + process_shared_preload_libraries_in_progress = false; } diff --git a/src/backend/utils/misc/guc_gp.c b/src/backend/utils/misc/guc_gp.c index 538d1b03af5..a48aa60affb 100644 --- a/src/backend/utils/misc/guc_gp.c +++ b/src/backend/utils/misc/guc_gp.c @@ -101,7 +101,7 @@ static bool check_gp_resource_group_bypass(bool *newval, void **extra, GucSource static int guc_array_compare(const void *a, const void *b); static bool check_max_running_tasks(int *newval, void **extra, GucSource source); -extern int listenerBacklog; +int listenerBacklog = 128; /* For synchornized GUC value is cache in HashTable, * dispatch value along with query when some guc changed diff --git a/src/include/cdb/cdbinterconnect.h b/src/include/cdb/cdbinterconnect.h index 2354999a9f7..4d7e733448e 100644 --- a/src/include/cdb/cdbinterconnect.h +++ b/src/include/cdb/cdbinterconnect.h @@ -27,337 +27,6 @@ #include "cdb/tupchunklist.h" #include "cdb/tupleremap.h" -struct CdbProcess; /* #include "nodes/execnodes.h" */ -struct ExecSlice; /* #include "nodes/execnodes.h" */ -struct SliceTable; /* #include "nodes/execnodes.h" */ -struct EState; /* #include "nodes/execnodes.h" */ -/* TODO: move "src/backend/cdb/motion/ic_proxy_backend.h" into public include folder*/ -struct ICProxyBackendContext; - -typedef struct icpkthdr -{ - int32 motNodeId; - - /* - * three pairs which seem useful for identifying packets. - * - * MPP-4194: - * It turns out that these can cause collisions; but the - * high bit (1<<31) of the dstListener port is now used - * for disambiguation with mirrors. - */ - int32 srcPid; - int32 srcListenerPort; - - int32 dstPid; - int32 dstListenerPort; - - int32 sessionId; - uint32 icId; - - int32 recvSliceIndex; - int32 sendSliceIndex; - int32 srcContentId; - int32 dstContentId; - - /* MPP-6042: add CRC field */ - uint32 crc; - - /* packet specific info */ - int32 flags; - int32 len; - - /* - * The usage of seq and extraSeq field - * a) In a normal DATA packet - * seq -> the data packet sequence number - * extraSeq -> not used - * b) In a normal ACK message (UDPIC_FLAGS_ACK | UDPIC_FLAGS_CAPACITY) - * seq -> the largest seq of the continuously cached packets - * sometimes, it is special, for exampke, conn req ack, mismatch ack. - * extraSeq -> the largest seq of the consumed packets - * c) In a start race NAK message (UPDIC_FLAGS_NAK) - * seq -> the seq from the pkt - * extraSeq -> the extraSeq from the pkt - * d) In a DISORDER message (UDPIC_FLAGS_DISORDER) - * seq -> packet sequence number that triggers the disorder message - * extraSeq -> the largest seq of the received packets - * e) In a DUPLICATE message (UDPIC_FLAGS_DUPLICATE) - * seq -> packet sequence number that triggers the duplicate message - * extraSeq -> the largest seq of the continuously cached packets - * f) In a stop messege (UDPIC_FLAGS_STOP | UDPIC_FLAGS_ACK | UDPIC_FLAGS_CAPACITY) - * seq -> the largest seq of the continuously cached packets - * extraSeq -> the largest seq of the continuously cached packets - * - * - * NOTE that: EOS/STOP flags are often saved in conn_info structure of a connection. - * It is possible for them to be sent together with other flags. - * - */ - uint32 seq; - uint32 extraSeq; -} icpkthdr; - -typedef enum MotionConnState -{ - mcsNull, - mcsAccepted, - mcsSetupOutgoingConnection, - mcsConnecting, - mcsRecvRegMsg, - mcsSendRegMsg, - mcsStarted, - mcsEosSent -} MotionConnState; - -typedef struct ICBuffer ICBuffer; -typedef struct ICBufferLink ICBufferLink; - -typedef enum ICBufferListType -{ - ICBufferListType_Primary, - ICBufferListType_Secondary, - ICBufferListType_UNDEFINED -} ICBufferListType; - -struct ICBufferLink -{ - ICBufferLink *next; - ICBufferLink *prev; -}; - -/* - * ICBufferList - * ic buffer list data structure. - * - * There are two kinds of lists. The first kind of list uses the primary next/prev pointers. - * And the second kind uses the secondary next/prev pointers. - */ -typedef struct ICBufferList -{ - int length; - ICBufferListType type; /* primary or secondary */ - - ICBufferLink head; -} ICBufferList; - -#define CONTAINER_OF(ptr, type, member) \ - ({ \ - const typeof( ((type *)0)->member ) *__member_ptr = (ptr); \ - (type *)( (char *)__member_ptr - offsetof(type,member) ); \ - }) - -#define GET_ICBUFFER_FROM_PRIMARY(ptr) CONTAINER_OF(ptr, ICBuffer, primary) -#define GET_ICBUFFER_FROM_SECONDARY(ptr) CONTAINER_OF(ptr, ICBuffer, secondary) - -/* - * ICBuffer - * interconnect buffer data structure. - * - * In some cases, an ICBuffer may exists in two lists/queues, - * thus it has two sets of pointers. For example, an ICBuffer - * can exist in an unack queue and an expiration queue at the same time. - * - * It is important to get the ICBuffer address when we iterate a list of - * ICBuffers through primary/secondary links. The Macro GET_ICBUFFER_FROM_PRIMARY - * and GET_ICBUFFER_FROM_SECONDARY are for this purpose. - * - */ -struct ICBuffer -{ - /* primary next and prev pointers */ - ICBufferLink primary; - - /* secondary next and prev pointers */ - ICBufferLink secondary; - - /* connection that this buffer belongs to */ - MotionConn *conn; - - /* - * Three fields for expiration processing - * - * sentTime - the time this buffer was sent - * nRetry - the number of send retries - * unackQueueRingSlot - unack queue ring slot index - */ - uint64 sentTime; - uint32 nRetry; - int32 unackQueueRingSlot; - - /* real data */ - icpkthdr pkt[0]; -}; - - -/* - * Structure used for keeping track of a pt-to-pt connection between two - * Cdb Entities (either QE or QD). - */ -struct MotionConn -{ - /* socket file descriptor. */ - int sockfd; - - /* send side queue for packets to be sent */ - ICBufferList sndQueue; - int capacity; - - /* seq already sent */ - uint32 sentSeq; - - /* ack of this seq and packets with smaller seqs have been received */ - uint32 receivedAckSeq; - - /* packets with this seq or smaller seqs have been consumed */ - uint32 consumedSeq; - - uint64 rtt; - uint64 dev; - uint64 deadlockCheckBeginTime; - - - ICBuffer *curBuff; - - /* send side unacked packet queue. Since it is often - * accessed at the same time with unack queue ring, - * it is protected with unqck queue ring lock. - */ - ICBufferList unackQueue; - - /* pointer to the data buffer. */ - uint8 *pBuff; - - uint16 route; - - /* size of the message in the buffer, if any. */ - int32 msgSize; - - /* position of message inside of buffer, "cursor" pointer */ - uint8 *msgPos; - - /* - * recv bytes: we can have more than one message/message fragment in recv - * queue at once - */ - int32 recvBytes; - - int tupleCount; - - /* - * false means 1) received a stop message and has handled it. 2) received - * EOS message or sent out EOS message 3) received a QueryFinishPending - * notify and has handled it. - */ - bool stillActive; - /* - * used both by motion sender and motion receiver - * - * sender: true means receiver don't need to consume tuples any more, sender - * is also responsible to send stop message to its senders. - * - * receiver: true means have sent out a stop message to its senders. The stop - * message might be lost, stopRequested can also tell sender that no more - * data needed in the ack message. - */ - bool stopRequested; - - MotionConnState state; - - uint64 wakeup_ms; - - struct icpkthdr conn_info; - - struct CdbProcess *cdbProc; - int remoteContentId; - char remoteHostAndPort[128]; /* Numeric IP addresses should never be longer than about 50 chars, but play it safe */ - char localHostAndPort[128]; - - struct sockaddr_storage peer; /* Allow for IPv4 or IPv6 */ - socklen_t peer_len; /* And remember the actual length */ - - /* a queue of maximum length Gp_interconnect_queue_depth */ - int pkt_q_capacity; /*max capacity of the queue*/ - int pkt_q_size; /*number of packets in the queue*/ - int pkt_q_head; - int pkt_q_tail; - uint8 **pkt_q; - - uint64 stat_total_ack_time; - uint64 stat_count_acks; - uint64 stat_max_ack_time; - uint64 stat_min_ack_time; - uint64 stat_count_resent; - uint64 stat_max_resent; - uint64 stat_count_dropped; - - /* - * used by the sender. - * - * the typmod of last sent record type in current connection, - * if the connection is for broadcasting then we only check - * and update this attribute on connection 0. - */ - int32 sent_record_typmod; - - /* - * used by the receiver. - * - * all the remap information. - */ - TupleRemapper *remapper; -}; - -/* - * Used to organize all of the information for a given motion node. - */ -typedef struct ChunkTransportStateEntry -{ - int motNodeId; - bool valid; - - /* Connection array */ - MotionConn *conns; - int numConns; - - /* - * used for receiving. to select() from a set of interesting MotionConns - * to see when data is ready to be read. When the incoming connections - * are established, read interest is turned on. It is turned off when an - * EOS (End of Stream) message is read. - */ - mpp_fd_set readSet; - - /* highest file descriptor in the readSet. */ - int highReadSock; - - int scanStart; - - /* slice table entries */ - struct ExecSlice *sendSlice; - struct ExecSlice *recvSlice; - - /* setup info */ - int txfd; - int txfd_family; - unsigned short txport; - - bool sendingEos; - - /* Statistics info for this motion on the interconnect level */ - uint64 stat_total_ack_time; - uint64 stat_count_acks; - uint64 stat_max_ack_time; - uint64 stat_min_ack_time; - uint64 stat_count_resent; - uint64 stat_max_resent; - uint64 stat_count_dropped; - -} ChunkTransportStateEntry; - -/* ChunkTransportState array initial size */ -#define CTS_INITIAL_SIZE (10) - /* * This structure is used to keep track of partially completed tuples, * and tuples that have been completed but have not been consumed by @@ -497,11 +166,35 @@ typedef struct MotionLayerState } MotionLayerState; + +/* ChunkTransportState array initial size */ +#define CTS_INITIAL_SIZE (10) + +struct SliceTable; /* #include "nodes/execnodes.h" */ +struct EState; /* #include "nodes/execnodes.h" */ +struct ICProxyBackendContext; +struct MotionConn; +struct ChunkTransportStateEntry; +typedef struct MotionConn MotionConn; +typedef struct ChunkTransportStateEntry ChunkTransportStateEntry; + +typedef struct MotionConnKey +{ + int mot_node_id; + int conn_index; +} MotionConnKey; + +typedef struct MotionConnSentRecordTypmodEnt +{ + MotionConnKey key; + int32 sent_record_typmod; +} MotionConnSentRecordTypmodEnt; + typedef struct ChunkTransportState { /* array of per-motion-node chunk transport state */ int size; - ChunkTransportStateEntry *states; + struct ChunkTransportStateEntry *states; /* keeps track of if we've "activated" connections via SetupInterconnect(). */ bool activated; @@ -521,19 +214,19 @@ typedef struct ChunkTransportState /* Estate pointer for this statement */ struct EState *estate; - /* Function pointers to our send/receive functions */ - bool (*SendChunk)(struct ChunkTransportState *transportStates, ChunkTransportStateEntry *pEntry, MotionConn *conn, TupleChunkListItem tcItem, int16 motionId); - TupleChunkListItem (*RecvTupleChunkFrom)(struct ChunkTransportState *transportStates, int16 motNodeID, int16 srcRoute); - TupleChunkListItem (*RecvTupleChunkFromAny)(struct ChunkTransportState *transportStates, int16 motNodeID, int16 *srcRoute); - void (*doSendStopMessage)(struct ChunkTransportState *transportStates, int16 motNodeID); - void (*SendEos)(struct ChunkTransportState *transportStates, int motNodeID, TupleChunkListItem tcItem); + /* + * used by the sender. + * + * the typmod of last sent record type in current connection, + * if the connection is for broadcasting then we only check + * and update this attribute on connection 0. + * + * mapping the MotionConn -> int32 + */ + HTAB* conn_sent_record_typmod; /* ic_proxy backend context */ struct ICProxyBackendContext *proxyContext; } ChunkTransportState; -extern void dumpICBufferList(ICBufferList *list, const char *fname); -extern void dumpUnackQueueRing(const char *fname); -extern void dumpConnections(ChunkTransportStateEntry *pEntry, const char *fname); - #endif /* CDBINTERCONNECT_H */ diff --git a/src/include/cdb/cdbmotion.h b/src/include/cdb/cdbmotion.h index b6b02ef1a58..5f50548dc6e 100644 --- a/src/include/cdb/cdbmotion.h +++ b/src/include/cdb/cdbmotion.h @@ -22,7 +22,6 @@ /* Define this if you want tons of logs! */ #undef AMS_VERBOSE_LOGGING - typedef enum SendReturnCode { SEND_COMPLETE, @@ -31,7 +30,7 @@ typedef enum SendReturnCode /* * Struct describing the direct transmit buffer. see: - * getTransportDirectBuffer() (in ic_common.c) and + * GetTransportDirectBuffer() (in ic_common.c) and * SerializeTupleDirect() (in cdbmotion.c). * * Simplified somewhat in 4.0 to remove mirror-data. diff --git a/src/include/cdb/ml_ipc.h b/src/include/cdb/ml_ipc.h index b6dfb51aeb2..927fc3e3f16 100644 --- a/src/include/cdb/ml_ipc.h +++ b/src/include/cdb/ml_ipc.h @@ -1,6 +1,6 @@ /*------------------------------------------------------------------------- * ml_ipc.h - * Motion Layer IPC Layer. + * Motion IPC Layer. * * Portions Copyright (c) 2005-2008, Cloudberry inc * Portions Copyright (c) 2012-Present VMware, Inc. or its affiliates. @@ -27,300 +27,266 @@ struct EState; /* #include "nodes/execnodes.h" */ extern int TCP_listenerFd; extern int UDP_listenerFd; -/* - * Registration message - * - * Upon making a connection, the sender sends a registration message to - * identify itself to the receiver. A lot of the fields are just there - * for validity checking. - */ -typedef struct RegisterMessage -{ - int32 msgBytes; - int32 recvSliceIndex; - int32 sendSliceIndex; - int32 srcContentId; - int32 srcListenerPort; - int32 srcPid; - int32 srcSessionId; - int32 srcCommandCount; -} RegisterMessage; - /* 2 bytes to store the size of the entire packet. a packet is composed of * of one or more serialized TupleChunks (each of which has a TupleChunk * header. */ #define PACKET_HEADER_SIZE 4 -/* Performs initialization of the MotionLayerIPC. This should be called before - * any work is performed through functions here. Generally, this should only - * need to be called only once during process startup. - * - * Errors are indicated by calls to ereport(), and are therefore not indicated - * by a return code. - * - */ -extern void InitMotionLayerIPC(void); - -/* Performs any cleanup necessary by the Motion Layer IPC. This is the cleanup - * function that matches InitMotionLayerIPC, it should only be called during - * shutdown of the process. This includes shutting down the Motion Listener. - * - * Errors are indicated by calls to ereport(), and are therefore not indicated - * in the return code. - */ -extern void CleanUpMotionLayerIPC(void); - -/* - * Wait interconnect thread to quit, called when proc exit. - */ -extern void WaitInterconnectQuit(void); - -/* - * checkForCancelFromQD - * Check for cancel from QD. - * - * Should be called only inside the dispatcher - */ -void -checkForCancelFromQD(ChunkTransportState *pTransportStates); - -/* The SetupInterconnect() function should be called at the beginning of - * executing any DML statement that will need to use the interconnect. - * - * This function goes through the slicetable and makes any appropriate - * outgoing connections as well as accepts any incoming connections. Incoming - * connections will have a "Register" message from them to see which remote - * CdbProcess sent it. - * - * So this function essentially performs all of the setup the interconnect has - * to perform for all of the motion nodes in the upcoming DML statement. - * - * PARAMETERS - * - * mySliceTable - slicetable structure that correlates to the upcoming DML - * statement. - * - * mySliceId - the index of the slice in the slicetable that we are a member of. - * - */ -extern void SetupInterconnect(struct EState *estate); - -/* The TeardownInterconnect() function should be called at the end of executing - * a DML statement to close down all socket resources that were setup during - * SetupInterconnect(). - * - * NOTE: it is important that TeardownInterconnect() happens - * regardless of the outcome of the statement. i.e. gets called - * even if an ERROR occurs during the statement. For abnormal - * statement termination we can force an end-of-stream notification. - * - */ -extern void TeardownInterconnect(ChunkTransportState *transportStates, +typedef struct MotionIPCLayer +{ + GpVars_Interconnect_Type ic_type; + + /* Get max tuple chuck size. + */ + int (*GetMaxTupleChunkSize) (void); + + /* Get IPC service listen port. + * Interface to replace `Gp_listener_port` + */ + int32 (*GetListenPort) (void); + + /* Performs initialization of the MotionLayerIPC. This should be called before + * any work is performed through functions here. Generally, this should only + * need to be called only once during process startup. + * + * Errors are indicated by calls to ereport(), and are therefore not indicated + * by a return code. + * + */ + void (*InitMotionLayerIPC) (void); + + /* Performs any cleanup necessary by the Motion Layer IPC. This is the cleanup + * function that matches InitMotionLayerIPC, it should only be called during + * shutdown of the process. This includes shutting down the Motion Listener. + * + * Errors are indicated by calls to ereport(), and are therefore not indicated + * in the return code. + */ + void (*CleanUpMotionLayerIPC) (void); + + /* + * Wait interconnect thread to quit, called when proc exit. + */ + void (*WaitInterconnectQuit) (void); + + /* The SetupInterconnect() function should be called at the beginning of + * executing any DML statement that will need to use the interconnect. + * + * This function goes through the slicetable and makes any appropriate + * outgoing connections as well as accepts any incoming connections. Incoming + * connections will have a "Register" message from them to see which remote + * CdbProcess sent it. + * + * So this function essentially performs all of the setup the interconnect has + * to perform for all of the motion nodes in the upcoming DML statement. + * + * PARAMETERS + * + * mySliceTable - slicetable structure that correlates to the upcoming DML + * statement. + * + * mySliceId - the index of the slice in the slicetable that we are a member of. + * + */ + void (*SetupInterconnect)(struct EState *estate); + + + /* The TeardownInterconnect() function should be called at the end of executing + * a DML statement to close down all socket resources that were setup during + * SetupInterconnect(). + * + * NOTE: it is important that TeardownInterconnect() happens + * regardless of the outcome of the statement. i.e. gets called + * even if an ERROR occurs during the statement. For abnormal + * statement termination we can force an end-of-stream notification. + * + */ + void (*TeardownInterconnect)(ChunkTransportState *transportStates, bool hasErrors); -extern void WaitInterconnectQuit(void); - - -/* Sends a tuple chunk from the Postgres process to the local AMS process via - * IPC. This function does not block; if the IPC channel cannot accept the - * tuple chunk for some reason, then this is indicated by a return-code. - * - * Errors are indicated by calls to ereport(), and are therefore not indicated - * in the return code. - * - * - * PARAMETERS: - * - motNodeID: motion node Id that the tcItem belongs to. - * - targetRoute: route to send this tcItem out over. - * - tcItem: The tuple-chunk data to send. - * - */ -extern bool SendTupleChunkToAMS(MotionLayerState *mlStates, - ChunkTransportState *transportStates, + /* Sends a tuple chunk from the Postgres process to the local AMS process via + * IPC. This function does not block; if the IPC channel cannot accept the + * tuple chunk for some reason, then this is indicated by a return-code. + * + * Errors are indicated by calls to ereport(), and are therefore not indicated + * in the return code. + * + * + * PARAMETERS: + * - motNodeID: motion node Id that the tcItem belongs to. + * - targetRoute: route to send this tcItem out over. + * - tcItem: The tuple-chunk data to send. + * + */ + bool (*SendTupleChunkToAMS)(ChunkTransportState *transportStates, int16 motNodeID, int16 targetRoute, TupleChunkListItem tcItem); -/* The SendEosToAMS() function is used to send an "End Of Stream" message to - * all connected receivers (generally this is a broadcast) - * - * PARAMETERS: - * - motNodeID: motion node Id that the tcItem belongs to. - * - tcItem: The tuple-chunk data to send. - * - */ -extern void SendEosToAMS(MotionLayerState *mlStates, - ChunkTransportState *transportStates, - int motNodeID, - TupleChunkListItem tcItem); - -/* The RecvTupleChunkFromAny() function attempts to receive one or more tuple - * chunks from any of the incoming connections. This function blocks until - * at least one TupleChunk is received. (Although PG Interrupts are still - * checked for within this call). - * - * This function makes some effort to "fairly" pull data from peers with data - * available (a peer with data available is always better than waiting for - * one without data available; but a peer with data available which hasn't been - * read from recently is better than a peer with data available which has - * been read from recently). - * - * NOTE: The TupleChunkListItem can have other's chained to it. The caller - * should check and process all in list. - * - * PARAMETERS: - * - motNodeID: motion node id to receive for. - * - srcRoute: output parameter that allows the function to return back which - * route the TupleChunkListItem is from. - * - * RETURN: - * - A populated TupleChunkListItemData structure (allocated with palloc()). - */ -extern TupleChunkListItem RecvTupleChunkFromAny(MotionLayerState *mlStates, - ChunkTransportState *transportStates, - int16 motNodeID, - int16 *srcRoute); - - -/* The RecvTupleChunkFrom() function is similar to the RecvTupleChunkFromAny() - * function except that the connection we are interested in is specified with - * srcRoute. - * - * PARAMETERS: - * - motNodeID: motion node id to receive for. - * - srcRoute: which connection to receive on. - * RETURN: - * - A populated TupleChunkListItemData structure (allocated with palloc()). - */ -extern TupleChunkListItem RecvTupleChunkFrom(ChunkTransportState *transportStates, + /* Internal API - should not call outside interconnection layer + * + * Sends a tuple chunk from the Postgres process to remote or local via + * IPC. This function does not block; if the IPC channel cannot accept the + * tuple chunk for some reason, then this is indicated by a return-code. + * + * Errors are indicated by calls to ereport(), and are therefore not indicated + * in the return code. + * + * + * PARAMETERS: + * - conn: actived connection. + * - tcItem: The tuple-chunk data to send. + * - motionId: motion node Id that the tcItem belongs to. + * + */ + bool (*SendChunk)(struct ChunkTransportState *transportStates, + struct ChunkTransportStateEntry *pEntry, + struct MotionConn *conn, + TupleChunkListItem tcItem, + int16 motionId); + + /* The SendEOS() function is used to send an "End Of Stream" message to + * one of connected receivers + * PARAMETERS: + * - motNodeID: motion node Id that the tcItem belongs to. + * - tcItem: The tuple-chunk data to send. + * + */ + void (*SendEOS)(struct ChunkTransportState *transportStates, int motNodeID, TupleChunkListItem tcItem); + + /* The SendStopMessage() function is used to send stop messages to all senders. + * + * PARAMETERS: + * - motNodeID: motion node Id that the tcItem belongs to. + */ + void (*SendStopMessage)(struct ChunkTransportState *transportStates, int16 motNodeID); + + /* The RecvTupleChunkFromAny() function attempts to receive one or more tuple + * chunks from any of the incoming connections. This function blocks until + * at least one TupleChunk is received. (Although PG Interrupts are still + * checked for within this call). + * + * This function makes some effort to "fairly" pull data from peers with data + * available (a peer with data available is always better than waiting for + * one without data available; but a peer with data available which hasn't been + * read from recently is better than a peer with data available which has + * been read from recently). + * + * NOTE: The TupleChunkListItem can have other's chained to it. The caller + * should check and process all in list. + * + * PARAMETERS: + * - motNodeID: motion node id to receive for. + * - srcRoute: output parameter that allows the function to return back which + * route the TupleChunkListItem is from. + * + * RETURN: + * - A populated TupleChunkListItemData structure (allocated with palloc()). + */ + TupleChunkListItem (*RecvTupleChunkFromAny)(ChunkTransportState *transportStates, + int16 motNodeID, + int16 *srcRoute); + + /* The RecvTupleChunkFrom() function is similar to the RecvTupleChunkFromAny() + * function except that the connection we are interested in is specified with + * srcRoute. + * + * PARAMETERS: + * - motNodeID: motion node id to receive for. + * - srcRoute: which connection to receive on. + * RETURN: + * - A populated TupleChunkListItemData structure (allocated with palloc()). + */ + TupleChunkListItem (*RecvTupleChunkFrom)(ChunkTransportState *transportStates, int16 motNodeID, int16 srcRoute); -/* The DeregisterReadInterest() function is used to specify that we are no - * longer interested in reading from the specified srcRoute. After calling this - * function, we should no longer ever return TupleChunks from this srcRoute - * when calling RecvTupleChunkFromAny(). - * - * PARAMTERS: - * - motNodeID: motion node id that this applies to. - * - srcRoute: which connection to turn off reads for. - * - */ -extern void DeregisterReadInterest(ChunkTransportState *transportStates, + /* Internal API - should not call outside interconnection layer + * + * Recv a tuple chunk from the remote or local via IPC. + * This function does not block; if the IPC channel cannot accept the + * tuple chunk for some reason, then this is indicated by a return-code. + * + * Errors are indicated by calls to ereport(), and are therefore not indicated + * in the return code. + * + * PARAMETERS: + * - conn: actived connection. + * RETURN: + * - A populated TupleChunkListItemData structure (allocated with palloc()). + * + */ + TupleChunkListItem (*RecvTupleChunk)(struct MotionConn *conn, ChunkTransportState *transportStates); + + /* + * Direct access receive buffer to our freelist. + * + * allows us to "keep" a buffer held for a connection, to avoid a copy + * (see inplace in chunklist). + * + * The cdbmotion code has discarded our pointer to the motion-conn + * structure, but has enough info to fully specify it. + * + * DirectPutRxBuffer() is specify for UDPIFC, can't used in other IPC + */ + void (*DirectPutRxBuffer)(ChunkTransportState *transportStates, int motNodeID, int route); + + /* The DeregisterReadInterest() function is used to specify that we are no + * longer interested in reading from the specified srcRoute. After calling this + * function, we should no longer ever return TupleChunks from this srcRoute + * when calling RecvTupleChunkFromAny(). + * + * PARAMTERS: + * - motNodeID: motion node id that this applies to. + * - srcRoute: which connection to turn off reads for. + * + */ + void (*DeregisterReadInterest)(ChunkTransportState *transportStates, int motNodeID, int srcRoute, const char *reason); -extern void readPacket(MotionConn *conn, ChunkTransportState *transportStates); - -/* - * Return a UDP receive buffer to our freelist. - * - * allows us to "keep" a buffer held for a connection, to avoid a copy - * (see inplace in chunklist). - */ -extern void MlPutRxBufferIFC(ChunkTransportState *transportStates, int motNodeID, int route); - -#define getChunkTransportState(transportState, motNodeID, ppEntry) \ - do { \ - Assert((transportState) != NULL); \ - if ((motNodeID) > 0 && \ - (transportState) && \ - (motNodeID) <= (transportState)->size && \ - (transportState)->states[(motNodeID)-1].motNodeId == (motNodeID) && \ - (transportState)->states[(motNodeID)-1].valid) \ - { \ - *(ppEntry) = &(transportState)->states[(motNodeID) - 1]; \ - } \ - else \ - { \ - ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), \ - errmsg("Interconnect Error: Unexpected Motion Node Id: %d (size %d). This means" \ - " a motion node that wasn't setup is requesting interconnect" \ - " resources.", (motNodeID), (transportState)->size))); \ - /* not reached */ \ - } \ - } while (0) - -#define ML_CHECK_FOR_INTERRUPTS(teardownActive) \ - do {if (!teardownActive && InterruptPending) CHECK_FOR_INTERRUPTS(); } while (0) - -/* - * Return a direct pointer to a transmit buffer. This is actually two pointers - * with accompanying lengths since we have separate xmit buffers for primary and mirror - * segments. - */ -extern void getTransportDirectBuffer(ChunkTransportState *transportStates, + /* Get the number of active motion connections. + * + */ + uint32 (*GetActiveMotionConns)(void); + + /* + * Return a direct pointer to a transmit buffer. This is actually two pointers + * with accompanying lengths since we have separate xmit buffers for primary and mirror + * segments. + */ + void (*GetTransportDirectBuffer)(ChunkTransportState *transportStates, int16 motNodeID, int16 targetRoute, struct directTransportBuffer *b); -/* - * Advance direct buffer beyond the message we just added. - */ -extern void putTransportDirectBuffer(ChunkTransportState *transportStates, + /* + * Advance direct buffer beyond the message we just added. + */ + void (*PutTransportDirectBuffer)(ChunkTransportState *transportStates, int16 motNodeID, - int16 targetRoute, int serializedLength); - -/* doBroadcast() is used to send a TupleChunk to all recipients. - * - * PARAMETERS - * mlStates - motion-layer state ptr. - * transportStates - IC-instance ptr. - * pEntry - ChunkTransportState context that contains everything we need to send. - * tcItem - TupleChunk to send. - */ -#define doBroadcast(transportStates, pEntry, tcItem, inactiveCountPtr) \ - do { \ - MotionConn *conn; \ - int *p_inactive = inactiveCountPtr; \ - int i, index, inactive = 0; \ - /* add our tcItem to each of the outgoing buffers. */ \ - index = Max(0, GpIdentity.segindex); /* entry-db has -1 */ \ - for (i = 0; i < pEntry->numConns; i++, index++) \ - { \ - if (index >= pEntry->numConns) \ - index = 0; \ - conn = pEntry->conns + index; \ - /* only send to still interested receivers. */ \ - if (conn->stillActive) \ - { \ - transportStates->SendChunk(transportStates, pEntry, conn, tcItem, pEntry->motNodeId); \ - if (!conn->stillActive) \ - inactive++; \ - } \ - } \ - if (p_inactive != NULL) \ - *p_inactive = (inactive ? 1 : 0); \ - } while (0) - - -extern ChunkTransportStateEntry *createChunkTransportState(ChunkTransportState *transportStates, - ExecSlice *sendSlice, - ExecSlice *recvSlice, - int numConns); - -extern ChunkTransportStateEntry *removeChunkTransportState(ChunkTransportState *transportStates, - int16 motNodeID); - -extern TupleChunkListItem RecvTupleChunk(MotionConn *conn, ChunkTransportState *transportStates); - -extern void InitMotionTCP(int *listenerSocketFd, uint16 *listenerPort); -extern void InitMotionUDPIFC(int *listenerSocketFd, uint16 *listenerPort); -extern void markUDPConnInactiveIFC(MotionConn *conn); -extern void CleanupMotionTCP(void); -extern void CleanupMotionUDPIFC(void); -extern void WaitInterconnectQuitUDPIFC(void); -extern void SetupTCPInterconnect(EState *estate); -extern void SetupUDPIFCInterconnect(EState *estate); -extern void TeardownTCPInterconnect(ChunkTransportState *transportStates, - bool hasErrors); -extern void TeardownUDPIFCInterconnect(ChunkTransportState *transportStates, - bool hasErrors); - -extern uint32 getActiveMotionConns(void); - -extern char *format_sockaddr(struct sockaddr_storage *sa, char *buf, size_t len); + int16 targetRoute, + int serializedLength); + + /** + * bgworker call extension method + * Only for ic_proxy. + */ + int (*IcProxyServiceMain) (void); + + /* + * Get the TupleRemapper from MotionConn + * TupleRemapper will be set in interconnect and used in cdbmotion layer + */ + TupleRemapper *(*GetMotionConnTupleRemapper) (ChunkTransportState *transportStates, + int16 motNodeID, + int16 targetRoute); + +} MotionIPCLayer; + +/* MotionIPCLayer selected */ +extern MotionIPCLayer *CurrentMotionIPCLayer; #endif /* ML_IPC_H */ diff --git a/src/include/cdb/tupser.h b/src/include/cdb/tupser.h index 8b3b010766a..cd033bd686f 100644 --- a/src/include/cdb/tupser.h +++ b/src/include/cdb/tupser.h @@ -19,9 +19,6 @@ #include "lib/stringinfo.h" #include "cdb/tupleremap.h" - -typedef struct MotionConn MotionConn; - /* * The next two structures are for cached tuple serialization and * deserialization information. This information is cached since there will @@ -79,7 +76,7 @@ extern void CleanupSerTupInfo(SerTupInfo *pSerInfo); /* Convert RecordCache into chunks ready to send out, in one pass */ extern void SerializeRecordCacheIntoChunks(SerTupInfo *pSerInfo, TupleChunkList tcList, - MotionConn *conn); + int32 sent_record_typmod); /* Convert a tuple into chunks directly in a set of transport buffers */ extern int SerializeTuple(TupleTableSlot *tuple, SerTupInfo *pSerInfo, struct directTransportBuffer *b, TupleChunkList tcList, int16 targetRoute); diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 603b659e1e3..5557c826fe0 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -60,6 +60,9 @@ /* Define to 1 to build with ic-proxy support (--enable-ic-proxy) */ #undef ENABLE_IC_PROXY +/* Define to 1 to disable preload interconnect module */ +#undef ENABLE_PRELOAD_IC_MODULE + /* Define to 1 to build client libraries as thread-safe code. (--enable-thread-safety) */ #undef ENABLE_THREAD_SAFETY diff --git a/src/test/regress/expected/ic.out b/src/test/regress/expected/ic.out index 02b8315148a..9d404d972d1 100644 --- a/src/test/regress/expected/ic.out +++ b/src/test/regress/expected/ic.out @@ -449,7 +449,7 @@ DROP TABLE a; RESET search_path; DROP SCHEMA ic_udp_test CASCADE; /* - * If ack packet is lost in doSendStopMessageUDPIFC(), transaction with cursor + * If ack packet is lost in SendStopMessageUDPIFC(), transaction with cursor * should still be able to commit. */ --start_ignore diff --git a/src/test/regress/expected/ic_1.out b/src/test/regress/expected/ic_1.out index d1e2113279a..223ceaa605f 100644 --- a/src/test/regress/expected/ic_1.out +++ b/src/test/regress/expected/ic_1.out @@ -449,7 +449,7 @@ DROP TABLE a; RESET search_path; DROP SCHEMA ic_udp_test CASCADE; /* - * If ack packet is lost in doSendStopMessageUDPIFC(), transaction with cursor + * If ack packet is lost in SendStopMessageUDPIFC(), transaction with cursor * should still be able to commit. */ --start_ignore diff --git a/src/test/regress/regress_gp.c b/src/test/regress/regress_gp.c index 3da688a3f58..f48f7402b71 100644 --- a/src/test/regress/regress_gp.c +++ b/src/test/regress/regress_gp.c @@ -517,7 +517,7 @@ Datum numActiveMotionConns(PG_FUNCTION_ARGS) { uint32 num = 0; if (Gp_interconnect_type == INTERCONNECT_TYPE_UDPIFC) - num = getActiveMotionConns(); + num = CurrentMotionIPCLayer->GetActiveMotionConns(); PG_RETURN_UINT32(num); } diff --git a/src/test/regress/sql/ic.sql b/src/test/regress/sql/ic.sql index 93773241769..79c4f0cccb8 100644 --- a/src/test/regress/sql/ic.sql +++ b/src/test/regress/sql/ic.sql @@ -185,7 +185,7 @@ RESET search_path; DROP SCHEMA ic_udp_test CASCADE; /* - * If ack packet is lost in doSendStopMessageUDPIFC(), transaction with cursor + * If ack packet is lost in SendStopMessageUDPIFC(), transaction with cursor * should still be able to commit. */ --start_ignore From 40e980ca39f0aeb722d1511744e2201307efd5e9 Mon Sep 17 00:00:00 2001 From: zhaoxi Date: Mon, 17 Jul 2023 17:03:26 +0800 Subject: [PATCH 8/9] Fix :vectorized query call function results loss problem. This commit fixes vectorized query call function result loss problem. In the vectorization scenario, customize a function as follows CREATE FUNCTION equipment_named_ambiguous_2b(hobby text) RETURNS setof equipment_r AS 'select * from equipment_r where equipment_r.hobby = hobby' LANGUAGE SQL; Execute select equipment_named_ambiguous_2b(text 'skywalking'),results will be lost. For the internal sql of the function, 'select * from equipment_r where equipment_r.hobby = hobby' follows the vectorized execution path, and the returned result is multiple tuples. There is such a piece of code in the implementation logic of the function fmgr_sql: /* Extract the result as a datum, and copy out from the slot */ result = postquel_get_single_result(slot, fcinfo, fcache, oldcontext); /* Clear the tuplestore, but keep it for next time */ /* NB: this might delete the slot's content, but we don't care */ tuplestore_clear(fcache->tstore); Every time a tuple is returned, the tuplestore_clear will be called to clean up, which will cause the data loss of the query result. --- src/backend/executor/functions.c | 15 ++++++++++----- src/backend/utils/sort/tuplestore.c | 20 ++++++++++++++++++++ src/include/utils/tuplestore.h | 4 ++++ 3 files changed, 34 insertions(+), 5 deletions(-) diff --git a/src/backend/executor/functions.c b/src/backend/executor/functions.c index d9820d6327d..e411dfa3876 100644 --- a/src/backend/executor/functions.c +++ b/src/backend/executor/functions.c @@ -1321,8 +1321,8 @@ PG_TRY(); PushActiveSnapshot(es->qd->snapshot); pushed_snapshot = true; } - - completed = postquel_getnext(es, fcache); + if (!tuplestore_has_remaining_tuples(fcache->tstore) || completed || !fcache->returnsSet) + completed = postquel_getnext(es, fcache); /* * If we ran the command to completion, we can shut it down now. Any @@ -1395,6 +1395,7 @@ PG_END_TRY(); Assert(es->lazyEval); /* Re-use the junkfilter's output slot to fetch back the tuple */ Assert(fcache->junkFilter); + tuplestore_consume_tuple(fcache->tstore); slot = fcache->junkFilter->jf_resultSlot; if (!tuplestore_gettupleslot(fcache->tstore, true, false, slot)) elog(ERROR, "failed to fetch lazy-eval tuple"); @@ -1403,7 +1404,8 @@ PG_END_TRY(); fcache, oldcontext); /* Clear the tuplestore, but keep it for next time */ /* NB: this might delete the slot's content, but we don't care */ - tuplestore_clear(fcache->tstore); + if (!tuplestore_has_remaining_tuples(fcache->tstore)) + tuplestore_clear(fcache->tstore); /* * Let caller know we're not finished. @@ -1427,7 +1429,8 @@ PG_END_TRY(); /* * We are done with a lazy evaluation. Clean up. */ - tuplestore_clear(fcache->tstore); + if (!tuplestore_has_remaining_tuples(fcache->tstore)) + tuplestore_clear(fcache->tstore); /* * Let caller know we're finished. @@ -1482,6 +1485,7 @@ PG_END_TRY(); { /* Re-use the junkfilter's output slot to fetch back the tuple */ slot = fcache->junkFilter->jf_resultSlot; + tuplestore_consume_tuple(fcache->tstore); if (tuplestore_gettupleslot(fcache->tstore, true, false, slot)) result = postquel_get_single_result(slot, fcinfo, fcache, oldcontext); @@ -1500,7 +1504,8 @@ PG_END_TRY(); } /* Clear the tuplestore, but keep it for next time */ - tuplestore_clear(fcache->tstore); + if (!tuplestore_has_remaining_tuples(fcache->tstore)) + tuplestore_clear(fcache->tstore); } /* Pop snapshot if we have pushed one */ diff --git a/src/backend/utils/sort/tuplestore.c b/src/backend/utils/sort/tuplestore.c index 29e86af3a01..0a50edf1897 100644 --- a/src/backend/utils/sort/tuplestore.c +++ b/src/backend/utils/sort/tuplestore.c @@ -153,6 +153,7 @@ struct Tuplestorestate SharedFileSet *fileset; char *shared_filename; workfile_set *work_set; /* workfile set to use when using workfile manager */ + int64 remaining_tuples; /* number of tuples remaining */ /* * These function pointers decouple the routines that must know what kind @@ -328,6 +329,7 @@ tuplestore_begin_common(int eflags, bool interXact, int maxKBytes) state->memtupdeleted = 0; state->memtupcount = 0; state->tuples = 0; + state->remaining_tuples = 0; /* * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD; @@ -495,6 +497,7 @@ tuplestore_clear(Tuplestorestate *state) state->memtupdeleted = 0; state->memtupcount = 0; state->tuples = 0; + state->remaining_tuples = 0; readptr = state->readptrs; for (i = 0; i < state->readptrcount; readptr++, i++) { @@ -862,6 +865,7 @@ tuplestore_puttuple_common(Tuplestorestate *state, void *tuple) elog(ERROR, "cannot write new tuples to frozen tuplestore"); state->tuples++; + state->remaining_tuples++; switch (state->status) { @@ -1786,3 +1790,19 @@ tuplestore_open_shared(SharedFileSet *fileset, const char *filename) return state; } +/* + * When the remaining quantity is greater than zero, it needs to return true. + */ +extern bool tuplestore_has_remaining_tuples(Tuplestorestate *state) +{ + return state->remaining_tuples > 0; +} + +/* + * Each time a tuple is processed, the counter of the number of remaining tuples is decremented. + */ +extern void tuplestore_consume_tuple(Tuplestorestate *state) +{ + --state->remaining_tuples; +} + diff --git a/src/include/utils/tuplestore.h b/src/include/utils/tuplestore.h index ff30ed7391a..d26b34a5377 100644 --- a/src/include/utils/tuplestore.h +++ b/src/include/utils/tuplestore.h @@ -101,4 +101,8 @@ extern void tuplestore_make_shared(Tuplestorestate *state, SharedFileSet *filese extern void tuplestore_freeze(Tuplestorestate *state); extern Tuplestorestate *tuplestore_open_shared(SharedFileSet *fileset, const char *filename); +extern bool tuplestore_has_remaining_tuples(Tuplestorestate *state); + +extern void tuplestore_consume_tuple(Tuplestorestate *state); + #endif /* TUPLESTORE_H */ From 1b67fef61e9b84d2444215993ca99e34228f9974 Mon Sep 17 00:00:00 2001 From: zhoujiaqi Date: Tue, 18 Jul 2023 10:59:32 +0800 Subject: [PATCH 9/9] Exclude unnecessary files when not enable ic_proxy We don't need extra libs like libuv when config without --enable-ic-porxy. Use ENABLE_IC_PROXY to identify dependency. --- contrib/interconnect/ic_modules.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/contrib/interconnect/ic_modules.c b/contrib/interconnect/ic_modules.c index 8a1c95a0cc8..c3a61195490 100644 --- a/contrib/interconnect/ic_modules.c +++ b/contrib/interconnect/ic_modules.c @@ -18,7 +18,10 @@ #include "ic_common.h" #include "tcp/ic_tcp.h" #include "udp/ic_udpifc.h" + +#ifdef ENABLE_IC_PROXY #include "proxy/ic_proxy_server.h" +#endif PG_MODULE_MAGIC;