From 3a3cbc71cf4e128153c595ad8c323a807b859b56 Mon Sep 17 00:00:00 2001 From: HouLei Date: Mon, 30 Jan 2023 17:09:38 +0800 Subject: [PATCH 01/11] Add ZSTD compression support for gpfdist writable external table (#14250) In the previous https://github.com/greenplum-db/gpdb/pull/14144, we finished the support of compression transmission for readable gpfdist external table. In this PR, I developed the code for zstd compression transmission for the writable gpfdist external table. The pr involves three aspects of changes. The first part is adding decompression code for gpfdist. The second part is adding compression code for gpdb. Other modifications mainly involve the regression test for writable gpfdist external table. Finally, to use the zstd compression to transfer data for a writable external table, we need to use the flag --compress to turn on the compression transmission for gpfdist. --- src/backend/access/external/url_curl.c | 72 +++++- src/bin/gpfdist/gpfdist.c | 236 +++++++++++++++--- .../regress/input/gpfdist2_compress.source | 128 +++++++++- .../regress/output/gpfdist2_compress.source | 135 ++++++++++ 4 files changed, 525 insertions(+), 46 deletions(-) diff --git a/src/backend/access/external/url_curl.c b/src/backend/access/external/url_curl.c index bd86d8af8d8..16b89e1ff3b 100644 --- a/src/backend/access/external/url_curl.c +++ b/src/backend/access/external/url_curl.c @@ -47,7 +47,8 @@ typedef struct curlhandle_t { CURL *handle; /* The curl handle */ #ifdef USE_ZSTD - ZSTD_DCtx *zstd_dctx; /* The zstd context */ + ZSTD_DCtx *zstd_dctx; /* The zstd decompression context */ + ZSTD_CCtx *zstd_cctx; /* The zstd compression context */ #endif struct curl_slist *x_httpheader; /* list of headers */ bool in_multi_handle; /* T, if the handle is in global @@ -86,7 +87,8 @@ typedef struct struct { - char *ptr; /* palloc-ed buffer */ + char *ptr; /* palloc-ed buffer */ + char *cptr; int max; int bot, top; @@ -119,7 +121,6 @@ typedef struct #define HOST_NAME_SIZE 100 #define FDIST_TIMEOUT 408 #define MAX_TRY_WAIT_TIME 64 - /* * SSL support GUCs - should be added soon. Until then we will use stubs * @@ -189,6 +190,7 @@ static curlhandle_t *open_curl_handles; static bool url_curl_resowner_callback_registered; + static curlhandle_t * create_curlhandle(void) { @@ -201,6 +203,7 @@ create_curlhandle(void) #ifdef USE_ZSTD h->zstd_dctx = NULL; + h->zstd_cctx = NULL; #endif h->owner = CurrentResourceOwner; @@ -252,6 +255,11 @@ destroy_curlhandle(curlhandle_t *h) ZSTD_freeDCtx(h->zstd_dctx); h->zstd_dctx = NULL; } + if (h->zstd_cctx) + { + ZSTD_freeCCtx(h->zstd_cctx); + h->zstd_cctx = NULL; + } #endif pfree(h); } @@ -401,7 +409,15 @@ header_callback(void *ptr_, size_t size, size_t nmemb, void *userp) buf[i] = 0; #ifdef USE_ZSTD url->zstd = strtol(buf, 0, 0); - if (!url->for_write && url->zstd) + + if (url->for_write && url->zstd) + { + url->curl->zstd_cctx = ZSTD_createCCtx(); + // allocate out.cptr whose size equals to out.ptr + url->out.cptr = (char *) palloc(writable_external_table_bufsize * 1024); + url->lastsize = 0; + } + else if (url->zstd) { url->curl->zstd_dctx = ZSTD_createDCtx(); url->lastsize = ZSTD_initDStream(url->curl->zstd_dctx); @@ -1259,6 +1275,9 @@ url_curl_fopen(char *url, bool forwrite, extvar_t *ev, CopyFormatOptions *opts) set_httpheader(file, "X-GP-SEGMENT-COUNT", ev->GP_SEGMENT_COUNT); set_httpheader(file, "X-GP-LINE-DELIM-STR", ev->GP_LINE_DELIM_STR); set_httpheader(file, "X-GP-LINE-DELIM-LENGTH", ev->GP_LINE_DELIM_LENGTH); +#ifdef USE_ZSTD + set_httpheader(file, "X-GP-ZSTD", "1"); +#endif if (forwrite) { @@ -1279,9 +1298,6 @@ url_curl_fopen(char *url, bool forwrite, extvar_t *ev, CopyFormatOptions *opts) { /* read specific - (TODO: unclear why some of these are needed) */ set_httpheader(file, "X-GP-PROTO", "1"); -#ifdef USE_ZSTD - set_httpheader(file, "X-GP-ZSTD", "1"); -#endif set_httpheader(file, "X-GP-MASTER_HOST", ev->GP_MASTER_HOST); set_httpheader(file, "X-GP-MASTER_PORT", ev->GP_MASTER_PORT); set_httpheader(file, "X-GP-CSVOPT", ev->GP_CSVOPT); @@ -1486,6 +1502,13 @@ url_curl_fclose(URL_FILE *fileg, bool failOnError, const char *relname) file->out.ptr = NULL; } + if (file->out.cptr) + { + Assert(file->for_write); + pfree(file->out.cptr); + file->out.cptr = NULL; + } + file->gp_proto = 0; file->error = file->eof = 0; memset(&file->in, 0, sizeof(file->in)); @@ -1564,6 +1587,12 @@ decompress_zstd_data(ZSTD_DCtx* ctx, ZSTD_inBuffer* bin, ZSTD_outBuffer* bout) } return ret; } + +static int +compress_zstd_data(URL_CURL_FILE *file) +{ + return ZSTD_compressCCtx(file->curl->zstd_cctx, file->out.cptr, file->out.top, file->out.ptr, file->out.top, file->zstd); +} #endif /* @@ -1820,6 +1849,7 @@ gp_proto1_read(char *buf, int bufsz, URL_CURL_FILE *file, CopyFromState pstate, return n; } + /* * gp_proto0_write * @@ -1828,9 +1858,20 @@ gp_proto1_read(char *buf, int bufsz, URL_CURL_FILE *file, CopyFromState pstate, */ static void gp_proto0_write(URL_CURL_FILE *file, CopyToState pstate) -{ - char* buf = file->out.ptr; - int nbytes = file->out.top; +{ + char* buf; + int nbytes; +#ifdef USE_ZSTD + if(file->zstd){ + nbytes = compress_zstd_data(file); + buf = file->out.cptr; + } + else +#endif + { + buf = file->out.ptr; + nbytes = file->out.top; + } if (nbytes == 0) return; @@ -1927,11 +1968,18 @@ curl_fwrite(char *buf, int nbytes, URL_CURL_FILE *file, CopyToState pstate) char* newbuf; newbuf = repalloc(file->out.ptr, n); - if (!newbuf) elog(ERROR, "out of memory (curl_fwrite)"); - file->out.ptr = newbuf; + + if (file->zstd) + { + newbuf = repalloc(file->out.cptr, n); + if (!newbuf) + elog(ERROR, "out of compress memory (curl_fwrite)"); + file->out.cptr = newbuf; + } + file->out.max = n; Assert(nbytes < file->out.max); diff --git a/src/bin/gpfdist/gpfdist.c b/src/bin/gpfdist/gpfdist.c index a6e368c549d..b731e67b9b4 100644 --- a/src/bin/gpfdist/gpfdist.c +++ b/src/bin/gpfdist/gpfdist.c @@ -97,6 +97,14 @@ struct block_t char* cdata; }; +typedef struct zstd_buffer zstd_buffer; +struct zstd_buffer +{ + char *buf; + int size; + int pos; +}; + /* Get session id for this request */ #define GET_SID(r) ((r->sid)) @@ -312,17 +320,23 @@ struct request_t char* dbuf; /* buffer for raw data from a POST request */ int dbuftop; /* # bytes used in dbuf */ int dbufmax; /* size of dbuf[] */ + + char* wbuf; /* data buf for decompressed data for writing into file, + its capacity equals to MAX_FRAME_SIZE. */ + int wbuftop; /* last index for decompressed data */ + int woffset; /* mark whether there is left data in compress ctx */ } in; block_t outblock; /* next block to send out */ char* line_delim_str; int line_delim_length; #ifdef USE_ZSTD - ZSTD_CCtx* zstd_cctx; /* zstd context */ + ZSTD_CCtx* zstd_cctx; /* zstd compression context */ + ZSTD_DCtx* zstd_dctx; /* zstd decompression context */ #endif - int zstd; /* request use zstd compress */ - int zstd_err_len; /* space allocate for zstd_error string */ - char* zstd_error; /* string contains zstd error*/ + int zstd; /* request use zstd compress */ + int zstd_err_len; /* space allocate for zstd_error string */ + char* zstd_error; /* string contains zstd error*/ #ifdef USE_SSL /* SSL related */ BIO *io; /* for the i.o. */ @@ -384,7 +398,10 @@ static int request_validate(request_t *r); static int request_set_path(request_t *r, const char* d, char* p, char* pp, char* path); static int request_path_validate(request_t *r, const char* path); #ifdef USE_ZSTD -static int compress_zstd(const request_t *r, block_t *blk, int buflen); +static int compress_zstd(const request_t *r, block_t* block, int buflen); +static int decompress_data(request_t *r, zstd_buffer *in, zstd_buffer *out); +static int decompress_zstd(request_t* r, ZSTD_inBuffer* bin, ZSTD_outBuffer* bout); +static int decompress_write_loop(request_t *r); #endif static int request_parse_gp_headers(request_t *r, int opt_g); static void free_session_cb(int fd, short event, void* arg); @@ -3122,6 +3139,47 @@ static void handle_get_request(request_t *r) } } +static +int check_output_to_file(request_t *r, int wrote) +{ + session_t *session = r->session; + char *buf; + int *buftop; + if (r->zstd) + { + buf = r->in.wbuf; + buftop = &r->in.wbuftop; + } + else + { + buf = r->in.dbuf; + buftop = &r->in.dbuftop; + } + + if (wrote == -1) + { + /* write error */ + gwarning(r, "handle_post_request, write error: %s", fstream_get_error(session->fstream)); + http_error(r, FDIST_INTERNAL_ERROR, fstream_get_error(session->fstream)); + request_end(r, 1, 0, 0); + return -1; + } + else if(wrote == *buftop) + { + /* wrote the whole buffer. clean it for next round */ + *buftop = 0; + } + else + { + /* wrote up to last line, some data left over in buffer. move to front */ + int bytes_left_over = *buftop - wrote; + + memmove(buf, buf + wrote, bytes_left_over); + *buftop = bytes_left_over; + } + return 0; +} + static void handle_post_request(request_t *r, int header_end) { int h_count = r->in.req->hc; @@ -3218,7 +3276,10 @@ static void handle_post_request(request_t *r, int header_end) /* create a buffer to hold the incoming raw data */ r->in.dbufmax = opt.m; /* size of max line size */ r->in.dbuftop = 0; + r->in.wbuftop = 0; r->in.dbuf = palloc_safe(r, r->pool, r->in.dbufmax, "out of memory when allocating r->in.dbuf: %d bytes", r->in.dbufmax); + if(r->zstd) + r->in.wbuf = palloc_safe(r, r->pool, MAX_FRAME_SIZE, "out of memory when allocating r->in.wbuf: %d bytes", MAX_FRAME_SIZE); /* if some data come along with the request, copy it first */ data_start = strstr(r->in.hbuf, "\r\n\r\n"); @@ -3232,21 +3293,35 @@ static void handle_post_request(request_t *r, int header_end) { /* we have data after the request headers. consume it */ /* should make sure r->in.dbuftop + data_bytes_in_req < r->in.dbufmax */ + memcpy(r->in.dbuf, data_start, data_bytes_in_req); r->in.dbuftop += data_bytes_in_req; + + r->in.davailable -= data_bytes_in_req; /* only write it out if no more data is expected */ if(r->in.davailable == 0) { - wrote = fstream_write(session->fstream, r->in.dbuf, data_bytes_in_req, 1, r->line_delim_str, r->line_delim_length); - delay_watchdog_timer(); - if(wrote == -1) +#ifdef USE_ZSTD + if(r->zstd) { - /* write error */ - http_error(r, FDIST_INTERNAL_ERROR, fstream_get_error(session->fstream)); - request_end(r, 1, 0, 0); - return; + wrote = decompress_write_loop(r); + if (wrote == -1) + return; + } + else +#endif + { + wrote = fstream_write(session->fstream, r->in.dbuf, data_bytes_in_req, 1, r->line_delim_str, r->line_delim_length); + delay_watchdog_timer(); + if (wrote == -1) + { + /* write error */ + http_error(r, FDIST_INTERNAL_ERROR, fstream_get_error(session->fstream)); + request_end(r, 1, 0, 0); + return; + } } } } @@ -3258,7 +3333,7 @@ static void handle_post_request(request_t *r, int header_end) while(r->in.davailable > 0) { size_t want; - ssize_t n; + ssize_t n = 0; size_t buf_space_left = r->in.dbufmax - r->in.dbuftop; if (r->in.davailable > buf_space_left) @@ -3302,36 +3377,33 @@ static void handle_post_request(request_t *r, int header_end) r->in.davailable -= n; r->in.dbuftop += n; + /* success is a flag to check whether data is written into file successfully. + * There is no need to do anything when success is less than 0, since all + * error handling has been done in 'check_output_to_file' function. + */ + int success = 0; + /* if filled our buffer or no more data expected, write it */ if (r->in.dbufmax == r->in.dbuftop || r->in.davailable == 0) { +#ifdef USE_ZSTD /* only write up to end of last row */ - wrote = fstream_write(session->fstream, r->in.dbuf, r->in.dbuftop, 1, r->line_delim_str, r->line_delim_length); - gdebug(r, "wrote %d bytes to file", wrote); - delay_watchdog_timer(); - - if (wrote == -1) + if(r->zstd) { - /* write error */ - gwarning(r, "handle_post_request, write error: %s", fstream_get_error(session->fstream)); - http_error(r, FDIST_INTERNAL_ERROR, fstream_get_error(session->fstream)); - request_end(r, 1, 0, 0); - return; - } - else if(wrote == r->in.dbuftop) - { - /* wrote the whole buffer. clean it for next round */ - r->in.dbuftop = 0; + success = decompress_write_loop(r); } else +#endif { - /* wrote up to last line, some data left over in buffer. move to front */ - int bytes_left_over = r->in.dbuftop - wrote; + wrote = fstream_write(session->fstream, r->in.dbuf, r->in.dbuftop, 1, r->line_delim_str, r->line_delim_length); + gdebug(r, "wrote %d bytes to file", wrote); + delay_watchdog_timer(); - memmove(r->in.dbuf, r->in.dbuf + wrote, bytes_left_over); - r->in.dbuftop = bytes_left_over; + success = check_output_to_file(r, wrote); } } + if (success < 0) + return; } } @@ -3563,6 +3635,9 @@ static int request_parse_gp_headers(request_t *r, int opt_g) #ifdef USE_ZSTD if (r->zstd) { + if (!r->is_get) + r->zstd_dctx = ZSTD_createDCtx(); + OUT_BUFFER_SIZE = ZSTD_CStreamOutSize(); r->zstd_err_len = 1024; r->outblock.cdata = palloc_safe(r, r->pool, opt.m, "out of memory when allocating buffer for compressed data: %d bytes", opt.m); @@ -4564,6 +4639,10 @@ static void request_cleanup(request_t *r) { ZSTD_freeCCtx(r->zstd_cctx); } + if ( r->zstd && !r->is_get ) + { + ZSTD_freeDCtx(r->zstd_dctx); + } #endif } @@ -4675,6 +4754,7 @@ static void delay_watchdog_timer() shutdown_time = apr_time_now() + gcb.wdtimer * APR_USEC_PER_SEC; } } + #else static void delay_watchdog_timer() { @@ -4682,6 +4762,96 @@ static void delay_watchdog_timer() #endif #ifdef USE_ZSTD + +/* decompress the data and write data to the file. + * Finally, the function will check the write result, + * and change the related value about data buffer. + */ +static +int decompress_write_loop(request_t *r) +{ + session_t *session = r->session; + int wrote_total = 0; + do + { + int offset = 0; + if (r->in.woffset) + offset = r->in.woffset; + + zstd_buffer in = {r->in.dbuf, r->in.dbuftop, offset}; + zstd_buffer out = {r->in.wbuf + r->in.wbuftop, MAX_FRAME_SIZE - r->in.wbuftop, 0}; + + int res = decompress_data(r, &in, &out); + + if (res < 0) + { + http_error(r, FDIST_INTERNAL_ERROR, r->zstd_error); + request_end(r, 1, 0, 0); + return res; + } + + int wrote = fstream_write(session->fstream, r->in.wbuf, r->in.wbuftop, 0, r->line_delim_str, r->line_delim_length); + wrote_total += wrote; + gdebug(r, "wrote %d bytes to file", wrote); + delay_watchdog_timer(); + + res = check_output_to_file(r, wrote); + if (res < 0) + { + return -1; + } + + } while(r->in.woffset); + return wrote_total; +} + +static int decompress_zstd(request_t* r, ZSTD_inBuffer* bin, ZSTD_outBuffer* bout) +{ + int ret; + /* The return code is zero if the frame is complete, but there may + * be multiple frames concatenated together. Zstd will automatically + * reset the context when a frame is complete. Still, calling + * ZSTD_DCtx_reset() can be useful to reset the context to a clean + * state, for instance if the last decompression call returned an + * error. + */ + + ret = ZSTD_decompressStream(r->zstd_dctx, bout, bin); + size_t const err = ret; + if(ZSTD_isError(err)){ + snprintf(r->zstd_error, r->zstd_err_len, "zstd decompression error, error is %s", ZSTD_getErrorName(err)); + gwarning(NULL, "%s", r->zstd_error); + return -1; + } + return bout->pos; +} + +static int decompress_data(request_t* r, zstd_buffer *in, zstd_buffer *out){ + ZSTD_inBuffer inbuf = {in->buf , in->size, in->pos}; + ZSTD_outBuffer obuf = {out->buf, out->size, out->pos}; + + if(!r->zstd_dctx) { + gwarning(r, "%s", "Out of memory when ZSTD_createDCtx"); + return -1; + } + + int outSize = decompress_zstd(r, &inbuf, &obuf); + if(outSize < 0){ + return outSize; + } + + r->in.wbuftop += outSize; + if (inbuf.pos == inbuf.size) + { + r->in.woffset = 0; + } + else + { + r->in.woffset = inbuf.pos; + } + gdebug(NULL, "decompress_zstd finished, input size = %d, output size = %d.", r->in.wbuftop, r->in.dbuftop); + return outSize; +} /* * compress_zstd * It is for compress data in buffer. Return is the length of data after compression. @@ -4738,6 +4908,8 @@ static int compress_zstd(const request_t *r, block_t *blk, int buflen) } offset += output.pos; + gdebug(NULL, "compress_zstd finished, input size = %d, output size = %d.", buflen, offset); + return offset; } #endif diff --git a/src/bin/gpfdist/regress/input/gpfdist2_compress.source b/src/bin/gpfdist/regress/input/gpfdist2_compress.source index 8c249ca6d74..ba9379fc9e5 100644 --- a/src/bin/gpfdist/regress/input/gpfdist2_compress.source +++ b/src/bin/gpfdist/regress/input/gpfdist2_compress.source @@ -55,7 +55,69 @@ FORMAT 'text' DELIMITER AS '|' ) ; +CREATE TABLE lineitem (like ext_lineitem); SELECT count(*) FROM ext_lineitem; +INSERT INTO lineitem SELECT * FROM ext_lineitem; +DROP EXTERNAL TABLE ext_lineitem; +--test 1.1 test writable table using compression +CREATE WRITABLE EXTERNAL TABLE ext_lineitem_w ( + L_ORDERKEY INT8, + L_PARTKEY INTEGER, + L_SUPPKEY INTEGER, + L_LINENUMBER integer, + L_QUANTITY decimal, + L_EXTENDEDPRICE decimal, + L_DISCOUNT decimal, + L_TAX decimal, + L_RETURNFLAG CHAR(1), + L_LINESTATUS CHAR(1), + L_SHIPDATE date, + L_COMMITDATE date, + L_RECEIPTDATE date, + L_SHIPINSTRUCT CHAR(25), + L_SHIPMODE CHAR(10), + L_COMMENT VARCHAR(44) + ) +LOCATION +( + 'gpfdist://@hostname@:7070/gpfdist2/lineitem.tbl.w' +) +FORMAT 'text' +( + DELIMITER AS '|' +) +; +CREATE EXTERNAL TABLE ext_lineitem ( + L_ORDERKEY INT8, + L_PARTKEY INTEGER, + L_SUPPKEY INTEGER, + L_LINENUMBER integer, + L_QUANTITY decimal, + L_EXTENDEDPRICE decimal, + L_DISCOUNT decimal, + L_TAX decimal, + L_RETURNFLAG CHAR(1), + L_LINESTATUS CHAR(1), + L_SHIPDATE date, + L_COMMITDATE date, + L_RECEIPTDATE date, + L_SHIPINSTRUCT CHAR(25), + L_SHIPMODE CHAR(10), + L_COMMENT VARCHAR(44) + ) +LOCATION +( + 'gpfdist://@hostname@:7070/gpfdist2/lineitem.tbl.w' +) +FORMAT 'text' +( + DELIMITER AS '|' +) +; +INSERT INTO ext_lineitem_w SELECT * FROM lineitem; +DROP TABLE lineitem; +SELECT count(*) FROM ext_lineitem; +DROP EXTERNAL TABLE ext_lineitem_w; DROP EXTERNAL TABLE ext_lineitem; -- test 2 use a bigger file. @@ -87,7 +149,70 @@ FORMAT 'text' DELIMITER AS '|' ) ; +CREATE TABLE lineitem (like ext_lineitem); SELECT count(*) FROM ext_lineitem; +INSERT INTO lineitem SELECT * FROM ext_lineitem; +DROP EXTERNAL TABLE ext_lineitem; + +--test 2.1 test writable table using compression with big data +CREATE WRITABLE EXTERNAL TABLE ext_lineitem_w ( + L_ORDERKEY INT8, + L_PARTKEY INTEGER, + L_SUPPKEY INTEGER, + L_LINENUMBER integer, + L_QUANTITY decimal, + L_EXTENDEDPRICE decimal, + L_DISCOUNT decimal, + L_TAX decimal, + L_RETURNFLAG CHAR(1), + L_LINESTATUS CHAR(1), + L_SHIPDATE date, + L_COMMITDATE date, + L_RECEIPTDATE date, + L_SHIPINSTRUCT CHAR(25), + L_SHIPMODE CHAR(10), + L_COMMENT VARCHAR(44) + ) +LOCATION +( + 'gpfdist://@hostname@:7070/gpfdist2/lineitem.tbl.w' +) +FORMAT 'text' +( + DELIMITER AS '|' +) +; +CREATE EXTERNAL TABLE ext_lineitem ( + L_ORDERKEY INT8, + L_PARTKEY INTEGER, + L_SUPPKEY INTEGER, + L_LINENUMBER integer, + L_QUANTITY decimal, + L_EXTENDEDPRICE decimal, + L_DISCOUNT decimal, + L_TAX decimal, + L_RETURNFLAG CHAR(1), + L_LINESTATUS CHAR(1), + L_SHIPDATE date, + L_COMMITDATE date, + L_RECEIPTDATE date, + L_SHIPINSTRUCT CHAR(25), + L_SHIPMODE CHAR(10), + L_COMMENT VARCHAR(44) + ) +LOCATION +( + 'gpfdist://@hostname@:7070/gpfdist2/lineitem.tbl.w' +) +FORMAT 'text' +( + DELIMITER AS '|' +) +; +INSERT INTO ext_lineitem_w SELECT * FROM lineitem; +DROP TABLE lineitem; +SELECT count(*) FROM ext_lineitem; +DROP EXTERNAL TABLE ext_lineitem_w; DROP EXTERNAL TABLE ext_lineitem; -- test 3 line too long with defaults @@ -193,7 +318,6 @@ FORMAT 'text' ; SELECT count(*) FROM ext_lineitem; DROP EXTERNAL TABLE ext_lineitem; - -- test 2 use a bigger file. CREATE EXTERNAL TABLE ext_lineitem ( @@ -247,4 +371,4 @@ DROP EXTERNAL TABLE ext_test; --test 4 using csv data CREATE EXTERNAL TABLE ext_crlf_with_lf_column(c1 int, c2 text) LOCATION ('gpfdist://@hostname@:7070/gpfdist2/crlf_with_lf_column.csv') FORMAT 'csv' (NEWLINE 'CRLF'); SELECT count(*) FROM ext_crlf_with_lf_column; -DROP EXTERNAL TABLE ext_crlf_with_lf_column; \ No newline at end of file +DROP EXTERNAL TABLE ext_crlf_with_lf_column; diff --git a/src/bin/gpfdist/regress/output/gpfdist2_compress.source b/src/bin/gpfdist/regress/output/gpfdist2_compress.source index 16d9a0d7cb8..a14f9c7a16d 100644 --- a/src/bin/gpfdist/regress/output/gpfdist2_compress.source +++ b/src/bin/gpfdist/regress/output/gpfdist2_compress.source @@ -61,12 +61,80 @@ FORMAT 'text' DELIMITER AS '|' ) ; +CREATE TABLE lineitem (like ext_lineitem); +NOTICE: table doesn't have 'DISTRIBUTED BY' clause, defaulting to distribution columns from LIKE table SELECT count(*) FROM ext_lineitem; count ------- 256 (1 row) +INSERT INTO lineitem SELECT * FROM ext_lineitem; +DROP EXTERNAL TABLE ext_lineitem; +--test 1.1 test writable table using compression +CREATE WRITABLE EXTERNAL TABLE ext_lineitem_w ( + L_ORDERKEY INT8, + L_PARTKEY INTEGER, + L_SUPPKEY INTEGER, + L_LINENUMBER integer, + L_QUANTITY decimal, + L_EXTENDEDPRICE decimal, + L_DISCOUNT decimal, + L_TAX decimal, + L_RETURNFLAG CHAR(1), + L_LINESTATUS CHAR(1), + L_SHIPDATE date, + L_COMMITDATE date, + L_RECEIPTDATE date, + L_SHIPINSTRUCT CHAR(25), + L_SHIPMODE CHAR(10), + L_COMMENT VARCHAR(44) + ) +LOCATION +( + 'gpfdist://@hostname@:7070/gpfdist2/lineitem.tbl.w' +) +FORMAT 'text' +( + DELIMITER AS '|' +) +; +CREATE EXTERNAL TABLE ext_lineitem ( + L_ORDERKEY INT8, + L_PARTKEY INTEGER, + L_SUPPKEY INTEGER, + L_LINENUMBER integer, + L_QUANTITY decimal, + L_EXTENDEDPRICE decimal, + L_DISCOUNT decimal, + L_TAX decimal, + L_RETURNFLAG CHAR(1), + L_LINESTATUS CHAR(1), + L_SHIPDATE date, + L_COMMITDATE date, + L_RECEIPTDATE date, + L_SHIPINSTRUCT CHAR(25), + L_SHIPMODE CHAR(10), + L_COMMENT VARCHAR(44) + ) +LOCATION +( + 'gpfdist://@hostname@:7070/gpfdist2/lineitem.tbl.w' +) +FORMAT 'text' +( + DELIMITER AS '|' +) +; +INSERT INTO ext_lineitem_w SELECT * FROM lineitem; +DROP TABLE lineitem; +SELECT count(*) FROM ext_lineitem; + count +------- + 256 +(1 row) + +DROP EXTERNAL TABLE ext_lineitem_w; DROP EXTERNAL TABLE ext_lineitem; -- test 2 use a bigger file. CREATE EXTERNAL TABLE ext_lineitem ( @@ -96,12 +164,79 @@ FORMAT 'text' DELIMITER AS '|' ) ; +CREATE TABLE lineitem (like ext_lineitem); SELECT count(*) FROM ext_lineitem; count -------- 100000 (1 row) +INSERT INTO lineitem SELECT * FROM ext_lineitem; +DROP EXTERNAL TABLE ext_lineitem; +--test 2.1 test writable table using compression with big data +CREATE WRITABLE EXTERNAL TABLE ext_lineitem_w ( + L_ORDERKEY INT8, + L_PARTKEY INTEGER, + L_SUPPKEY INTEGER, + L_LINENUMBER integer, + L_QUANTITY decimal, + L_EXTENDEDPRICE decimal, + L_DISCOUNT decimal, + L_TAX decimal, + L_RETURNFLAG CHAR(1), + L_LINESTATUS CHAR(1), + L_SHIPDATE date, + L_COMMITDATE date, + L_RECEIPTDATE date, + L_SHIPINSTRUCT CHAR(25), + L_SHIPMODE CHAR(10), + L_COMMENT VARCHAR(44) + ) +LOCATION +( + 'gpfdist://@hostname@:7070/gpfdist2/lineitem.tbl.w' +) +FORMAT 'text' +( + DELIMITER AS '|' +) +; +CREATE EXTERNAL TABLE ext_lineitem ( + L_ORDERKEY INT8, + L_PARTKEY INTEGER, + L_SUPPKEY INTEGER, + L_LINENUMBER integer, + L_QUANTITY decimal, + L_EXTENDEDPRICE decimal, + L_DISCOUNT decimal, + L_TAX decimal, + L_RETURNFLAG CHAR(1), + L_LINESTATUS CHAR(1), + L_SHIPDATE date, + L_COMMITDATE date, + L_RECEIPTDATE date, + L_SHIPINSTRUCT CHAR(25), + L_SHIPMODE CHAR(10), + L_COMMENT VARCHAR(44) + ) +LOCATION +( + 'gpfdist://@hostname@:7070/gpfdist2/lineitem.tbl.w' +) +FORMAT 'text' +( + DELIMITER AS '|' +) +; +INSERT INTO ext_lineitem_w SELECT * FROM lineitem; +DROP TABLE lineitem; +SELECT count(*) FROM ext_lineitem; + count +-------- + 100256 +(1 row) + +DROP EXTERNAL TABLE ext_lineitem_w; DROP EXTERNAL TABLE ext_lineitem; -- test 3 line too long with defaults CREATE EXTERNAL TABLE ext_test ( From 7b5250e953669e73b55015d4d5d959db51802b48 Mon Sep 17 00:00:00 2001 From: Adam Lee Date: Tue, 31 Jan 2023 13:53:36 +0800 Subject: [PATCH 02/11] Fix a flaky test case querying pg_class `relname` of `pg_class` has an index, the stats and plan can be influenced by other tests running in parallel. Use `relnatts` instead. ``` { 'id' => 5, 'parent' => 2, - 'short' => 'Seq Scan on pg_class' + 'short' => 'Index Only Scan using pg_class_tblspc_relfilenode_index on pg_class' } ], 'id' => 2, ``` --- src/test/regress/expected/aggregates.out | 9 +++------ src/test/regress/expected/aggregates_optimizer.out | 9 +++------ src/test/regress/sql/aggregates.sql | 9 +++------ 3 files changed, 9 insertions(+), 18 deletions(-) diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out index b932b05dfb0..f8572ddb2ae 100644 --- a/src/test/regress/expected/aggregates.out +++ b/src/test/regress/expected/aggregates.out @@ -3258,12 +3258,9 @@ drop table agg_hash_1; -- drop table agg_hash_2; drop table agg_hash_3; drop table agg_hash_4; --- fix github issue #12061 numsegments of general locus is not -1 on create_minmaxagg_path -/* - * On the arm platform, `Seq Scan` is executed frequently, resulting in unstable output. - */ -set enable_indexonlyscan = off; -explain analyze select count(*) from pg_class, (select count(*) >0 from (select count(*) from pg_class where relname like 't%')x)y; +-- GitHub issue https://github.com/greenplum-db/gpdb/issues/12061 +-- numsegments of the general locus should be -1 on create_minmaxagg_path +explain analyze select count(*) from pg_class, (select count(*) > 0 from (select count(*) from pg_class where relnatts > 8) x) y; QUERY PLAN ----------------------------------------------------------------------------------------------------------------------- Aggregate (cost=10000000025.03..10000000025.05 rows=1 width=8) (actual time=0.214..0.214 rows=1 loops=1) diff --git a/src/test/regress/expected/aggregates_optimizer.out b/src/test/regress/expected/aggregates_optimizer.out index 46e3b26094e..5aac06ae992 100644 --- a/src/test/regress/expected/aggregates_optimizer.out +++ b/src/test/regress/expected/aggregates_optimizer.out @@ -3482,12 +3482,9 @@ drop table agg_hash_1; -- drop table agg_hash_2; drop table agg_hash_3; drop table agg_hash_4; --- fix github issue #12061 numsegments of general locus is not -1 on create_minmaxagg_path -/* - * On the arm platform, `Seq Scan` is executed frequently, resulting in unstable output. - */ -set enable_indexonlyscan = off; -explain analyze select count(*) from pg_class, (select count(*) >0 from (select count(*) from pg_class where relname like 't%')x)y; +-- GitHub issue https://github.com/greenplum-db/gpdb/issues/12061 +-- numsegments of the general locus should be -1 on create_minmaxagg_path +explain analyze select count(*) from pg_class, (select count(*) > 0 from (select count(*) from pg_class where relnatts > 8) x) y; INFO: GPORCA failed to produce a plan, falling back to planner DETAIL: Feature not supported: Queries on master-only tables QUERY PLAN diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql index 16b811d9bbe..4ae3bb5bfda 100644 --- a/src/test/regress/sql/aggregates.sql +++ b/src/test/regress/sql/aggregates.sql @@ -1436,9 +1436,6 @@ drop table agg_hash_1; drop table agg_hash_3; drop table agg_hash_4; --- fix github issue #12061 numsegments of general locus is not -1 on create_minmaxagg_path -/* - * On the arm platform, `Seq Scan` is executed frequently, resulting in unstable output. - */ -set enable_indexonlyscan = off; -explain analyze select count(*) from pg_class, (select count(*) >0 from (select count(*) from pg_class where relname like 't%')x)y; +-- GitHub issue https://github.com/greenplum-db/gpdb/issues/12061 +-- numsegments of the general locus should be -1 on create_minmaxagg_path +explain analyze select count(*) from pg_class, (select count(*) > 0 from (select count(*) from pg_class where relnatts > 8) x) y; From 4cd373927243866d1542dc1edb0831e9f9751cef Mon Sep 17 00:00:00 2001 From: Zhenglong Li Date: Wed, 1 Feb 2023 11:11:40 +0800 Subject: [PATCH 03/11] fix the issue of cannot create temporary table like existing table with comments (#14742) This PR is trying to fix the issue of #14649. When QD generates query tree and serializes it to string and sends it to QE, and QE will deserialize it to a plan tree. In this process, Greenplum seems not to consider the difference between NULL and an empty string, so if the original value is a NULL, QE may deserialize it to an empty string, which could lead error happened. It's hard to modify the function of deserializeNode(), since Greenplum does not have the mechanism to clarify NULL and empty string during serialization and deserialization. So this PR is just fixing the special issue, not for general using. --- src/backend/catalog/namespace.c | 22 ++++++++++++++++++ .../regress/expected/create_table_like_gp.out | 23 +++++++++++++++++++ src/test/regress/sql/create_table_like_gp.sql | 18 +++++++++++++++ 3 files changed, 63 insertions(+) diff --git a/src/backend/catalog/namespace.c b/src/backend/catalog/namespace.c index 9a3b347aa0e..a2ce73578b0 100644 --- a/src/backend/catalog/namespace.c +++ b/src/backend/catalog/namespace.c @@ -3159,11 +3159,33 @@ makeRangeVarFromNameList(List *names) break; case 2: rel->schemaname = strVal(linitial(names)); + + /* GPDB: When QD generates query tree and serializes it to string + * and sends it to QE, and QE will deserialize it to a plan tree. + * In this process, Greenplum will not consider the difference + * between NULL and an empty string, so if the original value is + * a NULL, QE may deserialize it to an empty string, which could + * lead to error in the following process. + */ + if (rel->schemaname && strlen(rel->schemaname) == 0) + rel->schemaname = NULL; + rel->relname = strVal(lsecond(names)); break; case 3: rel->catalogname = strVal(linitial(names)); rel->schemaname = strVal(lsecond(names)); + + /* GPDB: When QD generates query tree and serializes it to string + * and sends it to QE, and QE will deserialize it to a plan tree. + * In this process, Greenplum will not consider the difference + * between NULL and an empty string, so if the original value is + * a NULL, QE may deserialize it to an empty string, which could + * lead to error in the following process. + */ + if (rel->schemaname && strlen(rel->schemaname) == 0) + rel->schemaname = NULL; + rel->relname = strVal(lthird(names)); break; default: diff --git a/src/test/regress/expected/create_table_like_gp.out b/src/test/regress/expected/create_table_like_gp.out index d290707b3de..0990a65d232 100644 --- a/src/test/regress/expected/create_table_like_gp.out +++ b/src/test/regress/expected/create_table_like_gp.out @@ -107,3 +107,26 @@ WHERE t_ext_r | f | {format=csv,"delimiter= ",null=,"escape=\"","quote=\"",format_type=c,location_uris=gpfdist://127.0.0.1:8080/tmp/dummy,execute_on=ALL_SEGMENTS,log_errors=f,encoding=6,is_writable=false} (3 rows) +-- TEMP TABLE WITH COMMENTS +-- More details can be found at https://github.com/greenplum-db/gpdb/issues/14649 +CREATE TABLE t_comments_a (a integer); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +COMMENT ON COLUMN t_comments_a.a IS 'Airflow'; +CREATE TEMPORARY TABLE t_comments_b (LIKE t_comments_a INCLUDING COMMENTS); +NOTICE: table doesn't have 'DISTRIBUTED BY' clause, defaulting to distribution columns from LIKE table +-- Verify the copied comment +SELECT + c.column_name, + pgd.description +FROM pg_catalog.pg_statio_all_tables st + inner join pg_catalog.pg_description pgd on (pgd.objoid=st.relid) + inner join information_schema.columns c on (pgd.objsubid=c.ordinal_position and c.table_schema=st.schemaname and c.table_name=st.relname) +WHERE c.table_name = 't_comments_b'; + column_name | description +-------------+------------- + a | Airflow +(1 row) + +DROP TABLE t_comments_a; +DROP TABLE t_comments_b; diff --git a/src/test/regress/sql/create_table_like_gp.sql b/src/test/regress/sql/create_table_like_gp.sql index a0960da7b41..e6d0f44fd67 100644 --- a/src/test/regress/sql/create_table_like_gp.sql +++ b/src/test/regress/sql/create_table_like_gp.sql @@ -78,3 +78,21 @@ FROM LEFT OUTER JOIN pg_catalog.pg_foreign_table f ON (c.oid = f.ftrelid) WHERE c.relname LIKE 't_ext%'; + +-- TEMP TABLE WITH COMMENTS +-- More details can be found at https://github.com/greenplum-db/gpdb/issues/14649 +CREATE TABLE t_comments_a (a integer); +COMMENT ON COLUMN t_comments_a.a IS 'Airflow'; +CREATE TEMPORARY TABLE t_comments_b (LIKE t_comments_a INCLUDING COMMENTS); + +-- Verify the copied comment +SELECT + c.column_name, + pgd.description +FROM pg_catalog.pg_statio_all_tables st + inner join pg_catalog.pg_description pgd on (pgd.objoid=st.relid) + inner join information_schema.columns c on (pgd.objsubid=c.ordinal_position and c.table_schema=st.schemaname and c.table_name=st.relname) +WHERE c.table_name = 't_comments_b'; + +DROP TABLE t_comments_a; +DROP TABLE t_comments_b; From 591450711ccf3108cd494285fa92fe92ae283eee Mon Sep 17 00:00:00 2001 From: Adam Lee Date: Wed, 1 Feb 2023 13:46:52 +0800 Subject: [PATCH 04/11] pfree() the pstrdup()'d string userDoption ``` $ postgres --boot -D ~/greenplum-db-data/gpseg-1 LOG: gp_role forced to 'utility' in single-user mode free(): invalid pointer Aborted (core dumped) ``` --- src/backend/bootstrap/bootstrap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c index 4bfd33ccc68..7d131e75e63 100644 --- a/src/backend/bootstrap/bootstrap.c +++ b/src/backend/bootstrap/bootstrap.c @@ -375,10 +375,11 @@ AuxiliaryProcessMain(int argc, char *argv[]) if (!SelectConfigFiles(userDoption, progname)) proc_exit(1); } + if (userDoption) { /* userDoption isn't used any more */ - free(userDoption); + pfree(userDoption); userDoption = NULL; } From 46e06cf61653f2e0189d682eb84bf668452e9403 Mon Sep 17 00:00:00 2001 From: Divyesh Vanjare Date: Tue, 24 Jan 2023 11:18:15 -0500 Subject: [PATCH 05/11] If QEs hit errors in explain analyze, rethrow the error before ExplainPrintPlan (#14588) The execution sequence of explain analyze is as follows: ExecutorStart ExecutorRun -->dispatch plan to QEs cdbdisp_checkDispatchResult -->wait all QEs finished ExplainPrintXXX -->print explain infos ExecutorEnd -->reThrow QE's error, etc.; When we execute explain analyze, we might not receive stats from QEs, If some QEs hit errors. But, In ExplainPrintXXX we will use these stats and print explain info, if we forget to do null judgment, this may hit a null pointer exception. Finally, in ExecutorEn d we reThrow the error from QE. If some QE throws errors, we reThrow the error before executing ExplainPrintXXX. --- src/backend/access/aocs/aocssegfiles.c | 123 ++++++++++-------- src/include/access/aocssegfiles.h | 1 - .../regress/expected/alter_table_aocs.out | 90 +++++++++++++ src/test/regress/sql/alter_table_aocs.sql | 15 +++ 4 files changed, 177 insertions(+), 52 deletions(-) diff --git a/src/backend/access/aocs/aocssegfiles.c b/src/backend/access/aocs/aocssegfiles.c index e9ea524a48b..b0ca80b1143 100644 --- a/src/backend/access/aocs/aocssegfiles.c +++ b/src/backend/access/aocs/aocssegfiles.c @@ -63,8 +63,7 @@ static AOCSFileSegInfo **GetAllAOCSFileSegInfo_pg_aocsseg_rel( - int numOfColumsn, - char *relationName, + Relation rel, Relation pg_aocsseg_rel, Snapshot appendOnlyMetaDataSnapshot, int32 *totalseg); @@ -126,6 +125,68 @@ InsertInitialAOCSFileSegInfo(Relation prel, int32 segno, int32 nvp, Oid segrelid pfree(values); } +/* + * This is a routine to extract the vpinfo underlying the untoasted datum from + * the pg_aocsseg relation row, given the aocs relation's relnatts, into the supplied + * AOCSFileSegInfo structure. + * + * Sometimes the number of columns represented in the vpinfo inside pg_aocsseg + * the row may not match pg_class.relnatts. For instance, when we do an ADD + * COLUMN operation, we will have lesser number of columns in the table row + * than pg_class.relnatts. + * On the other hand, following an aborted ADD COLUMN operation, + * the number of columns in the table row will be + * greater than pg_class.relnatts. + */ +static void +deformAOCSVPInfo(Relation rel, struct varlena *v, AOCSFileSegInfo *aocs_seginfo) +{ + int16 relnatts = RelationGetNumberOfAttributes(rel); + struct varlena *dv = pg_detoast_datum(v); + int source_size = VARSIZE(dv); + int target_size = aocs_vpinfo_size(relnatts); + + + if (source_size <= target_size) + { + /* The source fits into the target, simply memcpy. */ + memcpy(&aocs_seginfo->vpinfo, dv, source_size); + Assert(aocs_seginfo->vpinfo.nEntry <= relnatts); + } + else + { + /* + * We have more columns represented in the vpinfo recorded inside the + * pg_aocsseg row, than pg_class.relnatts. Perform additional validation + * on these extra column entries and then simply copy over relnatts + * worth of entries from within the datum. + */ + AOCSVPInfo *vpInfo = (AOCSVPInfo *) dv; + + for (int i = relnatts; i < vpInfo->nEntry; ++i) + { + /* + * These extra entries must have be the initial frozen inserts + * from when InsertInitialAOCSFileSegInfo() was called during + * an aborted ADD COLUMN operation. Such entries should have eofs = 0, + * indicating that there is no data. If not, there is something + * seriously wrong. Yell appropriately. + */ + if(vpInfo->entry[i].eof > 0 || vpInfo->entry[i].eof_uncompressed > 0) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("For relation \"%s\" aborted column %d has non-zero eof %d or non-zero uncompressed eof %d", + RelationGetRelationName(rel), i, (int) vpInfo->entry[i].eof, (int) vpInfo->entry[i].eof_uncompressed))); + } + + memcpy(&aocs_seginfo->vpinfo, dv, aocs_vpinfo_size(relnatts)); + aocs_seginfo->vpinfo.nEntry = relnatts; + } + + if (dv != v) + pfree(dv); +} + /* * GetAOCSFileSegInfo. * @@ -240,15 +301,9 @@ GetAOCSFileSegInfo(Relation prel, seginfo ->state = DatumGetInt16(d[Anum_pg_aocs_state - 1]); Assert(!null[Anum_pg_aocs_vpinfo - 1]); - { - struct varlena *v = (struct varlena *) DatumGetPointer(d[Anum_pg_aocs_vpinfo - 1]); - struct varlena *dv = pg_detoast_datum(v); - - Assert(VARSIZE(dv) <= aocs_vpinfo_size(nvp)); - memcpy(&seginfo->vpinfo, dv, aocs_vpinfo_size(nvp)); - if (dv != v) - pfree(dv); - } + deformAOCSVPInfo(prel, + (struct varlena *) DatumGetPointer(d[Anum_pg_aocs_vpinfo - 1]), + seginfo); pfree(d); pfree(null); @@ -285,9 +340,7 @@ GetAllAOCSFileSegInfo(Relation prel, pg_aocsseg_rel = relation_open(segrelid, AccessShareLock); - results = GetAllAOCSFileSegInfo_pg_aocsseg_rel( - RelationGetNumberOfAttributes(prel), - RelationGetRelationName(prel), + results = GetAllAOCSFileSegInfo_pg_aocsseg_rel(prel, pg_aocsseg_rel, appendOnlyMetaDataSnapshot, totalseg); @@ -315,17 +368,13 @@ aocsFileSegInfoCmp(const void *left, const void *right) return 0; } - static AOCSFileSegInfo ** -GetAllAOCSFileSegInfo_pg_aocsseg_rel(int numOfColumns, - char *relationName, +GetAllAOCSFileSegInfo_pg_aocsseg_rel(Relation rel, Relation pg_aocsseg_rel, Snapshot snapshot, int32 *totalseg) { - int32 nvp = numOfColumns; - SysScanDesc scan; HeapTuple tup; @@ -357,7 +406,7 @@ GetAllAOCSFileSegInfo_pg_aocsseg_rel(int numOfColumns, allseg = (AOCSFileSegInfo **) repalloc(allseg, sizeof(AOCSFileSegInfo *) * seginfo_slot_no); } - aocs_seginfo = (AOCSFileSegInfo *) palloc0(aocsfileseginfo_size(nvp)); + aocs_seginfo = (AOCSFileSegInfo *) palloc0(aocsfileseginfo_size(RelationGetNumberOfAttributes(rel))); allseg[cur_seg] = aocs_seginfo; @@ -389,33 +438,7 @@ GetAllAOCSFileSegInfo_pg_aocsseg_rel(int numOfColumns, aocs_seginfo->state = DatumGetInt16(d[Anum_pg_aocs_state - 1]); Assert(!null[Anum_pg_aocs_vpinfo - 1]); - { - uint32 len; - struct varlena *v = (struct varlena *) DatumGetPointer(d[Anum_pg_aocs_vpinfo - 1]); - struct varlena *dv = pg_detoast_datum(v); - - /* - * VARSIZE(dv) may be less than aocs_vpinfo_size, in case of add - * column, we try to do a ctas from old table to new table. - * - * Also, VARSIZE(dv) may be greater than aocs_vpinfo_size, in case - * of begin transaction, add column and assign a new segno for insert, - * and then rollback transaction. In this case, we must be using - * RESERVED_SEGNO (check ChooseSegnoForWrite() for more detail). - */ - Assert(VARSIZE(dv) <= aocs_vpinfo_size(nvp) || aocs_seginfo->segno == RESERVED_SEGNO); - - len = (VARSIZE(dv) <= aocs_vpinfo_size(nvp)) ? VARSIZE(dv) : aocs_vpinfo_size(nvp); - memcpy(&aocs_seginfo->vpinfo, dv, len); - if (len != VARSIZE(dv)) - { - /* truncate VPInfoEntry of useless columns */ - SET_VARSIZE(&aocs_seginfo->vpinfo, len); - aocs_seginfo->vpinfo.nEntry = nvp; - } - if (dv != v) - pfree(dv); - } + deformAOCSVPInfo(rel, (struct varlena *) DatumGetPointer(d[Anum_pg_aocs_vpinfo - 1]), aocs_seginfo); ++cur_seg; } @@ -1255,8 +1278,7 @@ gp_aocsseg_internal(PG_FUNCTION_ARGS, Oid aocsRelOid) pg_aocsseg_rel = heap_open(segrelid, AccessShareLock); context->aocsSegfileArray = GetAllAOCSFileSegInfo_pg_aocsseg_rel( - aocsRel->rd_rel->relnatts, - RelationGetRelationName(aocsRel), + aocsRel, pg_aocsseg_rel, appendOnlyMetaDataSnapshot, &context->totalAocsSegFiles); @@ -1468,8 +1490,7 @@ gp_aocsseg_history(PG_FUNCTION_ARGS) pg_aocsseg_rel = heap_open(segrelid, AccessShareLock); context->aocsSegfileArray = GetAllAOCSFileSegInfo_pg_aocsseg_rel( - RelationGetNumberOfAttributes(aocsRel), - RelationGetRelationName(aocsRel), + aocsRel, pg_aocsseg_rel, SnapshotAny, //Get ALL tuples from pg_aocsseg_ % including aborted and in - progress ones. & context->totalAocsSegFiles); diff --git a/src/include/access/aocssegfiles.h b/src/include/access/aocssegfiles.h index fc0ec005291..18d67e6df26 100644 --- a/src/include/access/aocssegfiles.h +++ b/src/include/access/aocssegfiles.h @@ -135,7 +135,6 @@ extern AOCSFileSegInfo *GetAOCSFileSegInfo(Relation prel, Snapshot appendOnlyMetaDataSnapshot, int32 segno, bool locked); - extern AOCSFileSegInfo **GetAllAOCSFileSegInfo(Relation prel, Snapshot appendOnlyMetaDataSnapshot, int *totalseg, diff --git a/src/test/regress/expected/alter_table_aocs.out b/src/test/regress/expected/alter_table_aocs.out index b8fc977c6f4..c6806a0d0a1 100644 --- a/src/test/regress/expected/alter_table_aocs.out +++ b/src/test/regress/expected/alter_table_aocs.out @@ -836,3 +836,93 @@ Distributed by: (a) Options: compresstype=rle_type, compresslevel=4, blocksize=65536 DROP TABLE aocs_alter_add_col_reorganize; +-- test case: Ensure that reads don't fail after aborting an add column + insert operation and we don't project the aborted column +CREATE TABLE aocs_addcol_abort(a int, b int) USING ao_column; +INSERT INTO aocs_addcol_abort SELECT i,i FROM generate_series(1,10)i; +BEGIN; +ALTER TABLE aocs_addcol_abort ADD COLUMN c int; +INSERT INTO aocs_addcol_abort SELECT i,i,i FROM generate_series(1,10)i; +-- check state of aocsseg for entries of add column + insert +SELECT * FROM gp_toolkit.__gp_aocsseg('aocs_addcol_abort') ORDER BY segment_id, column_num; + segment_id | segno | column_num | physical_segno | tupcount | eof | eof_uncompressed | modcount | formatversion | state +------------+-------+------------+----------------+----------+-----+------------------+----------+---------------+------- + 0 | 0 | 0 | 0 | 5 | 64 | 64 | 1 | 3 | 1 + 0 | 1 | 0 | 1 | 5 | 64 | 64 | 2 | 3 | 1 + 0 | 0 | 1 | 128 | 5 | 64 | 64 | 1 | 3 | 1 + 0 | 1 | 1 | 129 | 5 | 64 | 64 | 2 | 3 | 1 + 0 | 0 | 2 | 256 | 5 | 64 | 64 | 1 | 3 | 1 + 0 | 1 | 2 | 257 | 5 | 48 | 48 | 2 | 3 | 1 + 1 | 0 | 0 | 0 | 1 | 48 | 48 | 1 | 3 | 1 + 1 | 1 | 0 | 1 | 1 | 48 | 48 | 2 | 3 | 1 + 1 | 0 | 1 | 128 | 1 | 48 | 48 | 1 | 3 | 1 + 1 | 1 | 1 | 129 | 1 | 48 | 48 | 2 | 3 | 1 + 1 | 0 | 2 | 256 | 1 | 48 | 48 | 1 | 3 | 1 + 1 | 1 | 2 | 257 | 1 | 48 | 48 | 2 | 3 | 1 + 2 | 0 | 0 | 0 | 4 | 56 | 56 | 1 | 3 | 1 + 2 | 1 | 0 | 1 | 4 | 56 | 56 | 2 | 3 | 1 + 2 | 0 | 1 | 128 | 4 | 56 | 56 | 1 | 3 | 1 + 2 | 1 | 1 | 129 | 4 | 56 | 56 | 2 | 3 | 1 + 2 | 0 | 2 | 256 | 4 | 56 | 56 | 1 | 3 | 1 + 2 | 1 | 2 | 257 | 4 | 48 | 48 | 2 | 3 | 1 +(18 rows) + +SELECT * FROM aocs_addcol_abort; + a | b | c +----+----+---- + 5 | 5 | 5 + 6 | 6 | 6 + 9 | 9 | 9 + 10 | 10 | 10 + 5 | 5 | + 6 | 6 | + 9 | 9 | + 10 | 10 | + 1 | 1 | 1 + 1 | 1 | + 2 | 2 | 2 + 3 | 3 | 3 + 4 | 4 | 4 + 7 | 7 | 7 + 8 | 8 | 8 + 2 | 2 | + 3 | 3 | + 4 | 4 | + 7 | 7 | + 8 | 8 | +(20 rows) + +ABORT; +-- check state of aocsseg if entries for new column are rolled back correctly +SELECT * FROM gp_toolkit.__gp_aocsseg('aocs_addcol_abort') ORDER BY segment_id, column_num; + segment_id | segno | column_num | physical_segno | tupcount | eof | eof_uncompressed | modcount | formatversion | state +------------+-------+------------+----------------+----------+-----+------------------+----------+---------------+------- + 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 + 0 | 1 | 0 | 1 | 5 | 64 | 64 | 1 | 3 | 1 + 0 | 0 | 1 | 128 | 0 | 0 | 0 | 0 | 3 | 1 + 0 | 1 | 1 | 129 | 5 | 64 | 64 | 1 | 3 | 1 + 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 + 1 | 1 | 0 | 1 | 1 | 48 | 48 | 1 | 3 | 1 + 1 | 0 | 1 | 128 | 0 | 0 | 0 | 0 | 3 | 1 + 1 | 1 | 1 | 129 | 1 | 48 | 48 | 1 | 3 | 1 + 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 + 2 | 1 | 0 | 1 | 4 | 56 | 56 | 1 | 3 | 1 + 2 | 0 | 1 | 128 | 0 | 0 | 0 | 0 | 3 | 1 + 2 | 1 | 1 | 129 | 4 | 56 | 56 | 1 | 3 | 1 +(12 rows) + +SELECT * FROM aocs_addcol_abort; + a | b +----+---- + 5 | 5 + 6 | 6 + 9 | 9 + 10 | 10 + 1 | 1 + 2 | 2 + 3 | 3 + 4 | 4 + 7 | 7 + 8 | 8 +(10 rows) + +DROP TABLE aocs_addcol_abort; diff --git a/src/test/regress/sql/alter_table_aocs.sql b/src/test/regress/sql/alter_table_aocs.sql index 46d28049061..d6e542c1c7c 100644 --- a/src/test/regress/sql/alter_table_aocs.sql +++ b/src/test/regress/sql/alter_table_aocs.sql @@ -480,3 +480,18 @@ RESET gp_default_storage_options; ALTER TABLE aocs_alter_add_col_reorganize ADD COLUMN d int; \d+ aocs_alter_add_col_reorganize DROP TABLE aocs_alter_add_col_reorganize; + +-- test case: Ensure that reads don't fail after aborting an add column + insert operation and we don't project the aborted column +CREATE TABLE aocs_addcol_abort(a int, b int) USING ao_column; +INSERT INTO aocs_addcol_abort SELECT i,i FROM generate_series(1,10)i; +BEGIN; +ALTER TABLE aocs_addcol_abort ADD COLUMN c int; +INSERT INTO aocs_addcol_abort SELECT i,i,i FROM generate_series(1,10)i; +-- check state of aocsseg for entries of add column + insert +SELECT * FROM gp_toolkit.__gp_aocsseg('aocs_addcol_abort') ORDER BY segment_id, column_num; +SELECT * FROM aocs_addcol_abort; +ABORT; +-- check state of aocsseg if entries for new column are rolled back correctly +SELECT * FROM gp_toolkit.__gp_aocsseg('aocs_addcol_abort') ORDER BY segment_id, column_num; +SELECT * FROM aocs_addcol_abort; +DROP TABLE aocs_addcol_abort; From 8cd84e3b4fc114ad3e25e0eaa938876616a5c58c Mon Sep 17 00:00:00 2001 From: "xuqi.wxq" Date: Tue, 27 Sep 2022 16:19:25 +0800 Subject: [PATCH 06/11] Add test case for reloptions when adding child partition w/ different AM than parent Was fixed in 3e943c17edf7. Now adding a test and adjusting a comment. --- .../regress/expected/partition_storage.out | 19 ++++++++++++++++++ src/test/regress/sql/partition_storage.sql | 20 +++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/src/test/regress/expected/partition_storage.out b/src/test/regress/expected/partition_storage.out index 88eb1c59db3..4b26601f571 100644 --- a/src/test/regress/expected/partition_storage.out +++ b/src/test/regress/expected/partition_storage.out @@ -787,3 +787,22 @@ Options: compresstype=zlib, compresslevel=1 alter table pt_co_tab_rng add default partition dft; -- Split default partition alter table pt_co_tab_rng split default partition start(45) end(60) into (partition dft, partition two); +-- Add partition with different table am + SET gp_default_storage_options = 'blocksize=32768,compresstype=zstd,compresslevel=9'; + SET default_table_access_method TO ao_column; + CREATE TABLE pt_co_to_heap + ( + col1 int, + col2 decimal, + col3 text, + col4 bool + ) + distributed by (col1) + partition by list(col2) + ( + partition part1 values(1,2,3) + ); + ALTER TABLE pt_co_to_heap ADD PARTITION p4 values(4) WITH (appendonly=false); + DROP TABLE pt_co_to_heap; + SET gp_default_storage_options TO DEFAULT; + SET default_table_access_method TO DEFAULT; diff --git a/src/test/regress/sql/partition_storage.sql b/src/test/regress/sql/partition_storage.sql index f178956fec6..fa3b0716cde 100644 --- a/src/test/regress/sql/partition_storage.sql +++ b/src/test/regress/sql/partition_storage.sql @@ -472,3 +472,23 @@ drop table if exists co_can cascade; -- Split default partition alter table pt_co_tab_rng split default partition start(45) end(60) into (partition dft, partition two); + +-- Add partition with different table am + SET gp_default_storage_options = 'blocksize=32768,compresstype=zstd,compresslevel=9'; + SET default_table_access_method TO ao_column; + CREATE TABLE pt_co_to_heap + ( + col1 int, + col2 decimal, + col3 text, + col4 bool + ) + distributed by (col1) + partition by list(col2) + ( + partition part1 values(1,2,3) + ); + ALTER TABLE pt_co_to_heap ADD PARTITION p4 values(4) WITH (appendonly=false); + DROP TABLE pt_co_to_heap; + SET gp_default_storage_options TO DEFAULT; + SET default_table_access_method TO DEFAULT; From 58083e73baab03324fe127cd72c7eb098227c612 Mon Sep 17 00:00:00 2001 From: Brent Doil Date: Fri, 3 Feb 2023 14:07:27 -0500 Subject: [PATCH 07/11] Fix syntax error with CREATE MATERIALIZED VIEW Allow DISTRIBUTED BY when using the CREATE MATERIALIZED VIEW IF NOT EXISTS SYNTAX. Resolves error: CREATE MATERIALIZED VIEW IF NOT EXISTS mvtest AS SELECT * FROM mvtest_tvvmv DISTRIBUTED RANDOMLY; ERROR: syntax error at or near "DISTRIBUTED" Update matview regress tests to test `IF NOT EXISTS` with `DISTRIBUTED BY` Authored-by: Brent Doil --- src/test/regress/expected/matview.out | 4 ++-- src/test/regress/expected/matview_ao.out | 6 +++--- src/test/regress/expected/matview_optimizer.out | 4 ++-- src/test/regress/sql/matview.sql | 4 ++-- src/test/regress/sql/matview_ao.sql | 6 +++--- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/test/regress/expected/matview.out b/src/test/regress/expected/matview.out index 55e16a03f31..f0b1ad9c3dc 100644 --- a/src/test/regress/expected/matview.out +++ b/src/test/regress/expected/matview.out @@ -19,7 +19,7 @@ SELECT * FROM mvtest_tv ORDER BY type; -- create a materialized view with no data, and confirm correct behavior EXPLAIN (costs off) - CREATE MATERIALIZED VIEW mvtest_tm AS SELECT type, sum(amt) AS totamt FROM mvtest_t GROUP BY type WITH NO DATA distributed by(type); + CREATE MATERIALIZED VIEW IF NOT EXISTS mvtest_tm AS SELECT type, sum(amt) AS totamt FROM mvtest_t GROUP BY type WITH NO DATA distributed by(type); QUERY PLAN ------------------------------------------------------ HashAggregate @@ -30,7 +30,7 @@ EXPLAIN (costs off) Optimizer: Postgres query optimizer (6 rows) -CREATE MATERIALIZED VIEW mvtest_tm AS SELECT type, sum(amt) AS totamt FROM mvtest_t GROUP BY type WITH NO DATA distributed by(type); +CREATE MATERIALIZED VIEW IF NOT EXISTS mvtest_tm AS SELECT type, sum(amt) AS totamt FROM mvtest_t GROUP BY type WITH NO DATA distributed by(type); SELECT relispopulated FROM pg_class WHERE oid = 'mvtest_tm'::regclass; relispopulated ---------------- diff --git a/src/test/regress/expected/matview_ao.out b/src/test/regress/expected/matview_ao.out index b1941224590..7999cf2990c 100644 --- a/src/test/regress/expected/matview_ao.out +++ b/src/test/regress/expected/matview_ao.out @@ -7,7 +7,7 @@ INSERT INTO t_matview_ao VALUES (3, 'y', 5), (4, 'y', 7), (5, 'z', 11); -CREATE MATERIALIZED VIEW m_heap AS SELECT type, sum(amt) AS totamt FROM t_matview_ao GROUP BY type WITH NO DATA distributed by(type); +CREATE MATERIALIZED VIEW IF NOT EXISTS m_heap AS SELECT type, sum(amt) AS totamt FROM t_matview_ao GROUP BY type WITH NO DATA distributed by(type); CREATE UNIQUE INDEX m_heap_index ON m_heap(type); SELECT * from m_heap; ERROR: materialized view "m_heap" has not been populated @@ -54,7 +54,7 @@ SELECT * FROM m_heap; z | 11 (3 rows) -CREATE MATERIALIZED VIEW m_ao with (appendonly=true) AS SELECT type, sum(amt) AS totamt FROM t_matview_ao GROUP BY type WITH NO DATA distributed by(type); +CREATE MATERIALIZED VIEW IF NOT EXISTS m_ao with (appendonly=true) AS SELECT type, sum(amt) AS totamt FROM t_matview_ao GROUP BY type WITH NO DATA distributed by(type); SELECT * from m_ao; ERROR: materialized view "m_ao" has not been populated HINT: Use the REFRESH MATERIALIZED VIEW command. @@ -80,7 +80,7 @@ SELECT * FROM m_ao; z | 11 (3 rows) -CREATE MATERIALIZED VIEW m_aocs with (appendonly=true, orientation=column) AS SELECT type, sum(amt) AS totamt FROM t_matview_ao GROUP BY type WITH NO DATA distributed by(type); +CREATE MATERIALIZED VIEW IF NOT EXISTS m_aocs with (appendonly=true, orientation=column) AS SELECT type, sum(amt) AS totamt FROM t_matview_ao GROUP BY type WITH NO DATA distributed by(type); SELECT * from m_aocs; ERROR: materialized view "m_aocs" has not been populated HINT: Use the REFRESH MATERIALIZED VIEW command. diff --git a/src/test/regress/expected/matview_optimizer.out b/src/test/regress/expected/matview_optimizer.out index c54b1943ecb..a0c61de98d7 100644 --- a/src/test/regress/expected/matview_optimizer.out +++ b/src/test/regress/expected/matview_optimizer.out @@ -19,7 +19,7 @@ SELECT * FROM mvtest_tv ORDER BY type; -- create a materialized view with no data, and confirm correct behavior EXPLAIN (costs off) - CREATE MATERIALIZED VIEW mvtest_tm AS SELECT type, sum(amt) AS totamt FROM mvtest_t GROUP BY type WITH NO DATA distributed by(type); + CREATE MATERIALIZED VIEW IF NOT EXISTS mvtest_tm AS SELECT type, sum(amt) AS totamt FROM mvtest_t GROUP BY type WITH NO DATA distributed by(type); QUERY PLAN ------------------------------------------------------------ GroupAggregate @@ -32,7 +32,7 @@ EXPLAIN (costs off) Optimizer: Pivotal Optimizer (GPORCA) version 3.83.0 (8 rows) -CREATE MATERIALIZED VIEW mvtest_tm AS SELECT type, sum(amt) AS totamt FROM mvtest_t GROUP BY type WITH NO DATA distributed by(type); +CREATE MATERIALIZED VIEW IF NOT EXISTS mvtest_tm AS SELECT type, sum(amt) AS totamt FROM mvtest_t GROUP BY type WITH NO DATA distributed by(type); SELECT relispopulated FROM pg_class WHERE oid = 'mvtest_tm'::regclass; relispopulated ---------------- diff --git a/src/test/regress/sql/matview.sql b/src/test/regress/sql/matview.sql index 76739f281af..ead969f8e4b 100644 --- a/src/test/regress/sql/matview.sql +++ b/src/test/regress/sql/matview.sql @@ -14,8 +14,8 @@ SELECT * FROM mvtest_tv ORDER BY type; -- create a materialized view with no data, and confirm correct behavior EXPLAIN (costs off) - CREATE MATERIALIZED VIEW mvtest_tm AS SELECT type, sum(amt) AS totamt FROM mvtest_t GROUP BY type WITH NO DATA distributed by(type); -CREATE MATERIALIZED VIEW mvtest_tm AS SELECT type, sum(amt) AS totamt FROM mvtest_t GROUP BY type WITH NO DATA distributed by(type); + CREATE MATERIALIZED VIEW IF NOT EXISTS mvtest_tm AS SELECT type, sum(amt) AS totamt FROM mvtest_t GROUP BY type WITH NO DATA distributed by(type); +CREATE MATERIALIZED VIEW IF NOT EXISTS mvtest_tm AS SELECT type, sum(amt) AS totamt FROM mvtest_t GROUP BY type WITH NO DATA distributed by(type); SELECT relispopulated FROM pg_class WHERE oid = 'mvtest_tm'::regclass; SELECT * FROM mvtest_tm ORDER BY type; REFRESH MATERIALIZED VIEW mvtest_tm; diff --git a/src/test/regress/sql/matview_ao.sql b/src/test/regress/sql/matview_ao.sql index c3d072f2884..412017c82c5 100644 --- a/src/test/regress/sql/matview_ao.sql +++ b/src/test/regress/sql/matview_ao.sql @@ -8,7 +8,7 @@ INSERT INTO t_matview_ao VALUES (4, 'y', 7), (5, 'z', 11); -CREATE MATERIALIZED VIEW m_heap AS SELECT type, sum(amt) AS totamt FROM t_matview_ao GROUP BY type WITH NO DATA distributed by(type); +CREATE MATERIALIZED VIEW IF NOT EXISTS m_heap AS SELECT type, sum(amt) AS totamt FROM t_matview_ao GROUP BY type WITH NO DATA distributed by(type); CREATE UNIQUE INDEX m_heap_index ON m_heap(type); SELECT * from m_heap; REFRESH MATERIALIZED VIEW CONCURRENTLY m_heap; @@ -23,7 +23,7 @@ select relispopulated from gp_dist_random('pg_class') where relname = 'm_heap'; REFRESH MATERIALIZED VIEW m_heap; SELECT * FROM m_heap; -CREATE MATERIALIZED VIEW m_ao with (appendonly=true) AS SELECT type, sum(amt) AS totamt FROM t_matview_ao GROUP BY type WITH NO DATA distributed by(type); +CREATE MATERIALIZED VIEW IF NOT EXISTS m_ao with (appendonly=true) AS SELECT type, sum(amt) AS totamt FROM t_matview_ao GROUP BY type WITH NO DATA distributed by(type); SELECT * from m_ao; REFRESH MATERIALIZED VIEW m_ao; SELECT * FROM m_ao; @@ -33,7 +33,7 @@ REFRESH MATERIALIZED VIEW m_ao; SELECT * FROM m_ao; -CREATE MATERIALIZED VIEW m_aocs with (appendonly=true, orientation=column) AS SELECT type, sum(amt) AS totamt FROM t_matview_ao GROUP BY type WITH NO DATA distributed by(type); +CREATE MATERIALIZED VIEW IF NOT EXISTS m_aocs with (appendonly=true, orientation=column) AS SELECT type, sum(amt) AS totamt FROM t_matview_ao GROUP BY type WITH NO DATA distributed by(type); SELECT * from m_aocs; REFRESH MATERIALIZED VIEW m_aocs; SELECT * FROM m_aocs; From 96e0f19c2ae299dd45deda4654c2daffc23666b6 Mon Sep 17 00:00:00 2001 From: Xiaoran Wang Date: Mon, 6 Feb 2023 14:10:44 +0800 Subject: [PATCH 08/11] Add assert to the length of shared hash table name (#14163) The max size for shared memory hash table name is SHMEM_INDEX_KEYSIZE - 1 (shared hash table name is stored and indexed by ShmemIndex hash table, the key size of it is SHMEM_INDEX_KEYSIZE), but when caller using a longer hash table name, it doesn't report any error, instead it just uses the first SHMEM_INDEX_KEYSIZE chars as the hash table name. When some hash tables' names have a same prefix which is longer than (SHMEM_INDEX_KEYSIZE - 1), issues will come: those hash tables actually are created as a same hash table whose name is the prefix. So add the assert to prevent it. --- src/backend/storage/ipc/shmem.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c index 0214f3a05d6..17e6d9e4775 100644 --- a/src/backend/storage/ipc/shmem.c +++ b/src/backend/storage/ipc/shmem.c @@ -465,6 +465,7 @@ ShmemInitStruct(const char *name, Size size, bool *foundPtr) return structPtr; } + Assert(strlen(name) < SHMEM_INDEX_KEYSIZE); /* look it up in the shmem index */ result = (ShmemIndexEnt *) hash_search(ShmemIndex, name, HASH_ENTER_NULL, foundPtr); From d24483b5e50eaec462fab4e98194172548b159ed Mon Sep 17 00:00:00 2001 From: Andrew Repp Date: Fri, 3 Feb 2023 14:53:16 -0600 Subject: [PATCH 09/11] Fix cumulative statistics collection for AO aux tables Currently AO auxiliary files never have any cumulative statistics collected (pg_stat_*) by the stats collector. This means that autovacuum doesn't consider these tables (see relation_needs_vacanalyze()), and contravenes expectations for these tables. Fix this collection and display. # Conflicts: # src/backend/catalog/system_views.sql # src/backend/postmaster/pgstat.c # src/include/catalog/catversion.h --- src/backend/catalog/system_views.sql | 40 +++++++++------- src/include/catalog/catversion.h | 2 +- .../regress/input/pgstat_qd_tabstat.source | 21 +++++++++ .../regress/output/pgstat_qd_tabstat.source | 46 +++++++++++++++++++ 4 files changed, 92 insertions(+), 17 deletions(-) diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 312ba17ddce..561f89de8fe 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -690,7 +690,7 @@ CREATE VIEW pg_stat_all_tables_internal AS FROM pg_class C LEFT JOIN pg_index I ON C.oid = I.indrelid LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace) - WHERE C.relkind IN ('r', 't', 'm', 'p') + WHERE C.relkind IN ('r', 't', 'm', 'o', 'b', 'M', 'p') GROUP BY C.oid, N.nspname, C.relname; -- Gather data from segments on user tables, and use data on coordinator on system tables. @@ -722,9 +722,9 @@ SELECT s.autoanalyze_count FROM (SELECT - relid, - schemaname, - relname, + allt.relid, + allt.schemaname, + allt.relname, case when d.policytype = 'r' then (sum(seq_scan)/d.numsegments)::bigint else sum(seq_scan) end seq_scan, case when d.policytype = 'r' then (sum(seq_tup_read)/d.numsegments)::bigint else sum(seq_tup_read) end seq_tup_read, case when d.policytype = 'r' then (sum(idx_scan)/d.numsegments)::bigint else sum(idx_scan) end idx_scan, @@ -746,10 +746,18 @@ FROM max(analyze_count) as analyze_count, max(autoanalyze_count) as autoanalyze_count FROM - gp_dist_random('pg_stat_all_tables_internal'), gp_distribution_policy as d + gp_dist_random('pg_stat_all_tables_internal') allt + inner join pg_class c + on allt.relid = c.oid + left outer join gp_distribution_policy d + on allt.relid = d.localoid WHERE - relid >= 16384 and relid = d.localoid - GROUP BY relid, schemaname, relname, d.policytype, d.numsegments + relid >= 16384 + and ( + d.localoid is not null + or c.relkind in ('o', 'b', 'M') + ) + GROUP BY allt.relid, allt.schemaname, allt.relname, d.policytype, d.numsegments UNION ALL @@ -778,12 +786,12 @@ CREATE VIEW pg_stat_xact_all_tables AS FROM pg_class C LEFT JOIN pg_index I ON C.oid = I.indrelid LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace) - WHERE C.relkind IN ('r', 't', 'm', 'p') + WHERE C.relkind IN ('r', 't', 'm', 'o', 'b', 'M', 'p') GROUP BY C.oid, N.nspname, C.relname; CREATE VIEW pg_stat_sys_tables AS SELECT * FROM pg_stat_all_tables - WHERE schemaname IN ('pg_catalog', 'information_schema') OR + WHERE schemaname IN ('pg_catalog', 'information_schema', 'pg_aoseg') OR schemaname ~ '^pg_toast'; -- In singlenode mode, the result of pg_stat_sys_tables will be messed up, @@ -796,7 +804,7 @@ CREATE VIEW pg_stat_sys_tables_single_node AS CREATE VIEW pg_stat_xact_sys_tables AS SELECT * FROM pg_stat_xact_all_tables - WHERE schemaname IN ('pg_catalog', 'information_schema') OR + WHERE schemaname IN ('pg_catalog', 'information_schema', 'pg_aoseg') OR schemaname ~ '^pg_toast'; CREATE VIEW pg_stat_user_tables AS @@ -839,12 +847,12 @@ CREATE VIEW pg_statio_all_tables AS pg_class T ON C.reltoastrelid = T.oid LEFT JOIN pg_index X ON T.oid = X.indrelid LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace) - WHERE C.relkind IN ('r', 't', 'm') + WHERE C.relkind IN ('r', 't', 'm', 'o', 'b', 'M') GROUP BY C.oid, N.nspname, C.relname, T.oid, X.indexrelid; CREATE VIEW pg_statio_sys_tables AS SELECT * FROM pg_statio_all_tables - WHERE schemaname IN ('pg_catalog', 'information_schema') OR + WHERE schemaname IN ('pg_catalog', 'information_schema', 'pg_aoseg') OR schemaname ~ '^pg_toast'; CREATE VIEW pg_statio_user_tables AS @@ -866,7 +874,7 @@ CREATE VIEW pg_stat_all_indexes_internal AS pg_index X ON C.oid = X.indrelid JOIN pg_class I ON I.oid = X.indexrelid LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace) - WHERE C.relkind IN ('r', 't', 'm'); + WHERE C.relkind IN ('r', 't', 'm', 'o', 'b', 'M'); -- Gather data from segments on user tables, and use data on coordinator on system tables. @@ -908,7 +916,7 @@ WHERE m.relid = s.relid; CREATE VIEW pg_stat_sys_indexes AS SELECT * FROM pg_stat_all_indexes - WHERE schemaname IN ('pg_catalog', 'information_schema') OR + WHERE schemaname IN ('pg_catalog', 'information_schema', 'pg_aoseg') OR schemaname ~ '^pg_toast'; CREATE VIEW pg_stat_user_indexes AS @@ -930,11 +938,11 @@ CREATE VIEW pg_statio_all_indexes AS pg_index X ON C.oid = X.indrelid JOIN pg_class I ON I.oid = X.indexrelid LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace) - WHERE C.relkind IN ('r', 't', 'm'); + WHERE C.relkind IN ('r', 't', 'm', 'o', 'b', 'M'); CREATE VIEW pg_statio_sys_indexes AS SELECT * FROM pg_statio_all_indexes - WHERE schemaname IN ('pg_catalog', 'information_schema') OR + WHERE schemaname IN ('pg_catalog', 'information_schema', 'pg_aoseg') OR schemaname ~ '^pg_toast'; CREATE VIEW pg_statio_user_indexes AS diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 857724d1153..799916398b7 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -56,6 +56,6 @@ */ /* 3yyymmddN */ -#define CATALOG_VERSION_NO 302412242 +#define CATALOG_VERSION_NO 302501601 #endif diff --git a/src/test/regress/input/pgstat_qd_tabstat.source b/src/test/regress/input/pgstat_qd_tabstat.source index f963ff1bdbf..0fb8c0ccb71 100644 --- a/src/test/regress/input/pgstat_qd_tabstat.source +++ b/src/test/regress/input/pgstat_qd_tabstat.source @@ -254,3 +254,24 @@ select n_tup_ins, n_tup_upd, n_tup_del, n_tup_hot_upd, n_live_tup, n_dead_tup, n select n_tup_ins, n_tup_upd, n_tup_del, n_tup_hot_upd, n_live_tup, n_dead_tup, n_mod_since_analyze from pg_stat_all_tables where relid = 'mt'::regclass; reset gp_autostats_mode; + + +-- Test stats creation for AO auxiliary tables +create table tabstat_ao(i int, j int) using ao_row distributed by (i); +create index on tabstat_ao(i); +insert into tabstat_ao select 1,1; +delete from tabstat_ao; +select pg_sleep(0.77); -- Force pgstat_report_stat() to send tabstat. + +select count(*) from pg_stat_all_tables +where + relid = (select segrelid from pg_appendonly where relid = 'tabstat_ao'::regclass) + OR relid = (select blkdirrelid from pg_appendonly where relid = 'tabstat_ao'::regclass) + OR relid = (select visimaprelid from pg_appendonly where relid = 'tabstat_ao'::regclass); + +select pg_sleep(0.77); -- Force pgstat_report_stat() to send tabstat. +select n_tup_ins from pg_stat_all_tables where relid = (select segrelid from pg_appendonly where relid = 'tabstat_ao'::regclass); +select n_tup_ins from pg_stat_all_tables where relid = (select blkdirrelid from pg_appendonly where relid = 'tabstat_ao'::regclass); +select n_tup_ins from pg_stat_all_tables where relid = (select visimaprelid from pg_appendonly where relid = 'tabstat_ao'::regclass); + +drop table tabstat_ao; diff --git a/src/test/regress/output/pgstat_qd_tabstat.source b/src/test/regress/output/pgstat_qd_tabstat.source index f1155e75839..1583cabf345 100644 --- a/src/test/regress/output/pgstat_qd_tabstat.source +++ b/src/test/regress/output/pgstat_qd_tabstat.source @@ -569,3 +569,49 @@ select n_tup_ins, n_tup_upd, n_tup_del, n_tup_hot_upd, n_live_tup, n_dead_tup, n (1 row) reset gp_autostats_mode; +-- Test stats creation for AO auxiliary tables +create table tabstat_ao(i int, j int) using ao_row distributed by (i); +create index on tabstat_ao(i); +insert into tabstat_ao select 1,1; +delete from tabstat_ao; +select pg_sleep(0.77); -- Force pgstat_report_stat() to send tabstat. + pg_sleep +---------- + +(1 row) + +select count(*) from pg_stat_all_tables +where + relid = (select segrelid from pg_appendonly where relid = 'tabstat_ao'::regclass) + OR relid = (select blkdirrelid from pg_appendonly where relid = 'tabstat_ao'::regclass) + OR relid = (select visimaprelid from pg_appendonly where relid = 'tabstat_ao'::regclass); + count +------- + 3 +(1 row) + +select pg_sleep(0.77); -- Force pgstat_report_stat() to send tabstat. + pg_sleep +---------- + +(1 row) + +select n_tup_ins from pg_stat_all_tables where relid = (select segrelid from pg_appendonly where relid = 'tabstat_ao'::regclass); + n_tup_ins +----------- + 1 +(1 row) + +select n_tup_ins from pg_stat_all_tables where relid = (select blkdirrelid from pg_appendonly where relid = 'tabstat_ao'::regclass); + n_tup_ins +----------- + 1 +(1 row) + +select n_tup_ins from pg_stat_all_tables where relid = (select visimaprelid from pg_appendonly where relid = 'tabstat_ao'::regclass); + n_tup_ins +----------- + 1 +(1 row) + +drop table tabstat_ao; From 577b4e7690787b3659d33553de4a3664defafc6f Mon Sep 17 00:00:00 2001 From: Yini Li <90589201+yinil-hello@users.noreply.github.com> Date: Fri, 18 Nov 2022 09:52:28 +0800 Subject: [PATCH 10/11] Declare gp_gettmid as an extern function (#14498) The extension metrics collector needs gp_gettmid to get the timestamp. --- src/backend/executor/instrument.c | 4 ++-- src/include/executor/instrument.h | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c index ea879180e6a..9a57c680d9c 100644 --- a/src/backend/executor/instrument.c +++ b/src/backend/executor/instrument.c @@ -40,7 +40,6 @@ static bool shouldPickInstrInShmem(NodeTag tag); static Instrumentation *pickInstrFromShmem(const Plan *plan, int instrument_options); static void instrShmemRecycleCallback(ResourceReleasePhase phase, bool isCommit, bool isTopLevel, void *arg); -static void gp_gettmid(int32* tmid); InstrumentationHeader *InstrumentGlobal = NULL; static int scanNodeCounter = 0; @@ -555,7 +554,8 @@ static int32 gp_gettmid_helper() /* * Wrapper for gp_gettmid_helper() */ -static void gp_gettmid(int32* tmid) +void +gp_gettmid(int32* tmid) { int32 time = gp_gettmid_helper(); if (time == -1) diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h index b3b050c0d54..78a81f709f6 100644 --- a/src/include/executor/instrument.h +++ b/src/include/executor/instrument.h @@ -174,6 +174,9 @@ extern Size InstrShmemSize(void); extern void InstrShmemInit(void); extern Instrumentation *GpInstrAlloc(const Plan *node, int instrument_options, bool async_mode); +/* needed by metrics_collector*/ +extern void gp_gettmid(int32*); + /* * For each free slot in shmem, fill it with specific pattern * Use this pattern to detect the slot has been recycled. From 8b9dc71c400911215322cd6cebb56c0949e48f15 Mon Sep 17 00:00:00 2001 From: Zhenghua Lyu Date: Fri, 18 Nov 2022 08:50:01 +0800 Subject: [PATCH 11/11] Refactor SaveOidAssignments and RestoreOidAssignments logic. The two functions SaveOidAssignments and RestoreOidAssignments should come together, before some procedures that do not want to touch the global vars (dispatch_oids or preassigned_oids), we need to first save the oid assignments, and then do the job, finally restore oid assignments. A typical usage should be as below: List *l = SaveOidAssignments(); do_the_job(); RestoreOidAssignments(l); The global var dispatch_oids is only used on QD, and the global var preassigned_oids is only used on QEs. They are both Lists, in a specific memorycontext, normally the memorycontext will be reset at the end of transaction. Greenplum's MPP architecture need to make some OIDs consistent among coordinator and segments (like table OIDs). The oid assignments are generated on QD and then dispatched to QEs. A single SQL might involve sever dispatch events, for example, there are some functions involving SQLs and these functions are evaluated during planning stage before we dispatch the final Utility plan. We do not want to the dispatches during plannign stage to touch oid assignments. Another subtle case that the pair of functions are useful is that subtransaction abort will lead to reset of the oid assignments memory context. Subtransaction abort might happen for UDF with exception handling and nothing to do with the main statement needed to dispatch. That is why we deep copy the content to CurrentMemoryContext and reset oid assignment context during SaveOidAssignments and bring everything back during RestoreOidAssignments. This commit adds the two functions before eval_constant_expressions() to make sure the procedure does not touch oid assignments. Fix issue: https://github.com/greenplum-db/gpdb/issues/14465. --- src/backend/catalog/oid_dispatch.c | 83 ++++++++++++++++++- src/backend/commands/matview.c | 2 +- src/backend/optimizer/util/clauses.c | 9 +- src/include/catalog/oid_dispatch.h | 1 + src/test/regress/expected/oid_consistency.out | 14 ++++ src/test/regress/sql/oid_consistency.sql | 13 +++ 6 files changed, 119 insertions(+), 3 deletions(-) diff --git a/src/backend/catalog/oid_dispatch.c b/src/backend/catalog/oid_dispatch.c index 72a3d3ce6f7..22de9432eb4 100644 --- a/src/backend/catalog/oid_dispatch.c +++ b/src/backend/catalog/oid_dispatch.c @@ -202,10 +202,91 @@ ClearOidAssignmentsOnCommit(void) preserve_oids_on_commit = false; } +/* + * Comments for SaveOidAssignments and RestoreOidAssignments + * The two functions should come together, before some procedures + * that do not want to touch the global vars (dispatch_oids or preassigned_oids), + * we need to first save the oid assignments, and then do the job, finally + * restore oid assignments. A typical usage should be as below: + * List *l = SaveOidAssignments(); + * do_the_job(); + * RestoreOidAssignments(l); + * + * The global var dispatch_oids is only used on QD, and the global + * var preassigned_oids is only used on QEs. They are both Lists, + * in a specific memorycontext, normally the memorycontext will be + * reset at the end of transaction. + * + * Greenplum's MPP architecture need to make some OIDs consistent + * among coordinator and segments (like table OIDs). The oid assignments + * are generated on QD and then dispatched to QEs. A single SQL might + * involve sever dispatch events, for example, there are some functions + * involving SQLs and these functions are evaluated during planning stage + * before we dispatch the final Utility plan. We do not want to the dispatches + * during plannign stage to touch oid assignments. + * + * Another subtle case that the pair of functions are useful is that + * subtransaction abort will lead to reset of the oid assignments memory context. + * Subtransaction abort might happen for UDF with exception handling and nothing + * to do with the main statement needed to dispatch. That is why we deep copy + * the content to CurrentMemoryContext and reset oid assignment context during + * SaveOidAssignments and bring everything back during RestoreOidAssignments. + * + * Note: these two functions only do memory related operations when the gloabl + * vars are not empty. + */ +List * +SaveOidAssignments(void) +{ + List *l = NIL; + List *src = NIL; + + if (Gp_role == GP_ROLE_DISPATCH) + { + src = dispatch_oids; + dispatch_oids = NIL; + } + else if (Gp_role == GP_ROLE_EXECUTE) + { + src = preassigned_oids; + preassigned_oids = NIL; + } + else + return NIL; + + if (src == NIL) + return NIL; + + Assert(CurrentMemoryContext != get_oids_context()); + + l = copyObject(src); + MemoryContextReset(get_oids_context()); + return l; +} + void RestoreOidAssignments(List *oid_assignments) { - dispatch_oids = oid_assignments; + MemoryContext old; + List **target; + + if (oid_assignments == NIL) + return; + + if (Gp_role == GP_ROLE_DISPATCH) + target = &dispatch_oids; + else if (Gp_role == GP_ROLE_EXECUTE) + target = &preassigned_oids; + else + return; + + Assert(CurrentMemoryContext != get_oids_context()); + + old = MemoryContextSwitchTo(get_oids_context()); + *target = copyObject(oid_assignments); + MemoryContextSwitchTo(old); + + list_free_deep(oid_assignments); } /* ---------------------------------------------------------------- diff --git a/src/backend/commands/matview.c b/src/backend/commands/matview.c index 5065f063a12..8bd075e8706 100644 --- a/src/backend/commands/matview.c +++ b/src/backend/commands/matview.c @@ -785,7 +785,7 @@ refresh_matview_datafill(DestReceiver *dest, Query *query, * * See Github Issue for details: https://github.com/greenplum-db/gpdb/issues/11956 */ - List *saved_dispatch_oids = GetAssignedOidsForDispatch(); + List *saved_dispatch_oids = SaveOidAssignments(); /* Lock and rewrite, using a copy to preserve the original query. */ copied_query = copyObject(query); diff --git a/src/backend/optimizer/util/clauses.c b/src/backend/optimizer/util/clauses.c index c79268a1648..b80ac211a9d 100644 --- a/src/backend/optimizer/util/clauses.c +++ b/src/backend/optimizer/util/clauses.c @@ -23,6 +23,7 @@ #include "postgres.h" #include "access/htup_details.h" +#include "catalog/oid_dispatch.h" #include "catalog/pg_aggregate.h" #include "catalog/pg_class.h" #include "catalog/pg_language.h" @@ -2293,6 +2294,8 @@ Node * eval_const_expressions(PlannerInfo *root, Node *node) { eval_const_expressions_context context; + Node *result; + List *saved_oid_assignments; if (root) context.boundParams = root->glob->boundParams; /* bound Params */ @@ -2307,7 +2310,11 @@ eval_const_expressions(PlannerInfo *root, Node *node) context.max_size = 0; context.eval_stable_functions = should_eval_stable_functions(root); - return eval_const_expressions_mutator(node, &context); + saved_oid_assignments = SaveOidAssignments(); + result = eval_const_expressions_mutator(node, &context); + RestoreOidAssignments(saved_oid_assignments); + + return result; } #define MIN_ARRAY_SIZE_FOR_HASHED_SAOP 9 diff --git a/src/include/catalog/oid_dispatch.h b/src/include/catalog/oid_dispatch.h index 0dbb8532511..fbb7a14f59e 100644 --- a/src/include/catalog/oid_dispatch.h +++ b/src/include/catalog/oid_dispatch.h @@ -128,6 +128,7 @@ extern void RememberPreassignedIndexNameForChildIndex(Oid parentIdxOid, Oid chil /* Functions used in master and QE nodes */ extern void PreserveOidAssignmentsOnCommit(void); extern void ClearOidAssignmentsOnCommit(void); +extern List * SaveOidAssignments(void); extern void RestoreOidAssignments(List *oid_assignments); /* Functions used in binary upgrade */ diff --git a/src/test/regress/expected/oid_consistency.out b/src/test/regress/expected/oid_consistency.out index 8073b97f07b..d26f3a6da37 100644 --- a/src/test/regress/expected/oid_consistency.out +++ b/src/test/regress/expected/oid_consistency.out @@ -797,3 +797,17 @@ select verify('trigger_oid'); 1 (1 row) +-- Case for Issue: https://github.com/greenplum-db/gpdb/issues/14465 +create function func_fail_14465(int) returns int + immutable language plpgsql as $$ +begin + perform unwanted_grant(); + raise warning 'owned'; + return 1; +exception when others then + return 2; +end$$; +create materialized view mv_14465 as select 1 as c; +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column(s) named 'c' as the Greenplum Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +create index on mv_14465 (c) where func_fail_14465(1) > 0; diff --git a/src/test/regress/sql/oid_consistency.sql b/src/test/regress/sql/oid_consistency.sql index fa0f8c58333..4e775c19f10 100644 --- a/src/test/regress/sql/oid_consistency.sql +++ b/src/test/regress/sql/oid_consistency.sql @@ -434,3 +434,16 @@ $$ language plpgsql no sql; create trigger troid_trigger after insert on trigger_oid for each row execute procedure trig_func(); select verify('trigger_oid'); + +-- Case for Issue: https://github.com/greenplum-db/gpdb/issues/14465 +create function func_fail_14465(int) returns int + immutable language plpgsql as $$ +begin + perform unwanted_grant(); + raise warning 'owned'; + return 1; +exception when others then + return 2; +end$$; +create materialized view mv_14465 as select 1 as c; +create index on mv_14465 (c) where func_fail_14465(1) > 0;