diff --git a/gpcontrib/gp_internal_tools/gp_ao_co_diagnostics.c b/gpcontrib/gp_internal_tools/gp_ao_co_diagnostics.c index 2579075ef96..807e52ceaa2 100644 --- a/gpcontrib/gp_internal_tools/gp_ao_co_diagnostics.c +++ b/gpcontrib/gp_internal_tools/gp_ao_co_diagnostics.c @@ -48,6 +48,9 @@ gp_aoseg(PG_FUNCTION_ARGS); extern Datum gp_aocsseg_history(PG_FUNCTION_ARGS); +extern Datum +gp_aoblkdir(PG_FUNCTION_ARGS); + extern Datum gp_aovisimap(PG_FUNCTION_ARGS); @@ -67,6 +70,7 @@ PG_FUNCTION_INFO_V1(gp_aoseg_history_wrapper); PG_FUNCTION_INFO_V1(gp_aoseg_wrapper); PG_FUNCTION_INFO_V1(gp_aocsseg_wrapper); PG_FUNCTION_INFO_V1(gp_aocsseg_history_wrapper); +PG_FUNCTION_INFO_V1(gp_aoblkdir_wrapper); PG_FUNCTION_INFO_V1(gp_aovisimap_wrapper); PG_FUNCTION_INFO_V1(gp_aovisimap_entry_wrapper); PG_FUNCTION_INFO_V1(gp_aovisimap_hidden_info_wrapper); @@ -84,6 +88,8 @@ gp_aocsseg_wrapper(PG_FUNCTION_ARGS); extern Datum gp_aocsseg_history_wrapper(PG_FUNCTION_ARGS); extern Datum +gp_aoblkdir_wrapper(PG_FUNCTION_ARGS); +extern Datum gp_aovisimap_wrapper(PG_FUNCTION_ARGS); extern Datum gp_aovisimap_entry_wrapper(PG_FUNCTION_ARGS); @@ -228,6 +234,21 @@ gp_aocsseg_history_wrapper(PG_FUNCTION_ARGS) PG_RETURN_DATUM(returnValue); } +/* + * Interface to gp_aoblkdir_wrapper function. + * + * CREATE FUNCTION gp_aoblkdir_wrapper(regclass) RETURNS TABLE + * (segno integer, columngroup_no integer, first_row_no bigint, file_offset bigint, row_count bigint) + * AS '$libdir/gp_ao_co_diagnostics.so', 'gp_aoblkdir_wrapper' LANGUAGE C STRICT; + */ +Datum +gp_aoblkdir_wrapper(PG_FUNCTION_ARGS) +{ + Datum returnValue = gp_aoblkdir(fcinfo); + + PG_RETURN_DATUM(returnValue); +} + /* * Interface to gp_aovisimap_wrapper function. * diff --git a/src/backend/access/aocs/aocs_compaction.c b/src/backend/access/aocs/aocs_compaction.c index bb801acfeb2..e80887af9ba 100644 --- a/src/backend/access/aocs/aocs_compaction.c +++ b/src/backend/access/aocs/aocs_compaction.c @@ -282,6 +282,14 @@ AOCSSegmentFileFullCompaction(Relation aorel, estate->es_opened_result_relations = lappend(estate->es_opened_result_relations, resultRelInfo); + /* + * We don't want uniqueness checks to be performed while "insert"ing tuples + * to a destination segfile during AOCSMoveTuple(). This is to ensure that + * we can avoid spurious conflicts between the moved tuple and the original + * tuple. + */ + estate->gp_bypass_unique_check = true; + while (aocs_getnext(scanDesc, ForwardScanDirection, slot)) { CHECK_FOR_INTERRUPTS(); diff --git a/src/backend/access/aocs/aocsam.c b/src/backend/access/aocs/aocsam.c index 267dd837697..2e580a7f23a 100644 --- a/src/backend/access/aocs/aocsam.c +++ b/src/backend/access/aocs/aocsam.c @@ -868,7 +868,7 @@ aocs_getnext(AOCSScanDesc scan, ScanDirection direction, TupleTableSlot *slot) /* * Perform any required upgrades on the Datum we just fetched. */ - if (curseginfo->formatversion < AORelationVersion_GetLatest()) + if (curseginfo->formatversion < AOSegfileFormatVersion_GetLatest ()) { upgrade_datum_scan(scan, attno, d, null, curseginfo->formatversion); @@ -1272,6 +1272,10 @@ positionSkipCurrentBlock(DatumStreamFetchDesc datumStreamFetchDesc) datumStreamFetchDesc->currentBlock.lastRowNum + 1; } +/* + * Fetch the tuple's datum from the block indicated by the block directory entry + * that covers the tuple, given the colno. + */ static void fetchFromCurrentBlock(AOCSFetchDesc aocsFetchDesc, int64 rowNum, @@ -1313,7 +1317,7 @@ fetchFromCurrentBlock(AOCSFetchDesc aocsFetchDesc, /* * Perform any required upgrades on the Datum we just fetched. */ - if (formatversion < AORelationVersion_GetLatest()) + if (formatversion < AOSegfileFormatVersion_GetLatest ()) { upgrade_datum_fetch(aocsFetchDesc, colno, values, nulls, formatversion); @@ -1331,14 +1335,49 @@ scanToFetchValue(AOCSFetchDesc aocsFetchDesc, TupleTableSlot *slot, int colno) { - DatumStreamFetchDesc datumStreamFetchDesc = aocsFetchDesc->datumStreamFetchDesc[colno]; - DatumStreamRead *datumStream = datumStreamFetchDesc->datumStream; - bool found; + DatumStreamFetchDesc datumStreamFetchDesc = aocsFetchDesc->datumStreamFetchDesc[colno]; + DatumStreamRead *datumStream = datumStreamFetchDesc->datumStream; + CurrentBlock *currentBlock = &datumStreamFetchDesc->currentBlock; + AppendOnlyBlockDirectoryEntry *entry = ¤tBlock->blockDirectoryEntry; + bool found; found = datumstreamread_find_block(datumStream, datumStreamFetchDesc, rowNum); - if (found) + if (!found) + { + if (AppendOnlyBlockDirectoryEntry_RangeHasRow(entry, rowNum)) + { + /* + * We fell into a hole inside the resolved block directory entry + * we obtained from AppendOnlyBlockDirectory_GetEntry(). + * This should not be happening for versions >= CB2. Scream + * appropriately. See AppendOnlyBlockDirectoryEntry for details. + */ + ereportif(AORelationVersion_Get(aocsFetchDesc->relation) >= AORelationVersion_CB2, + ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("datum with row number %ld and col no %d not found in block directory entry range", rowNum, colno), + errdetail("block directory entry: (fileOffset = %ld, firstRowNum = %ld, " + "afterFileOffset = %ld, lastRowNum = %ld)", + entry->range.fileOffset, + entry->range.firstRowNum, + entry->range.afterFileOffset, + entry->range.lastRowNum))); + } + else + { + /* + * The resolved block directory entry we obtained from + * AppendOnlyBlockDirectory_GetEntry() has range s.t. + * firstRowNum < lastRowNum < rowNum + * This can happen when rowNum maps to an aborted transaction, and + * we find an earlier committed block directory row due to the + * <= scan condition in AppendOnlyBlockDirectory_GetEntry(). + */ + } + } + else fetchFromCurrentBlock(aocsFetchDesc, rowNum, slot, colno); return found; @@ -1412,6 +1451,11 @@ openFetchSegmentFile(AOCSFetchDesc aocsFetchDesc, return true; } +/* + * Note: we don't reset the block directory entry here. This is crucial, so we + * can use the block directory entry later on. See comment in AOFetchBlockMetadata + * FIXME: reset other fields here. + */ static void resetCurrentBlockInfo(CurrentBlock *currentBlock) { @@ -1815,119 +1859,6 @@ aocs_fetch_finish(AOCSFetchDesc aocsFetchDesc) AppendOnlyVisimap_Finish(&aocsFetchDesc->visibilityMap, AccessShareLock); } -typedef struct AOCSUpdateDescData -{ - AOCSInsertDesc insertDesc; - - /* - * visibility map - */ - AppendOnlyVisimap visibilityMap; - - /* - * Visimap delete support structure. Used to handle out-of-order deletes - */ - AppendOnlyVisimapDelete visiMapDelete; - -} AOCSUpdateDescData; - -AOCSUpdateDesc -aocs_update_init(Relation rel, int segno) -{ - Oid visimaprelid; - Oid visimapidxid; - AOCSUpdateDesc desc = (AOCSUpdateDesc) palloc0(sizeof(AOCSUpdateDescData)); - - desc->insertDesc = aocs_insert_init(rel, segno); - - GetAppendOnlyEntryAuxOids(rel->rd_id, - desc->insertDesc->appendOnlyMetaDataSnapshot, - NULL, NULL, NULL, - &visimaprelid, &visimapidxid); - AppendOnlyVisimap_Init(&desc->visibilityMap, - visimaprelid, - visimapidxid, - RowExclusiveLock, - desc->insertDesc->appendOnlyMetaDataSnapshot); - - AppendOnlyVisimapDelete_Init(&desc->visiMapDelete, - &desc->visibilityMap); - - return desc; -} - -void -aocs_update_finish(AOCSUpdateDesc desc) -{ - Assert(desc); - - AppendOnlyVisimapDelete_Finish(&desc->visiMapDelete); - - aocs_insert_finish(desc->insertDesc, NULL); - desc->insertDesc = NULL; - - /* Keep lock until the end of transaction */ - AppendOnlyVisimap_Finish(&desc->visibilityMap, NoLock); - - pfree(desc); -} - -TM_Result -aocs_update(AOCSUpdateDesc desc, TupleTableSlot *slot, - AOTupleId *oldTupleId, AOTupleId *newTupleId) -{ - TM_Result result; - - Assert(desc); - Assert(oldTupleId); - Assert(newTupleId); - -#ifdef FAULT_INJECTOR - FaultInjector_InjectFaultIfSet( - "appendonly_update", - DDLNotSpecified, - "", //databaseName - RelationGetRelationName(desc->insertDesc->aoi_rel)); - /* tableName */ -#endif - - result = AppendOnlyVisimapDelete_Hide(&desc->visiMapDelete, oldTupleId); - if (result != TM_Ok) - return result; - - slot_getallattrs(slot); - aocs_insert_values(desc->insertDesc, - slot->tts_values, slot->tts_isnull, - newTupleId); - - return result; -} - - -/* - * AOCSDeleteDescData is used for delete data from AOCS relations. - * It serves an equivalent purpose as AppendOnlyScanDescData - * (relscan.h) only that the later is used for scanning append-only - * relations. - */ -typedef struct AOCSDeleteDescData -{ - /* - * Relation to delete from - */ - Relation aod_rel; - - /* - * visibility map - */ - AppendOnlyVisimap visibilityMap; - - /* - * Visimap delete support structure. Used to handle out-of-order deletes - */ - AppendOnlyVisimapDelete visiMapDelete; - -} AOCSDeleteDescData; /* @@ -2186,7 +2117,7 @@ aocs_addcol_newsegfile(AOCSAddColumnDesc desc, int version; /* Always write in the latest format */ - version = AORelationVersion_GetLatest(); + version = AOSegfileFormatVersion_GetLatest(); FormatAOSegmentFileName(basepath, seginfo->segno, colno, &fileSegNo, fn); @@ -2674,7 +2605,7 @@ aocs_getnext_sample(AOCSScanDesc scan, ScanDirection direction, TupleTableSlot * /* * Perform any required upgrades on the Datum we just fetched. */ - if (curseginfo->formatversion < AORelationVersion_GetLatest()) + if (curseginfo->formatversion < AOSegfileFormatVersion_GetLatest ()) { upgrade_datum_scan(scan, attno, d, null, curseginfo->formatversion); diff --git a/src/backend/access/aocs/aocsam_handler.c b/src/backend/access/aocs/aocsam_handler.c index 5e20b685d20..4c6bd38e778 100644 --- a/src/backend/access/aocs/aocsam_handler.c +++ b/src/backend/access/aocs/aocsam_handler.c @@ -90,6 +90,7 @@ typedef struct AOCODMLState AOCSInsertDesc insertDesc; dlist_head head; // Head of multiple segment files insertion list. AOCSDeleteDesc deleteDesc; + AOCSUniqueCheckDesc uniqueCheckDesc; } AOCODMLState; static void reset_state_cb(void *arg); @@ -189,6 +190,7 @@ enter_dml_state(const Oid relationOid) state->insertDesc = NULL; state->deleteDesc = NULL; + state->uniqueCheckDesc = NULL; dlist_init(&state->head); Assert(!found); @@ -271,6 +273,7 @@ void aoco_dml_finish(Relation relation, CmdType operation) { AOCODMLState *state; + bool had_delete_desc = false; state = remove_dml_state(RelationGetRelid(relation)); @@ -289,6 +292,8 @@ aoco_dml_finish(Relation relation, CmdType operation) */ if (!state->insertDesc) AORelIncrementModCount(relation); + + had_delete_desc = true; } if (state->insertDesc) @@ -298,6 +303,29 @@ aoco_dml_finish(Relation relation, CmdType operation) state->insertDesc = NULL; } + if (state->uniqueCheckDesc) + { + /* clean up the block directory */ + AppendOnlyBlockDirectory_End_forUniqueChecks(state->uniqueCheckDesc->blockDirectory); + pfree(state->uniqueCheckDesc->blockDirectory); + state->uniqueCheckDesc->blockDirectory = NULL; + + /* + * If this fetch is a part of an update, then we have been reusing the + * visimap used by the delete half of the update, which would have + * already been cleaned up above. Clean up otherwise. + */ + if (!had_delete_desc) + { + AppendOnlyVisimap_Finish_forUniquenessChecks(state->uniqueCheckDesc->visimap); + pfree(state->uniqueCheckDesc->visimap); + } + state->uniqueCheckDesc->visimap = NULL; + + pfree(state->uniqueCheckDesc); + state->uniqueCheckDesc = NULL; + } + } /* @@ -318,7 +346,8 @@ get_insert_descriptor(const Relation relation) oldcxt = MemoryContextSwitchTo(aocoLocal.stateCxt); state->insertDesc = aocs_insert_init(relation, - ChooseSegnoForWrite(relation)); + ChooseSegnoForWrite(relation)); + dlist_init(&state->head); dlist_head *head = &state->head; dlist_push_tail(head, &state->insertDesc->node); @@ -335,6 +364,18 @@ get_insert_descriptor(const Relation relation) } list_free(segments); } + + //* mark all insertDesc placeholderInserted with false */ + if (relationHasUniqueIndex(relation)) + { + dlist_iter iter; + dlist_foreach(iter, head) + { + AOCSInsertDesc insertDesc = (AOCSInsertDesc)dlist_container(AOCSInsertDescData, node, iter.cur); + insertDesc->placeholderInserted = false; + } + } + MemoryContextSwitchTo(oldcxt); } @@ -349,7 +390,47 @@ get_insert_descriptor(const Relation relation) state->insertDesc = next; } + /* + * If we have a unique index, insert a placeholder block directory row to + * entertain uniqueness checks from concurrent inserts. See + * AppendOnlyBlockDirectory_InsertPlaceholder() for details. + * + * Note: For AOCO tables, we need to only insert a placeholder block + * directory row for the 1st non-dropped column. This is because + * during a uniqueness check, only the first non-dropped column's block + * directory entry is consulted. (See AppendOnlyBlockDirectory_CoversTuple()) + */ + if (relationHasUniqueIndex(relation) && !state->insertDesc->placeholderInserted) + { + int firstNonDroppedColumn = -1; + int64 firstRowNum; + DatumStreamWrite *dsw; + BufferedAppend *bufferedAppend; + int64 fileOffset; + AOCSInsertDesc insertDesc; + + for(int i = 0; i < relation->rd_att->natts; i++) + { + if (!relation->rd_att->attrs[i].attisdropped) { + firstNonDroppedColumn = i; + break; + } + } + Assert(firstNonDroppedColumn != -1); + + insertDesc = state->insertDesc; + dsw = insertDesc->ds[firstNonDroppedColumn]; + firstRowNum = dsw->blockFirstRowNum; + bufferedAppend = &dsw->ao_write.bufferedAppend; + fileOffset = BufferedAppendNextBufferPosition(bufferedAppend); + + AppendOnlyBlockDirectory_InsertPlaceholder(&insertDesc->blockDirectory, + firstRowNum, + fileOffset, + firstNonDroppedColumn); + insertDesc->placeholderInserted = true; + } return state->insertDesc; } @@ -392,6 +473,50 @@ get_delete_descriptor(const Relation relation, bool forUpdate) return state->deleteDesc; } +static AOCSUniqueCheckDesc +get_or_create_unique_check_desc(Relation relation, Snapshot snapshot) +{ + AOCODMLState *state = find_dml_state(RelationGetRelid(relation)); + + if (!state->uniqueCheckDesc) + { + MemoryContext oldcxt; + AOCSUniqueCheckDesc uniqueCheckDesc; + + oldcxt = MemoryContextSwitchTo(aocoLocal.stateCxt); + uniqueCheckDesc = palloc0(sizeof(AOCSUniqueCheckDescData)); + + /* Initialize the block directory */ + uniqueCheckDesc->blockDirectory = palloc0(sizeof(AppendOnlyBlockDirectory)); + AppendOnlyBlockDirectory_Init_forUniqueChecks(uniqueCheckDesc->blockDirectory, + relation, + relation->rd_att->natts, /* numColGroups */ + snapshot); + /* + * If this is part of an update, we need to reuse the visimap used by + * the delete half of the update. This is to avoid spurious conflicts + * when the key's previous and new value are identical. Using the + * visimap from the delete half ensures that the visimap can recognize + * any tuples deleted by us prior to this insert, within this command. + */ + if (state->deleteDesc) + uniqueCheckDesc->visimap = &state->deleteDesc->visibilityMap; + else + { + /* Initialize the visimap */ + uniqueCheckDesc->visimap = palloc0(sizeof(AppendOnlyVisimap)); + AppendOnlyVisimap_Init_forUniqueCheck(uniqueCheckDesc->visimap, + relation, + snapshot); + } + + state->uniqueCheckDesc = uniqueCheckDesc; + MemoryContextSwitchTo(oldcxt); + } + + return state->uniqueCheckDesc; +} + /* * AO_COLUMN access method uses virtual tuples */ @@ -705,6 +830,7 @@ aoco_index_fetch_tuple(struct IndexFetchTableData *scan, bool *call_again, bool *all_dead) { IndexFetchAOCOData *aocoscan = (IndexFetchAOCOData *) scan; + bool found = false; if (!aocoscan->aocofetch) { @@ -745,10 +871,129 @@ aoco_index_fetch_tuple(struct IndexFetchTableData *scan, if (aocs_fetch(aocoscan->aocofetch, (AOTupleId *) tid, slot)) { ExecStoreVirtualTuple(slot); - return true; + found = true; } - return false; + /* + * Currently, we don't determine this parameter. By contract, it is to be + * set to true iff we can determine that this row is dead to all + * transactions. Failure to set this will lead to use of a garbage value + * in certain code, such as that for unique index checks. + * This is typically used for HOT chains, which we don't support. + */ + if (all_dead) + *all_dead = false; + + /* Currently, we don't determine this parameter. By contract, it is to be + * set to true iff there is another tuple for the tid, so that we can prompt + * the caller to call index_fetch_tuple() again for the same tid. + * This is typically used for HOT chains, which we don't support. + */ + if (call_again) + *call_again = false; + + return found; +} + +/* + * Check if a visible tuple exists given the tid and a snapshot. This is + * currently used to determine uniqueness checks. + * + * We determine existence simply by checking if a *visible* block directory + * entry covers the given tid. + * + * There is no need to fetch the tuple (we actually can't reliably do so as + * we might encounter a placeholder row in the block directory) + * + * If no visible block directory entry exists, we are done. If it does, we need + * to further check the visibility of the tuple itself by consulting the visimap. + * Now, the visimap check can be skipped if the tuple was found to have been + * inserted by a concurrent in-progress transaction, in which case we return + * true and have the xwait machinery kick in. + */ +static bool +aoco_index_unique_check(Relation rel, + ItemPointer tid, + Snapshot snapshot, + bool *all_dead) +{ + AOCSUniqueCheckDesc uniqueCheckDesc; + AOTupleId *aoTupleId = (AOTupleId *) tid; + bool visible; + +#ifdef USE_ASSERT_CHECKING + int segmentFileNum = AOTupleIdGet_segmentFileNum(aoTupleId); + int64 rowNum = AOTupleIdGet_rowNum(aoTupleId); + + Assert(segmentFileNum != InvalidFileSegNumber); + Assert(rowNum != InvalidAORowNum); + /* + * Since this can only be called in the context of a unique index check, the + * snapshots that are supplied can only be non-MVCC snapshots: SELF and DIRTY. + */ + Assert(snapshot->snapshot_type == SNAPSHOT_SELF || + snapshot->snapshot_type == SNAPSHOT_DIRTY); +#endif + + /* + * Currently, we don't determine this parameter. By contract, it is to be + * set to true iff we can determine that this row is dead to all + * transactions. Failure to set this will lead to use of a garbage value + * in certain code, such as that for unique index checks. + * This is typically used for HOT chains, which we don't support. + */ + if (all_dead) + *all_dead = false; + + /* + * FIXME: for when we want CREATE UNIQUE INDEX CONCURRENTLY to work + * Unique constraint violation checks with SNAPSHOT_SELF are currently + * required to support CREATE UNIQUE INDEX CONCURRENTLY. Currently, the + * sole placeholder row inserted at first insert might not be visible to + * the snapshot, if it was already updated by its actual first row. So, + * we would need to flush a placeholder row at the beginning of each new + * in-memory minipage. Currently, CREATE INDEX CONCURRENTLY isn't + * supported, so we assume such a check satisfies SNAPSHOT_SELF. + */ + if (snapshot->snapshot_type == SNAPSHOT_SELF) + return true; + + uniqueCheckDesc = get_or_create_unique_check_desc(rel, snapshot); + + /* First, scan the block directory */ + if (!AppendOnlyBlockDirectory_UniqueCheck(uniqueCheckDesc->blockDirectory, + aoTupleId, + snapshot)) + return false; + + /* + * If the xmin or xmax are set for the dirty snapshot, after the block + * directory is scanned with the snapshot, it means that there is a + * concurrent in-progress transaction inserting the tuple. So, return true + * and have the xwait machinery kick in. + */ + Assert(snapshot->snapshot_type == SNAPSHOT_DIRTY); + if (TransactionIdIsValid(snapshot->xmin) || TransactionIdIsValid(snapshot->xmax)) + return true; + + /* Now, consult the visimap */ + visible = AppendOnlyVisimap_UniqueCheck(uniqueCheckDesc->visimap, + aoTupleId, + snapshot); + + /* + * Since we disallow deletes and updates running in parallel with inserts, + * there is no way that the dirty snapshot has it's xmin and xmax populated + * after the visimap has been scanned with it. + * + * Note: we disallow it by grabbing an ExclusiveLock on the QD (See + * CdbTryOpenTable()). So if we are running in utility mode, there is no + * such restriction. + */ + AssertImply(Gp_role != GP_ROLE_UTILITY, + (!TransactionIdIsValid(snapshot->xmin) && !TransactionIdIsValid(snapshot->xmax))); + + return visible; } static void @@ -1964,6 +2209,7 @@ static TableAmRoutine ao_column_methods = { .index_fetch_reset = aoco_index_fetch_reset, .index_fetch_end = aoco_index_fetch_end, .index_fetch_tuple = aoco_index_fetch_tuple, + .index_unique_check = aoco_index_unique_check, .tuple_insert = aoco_tuple_insert, .tuple_insert_speculative = aoco_tuple_insert_speculative, diff --git a/src/backend/access/aocs/aocssegfiles.c b/src/backend/access/aocs/aocssegfiles.c index 9f389f94cd6..be273dee718 100644 --- a/src/backend/access/aocs/aocssegfiles.c +++ b/src/backend/access/aocs/aocssegfiles.c @@ -84,7 +84,7 @@ InsertInitialAOCSFileSegInfo(Relation prel, int32 segno, int32 nvp, Oid segrelid ValidateAppendonlySegmentDataBeforeStorage(segno); /* New segments are always created in the latest format */ - formatVersion = AORelationVersion_GetLatest(); + formatVersion = AOSegfileFormatVersion_GetLatest(); segrel = heap_open(segrelid, RowExclusiveLock); @@ -665,7 +665,7 @@ ClearAOCSFileSegInfo(Relation prel, int segno) repl[Anum_pg_aocs_varblockcount - 1] = true; /* When the segment is later recreated, it will be in new format */ - d[Anum_pg_aocs_formatversion - 1] = Int16GetDatum(AORelationVersion_GetLatest()); + d[Anum_pg_aocs_formatversion - 1] = Int16GetDatum(AOSegfileFormatVersion_GetLatest()); repl[Anum_pg_aocs_formatversion - 1] = true; /* We do not reset the modcount here */ diff --git a/src/backend/access/appendonly/Makefile b/src/backend/access/appendonly/Makefile index 7b61f42a707..430e397f2fa 100755 --- a/src/backend/access/appendonly/Makefile +++ b/src/backend/access/appendonly/Makefile @@ -15,7 +15,7 @@ OBJS = appendonlyam_handler.o appendonlyam.o aosegfiles.o aomd.o \ appendonlyblockdirectory.o appendonly_visimap.o \ appendonly_visimap_entry.o appendonly_visimap_store.o \ appendonly_compaction.o appendonly_visimap_udf.o \ - aomd_filehandler.o + appendonly_blkdir_udf.o aomd_filehandler.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/appendonly/README.md b/src/backend/access/appendonly/README.md index 6d43acf0dfa..31fb78f1a75 100644 --- a/src/backend/access/appendonly/README.md +++ b/src/backend/access/appendonly/README.md @@ -178,3 +178,67 @@ Vacuum drop phase, to recycle segments that have been compacted, checks the xmin of each AWAITING_DROP segment. If it's visible to everyone, the segfile is recycled. It uses the relation extension lock to protect the scan over pg_aoseg. + + +# Unique indexes + +To answer uniqueness checks for AO/AOCO tables, we have a complication. Unlike +heap, in AO/CO we don't store the xmin/xmax fields in the tuples. So, we have to +rely on block directory rows that "cover" the data rows to satisfy index lookups. +Since the block directory is maintained as a heap table, visibility checks on it +are identical to any other heap table: the xmin/xmax of the block directory +row(s) will be leveraged. This means we don't have to write any special +visibility checking code ourselves, nor do we need to worry about transactions +vs subtransactions. + +Since block directory rows are written usually much after the data row has been +inserted, there are windows in which there is no block directory row on disk +for a given data row - a problem for concurrent unique index checks. So during +INSERT/COPY, at the beginning of the insertion operation, we insert a +placeholder block directory row to cover ALL future tuples going to the current +segment file for this command. + +To answer unique index lookups, we don't have to physically fetch the tuple from +the table. This is key to answering unique index lookups against placeholder +rows which predate their corresponding data rows. We simply perform a sysscan of +the block directory, and if we have a visible entry that encompasses the rowNum +being looked up, we go on to the next check. Otherwise, we have no conflict and +return. The next check that we need to perform is against the visimap, to see if +the tuple is visible. If yes, then we have a conflict. Since the snapshot used +to perform uniqueness checks for AO/CO is SNAPSHOT_DIRTY (we currently don't +support SNAPSHOT_SELF used for CREATE UNIQUE INDEX CONCURRENTLY), it is possible +to detect if the block directory tuple (and by extension the data tuple) was +inserted by a concurrent in-progress transaction. In this case, we simply avoid +the visimap check and return true. The benefit of performing the sysscan on the +block directory is that HeapTupleSatisfiesDirty() is called, and in the process, +the snapshot's xmin and/or xmax fields are updated (see SNAPSHOT_DIRTY for +details on its special contract). Returning true in this situation will lead to +the unique index code's xwait mechanism to kick in (see _bt_check_unique()) and +the current transaction will wait for the one that inserted the tuple to commit +or abort. + +Tableam changes: Since there is a lot of overhead (leads to ~20x performance +degradation in the worst case) in setting up and tearing down scan descriptors +for AO/CO tables, we avoid the scanbegin..fetch..scanend construct in +table_index_fetch_tuple_check(). + +So, a new tableam API index_unique_check() is used, which is implemented +only for AO/CO tables. Here, we fetch a UniqueCheckDesc, which stores all the +in-memory state to help us perform a unique index check. This descriptor is +attached to the DMLState structs. Currently, the descriptor holds only a block +directory struct. It will be modified later on to hold a visimap reference to +help implement DELETEs/UPDATEs. Furthermore, we initialize this struct on the +first unique index check performed, akin to how we initialize descriptors for +insert and delete. + +AO lazy VACUUM is different from heap vacuum in the sense that ctids of data +tuples change (and the index tuples need to be updated as a consequence). It +leverages the scan and insert code to scan live tuples from each segfile and to +move (insert) them in a target segfile. While moving tuples, we need to avoid +performing uniqueness checks from the insert machinery. This is to ensure that +we avoid spurious conflicts between the moved tuple and the original tuple. We +don't need to insert a placeholder row for the backend running vacuum as the old +index entries will still point to the segment being compacted. This will be the +case up until the index entries are bulk deleted, but by then the new index +entries along with new block directory rows would already have been written and +would be able to answer uniqueness checks. diff --git a/src/backend/access/appendonly/aosegfiles.c b/src/backend/access/appendonly/aosegfiles.c index 2e651099cb8..1d832d7e0be 100644 --- a/src/backend/access/appendonly/aosegfiles.c +++ b/src/backend/access/appendonly/aosegfiles.c @@ -106,7 +106,7 @@ InsertInitialSegnoEntry(Relation parentrel, int segno) ValidateAppendonlySegmentDataBeforeStorage(segno); /* New segments are always created in the latest format */ - formatVersion = AORelationVersion_GetLatest(); + formatVersion = AOSegfileFormatVersion_GetLatest(); GetAppendOnlyEntryAuxOids(parentrel->rd_id, NULL, &segrelid, NULL, NULL, NULL, NULL); @@ -302,7 +302,7 @@ GetFileSegInfo(Relation parentrel, Snapshot appendOnlyMetaDataSnapshot, int segn ereport(ERROR, (errcode(ERRCODE_UNDEFINED_OBJECT), errmsg("got invalid formatversion value: NULL"))); - AORelationVersion_CheckValid(fsinfo->formatversion); + AOSegfileFormatVersion_CheckValid(fsinfo->formatversion); /* get the state */ fsinfo->state = DatumGetInt16( @@ -494,7 +494,7 @@ GetAllFileSegInfo_pg_aoseg_rel(char *relationName, (errcode(ERRCODE_UNDEFINED_OBJECT), errmsg("got invalid formatversion value: NULL"))); - AORelationVersion_CheckValid(formatversion); + AOSegfileFormatVersion_CheckValid(formatversion); oneseginfo->formatversion = DatumGetInt16(formatversion); /* get the state */ @@ -665,7 +665,7 @@ ClearFileSegInfo(Relation parentrel, int segno) new_record_repl[Anum_pg_aoseg_eofuncompressed - 1] = true; /* When the segment is later recreated, it will be in new format */ - new_record[Anum_pg_aoseg_formatversion - 1] = Int16GetDatum(AORelationVersion_GetLatest()); + new_record[Anum_pg_aoseg_formatversion - 1] = Int16GetDatum(AOSegfileFormatVersion_GetLatest()); new_record_repl[Anum_pg_aoseg_formatversion - 1] = true; /* We do not reset the modcount here */ diff --git a/src/backend/access/appendonly/appendonly_blkdir_udf.c b/src/backend/access/appendonly/appendonly_blkdir_udf.c new file mode 100644 index 00000000000..2c807b58fb0 --- /dev/null +++ b/src/backend/access/appendonly/appendonly_blkdir_udf.c @@ -0,0 +1,210 @@ +/*------------------------------------------------------------------------------ + * + * AppendOnly_Blkdir UDFs + * User-defined functions (UDF) for support of append-only block directory + * + * Copyright (c) 2013-Present VMware, Inc. or its affiliates. + * + * + * IDENTIFICATION + * src/backend/access/appendonly/appendonly_blkdir_udf.c + * + *------------------------------------------------------------------------------ + */ + +#include "postgres.h" + +#include "access/appendonly_visimap.h" +#include "access/table.h" +#include "catalog/aoblkdir.h" +#include "cdb/cdbappendonlyblockdirectory.h" +#include "cdb/cdbvars.h" +#include "funcapi.h" +#include "utils/snapmgr.h" + +Datum gp_aoblkdir(PG_FUNCTION_ARGS); + +/* + * This UDF emits block directory entries for an AO/AOCO relation. It does so + * by flattening the minipage column of ao_blkdir relations, yielding 1 minipage + * entry / output row. + * + * Format: + * tupleid | segno | columngroup_no | entry_no | first_row_no | file_offset | row_count + * + * This UDF also respects gp_select_invisible to report block directory entries + * that are invisible. To determine invisible entries we can use the tupleid + * projected here and tie it to the corresponding pg_aoblkdir tuple's xmax. + */ + +Datum +gp_aoblkdir(PG_FUNCTION_ARGS) +{ + Oid aoRelOid = PG_GETARG_OID(0); + HeapTuple tuple; + + typedef struct Context + { + Relation aorel; + SysScanDesc scan; + MinipagePerColumnGroup currMinipage; + bool currMinipageValid; + int currMinipageEntryIdx; + Relation blkdirrel; + } Context; + + FuncCallContext *funcctx; + Context *context; + + if (SRF_IS_FIRSTCALL()) + { + TupleDesc tupdesc; + MemoryContext oldcontext; + Snapshot sst; + Oid blkdirrelid; + + /* create a function context for cross-call persistence */ + funcctx = SRF_FIRSTCALL_INIT(); + + /* + * switch to memory context appropriate for multiple function calls + */ + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + + /* build tupdesc for result tuples */ + tupdesc = CreateTemplateTupleDesc(7); + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "tupleid", + TIDOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 2, "segno", + INT4OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 3, "columngroup_no", + INT4OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 4, "entry_no", + INT4OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 5, "first_row_no", + INT8OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 6, "file_offset", + INT8OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 7, "row_count", + INT8OID, -1, 0); + funcctx->tuple_desc = BlessTupleDesc(tupdesc); + + /* initialize Context for SRF */ + context = (Context *) palloc0(sizeof(Context)); + context->aorel = table_open(aoRelOid, AccessShareLock); + if (!RelationIsAppendOptimized(context->aorel)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("function not supported on non append-optimized relation"))); + sst = GetLatestSnapshot(); + GetAppendOnlyEntryAuxOids(aoRelOid, sst, + NULL, &blkdirrelid, NULL, + NULL, NULL); + sst = gp_select_invisible ? SnapshotAny : GetLatestSnapshot(); + if (blkdirrelid == InvalidOid) + ereport(ERROR, + (errmsg("appendoptimized relation doesn't have a block directory"), + errhint("relation must have or must have had an index"))); + context->blkdirrel = table_open(blkdirrelid, AccessShareLock); + context->scan = systable_beginscan(context->blkdirrel, + InvalidOid, + false, + sst, + 0, + NULL); + context->currMinipage.minipage = palloc0(minipage_size(NUM_MINIPAGE_ENTRIES)); + context->currMinipageValid = false; + context->currMinipageEntryIdx = -1; + funcctx->user_fctx = (void *) context; + + MemoryContextSwitchTo(oldcontext); + } + + funcctx = SRF_PERCALL_SETUP(); + context = (Context *) funcctx->user_fctx; + + if (!context->currMinipageValid) + { + Datum minipage; + bool minipageNull; + + /* We need to fetch the next tuple from the blkdir relation */ + if (!systable_getnext(context->scan)) + goto srf_done; + + /* deform the tuple and populate slot->values/nulls */ + slot_getallattrs(context->scan->slot); + + minipage = slot_getattr(context->scan->slot, Anum_pg_aoblkdir_minipage, &minipageNull); + /* + * There should not really be any NULL values. We opt to report it + * instead of ERRORing out. + */ + context->currMinipageValid = !minipageNull; + if (context->currMinipageValid) + { + /* + * Cache the latest scanned minipage and use it to emit the next + * (context->currMinipage->numMinipageEntries) rows + */ + copy_out_minipage(&context->currMinipage, minipage, false); + context->currMinipageEntryIdx = 0; + } + } + + { + Datum values[7]; + bool nulls[7]; + TupleTableSlot *slot = context->scan->slot; + Datum result; + + values[0] = ItemPointerGetDatum(&slot->tts_tid); + nulls[0] = false; + + values[1] = slot_getattr(slot, Anum_pg_aoblkdir_segno, &nulls[1]); + values[2] = slot_getattr(slot, Anum_pg_aoblkdir_columngroupno, &nulls[2]); + + /* emit minipage entry */ + if (context->currMinipageValid) + { + MinipagePerColumnGroup *currMinipage = &context->currMinipage; + MinipageEntry *minipageEntry; + + Assert(context->currMinipageEntryIdx < currMinipage->numMinipageEntries); + + minipageEntry = &currMinipage->minipage->entry[context->currMinipageEntryIdx]; + + values[3] = context->currMinipageEntryIdx++; + values[4] = Int64GetDatum(minipageEntry->firstRowNum); + values[5] = Int64GetDatum(minipageEntry->fileOffset); + values[6] = Int64GetDatum(minipageEntry->rowCount); + + nulls[3] = false; + nulls[4] = false; + nulls[5] = false; + nulls[6] = false; + + context->currMinipageValid = + (context->currMinipageEntryIdx != currMinipage->numMinipageEntries); + } + else + { + nulls[3] = true; + nulls[4] = true; + nulls[5] = true; + nulls[6] = true; + } + + tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls); + result = HeapTupleGetDatum(tuple); + SRF_RETURN_NEXT(funcctx, result); + } + +srf_done: + table_close(context->aorel, AccessShareLock); + systable_endscan(context->scan); + table_close(context->blkdirrel, AccessShareLock); + pfree(context); + funcctx->user_fctx = NULL; + SRF_RETURN_DONE(funcctx); +} diff --git a/src/backend/access/appendonly/appendonly_compaction.c b/src/backend/access/appendonly/appendonly_compaction.c index 7aeb7b0382c..8fe266a38ea 100644 --- a/src/backend/access/appendonly/appendonly_compaction.c +++ b/src/backend/access/appendonly/appendonly_compaction.c @@ -454,6 +454,14 @@ AppendOnlySegmentFileFullCompaction(Relation aorel, estate->es_opened_result_relations = lappend(estate->es_opened_result_relations, resultRelInfo); + /* + * We don't want uniqueness checks to be performed while "insert"ing tuples + * to a destination segfile during AppendOnlyMoveTuple(). This is to ensure + * that we can avoid spurious conflicts between the moved tuple and the + * original tuple. + */ + estate->gp_bypass_unique_check = true; + /* * Go through all visible tuples and move them to a new segfile. */ @@ -521,39 +529,28 @@ AppendOnlySegmentFileFullCompaction(Relation aorel, } /* - * Recycle AWAITING_DROP segments. - * - * This tries to acquire an AccessExclusiveLock on the table, if it's - * available. If it's not, no segments are dropped. + * Collect AWAITING_DROP segments. + * + * Acquire AccessShareLock with cutoff_xid to scan and collect dead + * segments. */ -void -AppendOptimizedRecycleDeadSegments(Relation aorel) +Bitmapset * +AppendOptimizedCollectDeadSegments(Relation aorel) { Relation pg_aoseg_rel; TupleDesc pg_aoseg_dsc; SysScanDesc aoscan; HeapTuple tuple; Snapshot appendOnlyMetaDataSnapshot = RegisterSnapshot(GetCatalogSnapshot(InvalidOid)); - bool got_accessexclusive_lock = false; TransactionId cutoff_xid = InvalidTransactionId; Oid segrelid; + Bitmapset *dead_segs = NULL; Assert(RelationIsAppendOptimized(aorel)); - /* - * The algorithm below for choosing a target segment is not concurrent-safe. - * Grab a lock to serialize. - * - * INterlocks with SetSegnoInternal() - */ - LockDatabaseObject(aorel->rd_node.dbNode, (Oid)aorel->rd_node.relNode, 0, ExclusiveLock); - GetAppendOnlyEntryAuxOids(aorel->rd_id, appendOnlyMetaDataSnapshot, &segrelid, NULL, NULL, NULL, NULL); - /* - * Now pick a segment that is not in use, and is not over the allowed - * size threshold (90% full). - */ + pg_aoseg_rel = heap_open(segrelid, AccessShareLock); pg_aoseg_dsc = RelationGetDescr(pg_aoseg_rel); @@ -573,7 +570,9 @@ AppendOptimizedRecycleDeadSegments(Relation aorel) pg_aoseg_dsc, &isNull)); Assert(!isNull); - state = fastgetattr(tuple, Anum_pg_aoseg_state, pg_aoseg_dsc, &isNull); + state = DatumGetInt16(fastgetattr(tuple, + Anum_pg_aoseg_state, + pg_aoseg_dsc, &isNull)); Assert(!isNull); } else @@ -593,24 +592,8 @@ AppendOptimizedRecycleDeadSegments(Relation aorel) continue; /* - * Upgrade our lock to AccessExclusiveLock for the drop. Upgrading a - * lock poses a deadlock risk, so give up if we cannot acquire the - * lock immediately. We'll retry dropping the segment on the next - * VACUUM. - */ - if (!got_accessexclusive_lock) - { - if (!ConditionalLockRelation(aorel, AccessExclusiveLock)) - { - if (Debug_appendonly_print_compaction) - elog(LOG, "could not acquire AccessExclusiveLock lock on %s to recycle segno %d", - RelationGetRelationName(aorel), segno); - break; - } - got_accessexclusive_lock = true; - } - - /* + * Cutoff XID Screening + * * It's in awaiting-drop state, but does everyone see it that way? * * Compare the tuple's xmin with the oldest-xmin horizon. We don't bother @@ -618,6 +601,22 @@ AppendOptimizedRecycleDeadSegments(Relation aorel) * should not be set. Even if the tuple was update, presumably an AO * segment that's in awaiting-drop state won't be resurrected, so even if * someone updates or locks the tuple, it's still safe to drop. + * + * We don't need to acquire AccessExclusiveLock any longer because we only + * scan pg_aoseg to collect dead segments but no truncaste happens here. + * Considering the following two cases: + * + * a) When there was a reader accessing a segment file which was changed to + * AWAITING_DROP in later VACUUM compaction, the reader's xid should be earlier + * than this tuple's xmin hence would set visible_to_all to false. Then the + * AWAITING_DROP segment file wouldn't be dropped in this VACUUM cleanup and + * the earlier reader could still be able to access old tuples. + * + * b) Continue above, so there was a segment file in AWAITING_DROP state, the + * subsequent transactions can't see that hence it wouldn't be touched until + * next VACUUM is arrived. Therefore no later transaction's xid could be earlier + * than this dead segment tuple's xmin hence it would be true on visible_to_all. + * Then the corresponding dead segment file could be dropped later at that time. */ xmin = HeapTupleHeaderGetXmin(tuple->t_data); if (xmin == FrozenTransactionId) @@ -633,25 +632,55 @@ AppendOptimizedRecycleDeadSegments(Relation aorel) if (!visible_to_all) continue; - /* all set! */ - if (RelationIsAoRows(aorel)) - { - AppendOnlyCompaction_DropSegmentFile(aorel, segno); - ClearFileSegInfo(aorel, segno); - } - else - { - AOCSCompaction_DropSegmentFile(aorel, segno); - ClearAOCSFileSegInfo(aorel, segno); - } + /* collect dead segnos for dropping */ + dead_segs = bms_add_member(dead_segs, segno); } systable_endscan(aoscan); - UnlockDatabaseObject(aorel->rd_node.dbNode, (Oid)aorel->rd_node.relNode, 0, ExclusiveLock); - heap_close(pg_aoseg_rel, AccessShareLock); UnregisterSnapshot(appendOnlyMetaDataSnapshot); + + return dead_segs; +} + +/* + * Drop AWAITING_DROP segments. + * + * Callers should guarantee that the segfile is no longer needed by any + * running transaction. It is not necessary to hold a lock on the segfile + * row, though. + */ +static inline void +AppendOptimizedDropDeadSegment(Relation aorel, int segno) +{ + if (RelationIsAoRows(aorel)) + { + AppendOnlyCompaction_DropSegmentFile(aorel, segno); + ClearFileSegInfo(aorel, segno); + } + else + { + AOCSCompaction_DropSegmentFile(aorel, segno); + ClearAOCSFileSegInfo(aorel, segno); + } +} + +void +AppendOptimizedDropDeadSegments(Relation aorel, Bitmapset *segnos) +{ + int segno; + + /* + * drop segments in batch with concurrent-safety + */ + LockRelationForExtension(aorel, ExclusiveLock); + + segno = -1; + while ((segno = bms_next_member(segnos, segno)) >= 0) + AppendOptimizedDropDeadSegment(aorel, segno); + + UnlockRelationForExtension(aorel, ExclusiveLock); } /* @@ -683,10 +712,6 @@ AppendOptimizedTruncateToEOF(Relation aorel) GetAppendOnlyEntryAuxOids(aorel->rd_id, appendOnlyMetaDataSnapshot, &segrelid, NULL, NULL, NULL, NULL); - /* - * Now pick a segment that is not in use, and is not over the allowed - * size threshold (90% full). - */ pg_aoseg_rel = heap_open(segrelid, AccessShareLock); pg_aoseg_dsc = RelationGetDescr(pg_aoseg_rel); diff --git a/src/backend/access/appendonly/appendonly_visimap.c b/src/backend/access/appendonly/appendonly_visimap.c index ea9a587f2bd..d312c3a130d 100644 --- a/src/backend/access/appendonly/appendonly_visimap.c +++ b/src/backend/access/appendonly/appendonly_visimap.c @@ -864,3 +864,70 @@ AppendOnlyVisimapDelete_Finish( hash_destroy(visiMapDelete->dirtyEntryCache); BufFileClose(visiMapDelete->workfile); } + +/* + * AppendOnlyVisimap_Init_forUniqueCheck + * + * Initializes the visimap to determine if tuples were deleted as a part of + * uniqueness checks. + * + * Note: we defer setting up the appendOnlyMetaDataSnapshot for the visibility + * map to the index_unique_check() table AM call. This is because + * snapshots used for unique index lookups are special and don't follow the + * usual allocation or registration mechanism. They may be stack-allocated and a + * new snapshot object may be passed to every unique index check (this happens + * when SNAPSHOT_DIRTY is passed). While technically, we could set up the + * metadata snapshot in advance for SNAPSHOT_SELF, the alternative is fine. + */ +void AppendOnlyVisimap_Init_forUniqueCheck( + AppendOnlyVisimap *visiMap, + Relation aoRel, + Snapshot snapshot) +{ + Oid visimaprelid; + Oid visimapidxid; + + Assert(snapshot->snapshot_type == SNAPSHOT_DIRTY || + snapshot->snapshot_type == SNAPSHOT_SELF); + + GetAppendOnlyEntryAuxOids(aoRel->rd_id, + InvalidSnapshot, /* catalog snapshot is enough */ + NULL, NULL, NULL, &visimaprelid, &visimapidxid); + if (!OidIsValid(visimaprelid) || !OidIsValid(visimapidxid)) + elog(ERROR, "Could not find block directory for relation: %u", aoRel->rd_id); + + ereportif(Debug_appendonly_print_visimap, LOG, + (errmsg("Append-only visimap init for unique checks"), + errdetail("(aoRel = %u, visimaprel = %u, visimapidxrel = %u)", + aoRel->rd_id, visimaprelid, visimapidxid))); + + AppendOnlyVisimap_Init(visiMap, + visimaprelid, + visimapidxid, + AccessShareLock, + InvalidSnapshot /* appendOnlyMetaDataSnapshot */); +} + +void +AppendOnlyVisimap_Finish_forUniquenessChecks( + AppendOnlyVisimap *visiMap) +{ + AppendOnlyVisimapStore *visimapStore = &visiMap->visimapStore; + /* + * The snapshot was either reset to NULL in between calls or already cleaned + * up (if this was part of an update command) + */ + Assert(visimapStore->snapshot == InvalidSnapshot); + + ereportif(Debug_appendonly_print_visimap, LOG, + (errmsg("Append-only visimap finish for unique checks"), + errdetail("(visimaprel = %u, visimapidxrel = %u)", + visimapStore->visimapRelation->rd_id, + visimapStore->visimapRelation->rd_id))); + + AppendOnlyVisimapStore_Finish(&visiMap->visimapStore, AccessShareLock); + AppendOnlyVisimapEntry_Finish(&visiMap->visimapEntry); + + MemoryContextDelete(visiMap->memoryContext); + visiMap->memoryContext = NULL; +} diff --git a/src/backend/access/appendonly/appendonlyam.c b/src/backend/access/appendonly/appendonlyam.c index 4d84e957026..0cb0e96c321 100755 --- a/src/backend/access/appendonly/appendonlyam.c +++ b/src/backend/access/appendonly/appendonlyam.c @@ -68,54 +68,6 @@ #include "utils/memutils.h" #include "utils/snapmgr.h" -/* - * AppendOnlyDeleteDescData is used for delete data from append-only - * relations. It serves an equivalent purpose as AppendOnlyScanDescData - * (cdbappendonlyam.h) only that the later is used for scanning append-only - * relations. - */ -typedef struct AppendOnlyDeleteDescData -{ - /* - * Relation to delete from - */ - Relation aod_rel; - - /* - * Snapshot to use for meta data operations - */ - Snapshot appendOnlyMetaDataSnapshot; - - /* - * visibility map - */ - AppendOnlyVisimap visibilityMap; - - /* - * Visimap delete support structure. Used to handle out-of-order deletes - */ - AppendOnlyVisimapDelete visiMapDelete; - -} AppendOnlyDeleteDescData; - -/* - * AppendOnlyUpdateDescData is used to update data from append-only - * relations. It serves an equivalent purpose as AppendOnlyScanDescData - * (cdbappendonlyam.h) only that the later is used for scanning append-only - * relations. - */ -typedef struct AppendOnlyUpdateDescData -{ - AppendOnlyInsertDesc aoInsertDesc; - - AppendOnlyVisimap visibilityMap; - - /* - * Visimap delete support structure. Used to handle out-of-order deletes - */ - AppendOnlyVisimapDelete visiMapDelete; - -} AppendOnlyUpdateDescData; typedef enum AoExecutorBlockKind { @@ -877,7 +829,7 @@ upgrade_tuple(AppendOnlyExecutorReadBlock *executorReadBlock, * stored memtuple is problematic and then create a clone of the tuple * with properly aligned bindings to be used by the executor. */ - if (formatversion < AORelationVersion_Aligned64bit && + if (formatversion < AOSegfileFormatVersion_Aligned64bit && memtuple_has_misaligned_attribute(mtup, pbind)) convert_alignment = true; @@ -1021,7 +973,7 @@ AppendOnlyExecutorReadBlock_ProcessTuple(AppendOnlyExecutorReadBlock *executorRe AOTupleId *aoTupleId = (AOTupleId *) &fake_ctid; int formatVersion = executorReadBlock->storageRead->formatVersion; - AORelationVersion_CheckValid(formatVersion); + AOSegfileFormatVersion_CheckValid(formatVersion); AOTupleIdInit(aoTupleId, executorReadBlock->segmentFileNum, rowNum); @@ -1040,7 +992,7 @@ AppendOnlyExecutorReadBlock_ProcessTuple(AppendOnlyExecutorReadBlock *executorRe /* If the tuple is not in the latest format, convert it */ // GPDB_12_MERGE_FIXME: Is pg_upgrade from old versions still a thing? Can we drop this? - if (formatVersion < AORelationVersion_GetLatest()) + if (formatVersion < AOSegfileFormatVersion_GetLatest ()) tuple = upgrade_tuple(executorReadBlock, tuple, executorReadBlock->mt_bind, formatVersion, &shouldFree); ExecClearTuple(slot); @@ -1978,30 +1930,68 @@ fetchNextBlock(AppendOnlyFetchDesc aoFetchDesc) return true; } -static bool +/* + * Fetch the tuple from the block indicated by the block directory entry that + * covers the tuple. + */ +static void fetchFromCurrentBlock(AppendOnlyFetchDesc aoFetchDesc, int64 rowNum, TupleTableSlot *slot) { - Assert(aoFetchDesc->currentBlock.have); - Assert(rowNum >= aoFetchDesc->currentBlock.firstRowNum); - Assert(rowNum <= aoFetchDesc->currentBlock.lastRowNum); + bool fetched; + CurrentBlock *currentBlock = &aoFetchDesc->currentBlock; + AppendOnlyExecutorReadBlock *executorReadBlock = &aoFetchDesc->executorReadBlock; + AppendOnlyBlockDirectoryEntry *entry = ¤tBlock->blockDirectoryEntry; - if (!aoFetchDesc->currentBlock.gotContents) + if (!currentBlock->gotContents) { /* * Do decompression if necessary and get contents. */ - AppendOnlyExecutorReadBlock_GetContents(&aoFetchDesc->executorReadBlock); + AppendOnlyExecutorReadBlock_GetContents(executorReadBlock); - aoFetchDesc->currentBlock.gotContents = true; + currentBlock->gotContents = true; } - return AppendOnlyExecutorReadBlock_FetchTuple(&aoFetchDesc->executorReadBlock, - rowNum, - /* nkeys */ 0, - /* key */ NULL, - slot); + fetched = AppendOnlyExecutorReadBlock_FetchTuple(executorReadBlock, + rowNum, + /* nkeys */ 0, + /* key */ NULL, + slot); + if (!fetched) + { + if (AppendOnlyBlockDirectoryEntry_RangeHasRow(entry, rowNum)) + { + /* + * We fell into a hole inside the resolved block directory entry + * we obtained from AppendOnlyBlockDirectory_GetEntry(). + * This should not be happening for versions >= CB2. Scream + * appropriately. See AppendOnlyBlockDirectoryEntry for details. + */ + ereportif(AORelationVersion_Get(aoFetchDesc->relation) >= AORelationVersion_CB2, + ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("tuple with row number %ld not found in block directory entry range", rowNum), + errdetail("block directory entry: (fileOffset = %ld, firstRowNum = %ld, " + "afterFileOffset = %ld, lastRowNum = %ld)", + entry->range.fileOffset, + entry->range.firstRowNum, + entry->range.afterFileOffset, + entry->range.lastRowNum))); + } + else + { + /* + * The resolved block directory entry we obtained from + * AppendOnlyBlockDirectory_GetEntry() has range s.t. + * firstRowNum < lastRowNum < rowNum + * This can happen when rowNum maps to an aborted transaction, and + * we find an earlier committed block directory row due to the + * <= scan condition in AppendOnlyBlockDirectory_GetEntry(). + */ + } + } } static void @@ -2106,7 +2096,10 @@ scanToFetchTuple(AppendOnlyFetchDesc aoFetchDesc, } if (rowNum <= aoFetchDesc->currentBlock.lastRowNum) - return fetchFromCurrentBlock(aoFetchDesc, rowNum, slot); + { + fetchFromCurrentBlock(aoFetchDesc, rowNum, slot); + return true; + } /* * Update information to get next block. @@ -2355,7 +2348,8 @@ appendonly_fetch(AppendOnlyFetchDesc aoFetchDesc, } return false; /* row has been deleted or updated. */ } - return fetchFromCurrentBlock(aoFetchDesc, rowNum, slot); + fetchFromCurrentBlock(aoFetchDesc, rowNum, slot); + return true; } /* diff --git a/src/backend/access/appendonly/appendonlyam_handler.c b/src/backend/access/appendonly/appendonlyam_handler.c index 12dce83d488..22a47d7ab2c 100644 --- a/src/backend/access/appendonly/appendonlyam_handler.c +++ b/src/backend/access/appendonly/appendonlyam_handler.c @@ -61,6 +61,7 @@ typedef struct AppendOnlyDMLState AppendOnlyInsertDesc insertDesc; dlist_head head; // Head of multiple segment files insertion list. AppendOnlyDeleteDesc deleteDesc; + AppendOnlyUniqueCheckDesc uniqueCheckDesc; } AppendOnlyDMLState; @@ -160,6 +161,7 @@ enter_dml_state(const Oid relationOid) state->insertDesc = NULL; state->deleteDesc = NULL; + state->uniqueCheckDesc = NULL; dlist_init(&state->head); Assert(!found); @@ -242,6 +244,7 @@ void appendonly_dml_finish(Relation relation, CmdType operation) { AppendOnlyDMLState *state; + bool had_delete_desc = false; state = remove_dml_state(RelationGetRelid(relation)); @@ -260,6 +263,8 @@ appendonly_dml_finish(Relation relation, CmdType operation) */ if (!state->insertDesc) AORelIncrementModCount(relation); + + had_delete_desc = true; } if (state->insertDesc) @@ -268,6 +273,29 @@ appendonly_dml_finish(Relation relation, CmdType operation) appendonly_insert_finish(state->insertDesc, &state->head); state->insertDesc = NULL; } + + if (state->uniqueCheckDesc) + { + /* clean up the block directory */ + AppendOnlyBlockDirectory_End_forUniqueChecks(state->uniqueCheckDesc->blockDirectory); + pfree(state->uniqueCheckDesc->blockDirectory); + state->uniqueCheckDesc->blockDirectory = NULL; + + /* + * If this fetch is a part of an update, then we have been reusing the + * visimap used by the delete half of the update, which would have + * already been cleaned up above. Clean up otherwise. + */ + if (!had_delete_desc) + { + AppendOnlyVisimap_Finish_forUniquenessChecks(state->uniqueCheckDesc->visimap); + pfree(state->uniqueCheckDesc->visimap); + } + state->uniqueCheckDesc->visimap = NULL; + + pfree(state->uniqueCheckDesc); + state->uniqueCheckDesc = NULL; + } } /* @@ -302,12 +330,13 @@ get_insert_descriptor(const Relation relation) MemoryContext oldcxt; oldcxt = MemoryContextSwitchTo(appendOnlyLocal.stateCxt); - state->insertDesc = appendonly_insert_init(relation, - ChooseSegnoForWrite(relation)); + state->insertDesc= appendonly_insert_init(relation, + ChooseSegnoForWrite(relation)); dlist_init(&state->head); dlist_head *head = &state->head; dlist_push_tail(head, &state->insertDesc->node); + if (state->insertDesc->insertMultiFiles) { segments = lappend_int(segments, state->insertDesc->cur_segno); @@ -320,6 +349,18 @@ get_insert_descriptor(const Relation relation) } list_free(segments); } + + //* mark all insertDesc placeholderInserted with false */ + if (relationHasUniqueIndex(relation)) + { + dlist_iter iter; + dlist_foreach(iter, head) + { + AppendOnlyInsertDesc insertDesc = (AppendOnlyInsertDesc)dlist_container(AppendOnlyInsertDescData, node, iter.cur); + insertDesc->placeholderInserted = false; + } + } + MemoryContextSwitchTo(oldcxt); } @@ -334,6 +375,26 @@ get_insert_descriptor(const Relation relation) state->insertDesc = next; } + /* + * If we have a unique index, insert a placeholder block directory row + * to entertain uniqueness checks from concurrent inserts. See + * AppendOnlyBlockDirectory_InsertPlaceholder() for details. + */ + if (relationHasUniqueIndex(relation) && !state->insertDesc->placeholderInserted) + { + + AppendOnlyInsertDesc insertDesc = state->insertDesc; + int64 firstRowNum = insertDesc->lastSequence + 1; + BufferedAppend *bufferedAppend = &insertDesc->storageWrite.bufferedAppend; + int64 fileOffset = BufferedAppendNextBufferPosition(bufferedAppend); + + AppendOnlyBlockDirectory_InsertPlaceholder(&insertDesc->blockDirectory, + firstRowNum, + fileOffset, + 0); + insertDesc->placeholderInserted = true; + } + return state->insertDesc; } @@ -376,6 +437,50 @@ get_delete_descriptor(const Relation relation, bool forUpdate) return state->deleteDesc; } +static AppendOnlyUniqueCheckDesc +get_or_create_unique_check_desc(Relation relation, Snapshot snapshot) +{ + AppendOnlyDMLState *state = find_dml_state(RelationGetRelid(relation)); + + if (!state->uniqueCheckDesc) + { + MemoryContext oldcxt; + AppendOnlyUniqueCheckDesc uniqueCheckDesc; + + oldcxt = MemoryContextSwitchTo(appendOnlyLocal.stateCxt); + uniqueCheckDesc = palloc0(sizeof(AppendOnlyUniqueCheckDescData)); + + /* Initialize the block directory */ + uniqueCheckDesc->blockDirectory = palloc0(sizeof(AppendOnlyBlockDirectory)); + AppendOnlyBlockDirectory_Init_forUniqueChecks(uniqueCheckDesc->blockDirectory, + relation, + 1, /* numColGroups */ + snapshot); + + /* + * If this is part of an update, we need to reuse the visimap used by + * the delete half of the update. This is to avoid spurious conflicts + * when the key's previous and new value are identical. Using the + * visimap from the delete half ensures that the visimap can recognize + * any tuples deleted by us prior to this insert, within this command. + */ + if (state->deleteDesc) + uniqueCheckDesc->visimap = &state->deleteDesc->visibilityMap; + else + { + /* Initialize the visimap */ + uniqueCheckDesc->visimap = palloc0(sizeof(AppendOnlyVisimap)); + AppendOnlyVisimap_Init_forUniqueCheck(uniqueCheckDesc->visimap, + relation, + snapshot); + } + + state->uniqueCheckDesc = uniqueCheckDesc; + MemoryContextSwitchTo(oldcxt); + } + + return state->uniqueCheckDesc; +} /* ------------------------------------------------------------------------ * Slot related callbacks for appendonly AM @@ -547,9 +652,128 @@ appendonly_index_fetch_tuple(struct IndexFetchTableData *scan, appendonly_fetch(aoscan->aofetch, (AOTupleId *) tid, slot); + /* + * Currently, we don't determine this parameter. By contract, it is to be + * set to true iff we can determine that this row is dead to all + * transactions. Failure to set this will lead to use of a garbage value + * in certain code, such as that for unique index checks. + * This is typically used for HOT chains, which we don't support. + */ + if (all_dead) + *all_dead = false; + + /* Currently, we don't determine this parameter. By contract, it is to be + * set to true iff there is another tuple for the tid, so that we can prompt + * the caller to call index_fetch_tuple() again for the same tid. + * This is typically used for HOT chains, which we don't support. + */ + if (call_again) + *call_again = false; + return !TupIsNull(slot); } +/* + * Check if a visible tuple exists given the tid and a snapshot. This is + * currently used to determine uniqueness checks. + * + * We determine existence simply by checking if a *visible* block directory + * entry covers the given tid. + * + * There is no need to fetch the tuple (we actually can't reliably do so as + * we might encounter a placeholder row in the block directory) + * + * If no visible block directory entry exists, we are done. If it does, we need + * to further check the visibility of the tuple itself by consulting the visimap. + * Now, the visimap check can be skipped if the tuple was found to have been + * inserted by a concurrent in-progress transaction, in which case we return + * true and have the xwait machinery kick in. + */ +static bool +appendonly_index_unique_check(Relation rel, + ItemPointer tid, + Snapshot snapshot, + bool *all_dead) +{ + AppendOnlyUniqueCheckDesc uniqueCheckDesc; + AOTupleId *aoTupleId = (AOTupleId *) tid; + bool visible; + +#ifdef USE_ASSERT_CHECKING + int segmentFileNum = AOTupleIdGet_segmentFileNum(aoTupleId); + int64 rowNum = AOTupleIdGet_rowNum(aoTupleId); + + Assert(segmentFileNum != InvalidFileSegNumber); + Assert(rowNum != InvalidAORowNum); + /* + * Since this can only be called in the context of a unique index check, the + * snapshots that are supplied can only be non-MVCC snapshots: SELF and DIRTY. + */ + Assert(snapshot->snapshot_type == SNAPSHOT_SELF || + snapshot->snapshot_type == SNAPSHOT_DIRTY); +#endif + + /* + * Currently, we don't determine this parameter. By contract, it is to be + * set to true iff we can determine that this row is dead to all + * transactions. Failure to set this will lead to use of a garbage value + * in certain code, such as that for unique index checks. + * This is typically used for HOT chains, which we don't support. + */ + if (all_dead) + *all_dead = false; + + /* + * FIXME: for when we want CREATE UNIQUE INDEX CONCURRENTLY to work + * Unique constraint violation checks with SNAPSHOT_SELF are currently + * required to support CREATE UNIQUE INDEX CONCURRENTLY. Currently, the + * sole placeholder row inserted at first insert might not be visible to + * the snapshot, if it was already updated by its actual first row. So, + * we would need to flush a placeholder row at the beginning of each new + * in-memory minipage. Currently, CREATE INDEX CONCURRENTLY isn't + * supported, so we assume such a check satisfies SNAPSHOT_SELF. + */ + if (snapshot->snapshot_type == SNAPSHOT_SELF) + return true; + + uniqueCheckDesc = get_or_create_unique_check_desc(rel, snapshot); + + /* First, scan the block directory */ + if (!AppendOnlyBlockDirectory_UniqueCheck(uniqueCheckDesc->blockDirectory, + aoTupleId, + snapshot)) + return false; + + /* + * If the xmin or xmax are set for the dirty snapshot, after the block + * directory is scanned with the snapshot, it means that there is a + * concurrent in-progress transaction inserting the tuple. So, return true + * and have the xwait machinery kick in. + */ + Assert(snapshot->snapshot_type == SNAPSHOT_DIRTY); + if (TransactionIdIsValid(snapshot->xmin) || TransactionIdIsValid(snapshot->xmax)) + return true; + + /* Now, consult the visimap */ + visible = AppendOnlyVisimap_UniqueCheck(uniqueCheckDesc->visimap, + aoTupleId, + snapshot); + + /* + * Since we disallow deletes and updates running in parallel with inserts, + * there is no way that the dirty snapshot has it's xmin and xmax populated + * after the visimap has been scanned with it. + * + * Note: we disallow it by grabbing an ExclusiveLock on the QD (See + * CdbTryOpenTable()). So if we are running in utility mode, there is no + * such restriction. + */ + AssertImply(Gp_role != GP_ROLE_UTILITY, + (!TransactionIdIsValid(snapshot->xmin) && !TransactionIdIsValid(snapshot->xmax))); + + return visible; +} + /* ------------------------------------------------------------------------ * Callbacks for non-modifying operations on individual tuples for @@ -2110,6 +2334,7 @@ static const TableAmRoutine ao_row_methods = { .index_fetch_reset = appendonly_index_fetch_reset, .index_fetch_end = appendonly_index_fetch_end, .index_fetch_tuple = appendonly_index_fetch_tuple, + .index_unique_check = appendonly_index_unique_check, .tuple_insert = appendonly_tuple_insert, .tuple_insert_speculative = appendonly_tuple_insert_speculative, diff --git a/src/backend/access/appendonly/appendonlyblockdirectory.c b/src/backend/access/appendonly/appendonlyblockdirectory.c index a97dbd71ef2..5b20a82e228 100644 --- a/src/backend/access/appendonly/appendonlyblockdirectory.c +++ b/src/backend/access/appendonly/appendonlyblockdirectory.c @@ -14,6 +14,7 @@ */ #include "postgres.h" +#include "access/xact.h" #include "cdb/cdbappendonlyblockdirectory.h" #include "catalog/aoblkdir.h" #include "catalog/pg_appendonly.h" @@ -22,6 +23,7 @@ #include "parser/parse_oper.h" #include "utils/lsyscache.h" #include "utils/memutils.h" +#include "utils/faultinjector.h" #include "utils/guc.h" #include "utils/fmgroids.h" #include "cdb/cdbappendonlyam.h" @@ -29,13 +31,6 @@ int gp_blockdirectory_entry_min_range = 0; int gp_blockdirectory_minipage_size = NUM_MINIPAGE_ENTRIES; -static inline uint32 -minipage_size(uint32 nEntry) -{ - return offsetof(Minipage, entry) + - sizeof(MinipageEntry) * nEntry; -} - static void load_last_minipage( AppendOnlyBlockDirectory *blockDirectory, int64 lastSequence, @@ -62,6 +57,10 @@ static bool insert_new_entry(AppendOnlyBlockDirectory *blockDirectory, int64 fileOffset, int64 rowCount, bool addColAction); +static void clear_minipage(MinipagePerColumnGroup *minipagePerColumnGroup); +static bool blkdir_entry_exists(AppendOnlyBlockDirectory *blockDirectory, + AOTupleId *aoTupleId, + int columnGroupNo); void AppendOnlyBlockDirectoryEntry_GetBeginRange( @@ -152,6 +151,7 @@ init_internal(AppendOnlyBlockDirectory *blockDirectory) minipageInfo->minipage = palloc0(minipage_size(NUM_MINIPAGE_ENTRIES)); minipageInfo->numMinipageEntries = 0; + ItemPointerSetInvalid(&minipageInfo->tupleTid); } MemoryContextSwitchTo(oldcxt); @@ -220,6 +220,69 @@ AppendOnlyBlockDirectory_Init_forSearch( init_internal(blockDirectory); } +/* + * AppendOnlyBlockDirectory_Init_forUniqueChecks + * + * Initializes the block directory to handle lookups for uniqueness checks. + * + * Note: These lookups will be purely restricted to the block directory relation + * itself and will not involve the physical AO relation. + * + * Note: we defer setting up the appendOnlyMetaDataSnapshot for the block + * directory to the index_unique_check() table AM call. This is because + * snapshots used for unique index lookups are special and don't follow the + * usual allocation or registration mechanism. They may be stack-allocated and a + * new snapshot object may be passed to every unique index check (this happens + * when SNAPSHOT_DIRTY is passed). While technically, we could set up the + * metadata snapshot in advance for SNAPSHOT_SELF, the alternative is fine. + */ +void +AppendOnlyBlockDirectory_Init_forUniqueChecks( + AppendOnlyBlockDirectory *blockDirectory, + Relation aoRel, + int numColumnGroups, + Snapshot snapshot) +{ + Oid blkdirrelid; + Oid blkdiridxid; + + Assert(RelationIsValid(aoRel)); + + Assert(snapshot->snapshot_type == SNAPSHOT_DIRTY || + snapshot->snapshot_type == SNAPSHOT_SELF); + + GetAppendOnlyEntryAuxOids(aoRel->rd_id, + InvalidSnapshot, /* catalog snapshot is enough */ + NULL, &blkdirrelid, &blkdiridxid, NULL, NULL); + + if (!OidIsValid(blkdirrelid) || !OidIsValid(blkdiridxid)) + elog(ERROR, "Could not find block directory for relation: %u", aoRel->rd_id); + + ereportif(Debug_appendonly_print_blockdirectory, LOG, + (errmsg("Append-only block directory init for unique checks"), + errdetail("(aoRel = %u, blkdirrel = %u, blkdiridxrel = %u, numColumnGroups = %d)", + aoRel->rd_id, blkdirrelid, blkdiridxid, numColumnGroups))); + + blockDirectory->aoRel = aoRel; + blockDirectory->isAOCol = RelationIsAoCols(aoRel); + + /* Segfile setup is not necessary as physical AO tuples will not be accessed */ + blockDirectory->segmentFileInfo = NULL; + blockDirectory->totalSegfiles = -1; + blockDirectory->currentSegmentFileNum = -1; + + /* Metadata snapshot assignment is deferred to lookup-time */ + blockDirectory->appendOnlyMetaDataSnapshot = InvalidSnapshot; + + blockDirectory->numColumnGroups = numColumnGroups; + blockDirectory->proj = NULL; + + blockDirectory->blkdirRel = heap_open(blkdirrelid, AccessShareLock); + blockDirectory->blkdirIdx = index_open(blkdiridxid, AccessShareLock); + + init_internal(blockDirectory); +} + /* * AppendOnlyBlockDirectory_Init_forInsert * @@ -585,7 +648,12 @@ AppendOnlyBlockDirectory_GetEntry( /* Ignore columns that are not projected. */ continue; } - /* Setup the scan keys for the scan. */ + /* + * Set up the scan keys values. The keys have already been set up in + * init_internal() with the following strategy: + * (=segmentFileNum, =columnGroupNo, <=rowNum) + * See init_internal(). + */ Assert(scanKeys != NULL); scanKeys[0].sk_argument = Int32GetDatum(segmentFileNum); scanKeys[1].sk_argument = Int32GetDatum(tmpGroupNo); @@ -648,6 +716,15 @@ AppendOnlyBlockDirectory_GetEntry( /* * Since the last few blocks may not be logged in the block * directory, we always use the last entry. + * + * FIXME: If we didn't find a suitable entry, why even use the last + * entry? Currently, as it stands we would most likely return + * true from this function. This will lead to us having to do a + * fetch of the tuple from the physical file in the layer above (see + * scanToFetchTuple()), where we would ultimately find the tuple + * missing. Would it be correct to set the directory entry here to + * be the last one (for caching purposes) and return false, in order + * to avoid this physical file read? */ entry_no = minipageInfo->numMinipageEntries - 1; } @@ -660,6 +737,170 @@ AppendOnlyBlockDirectory_GetEntry( return false; } +/* + * AppendOnlyBlockDirectory_CoversTuple + * + * Check if there exists a visible block directory entry that represents a range + * in which this tid resides. + * + * Currently used by index fetches to perform unique constraint validation. A + * sysscan of the block directory relation is performed to determine the result. + * (see blkdir_entry_exists()) + * + * Performing a sysscan also has the distinct advantage of setting the xmin/xmax + * of the snapshot used to scan, which is a requirement when SNAPSHOT_DIRTY is + * used. See _bt_check_unique() and SNAPSHOT_DIRTY for details. + * + * Note about AOCO tables: + * For AOCO tables, there are multiple block directory entries for each tid. + * However, it is currently sufficient to check the block directory entry for + * just one of these columns. We do so for the 1st non-dropped column. Note that + * if we write a placeholder row for the 1st non-dropped column i, there is a + * guarantee that if there is a conflict on the placeholder row, the covering + * block directory entry will be based on the same column i (as columnar DDL + * changes need exclusive locks and placeholder rows can't be seen after tx end) + * (We could just have checked the covers condition for column 0, as block + * directory entries are inserted even for dropped columns. But, this may change + * one day, and we want our code to be future-proof) + */ +bool +AppendOnlyBlockDirectory_CoversTuple( + AppendOnlyBlockDirectory *blockDirectory, + AOTupleId *aoTupleId) +{ + Relation aoRel = blockDirectory->aoRel; + int firstNonDroppedColumn = -1; + + Assert(RelationIsValid(aoRel)); + + if (RelationIsAoRows(aoRel)) + return blkdir_entry_exists(blockDirectory, aoTupleId, 0); + else + { + for(int i = 0; i < aoRel->rd_att->natts; i++) + { + if (!aoRel->rd_att->attrs[i].attisdropped) { + firstNonDroppedColumn = i; + break; + } + } + Assert(firstNonDroppedColumn != -1); + + return blkdir_entry_exists(blockDirectory, + aoTupleId, + firstNonDroppedColumn); + } +} + +/* + * Does a visible block directory entry exist for a given aotid and column no? + * Currently used to satisfy unique constraint checks. + */ +static bool +blkdir_entry_exists(AppendOnlyBlockDirectory *blockDirectory, + AOTupleId *aoTupleId, + int columnGroupNo) +{ + int segmentFileNum = AOTupleIdGet_segmentFileNum(aoTupleId); + int64 rowNum = AOTupleIdGet_rowNum(aoTupleId); + Relation blkdirRel = blockDirectory->blkdirRel; + Relation blkdirIdx = blockDirectory->blkdirIdx; + ScanKey scanKeys = blockDirectory->scanKeys; + HeapTuple tuple; + SysScanDesc idxScanDesc; + bool found = false; + TupleDesc blkdirTupleDesc; + + Assert(RelationIsValid(blkdirRel)); + + ereportif(Debug_appendonly_print_blockdirectory, LOG, + (errmsg("Append-only block directory covers tuple check: " + "(columnGroupNo, segmentFileNum, rowNum) = " + "(%d, %d, " INT64_FORMAT ")", + 0, segmentFileNum, rowNum))); + + blkdirTupleDesc = RelationGetDescr(blkdirRel); + + /* + * Set up the scan keys values. The keys have already been set up in + * init_internal() with the following strategy: + * (=segmentFileNum, =columnGroupNo, <=rowNum) + * See init_internal(). + */ + Assert(scanKeys != NULL); + Assert(blockDirectory->numScanKeys == 3); + scanKeys[0].sk_argument = Int32GetDatum(segmentFileNum); + scanKeys[1].sk_argument = Int32GetDatum(columnGroupNo); + scanKeys[2].sk_argument = Int64GetDatum(rowNum); + idxScanDesc = systable_beginscan_ordered(blkdirRel, blkdirIdx, + blockDirectory->appendOnlyMetaDataSnapshot, + blockDirectory->numScanKeys, + scanKeys); + + /* + * + * Loop until: + * + * (1) No rows are returned from the sysscan, as there is no visible row + * satisfying the criteria. This is what happens when there is no uniqueness + * conflict, when we call this in the context of a uniqueness check. + * + * (2) We find a row such that: rowNum ∈ [firstRowNum, firstRowNum + rowCount) + * (a) The row is a regular block directory row covering the rowNum. + * (b) The row is a placeholder block directory row, inserted by + * AppendOnlyBlockDirectory_InsertPlaceholder(), which will always + * cover the rowNum by virtue of it's rowCount = AOTupleId_MaxRowNum. + */ + while (HeapTupleIsValid(tuple = systable_getnext_ordered(idxScanDesc, BackwardScanDirection))) + { + /* + * Once we have found a matching row, we must also ensure that we check + * for a block directory entry, in this row's minipage, that has a range + * that covers the rowNum. + * + * This is necessary for aborted transactions where the index entry + * might still be live. In such a case, since our search criteria lacks + * a lastRowNum, we will match rows where: + * firstRowNum < lastRowNum < rowNum + * Such rows will obviously not cover the rowNum, thus making inspection + * of the row's minipage a necessity. + */ + MinipagePerColumnGroup *minipageInfo; + int entry_no; + + BlockNumber blockNumber = ItemPointerGetBlockNumberNoCheck(&tuple->t_self); + OffsetNumber offsetNumber = ItemPointerGetOffsetNumberNoCheck(&tuple->t_self); + elogif(Debug_appendonly_print_blockdirectory, LOG, + "For segno = %d, rownum = %ld, tid returned: (%u,%u) " + "tuple (xmin, xmax) = (%lu, %lu), snaptype = %d", + segmentFileNum, rowNum, blockNumber, offsetNumber, + (unsigned long) HeapTupleHeaderGetRawXmin(tuple->t_data), + (unsigned long) HeapTupleHeaderGetRawXmax(tuple->t_data), + blockDirectory->appendOnlyMetaDataSnapshot->snapshot_type); + + /* Set this so that we don't blow up in the assert in extract_minipage */ + blockDirectory->currentSegmentFileNum = segmentFileNum; + extract_minipage(blockDirectory, + tuple, + blkdirTupleDesc, + columnGroupNo); + + minipageInfo = &blockDirectory->minipages[columnGroupNo]; + entry_no = find_minipage_entry(minipageInfo->minipage, + minipageInfo->numMinipageEntries, + rowNum); + if (entry_no != -1) + { + found = true; + break; + } + } + + systable_endscan_ordered(idxScanDesc); + + return found; +} + /* * AppendOnlyBlockDirectory_InsertEntry * @@ -696,6 +937,14 @@ AppendOnlyBlockDirectory_InsertEntry( * Helper method used to insert a new minipage entry in the block * directory relation. Refer to AppendOnlyBlockDirectory_InsertEntry() * for more details. + * + * 1. Checks if the current minipage is full. If yes, it writes the current + * minipage to the block directory relation and empty the in-memory area. This + * could mean a new block directory tuple is inserted OR an old tuple is updated. + * + * 2. "Inserts" the new entry in the current in-mem minipage -> just sets the + * in-memory area with the supplied function args. + * */ static bool insert_new_entry( @@ -709,7 +958,6 @@ insert_new_entry( MinipageEntry *entry = NULL; MinipagePerColumnGroup *minipageInfo; int minipageIndex; - int lastEntryNo; if (rowCount == 0) return false; @@ -739,52 +987,22 @@ insert_new_entry( minipageInfo = &blockDirectory->minipages[minipageIndex]; Assert(minipageInfo->numMinipageEntries <= (uint32) NUM_MINIPAGE_ENTRIES); - lastEntryNo = minipageInfo->numMinipageEntries - 1; - if (lastEntryNo >= 0) - { - entry = &(minipageInfo->minipage->entry[lastEntryNo]); - - Assert(entry->firstRowNum < firstRowNum); - Assert(entry->fileOffset < fileOffset); - - if (gp_blockdirectory_entry_min_range > 0 && - fileOffset - entry->fileOffset < gp_blockdirectory_entry_min_range) - return true; - - /* Update the rowCount in the latest entry */ - Assert(entry->rowCount <= firstRowNum - entry->firstRowNum); - - ereportif(Debug_appendonly_print_blockdirectory, LOG, - (errmsg("Append-only block directory update entry: " - "(firstRowNum, columnGroupNo, fileOffset, rowCount) = (" INT64_FORMAT - ", %d, " INT64_FORMAT ", " INT64_FORMAT ") at index %d to " - "(firstRowNum, columnGroupNo, fileOffset, rowCount) = (" INT64_FORMAT - ", %d, " INT64_FORMAT ", " INT64_FORMAT ")", - entry->firstRowNum, columnGroupNo, entry->fileOffset, entry->rowCount, - minipageInfo->numMinipageEntries - 1, - entry->firstRowNum, columnGroupNo, entry->fileOffset, - firstRowNum - entry->firstRowNum))); - - entry->rowCount = firstRowNum - entry->firstRowNum; - } - - if (minipageInfo->numMinipageEntries >= (uint32) gp_blockdirectory_minipage_size) + /* + * Before we insert the new entry into the current minipage, we should + * check if the current minipage is full. If so, we write out the current + * minipage to the block directory relation and clear out the last minipage + * in-mem, making the current in-mem minipage empty and ready to hold the + * new entry (and beyond). + */ + if (IsMinipageFull(minipageInfo)) { write_minipage(blockDirectory, columnGroupNo, minipageInfo); - - /* Set tupleTid to invalid */ - ItemPointerSetInvalid(&minipageInfo->tupleTid); - - /* - * Clear out the entries. - */ - MemSet(minipageInfo->minipage->entry, 0, - minipageInfo->numMinipageEntries * sizeof(MinipageEntry)); - minipageInfo->numMinipageEntries = 0; + clear_minipage(minipageInfo); + SIMPLE_FAULT_INJECTOR("insert_new_entry_curr_minipage_full"); } + /* Now insert the new entry */ Assert(minipageInfo->numMinipageEntries < (uint32) gp_blockdirectory_minipage_size); - entry = &(minipageInfo->minipage->entry[minipageInfo->numMinipageEntries]); entry->firstRowNum = firstRowNum; entry->fileOffset = fileOffset; @@ -918,35 +1136,6 @@ init_scankeys(TupleDesc tupleDesc, } } -/* - * copy_out_minipage - * - * Copy out the minipage content from a deformed tuple. - */ -static inline void -copy_out_minipage(MinipagePerColumnGroup *minipageInfo, - Datum minipage_value, - bool minipage_isnull) -{ - struct varlena *value; - struct varlena *detoast_value; - - Assert(!minipage_isnull); - - value = (struct varlena *) - DatumGetPointer(minipage_value); - detoast_value = pg_detoast_datum(value); - Assert(VARSIZE(detoast_value) <= minipage_size(NUM_MINIPAGE_ENTRIES)); - - memcpy(minipageInfo->minipage, detoast_value, VARSIZE(detoast_value)); - if (detoast_value != value) - pfree(detoast_value); - - Assert(minipageInfo->minipage->nEntry <= NUM_MINIPAGE_ENTRIES); - - minipageInfo->numMinipageEntries = minipageInfo->minipage->nEntry; -} - /* * extract_minipage @@ -962,14 +1151,7 @@ extract_minipage(AppendOnlyBlockDirectory *blockDirectory, { Datum *values = blockDirectory->values; bool *nulls = blockDirectory->nulls; - MinipagePerColumnGroup *minipageInfo = - &blockDirectory->minipages[columnGroupNo]; - FileSegInfo *fsInfo = blockDirectory->currentSegmentFileInfo; - int64 eof; - int start, - end, - mid = 0; - bool found = false; + MinipagePerColumnGroup *minipageInfo = &blockDirectory->minipages[columnGroupNo]; heap_deform_tuple(tuple, tupleDesc, values, nulls); @@ -984,42 +1166,6 @@ extract_minipage(AppendOnlyBlockDirectory *blockDirectory, nulls[Anum_pg_aoblkdir_minipage - 1]); ItemPointerCopy(&tuple->t_self, &minipageInfo->tupleTid); - - /* - * When crashes during inserts, or cancellation during inserts, there are - * out-of-date minipage entries in the block directory. We reset those - * entries here. - */ - Assert(fsInfo != NULL); - if (!blockDirectory->isAOCol) - eof = fsInfo->eof; - else - eof = ((AOCSFileSegInfo *) fsInfo)->vpinfo.entry[columnGroupNo].eof; - - start = 0; - end = minipageInfo->numMinipageEntries - 1; - while (start <= end) - { - mid = (end - start + 1) / 2 + start; - if (minipageInfo->minipage->entry[mid].fileOffset > eof) - end = mid - 1; - else if (minipageInfo->minipage->entry[mid].fileOffset < eof) - start = mid + 1; - else - { - found = true; - break; - } - } - - minipageInfo->numMinipageEntries = 0; - if (found) - minipageInfo->numMinipageEntries = mid; - else if (start > 0) - { - minipageInfo->numMinipageEntries = start; - Assert(minipageInfo->minipage->entry[start - 1].fileOffset < eof); - } } /* @@ -1224,12 +1370,100 @@ write_minipage(AppendOnlyBlockDirectory *blockDirectory, CatalogTupleInsertWithInfo(blkdirRel, tuple, indinfo); } + /* memorize updated/inserted tuple header info */ + ItemPointerCopy(&tuple->t_self, &minipageInfo->tupleTid); + heap_freetuple(tuple); MemoryContextSwitchTo(oldcxt); } +static void +clear_minipage(MinipagePerColumnGroup *minipagePerColumnGroup) +{ + MemSet(minipagePerColumnGroup->minipage->entry, 0, + minipagePerColumnGroup->numMinipageEntries * sizeof(MinipageEntry)); + minipagePerColumnGroup->numMinipageEntries = 0; + ItemPointerSetInvalid(&minipagePerColumnGroup->tupleTid); +} + +/* + * AppendOnlyBlockDirectory_InsertPlaceholder + * + * We perform uniqueness checks by looking up block directory rows that cover + * the rowNum indicated by the aotid obtained from the index. See + * AppendOnlyBlockDirectory_CoversTuple() for details. + * + * However, there are multiple time windows in which there are no covering block + * directory entries in the table for already inserted data rows. Such time + * windows start from when a data row is inserted and lasts till the block + * directory row covering it is written to the block directory table (see + * write_minipage()). Block directory rows are written only when: + * (i) the current in-memory minipage is full + * (ii) at end of command. + * + * So we insert a placeholder entry in the current block directory row and + * persist the row before the first insert to cover rows in the range: + * [firstRowNum, lastRowNum], starting at firstOffset in the relfile + * corresponding to columnGroupNo. + * + * firstRowNum is the rowNum assigned to the 1st insert of the insert command. + * lastRowNum is the last rowNum that will be entered by the insert command, + * which is something unknown to us. So, to cover all such windows during the + * insert command's execution, we insert an entry with a placeholder + * rowcount = AOTupleId_MaxRowNum into the current minipage and write it to the + * relation (by reusing the machinery in write_minipage()). Such a row whose + * last entry is a placeholder entry is called a placeholder row. This entry + * will cover up to lastRowNum, whatever its value may be, for all such time + * windows during the insert command. + * + * Safety: + * (1) The placeholder upper bound is not a concern as this row will be consulted + * ONLY by SNAPSHOT_DIRTY (for uniqueness checks) and will be ignored by regular + * MVCC processing (for index scans). Eventually, it will be rendered invisible + * as it will be updated by a subsequent write_minipage() or by virtue of abort. + * + * (2) There is no way a placeholder row will detect spurious conflicts due to + * its loose upper bound, in the same segment file, to which it maps. This is + * because there can be no other rows inserted into a segment file other than + * the insert operation that is currently in progress on the file. + */ +void +AppendOnlyBlockDirectory_InsertPlaceholder(AppendOnlyBlockDirectory *blockDirectory, + int64 firstRowNum, + int64 fileOffset, + int columnGroupNo) +{ + MinipagePerColumnGroup *minipagePerColumnGroup; + + Assert(firstRowNum > 0); + Assert(fileOffset >= 0); + Assert(RelationIsValid(blockDirectory->blkdirRel)); + Assert(columnGroupNo >= 0 && + columnGroupNo < blockDirectory->aoRel->rd_att->natts); + + minipagePerColumnGroup = &blockDirectory->minipages[columnGroupNo]; + /* insert placeholder entry with a max row count */ + insert_new_entry(blockDirectory, columnGroupNo, firstRowNum, fileOffset, + AOTupleId_MaxRowNum, false); + /* insert placeholder row containing placeholder entry */ + write_minipage(blockDirectory, columnGroupNo, minipagePerColumnGroup); + /* + * Delete the placeholder entry as it has no business being in memory. + * Removing it from the current minipage will make rest of the processing + * for the current command behave as if it never existed. The absence of + * this entry will help effectively "update" it once it's replacement entry + * is created in memory, in a subsequent call to insert_new_entry(), + * followed by a write_minipage() which will make this "update" persistent. + */ + minipagePerColumnGroup->numMinipageEntries--; + /* + * Increment the command counter, as we will be updating this temp row later + * on in write_minipage(). + */ + CommandCounterIncrement(); +} void AppendOnlyBlockDirectory_End_forInsert( @@ -1284,8 +1518,7 @@ AppendOnlyBlockDirectory_End_forSearch( { int groupNo; - if (blockDirectory->blkdirRel == NULL || - blockDirectory->blkdirIdx == NULL) + if (blockDirectory->blkdirRel == NULL) return; for (groupNo = 0; groupNo < blockDirectory->numColumnGroups; groupNo++) @@ -1308,7 +1541,8 @@ AppendOnlyBlockDirectory_End_forSearch( pfree(blockDirectory->scanKeys); pfree(blockDirectory->strategyNumbers); - index_close(blockDirectory->blkdirIdx, AccessShareLock); + if (blockDirectory->blkdirIdx) + index_close(blockDirectory->blkdirIdx, AccessShareLock); heap_close(blockDirectory->blkdirRel, AccessShareLock); MemoryContextDelete(blockDirectory->memoryContext); @@ -1368,3 +1602,27 @@ AppendOnlyBlockDirectory_End_addCol( MemoryContextDelete(blockDirectory->memoryContext); } + +void +AppendOnlyBlockDirectory_End_forUniqueChecks(AppendOnlyBlockDirectory *blockDirectory) +{ + Assert(RelationIsValid(blockDirectory->blkdirRel)); + + /* This must have been reset after each uniqueness check */ + Assert(blockDirectory->appendOnlyMetaDataSnapshot == InvalidSnapshot); + + Assert(RelationIsValid(blockDirectory->blkdirIdx)); + Assert(RelationIsValid(blockDirectory->blkdirRel)); + + ereportif(Debug_appendonly_print_blockdirectory, LOG, + (errmsg("Append-only block directory end for unique checks"), + errdetail("(aoRel = %u, blkdirrel = %u, blkdiridxrel = %u)", + blockDirectory->aoRel->rd_id, + blockDirectory->blkdirRel->rd_id, + blockDirectory->blkdirIdx->rd_id))); + + index_close(blockDirectory->blkdirIdx, AccessShareLock); + heap_close(blockDirectory->blkdirRel, AccessShareLock); + + MemoryContextDelete(blockDirectory->memoryContext); +} diff --git a/src/backend/access/appendonly/appendonlywriter.c b/src/backend/access/appendonly/appendonlywriter.c index 9405c3b1c6c..34eccbc828d 100644 --- a/src/backend/access/appendonly/appendonlywriter.c +++ b/src/backend/access/appendonly/appendonlywriter.c @@ -191,7 +191,7 @@ LockSegnoForWrite(Relation rel, int segno) elog(ERROR, "segfile %d is full", segno); /* Skip using the ao segment if not latest version (except as a compaction target) */ - if (formatversion != AORelationVersion_GetLatest()) + if (formatversion != AOSegfileFormatVersion_GetLatest()) elog(ERROR, "segfile %d is not of the latest version", segno); found = true; @@ -484,7 +484,7 @@ choose_segno_internal(Relation rel, List *avoid_segnos, choose_segno_mode mode) continue; /* Skip using the ao segment if not latest version (except as a compaction target) */ - if (formatversion != AORelationVersion_GetLatest()) + if (formatversion != AOSegfileFormatVersion_GetLatest()) continue; /* diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c index 5dfc3d659c2..1eee611e022 100644 --- a/src/backend/access/table/tableam.c +++ b/src/backend/access/table/tableam.c @@ -283,6 +283,14 @@ table_index_fetch_tuple_check(Relation rel, bool call_again = false; bool found; + /* + * Optimized path for AO/CO relations as the aforementioned per-tuple + * overhead is significant for AO/CO relations. For details, please refer to + * table_index_unique_check(). + */ + if (RelationIsAppendOptimized(rel)) + return table_index_unique_check(rel, tid, snapshot, all_dead); + slot = table_slot_create(rel, NULL); scan = table_index_fetch_begin(rel); found = table_index_fetch_tuple(scan, tid, snapshot, slot, &call_again, diff --git a/src/backend/catalog/aoblkdir.c b/src/backend/catalog/aoblkdir.c index d7c8e07f46b..bb64961d108 100644 --- a/src/backend/catalog/aoblkdir.c +++ b/src/backend/catalog/aoblkdir.c @@ -16,6 +16,8 @@ */ #include "postgres.h" +#include "access/aosegfiles.h" +#include "access/aocssegfiles.h" #include "access/table.h" #include "catalog/pg_am.h" #include "catalog/pg_opclass.h" @@ -118,4 +120,3 @@ AlterTableCreateAoBlkdirTable(Oid relOid) table_close(rel, NoLock); } - diff --git a/src/backend/catalog/gp_toolkit.sql b/src/backend/catalog/gp_toolkit.sql index 3bb54540d63..915c69e55c6 100644 --- a/src/backend/catalog/gp_toolkit.sql +++ b/src/backend/catalog/gp_toolkit.sql @@ -1880,6 +1880,16 @@ AS '$libdir/gp_ao_co_diagnostics' , 'gp_aocsseg_history_wrapper' LANGUAGE C STRICT EXECUTE ON ALL SEGMENTS; GRANT EXECUTE ON FUNCTION gp_toolkit.__gp_aocsseg_history(regclass) TO public; +CREATE FUNCTION gp_toolkit.__gp_aoblkdir(regclass) +RETURNS TABLE (tupleid tid, + segno integer, + columngroup_no integer, + entry_no integer, + first_row_no bigint, + file_offset bigint, + row_count bigint) +AS '$libdir/gp_ao_co_diagnostics.so', 'gp_aoblkdir_wrapper' LANGUAGE C STRICT; + CREATE FUNCTION gp_toolkit.__gp_aovisimap(regclass) RETURNS TABLE (tid tid, segno int, diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c index 9bd26ad8de5..f4d2f0612c2 100644 --- a/src/backend/catalog/heap.c +++ b/src/backend/catalog/heap.c @@ -1764,7 +1764,8 @@ heap_create_with_catalog(const char *relname, InvalidOid, InvalidOid, InvalidOid, - InvalidOid); + InvalidOid, + AORelationVersion_GetLatest()); } /* diff --git a/src/backend/catalog/pg_appendonly.c b/src/backend/catalog/pg_appendonly.c index d3b49753af5..a051bd9288d 100644 --- a/src/backend/catalog/pg_appendonly.c +++ b/src/backend/catalog/pg_appendonly.c @@ -55,7 +55,8 @@ InsertAppendOnlyEntry(Oid relid, Oid blkdirrelid, Oid blkdiridxid, Oid visimaprelid, - Oid visimapidxid) + Oid visimapidxid, + int16 version) { Relation pg_appendonly_rel; HeapTuple pg_appendonly_tuple = NULL; @@ -93,10 +94,12 @@ InsertAppendOnlyEntry(Oid relid, values[Anum_pg_appendonly_columnstore - 1] = BoolGetDatum(columnstore); values[Anum_pg_appendonly_segrelid - 1] = ObjectIdGetDatum(segrelid); values[Anum_pg_appendonly_segfilecount- 1] = Int16GetDatum(0); + values[Anum_pg_appendonly_version - 1] = Int16GetDatum(version); values[Anum_pg_appendonly_blkdirrelid - 1] = ObjectIdGetDatum(blkdirrelid); values[Anum_pg_appendonly_blkdiridxid - 1] = ObjectIdGetDatum(blkdiridxid); values[Anum_pg_appendonly_visimaprelid - 1] = ObjectIdGetDatum(visimaprelid); values[Anum_pg_appendonly_visimapidxid - 1] = ObjectIdGetDatum(visimapidxid); + /* * form the tuple and insert it @@ -670,3 +673,19 @@ GetAppendOnlySegmentFilesCount(Relation rel) table_close(pg_aoseg_rel, AccessShareLock); return result; } + +int16 +AORelationVersion_Get(Relation rel) +{ + FormData_pg_appendonly aoFormData; + + GetAppendOnlyEntry(rel->rd_id, &aoFormData); + + return aoFormData.version; +} + +bool +AORelationVersion_Validate(Relation rel, int16 version) +{ + return AORelationVersion_Get(rel) >= version; +} diff --git a/src/backend/cdb/cdbappendonlystorageread.c b/src/backend/cdb/cdbappendonlystorageread.c index 7588fe15df4..76f9b4c8f59 100755 --- a/src/backend/cdb/cdbappendonlystorageread.c +++ b/src/backend/cdb/cdbappendonlystorageread.c @@ -267,7 +267,7 @@ AppendOnlyStorageRead_FinishOpenFile(AppendOnlyStorageRead *storageRead, { MemoryContext oldMemoryContext; - AORelationVersion_CheckValid(version); + AOSegfileFormatVersion_CheckValid(version); storageRead->file = file; storageRead->formatVersion = version; diff --git a/src/backend/cdb/cdbappendonlystoragewrite.c b/src/backend/cdb/cdbappendonlystoragewrite.c index 0f3155d5b25..16e5569378d 100755 --- a/src/backend/cdb/cdbappendonlystoragewrite.c +++ b/src/backend/cdb/cdbappendonlystoragewrite.c @@ -306,7 +306,7 @@ AppendOnlyStorageWrite_OpenFile(AppendOnlyStorageWrite *storageWrite, * Assume that we only write in the current latest format. (it's redundant * to pass the version number as argument, currently) */ - if (version != AORelationVersion_GetLatest()) + if (version != AOSegfileFormatVersion_GetLatest()) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("cannot write append-only table version %d", version))); diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index 71c85687345..51e30960223 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -701,6 +701,7 @@ DefineIndex(Oid relationId, int root_save_nestlevel; int i; bool shouldDispatch; + Oid blkdirrelid = InvalidOid; shouldDispatch = (Gp_role == GP_ROLE_DISPATCH && ENABLE_DISPATCH() && @@ -819,7 +820,6 @@ DefineIndex(Oid relationId, rel = table_open(relationId, NoLock); if (RelationIsAppendOptimized(rel)) { - Oid blkdirrelid = InvalidOid; GetAppendOnlyEntryAuxOids(relationId, NULL, NULL, &blkdirrelid, NULL, NULL, NULL); if (!OidIsValid(blkdirrelid)) @@ -1071,9 +1071,32 @@ DefineIndex(Oid relationId, accessMethodName))); if (stmt->unique && RelationIsAppendOptimized(rel)) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("append-only tables do not support unique indexes"))); + { + if (stmt->concurrent) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("append-only tables do not support unique indexes built concurrently"))); + + /* Additional version checks needed if block directory already exists */ + if (OidIsValid(blkdirrelid) && !AORelationVersion_Validate(rel, AORelationVersion_CB2)) + { + /* + * We currently raise an error in this scenario. We could alternatively + * recreate the block directory (and perform a relfile swap of the block + * directory relation, similar to alter table rewrites). Such a solution is + * complex enough and can be explored with appropriate user need. Block + * directory creation during DefineIndex() has exposed complexities in the + * past too, especially around locking when multiple indexes are being + * created at a time. + */ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("append-only tables with older relation versions do not support unique indexes"), + errdetail("version found = %d, minimum version required = %d", AORelationVersion_Get(rel), + AORelationVersion_CB2), + errhint("ALTER TABLE SET WITH (REORGANIZE = true) before creating the unique index"))); + } + } /* * The TableAmRoutine of AO/AOCS does not implement the index_validate_scan method, diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 0bac5c35cba..ab15d32e47c 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -6789,8 +6789,26 @@ ATAocsWriteNewColumns(AlteredTableInfo *tab) rel = heap_open(tab->relid, NoLock); Assert(RelationIsAoCols(rel)); - /* Try to recycle any old segfiles first. */ - AppendOptimizedRecycleDeadSegments(rel); + /* + * There might be AWAITING_DROP segments occupying spaces for failing + * to drop at VACUUM in the case of cleaning up happened concurrently + * with earlier readers which was accessing the dead segment files. + * + * We used to call AppendOptimizedRecycleDeadSegments() (current name is + * ao_vacuum_rel_recycle_dead_segments) to recycle those segfiles to save + * spaces in this scenario. But it didn't do corresponding index tuples + * cleanup for unknown reason. + * + * After optimizing VACUUM AO strategy, we did refactor for + * AppendOptimizedRecycleDeadSegments() a little bit and combine + * dead segfiles cleanup with corresponding indexes cleanup together. + * While it seems to be impossible to pass index vacuuming parameter in + * this scenario, so we removed AppendOptimizedRecycleDeadSegments() out + * of this function and dedicated it to be called only in VACUUM scenario. + * + * We are supposed to be fine without recycling spaces here, or find + * another way to fix it if that does become a real problem. + */ segInfos = GetAllAOCSFileSegInfo(rel, snapshot, &nseg, NULL); basepath = relpathbackend(rel->rd_node, rel->rd_backend, MAIN_FORKNUM); diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index 0c8ea4aaf20..d8cb3975af5 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -2558,27 +2558,7 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params, LockRelation(rel, ShareLock); } - /* - * Do the actual work --- either FULL or "lazy" vacuum - */ - if (ao_vacuum_phase == VACOPT_AO_PRE_CLEANUP_PHASE) - { - ao_vacuum_rel_pre_cleanup(rel, params->options, params, vac_strategy); - } - else if (ao_vacuum_phase == VACOPT_AO_COMPACT_PHASE) - { - ao_vacuum_rel_compact(rel, params->options, params, vac_strategy); - } - else if (ao_vacuum_phase == VACOPT_AO_POST_CLEANUP_PHASE) - { - ao_vacuum_rel_post_cleanup(rel, params->options, params, vac_strategy); - } - else if (is_appendoptimized) - { - /* Do nothing here, we will launch the stages later */ - Assert(ao_vacuum_phase == 0); - } - else if ((params->options & VACOPT_FULL)) + if (!is_appendoptimized && (params->options & VACOPT_FULL)) { ClusterParams cluster_params = {0}; @@ -2592,7 +2572,7 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params, /* VACUUM FULL is now a variant of CLUSTER; see cluster.c */ cluster_rel(relid, InvalidOid, &cluster_params); } - else + else /* Heap vacuum or AO/CO vacuum in specific phase */ table_relation_vacuum(rel, params, vac_strategy); /* Roll back any GUC changes executed by index functions */ @@ -2611,10 +2591,13 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params, PopActiveSnapshot(); CommitTransactionCommand(); + /* entrance of Append-Optimized table vacuum */ if (is_appendoptimized && ao_vacuum_phase == 0) { - int orig_options = params->options; + int orig_options = params->options; + /* orchestrate the AO vacuum phases */ + /* * Do cleanup first, to reclaim as much space as possible that * was left behind from previous VACUUMs. This runs under local @@ -2627,7 +2610,8 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params, params->options = orig_options | VACOPT_AO_COMPACT_PHASE; vacuum_rel(relid, this_rangevar, params, false); - /* Do a final round of cleanup. Hopefully, this can drop the segments + /* + * Do a final round of cleanup. Hopefully, this can drop the segments * that were compacted in the previous phase. */ params->options = orig_options | VACOPT_AO_POST_CLEANUP_PHASE; diff --git a/src/backend/commands/vacuum_ao.c b/src/backend/commands/vacuum_ao.c index e9a4b1265d6..17c4d7826f2 100644 --- a/src/backend/commands/vacuum_ao.c +++ b/src/backend/commands/vacuum_ao.c @@ -115,6 +115,7 @@ */ #include "postgres.h" +#include "access/table.h" #include "access/aocs_compaction.h" #include "access/appendonlywriter.h" #include "access/appendonly_compaction.h" @@ -142,19 +143,10 @@ #include "utils/pg_rusage.h" #include "cdb/cdbappendonlyblockdirectory.h" -/* - * State information used during the vacuum of indexes on append-only tables - */ -typedef struct AppendOnlyIndexVacuumState -{ - AppendOnlyVisimap visiMap; - AppendOnlyBlockDirectory blockDirectory; - AppendOnlyBlockDirectoryEntry blockDirectoryEntry; -} AppendOnlyIndexVacuumState; static void vacuum_appendonly_index(Relation indexRelation, - AppendOnlyIndexVacuumState *vacuumIndexState, double rel_tuple_count, + Bitmapset *dead_segs, int elevel, BufferAccessStrategy bstrategy); @@ -163,19 +155,17 @@ static bool appendonly_tid_reaped(ItemPointer itemptr, void *state); static void vacuum_appendonly_fill_stats(Relation aorel, Snapshot snapshot, int elevel, BlockNumber *rel_pages, double *rel_tuples, bool *relhasindex, BlockNumber *total_file_segs); -static int vacuum_appendonly_indexes(Relation aoRelation, int options, +static int vacuum_appendonly_indexes(Relation aoRelation, int options, Bitmapset *dead_segs, BufferAccessStrategy bstrategy); -static void scan_index(Relation indrel, - AppendOnlyIndexVacuumState *vacuumIndexState, - double num_tuples, - int elevel, BufferAccessStrategy vac_strategy); +static void ao_vacuum_rel_recycle_dead_segments(Relation onerel, VacuumParams *params, + BufferAccessStrategy bstrategy); -void -ao_vacuum_rel_pre_cleanup(Relation onerel, int options, VacuumParams *params, - BufferAccessStrategy bstrategy) +static void +ao_vacuum_rel_pre_cleanup(Relation onerel, VacuumParams *params, BufferAccessStrategy bstrategy) { char *relname; int elevel; + int options = params->options; if (options & VACOPT_VERBOSE) elevel = INFO; @@ -185,7 +175,13 @@ ao_vacuum_rel_pre_cleanup(Relation onerel, int options, VacuumParams *params, if (Gp_role == GP_ROLE_DISPATCH) elevel = DEBUG2; /* vacuum and analyze messages aren't interesting from the QD */ - /* + relname = RelationGetRelationName(onerel); + ereport(elevel, + (errmsg("vacuuming \"%s.%s\"", + get_namespace_name(RelationGetNamespace(onerel)), + relname))); + + /* * Truncate AWAITING_DROP segments that are no longer visible to anyone * to 0 bytes. We cannot actually remove them yet, because there might * still be index entries pointing to them. We cannot recycle the segments @@ -197,14 +193,7 @@ ao_vacuum_rel_pre_cleanup(Relation onerel, int options, VacuumParams *params, * * This could run in a local transaction. */ - - relname = RelationGetRelationName(onerel); - ereport(elevel, - (errmsg("vacuuming \"%s.%s\"", - get_namespace_name(RelationGetNamespace(onerel)), - relname))); - - AppendOptimizedRecycleDeadSegments(onerel); + ao_vacuum_rel_recycle_dead_segments(onerel, params, bstrategy); /* * Also truncate all live segments to the EOF values stored in pg_aoseg. @@ -214,9 +203,8 @@ ao_vacuum_rel_pre_cleanup(Relation onerel, int options, VacuumParams *params, } -void -ao_vacuum_rel_post_cleanup(Relation onerel, int options, VacuumParams *params, - BufferAccessStrategy bstrategy) +static void +ao_vacuum_rel_post_cleanup(Relation onerel, VacuumParams *params, BufferAccessStrategy bstrategy) { BlockNumber relpages; double reltuples; @@ -226,6 +214,7 @@ ao_vacuum_rel_post_cleanup(Relation onerel, int options, VacuumParams *params, */ BlockNumber total_file_segs; int elevel; + int options = params->options; TransactionId OldestXmin; TransactionId FreezeLimit; MultiXactId MultiXactCutoff; @@ -240,20 +229,21 @@ ao_vacuum_rel_post_cleanup(Relation onerel, int options, VacuumParams *params, if (Gp_role == GP_ROLE_DISPATCH) elevel = DEBUG2; /* vacuum and analyze messages aren't interesting from the QD */ - /*----- + /* * This could run in a *local* transaction: * * 1. Recycled any dead AWAITING_DROP segments, like in the * pre-cleanup phase. * * 2. Vacuum indexes. - *---- + * + * 3. Drop/Truncate dead segments. + * + * 4. Update statistics. */ Assert(RelationIsAoRows(onerel) || RelationIsAoCols(onerel)); - AppendOptimizedRecycleDeadSegments(onerel); - - vacuum_appendonly_indexes(onerel, options, bstrategy); + ao_vacuum_rel_recycle_dead_segments(onerel, params, bstrategy); /* Update statistics in pg_class */ vacuum_appendonly_fill_stats(onerel, GetActiveSnapshot(), @@ -285,17 +275,16 @@ ao_vacuum_rel_post_cleanup(Relation onerel, int options, VacuumParams *params, true /* isvacuum */); } -void -ao_vacuum_rel_compact(Relation onerel, int options, VacuumParams *params, - BufferAccessStrategy bstrategy) +static void +ao_vacuum_rel_compact(Relation onerel, VacuumParams *params, BufferAccessStrategy bstrategy) { int compaction_segno; int insert_segno; List *compacted_segments = NIL; List *compacted_and_inserted_segments = NIL; - Snapshot appendOnlyMetaDataSnapshot = RegisterSnapshot(GetCatalogSnapshot(InvalidOid)); char *relname; int elevel; + int options = params->options; /* * This should run in a distributed transaction. But also allow utility @@ -372,8 +361,6 @@ ao_vacuum_rel_compact(Relation onerel, int options, VacuumParams *params, */ CommandCounterIncrement(); } - - UnregisterSnapshot(appendOnlyMetaDataSnapshot); } /* @@ -393,62 +380,73 @@ ao_vacuum_rel(Relation rel, VacuumParams *params, BufferAccessStrategy bstrategy * Do the actual work --- either FULL or "lazy" vacuum */ if (ao_vacuum_phase == VACOPT_AO_PRE_CLEANUP_PHASE) - ao_vacuum_rel_pre_cleanup(rel, params->options, params, bstrategy); + ao_vacuum_rel_pre_cleanup(rel, params, bstrategy); else if (ao_vacuum_phase == VACOPT_AO_COMPACT_PHASE) - ao_vacuum_rel_compact(rel, params->options, params, bstrategy); + ao_vacuum_rel_compact(rel, params, bstrategy); else if (ao_vacuum_phase == VACOPT_AO_POST_CLEANUP_PHASE) - ao_vacuum_rel_post_cleanup(rel, params->options, params, bstrategy); + ao_vacuum_rel_post_cleanup(rel, params, bstrategy); else /* Do nothing here, we will launch the stages later */ Assert(ao_vacuum_phase == 0); } - -static bool -vacuum_appendonly_index_should_vacuum(Relation aoRelation, - int options, - Snapshot snapshot, - AppendOnlyIndexVacuumState *vacuumIndexState, - double *rel_tuple_count) +/* + * Recycling AWAITING_DROP segments. + */ +static void +ao_vacuum_rel_recycle_dead_segments(Relation onerel, VacuumParams *params, BufferAccessStrategy bstrategy) { - int64 hidden_tupcount; - FileSegTotals *totals; - - Assert(RelationIsAppendOptimized(aoRelation)); + Bitmapset *dead_segs; + int options = params->options; + bool need_drop; - if (Gp_role == GP_ROLE_DISPATCH) + dead_segs = AppendOptimizedCollectDeadSegments(onerel); + need_drop = !bms_is_empty(dead_segs); + if (need_drop) { - if (rel_tuple_count) - { - *rel_tuple_count = 0.0; - } - return false; - } - - if (RelationIsAoRows(aoRelation)) - { - totals = GetSegFilesTotals(aoRelation, snapshot); + /* + * Vacuum indexes only when we do find AWAITING_DROP segments. + * + * Do index vacuuming before dropping dead segments for data + * consistency and crash safety. If dropping dead segments before + * cleaning up index tuples, the following issues may occur: + * + * a) The dead segment file becomes available as soon as dropping + * complete. Concurrent inserts may fill it with new tuples hence + * might be deleted soon in the following index vacuuming; + * + * b) Crash happens in-between ao_vacuum_rel_recycle_dead_segments() + * and vacuum_appendonly_indexes() result in losing the opportunity + * to clean index entries fully as a state for which index tuples + * to delete will be lost in this case. + * + * So make sure to vacuum indexs to be based on persistent information + * (AWAITING_DROP state in pg_aoseg) to cleanup dead index tuples + * effectively. + */ + vacuum_appendonly_indexes(onerel, options, dead_segs, bstrategy); + /* + * Truncate above collected AWAITING_DROP segments to 0 byte. + * AppendOptimizedCollectDeadSegments() should guarantee that + * no transaction is able to access the dead segments for being + * marked as AWAITING_DROP as well as cutoff xid screening. + * ExclusiveLock will be held in case of concurrent VACUUM being + * on the same file. + */ + AppendOptimizedDropDeadSegments(onerel, dead_segs); } else { - Assert(RelationIsAoCols(aoRelation)); - totals = GetAOCSSSegFilesTotals(aoRelation, snapshot); - } - hidden_tupcount = AppendOnlyVisimap_GetRelationHiddenTupleCount(&vacuumIndexState->visiMap); - - if (rel_tuple_count) - { - *rel_tuple_count = (double)(totals->totaltuples - hidden_tupcount); - Assert((*rel_tuple_count) > -1.0); + /* + * If no AWAITING_DROP segments were found, we called + * vacuum_appendonly_indexes() in post_cleanup phase + * for updating statistics. + */ + if ((options & VACUUM_AO_PHASE_MASK) == VACOPT_AO_POST_CLEANUP_PHASE) + vacuum_appendonly_indexes(onerel, options, dead_segs, bstrategy); } - pfree(totals); - - if (hidden_tupcount > 0 || (options & VACOPT_FULL) != 0) - { - return true; - } - return false; + bms_free(dead_segs); } /* @@ -456,30 +454,18 @@ vacuum_appendonly_index_should_vacuum(Relation aoRelation, * * Perform a vacuum on all indexes of an append-only relation. * - * The page and tuplecount information in vacrelstats are used, the - * nindex value is set by this function. - * * It returns the number of indexes on the relation. */ static int -vacuum_appendonly_indexes(Relation aoRelation, int options, +vacuum_appendonly_indexes(Relation aoRelation, int options, Bitmapset *dead_segs, BufferAccessStrategy bstrategy) { - int reindex_count = 1; int i; Relation *Irel; int nindexes; - AppendOnlyIndexVacuumState vacuumIndexState; - FileSegInfo **segmentFileInfo = NULL; /* Might be a casted AOCSFileSegInfo */ - int totalSegfiles; - Snapshot appendOnlyMetaDataSnapshot; - Oid visimaprelid; - Oid visimapidxid; Assert(RelationIsAppendOptimized(aoRelation)); - memset(&vacuumIndexState, 0, sizeof(vacuumIndexState)); - if (Debug_appendonly_print_compaction) elog(LOG, "Vacuum indexes for append-only relation %s", RelationGetRelationName(aoRelation)); @@ -490,94 +476,35 @@ vacuum_appendonly_indexes(Relation aoRelation, int options, else vac_open_indexes(aoRelation, RowExclusiveLock, &nindexes, &Irel); - appendOnlyMetaDataSnapshot = GetActiveSnapshot(); - - if (RelationIsAoRows(aoRelation)) - { - segmentFileInfo = GetAllFileSegInfo(aoRelation, - appendOnlyMetaDataSnapshot, - &totalSegfiles, - NULL); - } - else - { - Assert(RelationIsAoCols(aoRelation)); - segmentFileInfo = (FileSegInfo **) GetAllAOCSFileSegInfo(aoRelation, - appendOnlyMetaDataSnapshot, - &totalSegfiles, - NULL); - } - - GetAppendOnlyEntryAuxOids(aoRelation->rd_id, - appendOnlyMetaDataSnapshot, - NULL, NULL, NULL, - &visimaprelid, &visimapidxid); - - AppendOnlyVisimap_Init( - &vacuumIndexState.visiMap, - visimaprelid, - visimapidxid, - AccessShareLock, - appendOnlyMetaDataSnapshot); - - AppendOnlyBlockDirectory_Init_forSearch(&vacuumIndexState.blockDirectory, - appendOnlyMetaDataSnapshot, - segmentFileInfo, - totalSegfiles, - aoRelation, - 1, - RelationIsAoCols(aoRelation), - NULL); - /* Clean/scan index relation(s) */ if (Irel != NULL) { - double rel_tuple_count = 0.0; - int elevel; + int elevel; - /* just scan indexes to update statistic */ if (options & VACOPT_VERBOSE) elevel = INFO; else elevel = DEBUG2; - if (vacuum_appendonly_index_should_vacuum(aoRelation, options, - appendOnlyMetaDataSnapshot, - &vacuumIndexState, - &rel_tuple_count)) + /* just scan indexes to update statistic */ + if (Gp_role == GP_ROLE_DISPATCH || bms_is_empty(dead_segs)) { - Assert(rel_tuple_count > -1.0); - for (i = 0; i < nindexes; i++) { - vacuum_appendonly_index(Irel[i], &vacuumIndexState, - rel_tuple_count, - elevel, - bstrategy); + scan_index(Irel[i], aoRelation , elevel, bstrategy); } - reindex_count++; } else { for (i = 0; i < nindexes; i++) - scan_index(Irel[i], &vacuumIndexState, rel_tuple_count, elevel, bstrategy); - } - } - - AppendOnlyVisimap_Finish(&vacuumIndexState.visiMap, AccessShareLock); - AppendOnlyBlockDirectory_End_forSearch(&vacuumIndexState.blockDirectory); - - if (segmentFileInfo) - { - if (RelationIsAoRows(aoRelation)) - { - FreeAllSegFileInfo(segmentFileInfo, totalSegfiles); - } - else - { - FreeAllAOCSSegFileInfo((AOCSFileSegInfo **)segmentFileInfo, totalSegfiles); + { + vacuum_appendonly_index(Irel[i], + aoRelation->rd_rel->reltuples, + dead_segs, + elevel, + bstrategy); + } } - pfree(segmentFileInfo); } vac_close_indexes(nindexes, Irel, NoLock); @@ -593,18 +520,17 @@ vacuum_appendonly_indexes(Relation aoRelation, int options, */ static void vacuum_appendonly_index(Relation indexRelation, - AppendOnlyIndexVacuumState *vacuumIndexState, double rel_tuple_count, + Bitmapset *dead_segs, int elevel, BufferAccessStrategy bstrategy) { - Assert(RelationIsValid(indexRelation)); - Assert(vacuumIndexState); - IndexBulkDeleteResult *stats; IndexVacuumInfo ivinfo; PGRUsage ru0; + Assert(RelationIsValid(indexRelation)); + pg_rusage_init(&ru0); ivinfo.index = indexRelation; @@ -614,7 +540,9 @@ vacuum_appendonly_index(Relation indexRelation, /* Do bulk deletion */ stats = index_bulk_delete(&ivinfo, NULL, appendonly_tid_reaped, - (void *) vacuumIndexState); + (void *) dead_segs); + + SIMPLE_FAULT_INJECTOR("vacuum_ao_after_index_delete"); /* Do post-VACUUM cleanup */ stats = index_vacuum_cleanup(&ivinfo, stats); @@ -651,70 +579,26 @@ vacuum_appendonly_index(Relation indexRelation, pfree(stats); } -static bool -appendonly_tid_reaped_check_block_directory(AppendOnlyIndexVacuumState *vacuumState, - AOTupleId *aoTupleId) -{ - if (vacuumState->blockDirectory.currentSegmentFileNum == - AOTupleIdGet_segmentFileNum(aoTupleId) && - AppendOnlyBlockDirectoryEntry_RangeHasRow(&vacuumState->blockDirectoryEntry, - AOTupleIdGet_rowNum(aoTupleId))) - { - return true; - } - - if (!AppendOnlyBlockDirectory_GetEntry(&vacuumState->blockDirectory, - aoTupleId, - 0, - &vacuumState->blockDirectoryEntry)) - { - return false; - } - return (vacuumState->blockDirectory.currentSegmentFileNum == - AOTupleIdGet_segmentFileNum(aoTupleId) && - AppendOnlyBlockDirectoryEntry_RangeHasRow(&vacuumState->blockDirectoryEntry, - AOTupleIdGet_rowNum(aoTupleId))); -} - /* * appendonly_tid_reaped() * - * Is a particular tid for an appendonly reaped? - * state should contain an integer list of all compacted - * segment files. + * Is a particular tid for an appendonly reaped? the inputed state + * is a bitmap of dropped segno. The index entry is reaped only + * because of the segment no is a member of dead_segs. In this + * way, no need to scan visibility map so the performance would be + * good. * * This has the right signature to be an IndexBulkDeleteCallback. */ static bool appendonly_tid_reaped(ItemPointer itemptr, void *state) { - AOTupleId *aoTupleId; - AppendOnlyIndexVacuumState *vacuumState; - bool reaped; - - Assert(itemptr); - Assert(state); - - aoTupleId = (AOTupleId *)itemptr; - vacuumState = (AppendOnlyIndexVacuumState *)state; - - reaped = !appendonly_tid_reaped_check_block_directory(vacuumState, - aoTupleId); - if (!reaped) - { - /* Also check visi map */ - reaped = !AppendOnlyVisimap_IsVisible(&vacuumState->visiMap, - aoTupleId); - } + Bitmapset *dead_segs = (Bitmapset *) state; + int segno = AOTupleIdGet_segmentFileNum((AOTupleId *)itemptr); - if (Debug_appendonly_print_compaction) - ereport(DEBUG3, - (errmsg("Index vacuum %s %d", - AOTupleIdToString(aoTupleId), reaped))); - return reaped; + return bms_is_member(segno, dead_segs); } - /* * Fills in the relation statistics for an append-only relation. * @@ -795,10 +679,9 @@ vacuum_appendonly_fill_stats(Relation aorel, Snapshot snapshot, int elevel, * * We use this when we have no deletions to do. */ -static void +void scan_index(Relation indrel, - AppendOnlyIndexVacuumState *vacuumIndexState, - double num_tuples, + Relation aorel, int elevel, BufferAccessStrategy vac_strategy) { IndexBulkDeleteResult *stats; @@ -811,15 +694,12 @@ scan_index(Relation indrel, ivinfo.analyze_only = false; ivinfo.estimated_count = false; ivinfo.message_level = elevel; - ivinfo.num_heap_tuples = num_tuples; + ivinfo.num_heap_tuples = aorel->rd_rel->reltuples; ivinfo.strategy = vac_strategy; - /* Do bulk deletion */ - stats = index_bulk_delete(&ivinfo, NULL, appendonly_tid_reaped, - (void *) vacuumIndexState); /* Do post-VACUUM cleanup */ - stats = index_vacuum_cleanup(&ivinfo, stats); + stats = index_vacuum_cleanup(&ivinfo, NULL); if (!stats) return; diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c index 74becdc85df..1f524040fd3 100644 --- a/src/backend/executor/execIndexing.c +++ b/src/backend/executor/execIndexing.c @@ -273,6 +273,14 @@ ExecCloseIndices(ResultRelInfo *resultRelInfo) * * If 'arbiterIndexes' is nonempty, noDupErr applies only to * those indexes. NIL means noDupErr applies to all indexes. + * + * GPDB: gp_bypass_unique_check is introduced so that routines + * such as AO vacuum which don't need to run uniqueness checks + * while inserting tuples can do so. + * + * CAUTION: this must not be called for a HOT update. + * We can't defend against that here for lack of info. + * Should we change the API to make it safer? * ---------------------------------------------------------------- */ List * @@ -388,7 +396,7 @@ ExecInsertIndexTuples(ResultRelInfo *resultRelInfo, * For a speculative insertion (used by INSERT ... ON CONFLICT), do * the same as for a deferrable unique index. */ - if (!indexRelation->rd_index->indisunique) + if (!indexRelation->rd_index->indisunique || estate->gp_bypass_unique_check) checkUnique = UNIQUE_CHECK_NO; else if (applyNoDupErr) checkUnique = UNIQUE_CHECK_PARTIAL; diff --git a/src/backend/executor/execUtils.c b/src/backend/executor/execUtils.c index 0ce8f755548..aaf6f13434a 100644 --- a/src/backend/executor/execUtils.c +++ b/src/backend/executor/execUtils.c @@ -207,6 +207,8 @@ CreateExecutorState(void) estate->useMppParallelMode = false; estate->eliminateAliens = false; + estate->gp_bypass_unique_check = false; + /* * Return the executor state structure */ diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c index 3193dd320b3..719049a646b 100644 --- a/src/backend/storage/lmgr/lock.c +++ b/src/backend/storage/lmgr/lock.c @@ -964,19 +964,6 @@ LockAcquireExtended(const LOCKTAG *locktag, } } - /* - * We don't acquire any other heavyweight lock while holding the relation - * extension lock. We do allow to acquire the same relation extension - * lock more than once but that case won't reach here. - */ - Assert(!IsRelationExtensionLockHeld); - - /* - * We don't acquire any other heavyweight lock while holding the page lock - * except for relation extension. - */ - Assert(!IsPageLockHeld || - (locktag->locktag_type == LOCKTAG_RELATION_EXTEND)); /* * Prepare to emit a WAL record if acquisition of this lock needs to be diff --git a/src/include/access/appendonly_compaction.h b/src/include/access/appendonly_compaction.h index e07c5a65a60..ce44d2a4979 100644 --- a/src/include/access/appendonly_compaction.h +++ b/src/include/access/appendonly_compaction.h @@ -21,7 +21,8 @@ #define APPENDONLY_COMPACTION_SEGNO_INVALID (-1) -extern void AppendOptimizedRecycleDeadSegments(Relation aorel); +extern Bitmapset *AppendOptimizedCollectDeadSegments(Relation aorel); +extern void AppendOptimizedDropDeadSegments(Relation aorel, Bitmapset *segnos); extern void AppendOnlyCompact(Relation aorel, int compaction_segno, int *insert_segno, diff --git a/src/include/access/appendonly_visimap.h b/src/include/access/appendonly_visimap.h index ce7257cd467..ababd194296 100644 --- a/src/include/access/appendonly_visimap.h +++ b/src/include/access/appendonly_visimap.h @@ -132,6 +132,14 @@ void AppendOnlyVisimapScan_Init( LOCKMODE lockmode, Snapshot appendonlyMetadataSnapshot); +extern void AppendOnlyVisimap_Init_forUniqueCheck( + AppendOnlyVisimap *visiMap, + Relation aoRel, + Snapshot snapshot); + +extern void AppendOnlyVisimap_Finish_forUniquenessChecks( + AppendOnlyVisimap *visiMap); + bool AppendOnlyVisimapScan_GetNextInvisible( AppendOnlyVisimapScan *visiMapScan, AOTupleId *tupleId); @@ -149,4 +157,35 @@ TM_Result AppendOnlyVisimapDelete_Hide( void AppendOnlyVisimapDelete_Finish( AppendOnlyVisimapDelete *visiMapDelete); + +/* + * AppendOnlyVisimap_UniqueCheck + * + * During a uniqueness check, look up the visimap to see if a tuple was deleted + * by a *committed* transaction. + * + * Note: We need to use the passed in per-tuple snapshot to perform the block + * directory lookup. See AppendOnlyVisimap_Init_forUniqueCheck() for details on + * why we can't set up the metadata snapshot at init time. + * If this is part of an update, we are reusing the visimap from the delete half + * of the update, so better restore its snapshot once we are done. + */ +static inline bool AppendOnlyVisimap_UniqueCheck( + AppendOnlyVisimap *visiMap, + AOTupleId *aoTupleId, + Snapshot appendOnlyMetaDataSnapshot) +{ + Snapshot save_snapshot; + bool visible; + + Assert(appendOnlyMetaDataSnapshot->snapshot_type == SNAPSHOT_DIRTY || + appendOnlyMetaDataSnapshot->snapshot_type == SNAPSHOT_SELF); + + save_snapshot = visiMap->visimapStore.snapshot; + visiMap->visimapStore.snapshot = appendOnlyMetaDataSnapshot; + visible = AppendOnlyVisimap_IsVisible(visiMap, aoTupleId); + visiMap->visimapStore.snapshot = save_snapshot; + return visible; +} + #endif diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index ff0bf69a670..d9a3b37e9d3 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -458,6 +458,11 @@ typedef struct TableAmRoutine TupleTableSlot *slot, bool *call_again, bool *all_dead); + /* See table_index_unique_check() for details */ + bool (*index_unique_check) (Relation rel, + ItemPointer tid, + Snapshot snapshot, + bool *all_dead); /* ------------------------------------------------------------------------ * Callbacks for non-modifying operations on individual tuples @@ -1325,6 +1330,24 @@ extern bool table_index_fetch_tuple_check(Relation rel, Snapshot snapshot, bool *all_dead); +/* + * GPDB: Check if a tuple exists for a given tid obtained from an index. + * This is used to entertain unique index checks on AO/CO tables. For heap + * tables, the regular method of beginindexscan..fetchtuple..endindexscan + * can be used. Creating/destroying scan descriptors for AO/CO tables are + * too expensive to be done on a per-tuple basis. + * + * This has to have an identical signature to table_index_fetch_tuple_check(). + */ +static inline bool +table_index_unique_check(Relation rel, + ItemPointer tid, + Snapshot snapshot, + bool *all_dead) +{ + return rel->rd_tableam->index_unique_check(rel, tid, snapshot, + all_dead); +} /* ------------------------------------------------------------------------ * Functions for non-modifying operations on individual tuples diff --git a/src/include/catalog/pg_appendonly.h b/src/include/catalog/pg_appendonly.h index 1800c33e278..45a0ab1ca37 100644 --- a/src/include/catalog/pg_appendonly.h +++ b/src/include/catalog/pg_appendonly.h @@ -34,6 +34,7 @@ CATALOG(pg_appendonly,6105,AppendOnlyRelationId) bool columnstore; /* true if orientation is column */ Oid segrelid; /* OID of aoseg table; 0 if none */ int16 segfilecount; /* the (per seg) average total number of segment file */ + int16 version; /* AO relation version see AORelationVersion for detail */ Oid blkdirrelid; /* OID of aoblkdir table; 0 if none */ Oid blkdiridxid; /* if aoblkdir table, OID of aoblkdir index */ Oid visimaprelid; /* OID of the aovisimap table */ @@ -59,29 +60,46 @@ typedef FormData_pg_appendonly *Form_pg_appendonly; /* * AORelationVersion defines valid values for the version of AppendOnlyEntry. - * NOTE: When this is updated, AoRelationVersion_GetLatest() must be updated accordingly. + * NOTE: When this is updated, AORelationVersion_GetLatest() must be updated accordingly. */ typedef enum AORelationVersion { - AORelationVersion_None = 0, - AORelationVersion_Original = 1, /* first valid version */ - AORelationVersion_Aligned64bit = 2, /* version where the fixes for AOBlock and MemTuple - * were introduced, see MPP-7251 and MPP-7372. */ - AORelationVersion_PG83 = 3, /* Same as Aligned64bit, but numerics are stored - * in the PostgreSQL 8.3 format. */ - MaxAORelationVersion /* must always be last */ + AORelationVersion_None = 0, + AORelationVersion_CB1 = 1, + AORelationVersion_CB2 = 2, /* version after aoblkdir remove hole filling + * mechanims used for unique index */ + MaxAORelationVersion } AORelationVersion; -#define AORelationVersion_GetLatest() AORelationVersion_PG83 - +#define AORelationVersion_GetLatest() AORelationVersion_CB2 #define AORelationVersion_IsValid(version) \ - (version > AORelationVersion_None && version < MaxAORelationVersion) + ((version) > AORelationVersion_None && (version) < MaxAORelationVersion) + +/* + * AOSegfileFormatVersion defines valid values for the version of AppendOnlyEntry. + * NOTE: When this is updated, AOSegfileFormatVersion_GetLatest() must be updated accordingly. + */ +typedef enum AOSegfileFormatVersion +{ + AOSegfileFormatVersion_None = 0, + AOSegfileFormatVersion_Original = 1, /* first valid version */ + AOSegfileFormatVersion_Aligned64bit = 2, /* version where the fixes for AOBlock and MemTuple + * were introduced, see MPP-7251 and MPP-7372. */ + AOSegfileFormatVersion_GP5 = 3, /* Same as Aligned64bit, but numerics are stored + * in the PostgreSQL 8.3 format. */ + MaxAOSegfileFormatVersion /* must always be last */ +} AOSegfileFormatVersion; + +#define AOSegfileFormatVersion_GetLatest() AOSegfileFormatVersion_GP5 + +#define AOSegfileFormatVersion_IsValid(version) \ + (version > AOSegfileFormatVersion_None && version < MaxAOSegfileFormatVersion) extern bool Debug_appendonly_print_verify_write_block; -static inline void AORelationVersion_CheckValid(int version) +static inline void AOSegfileFormatVersion_CheckValid(int version) { - if (!AORelationVersion_IsValid(version)) + if (!AOSegfileFormatVersion_IsValid(version)) { ereport(Debug_appendonly_print_verify_write_block?PANIC:ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), @@ -91,13 +109,13 @@ static inline void AORelationVersion_CheckValid(int version) } /* - * Versions higher than AORelationVersion_Original include the fixes for AOBlock and + * Versions higher than AOSegfileFormatVersion_Original include the fixes for AOBlock and * MemTuple alignment. */ #define IsAOBlockAndMemtupleAlignmentFixed(version) \ ( \ - AORelationVersion_CheckValid(version), \ - (version > AORelationVersion_Original) \ + AOSegfileFormatVersion_CheckValid(version), \ + (version > AOSegfileFormatVersion_Original) \ ) /* @@ -105,8 +123,8 @@ static inline void AORelationVersion_CheckValid(int version) */ #define PG82NumericConversionNeeded(version) \ ( \ - AORelationVersion_CheckValid(version), \ - (version < AORelationVersion_PG83) \ + AOSegfileFormatVersion_CheckValid(version), \ + (version > AOSegfileFormatVersion_Original) \ ) extern void @@ -121,7 +139,8 @@ InsertAppendOnlyEntry(Oid relid, Oid blkdirrelid, Oid blkdiridxid, Oid visimaprelid, - Oid visimapidxid); + Oid visimapidxid, + int16 version); void GetAppendOnlyEntryAttributes(Oid relid, @@ -147,7 +166,6 @@ GetAppendOnlyEntryAuxOids(Oid relid, Oid *visimaprelid, Oid *visimapidxid); - void GetAppendOnlyEntry(Oid relid, Form_pg_appendonly aoEntry); /* @@ -171,4 +189,10 @@ SwapAppendonlyEntries(Oid entryRelId1, Oid entryRelId2); extern int16 GetAppendOnlySegmentFilesCount(Relation rel); +extern int16 +AORelationVersion_Get(Relation rel); + +extern bool +AORelationVersion_Validate(Relation rel, int16 version); + #endif /* PG_APPENDONLY_H */ diff --git a/src/include/cdb/cdbaocsam.h b/src/include/cdb/cdbaocsam.h index 1cf80410190..ed06d6cb522 100644 --- a/src/include/cdb/cdbaocsam.h +++ b/src/include/cdb/cdbaocsam.h @@ -77,6 +77,8 @@ typedef struct AOCSInsertDescData bool insertMultiFiles; /* insert into multi files */ dlist_node node; /* node of segfiles list */ int range; /* inserted tuples of each range */ + /* flag for insert placeholder in unique index */ + bool placeholderInserted; } AOCSInsertDescData; typedef AOCSInsertDescData *AOCSInsertDesc; @@ -258,9 +260,40 @@ typedef struct AOCSFetchDescData typedef AOCSFetchDescData *AOCSFetchDesc; -typedef struct AOCSUpdateDescData *AOCSUpdateDesc; +/* + * AOCSDeleteDescData is used for delete data from AOCS relations. + * It serves an equivalent purpose as AppendOnlyScanDescData + * (relscan.h) only that the later is used for scanning append-only + * relations. + */ +typedef struct AOCSDeleteDescData +{ + /* + * Relation to delete from + */ + Relation aod_rel; + + /* + * visibility map + */ + AppendOnlyVisimap visibilityMap; + + /* + * Visimap delete support structure. Used to handle out-of-order deletes + */ + AppendOnlyVisimapDelete visiMapDelete; + +} AOCSDeleteDescData; typedef struct AOCSDeleteDescData *AOCSDeleteDesc; +typedef struct AOCSUniqueCheckDescData +{ + AppendOnlyBlockDirectory *blockDirectory; + AppendOnlyVisimap *visimap; +} AOCSUniqueCheckDescData; + +typedef struct AOCSUniqueCheckDescData *AOCSUniqueCheckDesc; + /* * Descriptor for fetches from table via an index. */ @@ -341,12 +374,6 @@ extern bool aocs_fetch(AOCSFetchDesc aocsFetchDesc, AOTupleId *aoTupleId, TupleTableSlot *slot); extern void aocs_fetch_finish(AOCSFetchDesc aocsFetchDesc); - -extern AOCSUpdateDesc aocs_update_init(Relation rel, int segno); -extern void aocs_update_finish(AOCSUpdateDesc desc); -extern TM_Result aocs_update(AOCSUpdateDesc desc, TupleTableSlot *slot, - AOTupleId *oldTupleId, AOTupleId *newTupleId); - extern AOCSDeleteDesc aocs_delete_init(Relation rel); extern TM_Result aocs_delete(AOCSDeleteDesc desc, AOTupleId *aoTupleId); diff --git a/src/include/cdb/cdbappendonlyam.h b/src/include/cdb/cdbappendonlyam.h index 8a9bc451bdb..1242b3b8ce7 100644 --- a/src/include/cdb/cdbappendonlyam.h +++ b/src/include/cdb/cdbappendonlyam.h @@ -55,6 +55,8 @@ #define DEFAULT_VARBLOCK_TEMPSPACE_LEN (4 * 1024) #define DEFAULT_FS_SAFE_WRITE_SIZE (0) +extern AppendOnlyBlockDirectory *GetAOBlockDirectory(Relation relation); + /* * AppendOnlyInsertDescData is used for inserting data into append-only * relations. It serves an equivalent purpose as AppendOnlyScanDescData @@ -116,6 +118,9 @@ typedef struct AppendOnlyInsertDescData bool insertMultiFiles; /* insert into multi files */ dlist_node node; /* node of segfiles list */ int range; /* inserted tuples of each range */ + /* flag for insert placeholder in unique index */ + bool placeholderInserted; + } AppendOnlyInsertDescData; typedef AppendOnlyInsertDescData *AppendOnlyInsertDesc; @@ -128,7 +133,7 @@ typedef struct AppendOnlyExecutorReadBlock MemTupleBinding *mt_bind; /* - * When reading a segfile that's using version < AORelationVersion_PG83, + * When reading a segfile that's using version < AOSegfileFormatVersion_GP5, * that is, was created before GPDB 5.0 and upgraded with pg_upgrade, we need * to convert numeric attributes on the fly to new format. numericAtts * is an array of attribute numbers (0-based), of all numeric columns (including @@ -366,8 +371,45 @@ typedef struct AppendOnlyFetchDescData typedef AppendOnlyFetchDescData *AppendOnlyFetchDesc; +/* + * AppendOnlyDeleteDescData is used for delete data from append-only + * relations. It serves an equivalent purpose as AppendOnlyScanDescData + * (relscan.h) only that the later is used for scanning append-only + * relations. + */ +typedef struct AppendOnlyDeleteDescData +{ + /* + * Relation to delete from + */ + Relation aod_rel; + + /* + * Snapshot to use for meta data operations + */ + Snapshot appendOnlyMetaDataSnapshot; + + /* + * visibility map + */ + AppendOnlyVisimap visibilityMap; + + /* + * Visimap delete support structure. Used to handle out-of-order deletes + */ + AppendOnlyVisimapDelete visiMapDelete; + +} AppendOnlyDeleteDescData; + typedef struct AppendOnlyDeleteDescData *AppendOnlyDeleteDesc; +typedef struct AppendOnlyUniqueCheckDescData +{ + AppendOnlyBlockDirectory *blockDirectory; + AppendOnlyVisimap *visimap; +} AppendOnlyUniqueCheckDescData; + +typedef struct AppendOnlyUniqueCheckDescData *AppendOnlyUniqueCheckDesc; /* * Descriptor for fetches from table via an index. */ diff --git a/src/include/cdb/cdbappendonlyblockdirectory.h b/src/include/cdb/cdbappendonlyblockdirectory.h index 7a314490105..9e4f92a1453 100644 --- a/src/include/cdb/cdbappendonlyblockdirectory.h +++ b/src/include/cdb/cdbappendonlyblockdirectory.h @@ -23,10 +23,21 @@ extern int gp_blockdirectory_entry_min_range; extern int gp_blockdirectory_minipage_size; +/* + * In-memory equivalent of on-disk data structure MinipageEntry, used to + * represent a block directory entry. + */ typedef struct AppendOnlyBlockDirectoryEntry { /* - * The range of blocks covered by the Block Directory entry. + * The range of blocks covered by the Block Directory entry, which is the + * continuous range [firstRowNum, lastRowNum]. There are no gaps (or holes) + * within this range. However, there may be gaps between successive block + * directory entries. For e.g. entry0 could have range [1,50] and entry1 + * could have: [100,150]. The reason gaps arise between successive entries + * is that we allocate row numbers using the gp_fastsequence mechanism, + * which allocates blocks of row numbers of a pre-determined size (that may + * be larger than the number of blocks being inserted) */ struct range { @@ -81,6 +92,9 @@ typedef struct MinipagePerColumnGroup #define NUM_MINIPAGE_ENTRIES (((MaxHeapTupleSize)/8 - sizeof(HeapTupleHeaderData) - 64 * 3)\ / sizeof(MinipageEntry)) +#define IsMinipageFull(minipagePerColumnGroup) \ + ((minipagePerColumnGroup)->numMinipageEntries == (uint32) gp_blockdirectory_minipage_size) + /* * Define a structure for the append-only relation block directory. */ @@ -126,6 +140,12 @@ typedef struct AppendOnlyBlockDirectory typedef struct CurrentBlock { + /* + * Current cached block directory entry. + * FIXME: At times, we rely upon the values in this struct to be valid even + * when AOFetchBlockMetadata->valid = false. This indicates that this should + * live elsewhere. + */ AppendOnlyBlockDirectoryEntry blockDirectoryEntry; bool have; @@ -152,6 +172,11 @@ typedef struct CurrentSegmentFile int64 logicalEof; } CurrentSegmentFile; +typedef struct AppendOnlyBlockDirectorySeqScan { + AppendOnlyBlockDirectory blkdir; + SysScanDesc sysScan; +} AppendOnlyBlockDirectorySeqScan; + extern void AppendOnlyBlockDirectoryEntry_GetBeginRange( AppendOnlyBlockDirectoryEntry *directoryEntry, int64 *fileOffset, @@ -168,6 +193,9 @@ extern bool AppendOnlyBlockDirectory_GetEntry( AOTupleId *aoTupleId, int columnGroupNo, AppendOnlyBlockDirectoryEntry *directoryEntry); +extern bool AppendOnlyBlockDirectory_CoversTuple( + AppendOnlyBlockDirectory *blockDirectory, + AOTupleId *aoTupleId); extern void AppendOnlyBlockDirectory_Init_forInsert( AppendOnlyBlockDirectory *blockDirectory, Snapshot appendOnlyMetaDataSnapshot, @@ -186,6 +214,10 @@ extern void AppendOnlyBlockDirectory_Init_forSearch( int numColumnGroups, bool isAOCol, bool *proj); +extern void AppendOnlyBlockDirectory_Init_forUniqueChecks(AppendOnlyBlockDirectory *blockDirectory, + Relation aoRel, + int numColumnGroups, + Snapshot snapshot); extern void AppendOnlyBlockDirectory_Init_addCol( AppendOnlyBlockDirectory *blockDirectory, Snapshot appendOnlyMetaDataSnapshot, @@ -225,4 +257,85 @@ extern void AppendOnlyBlockDirectory_DeleteSegmentFile( Snapshot snapshot, int segno, int columnGroupNo); +extern void AppendOnlyBlockDirectory_End_forUniqueChecks( + AppendOnlyBlockDirectory *blockDirectory); + +extern void AppendOnlyBlockDirectory_InsertPlaceholder(AppendOnlyBlockDirectory *blockDirectory, + int64 firstRowNum, + int64 fileOffset, + int columnGroupNo); + +/* + * AppendOnlyBlockDirectory_UniqueCheck + * + * Check to see if there is a block directory entry for the tuple. If no such + * entry exists, the tuple doesn't exist physically in the segfile. + * + * Note: We need to use the passed in per-tuple snapshot to perform the block + * directory lookup. See AppendOnlyBlockDirectory_Init_forUniqueCheck() for + * details on why we can't set up the metadata snapshot at init time. + */ +static inline bool AppendOnlyBlockDirectory_UniqueCheck( + AppendOnlyBlockDirectory *blockDirectory, + AOTupleId *aoTupleId, + Snapshot appendOnlyMetaDataSnapshot +) +{ + bool covers; + + Assert(appendOnlyMetaDataSnapshot->snapshot_type == SNAPSHOT_DIRTY || + appendOnlyMetaDataSnapshot->snapshot_type == SNAPSHOT_SELF); + + Assert(blockDirectory->appendOnlyMetaDataSnapshot == InvalidSnapshot); + + /* Set up the snapshot to use for the block directory scan */ + blockDirectory->appendOnlyMetaDataSnapshot = appendOnlyMetaDataSnapshot; + + covers = AppendOnlyBlockDirectory_CoversTuple(blockDirectory, + aoTupleId); + + /* + * Reset the metadata snapshot to avoid leaking a stack reference. We have + * to do this since SNAPSHOT_DIRTY is stack-allocated. + */ + blockDirectory->appendOnlyMetaDataSnapshot = InvalidSnapshot; + + return covers; +} + +static inline uint32 +minipage_size(uint32 nEntry) +{ + return offsetof(Minipage, entry) + sizeof(MinipageEntry) * nEntry; +} + +/* + * copy_out_minipage + * + * Copy out the minipage content from a deformed tuple. + */ +static inline void +copy_out_minipage(MinipagePerColumnGroup *minipageInfo, + Datum minipage_value, + bool minipage_isnull) +{ + struct varlena *value; + struct varlena *detoast_value; + + Assert(!minipage_isnull); + + value = (struct varlena *) + DatumGetPointer(minipage_value); + detoast_value = pg_detoast_datum(value); + Assert(VARSIZE(detoast_value) <= minipage_size(NUM_MINIPAGE_ENTRIES)); + + memcpy(minipageInfo->minipage, detoast_value, VARSIZE(detoast_value)); + if (detoast_value != value) + pfree(detoast_value); + + Assert(minipageInfo->minipage->nEntry <= NUM_MINIPAGE_ENTRIES); + + minipageInfo->numMinipageEntries = minipageInfo->minipage->nEntry; +} + #endif diff --git a/src/include/cdb/cdbappendonlystoragewrite.h b/src/include/cdb/cdbappendonlystoragewrite.h index 80127661c36..acbdfe0211b 100755 --- a/src/include/cdb/cdbappendonlystoragewrite.h +++ b/src/include/cdb/cdbappendonlystoragewrite.h @@ -51,7 +51,7 @@ typedef struct AppendOnlyStorageWrite /* * Version number indicating the AO table format version to write in. */ - AORelationVersion formatVersion; + AOSegfileFormatVersion formatVersion; /* * Name of the relation to use in system logging and error messages. diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h index daeade87983..3fba8197b95 100644 --- a/src/include/commands/vacuum.h +++ b/src/include/commands/vacuum.h @@ -385,16 +385,8 @@ extern void analyze_rel(Oid relid, RangeVar *relation, /* in commands/vacuumlazy.c */ extern void lazy_vacuum_rel_heap(Relation onerel, VacuumParams *params, BufferAccessStrategy bstrategy); - +extern void scan_index(Relation indrel, Relation aorel, int elevel, BufferAccessStrategy bstrategy); /* in commands/vacuum_ao.c */ - -extern void ao_vacuum_rel_pre_cleanup(Relation onerel, int options, VacuumParams *params, - BufferAccessStrategy bstrategy); -extern void ao_vacuum_rel_compact(Relation onerel, int options, VacuumParams *params, - BufferAccessStrategy bstrategy); -extern void ao_vacuum_rel_post_cleanup(Relation onerel, int options, VacuumParams *params, - BufferAccessStrategy bstrategy); - extern void ao_vacuum_rel(Relation rel, VacuumParams *params, BufferAccessStrategy bstrategy); extern bool std_typanalyze(VacAttrStats *stats); diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 9a998212fcc..2b4f7ff23b2 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -713,6 +713,12 @@ typedef struct EState bool eliminateAliens; Bitmapset *locallyExecutableSubplans; + /* + * GPDB: gp_bypass_unique_check is introduced so that routines, such as AO + * vacuum, can avoid running uniqueness checks while inserting tuples. + */ + bool gp_bypass_unique_check; + } EState; struct PlanState; diff --git a/src/test/isolation2/expected/add_column_after_vacuum_skip_drop_column.out b/src/test/isolation2/expected/add_column_after_vacuum_skip_drop_column.out index 74b4423cc06..62468f7904a 100644 --- a/src/test/isolation2/expected/add_column_after_vacuum_skip_drop_column.out +++ b/src/test/isolation2/expected/add_column_after_vacuum_skip_drop_column.out @@ -1,5 +1,7 @@ -- @Description Ensures that an ALTER TABLE ADD COLUMN will drop segfiles in --- AOSEG_STATE_AWAITING_DROP state left over by a previous vacuum +-- AOSEG_STATE_AWAITING_DROP state left over by a previous vacuum. +-- We removed recycling dead segfiles from ADD COLUMN workflow, so +-- the test expected result were adjusted accordingly. -- CREATE TABLE aoco_add_column_after_vacuum_skip_drop (a INT, b INT) WITH (appendonly=true, orientation=column); CREATE @@ -46,9 +48,9 @@ ALTER 0U: SELECT segno, column_num, state FROM gp_toolkit.__gp_aocsseg('aoco_add_column_after_vacuum_skip_drop'); segno | column_num | state -------+------------+------- - 1 | 0 | 1 - 1 | 1 | 1 - 1 | 2 | 1 + 1 | 0 | 2 + 1 | 1 | 2 + 1 | 2 | 2 2 | 0 | 1 2 | 1 | 1 2 | 2 | 1 @@ -60,10 +62,10 @@ INSERT 100 0U: SELECT segno, tupcount > 0, state FROM gp_toolkit.__gp_aocsseg('aoco_add_column_after_vacuum_skip_drop'); segno | ?column? | state -------+----------+------- - 1 | t | 1 - 1 | t | 1 - 1 | t | 1 - 2 | f | 1 - 2 | f | 1 - 2 | f | 1 + 1 | t | 2 + 1 | t | 2 + 1 | t | 2 + 2 | t | 1 + 2 | t | 1 + 2 | t | 1 (6 rows) diff --git a/src/test/isolation2/expected/ao_blkdir.out b/src/test/isolation2/expected/ao_blkdir.out new file mode 100644 index 00000000000..c798a23c8f6 --- /dev/null +++ b/src/test/isolation2/expected/ao_blkdir.out @@ -0,0 +1,819 @@ +-- White-box tests asserting composition of AO/CO block directory entries. +-- All tuples are directed to seg0 and each INSERT has an increasing row count +-- to make their identification easy. + +-------------------------------------------------------------------------------- +-- AO tables +-------------------------------------------------------------------------------- + +CREATE TABLE ao_blkdir_test(i int, j int) USING ao_row DISTRIBUTED BY (j); +CREATE +CREATE INDEX ao_blkdir_test_idx ON ao_blkdir_test(i); +CREATE + +1: INSERT INTO ao_blkdir_test SELECT i, 2 FROM generate_series(1, 10) i; +INSERT 10 +-- There should be 1 block directory row with a single entry covering 10 rows +SELECT (gp_toolkit.__gp_aoblkdir('ao_blkdir_test')).* FROM gp_dist_random('gp_id') WHERE gp_segment_id = 0 ORDER BY 1,2,3,4,5; + tupleid | segno | columngroup_no | entry_no | first_row_no | file_offset | row_count +---------+-------+----------------+----------+--------------+-------------+----------- + (0,1) | 1 | 0 | 0 | 1 | 0 | 10 +(1 row) + +1: INSERT INTO ao_blkdir_test SELECT i, 2 FROM generate_series(11, 30) i; +INSERT 20 +-- There should be 2 block directory entries in a new block directory row, and +-- the row from the previous INSERT should not be visible. The entry from the +-- first INSERT should remain unchanged. +SELECT (gp_toolkit.__gp_aoblkdir('ao_blkdir_test')).* FROM gp_dist_random('gp_id') WHERE gp_segment_id = 0 ORDER BY 1,2,3,4,5; + tupleid | segno | columngroup_no | entry_no | first_row_no | file_offset | row_count +---------+-------+----------------+----------+--------------+-------------+----------- + (0,2) | 1 | 0 | 0 | 1 | 0 | 10 + (0,2) | 1 | 0 | 1 | 101 | 216 | 20 +(2 rows) + +1: BEGIN; +BEGIN +1: INSERT INTO ao_blkdir_test SELECT i, 2 FROM generate_series(31, 60) i; +INSERT 30 +2: BEGIN; +BEGIN +2: INSERT INTO ao_blkdir_test SELECT i, 2 FROM generate_series(71, 110) i; +INSERT 40 +1: COMMIT; +COMMIT +2: COMMIT; +COMMIT +-- The second INSERT of 40 rows above would have landed in segfile 1 (unlike +-- segfile 0, like the first INSERT of 30 rows above). This should be reflected +-- in the block directory entries for these rows. +SELECT (gp_toolkit.__gp_aoblkdir('ao_blkdir_test')).* FROM gp_dist_random('gp_id') WHERE gp_segment_id = 0 ORDER BY 1,2,3,4,5; + tupleid | segno | columngroup_no | entry_no | first_row_no | file_offset | row_count +---------+-------+----------------+----------+--------------+-------------+----------- + (0,3) | 1 | 0 | 0 | 1 | 0 | 10 + (0,3) | 1 | 0 | 1 | 101 | 216 | 20 + (0,3) | 1 | 0 | 2 | 201 | 608 | 30 + (0,4) | 2 | 0 | 0 | 1 | 0 | 40 +(4 rows) + +TRUNCATE ao_blkdir_test; +TRUNCATE +-- Insert enough rows to overflow the first block directory minipage by 2. +INSERT INTO ao_blkdir_test SELECT i, 2 FROM generate_series(1, 292700) i; +INSERT 292700 +-- There should be 2 block directory rows, one with 161 entries covering 292698 +-- rows and the other with 1 entry covering the 2 overflow rows. +SELECT (gp_toolkit.__gp_aoblkdir('ao_blkdir_test')).* FROM gp_dist_random('gp_id') WHERE gp_segment_id = 0 ORDER BY 1,2,3,4,5; + tupleid | segno | columngroup_no | entry_no | first_row_no | file_offset | row_count +---------+-------+----------------+----------+--------------+-------------+----------- + (0,1) | 1 | 0 | 0 | 1 | 0 | 1818 + (0,1) | 1 | 0 | 1 | 1819 | 32760 | 1818 + (0,1) | 1 | 0 | 2 | 3637 | 65520 | 1818 + (0,1) | 1 | 0 | 3 | 5455 | 98280 | 1818 + (0,1) | 1 | 0 | 4 | 7273 | 131040 | 1818 + (0,1) | 1 | 0 | 5 | 9091 | 163800 | 1818 + (0,1) | 1 | 0 | 6 | 10909 | 196560 | 1818 + (0,1) | 1 | 0 | 7 | 12727 | 229320 | 1818 + (0,1) | 1 | 0 | 8 | 14545 | 262080 | 1818 + (0,1) | 1 | 0 | 9 | 16363 | 294840 | 1818 + (0,1) | 1 | 0 | 10 | 18181 | 327600 | 1818 + (0,1) | 1 | 0 | 11 | 19999 | 360360 | 1818 + (0,1) | 1 | 0 | 12 | 21817 | 393120 | 1818 + (0,1) | 1 | 0 | 13 | 23635 | 425880 | 1818 + (0,1) | 1 | 0 | 14 | 25453 | 458640 | 1818 + (0,1) | 1 | 0 | 15 | 27271 | 491400 | 1818 + (0,1) | 1 | 0 | 16 | 29089 | 524160 | 1818 + (0,1) | 1 | 0 | 17 | 30907 | 556920 | 1818 + (0,1) | 1 | 0 | 18 | 32725 | 589680 | 1818 + (0,1) | 1 | 0 | 19 | 34543 | 622440 | 1818 + (0,1) | 1 | 0 | 20 | 36361 | 655200 | 1818 + (0,1) | 1 | 0 | 21 | 38179 | 687960 | 1818 + (0,1) | 1 | 0 | 22 | 39997 | 720720 | 1818 + (0,1) | 1 | 0 | 23 | 41815 | 753480 | 1818 + (0,1) | 1 | 0 | 24 | 43633 | 786240 | 1818 + (0,1) | 1 | 0 | 25 | 45451 | 819000 | 1818 + (0,1) | 1 | 0 | 26 | 47269 | 851760 | 1818 + (0,1) | 1 | 0 | 27 | 49087 | 884520 | 1818 + (0,1) | 1 | 0 | 28 | 50905 | 917280 | 1818 + (0,1) | 1 | 0 | 29 | 52723 | 950040 | 1818 + (0,1) | 1 | 0 | 30 | 54541 | 982800 | 1818 + (0,1) | 1 | 0 | 31 | 56359 | 1015560 | 1818 + (0,1) | 1 | 0 | 32 | 58177 | 1048320 | 1818 + (0,1) | 1 | 0 | 33 | 59995 | 1081080 | 1818 + (0,1) | 1 | 0 | 34 | 61813 | 1113840 | 1818 + (0,1) | 1 | 0 | 35 | 63631 | 1146600 | 1818 + (0,1) | 1 | 0 | 36 | 65449 | 1179360 | 1818 + (0,1) | 1 | 0 | 37 | 67267 | 1212120 | 1818 + (0,1) | 1 | 0 | 38 | 69085 | 1244880 | 1818 + (0,1) | 1 | 0 | 39 | 70903 | 1277640 | 1818 + (0,1) | 1 | 0 | 40 | 72721 | 1310400 | 1818 + (0,1) | 1 | 0 | 41 | 74539 | 1343160 | 1818 + (0,1) | 1 | 0 | 42 | 76357 | 1375920 | 1818 + (0,1) | 1 | 0 | 43 | 78175 | 1408680 | 1818 + (0,1) | 1 | 0 | 44 | 79993 | 1441440 | 1818 + (0,1) | 1 | 0 | 45 | 81811 | 1474200 | 1818 + (0,1) | 1 | 0 | 46 | 83629 | 1506960 | 1818 + (0,1) | 1 | 0 | 47 | 85447 | 1539720 | 1818 + (0,1) | 1 | 0 | 48 | 87265 | 1572480 | 1818 + (0,1) | 1 | 0 | 49 | 89083 | 1605240 | 1818 + (0,1) | 1 | 0 | 50 | 90901 | 1638000 | 1818 + (0,1) | 1 | 0 | 51 | 92719 | 1670760 | 1818 + (0,1) | 1 | 0 | 52 | 94537 | 1703520 | 1818 + (0,1) | 1 | 0 | 53 | 96355 | 1736280 | 1818 + (0,1) | 1 | 0 | 54 | 98173 | 1769040 | 1818 + (0,1) | 1 | 0 | 55 | 99991 | 1801800 | 1818 + (0,1) | 1 | 0 | 56 | 101809 | 1834560 | 1818 + (0,1) | 1 | 0 | 57 | 103627 | 1867320 | 1818 + (0,1) | 1 | 0 | 58 | 105445 | 1900080 | 1818 + (0,1) | 1 | 0 | 59 | 107263 | 1932840 | 1818 + (0,1) | 1 | 0 | 60 | 109081 | 1965600 | 1818 + (0,1) | 1 | 0 | 61 | 110899 | 1998360 | 1818 + (0,1) | 1 | 0 | 62 | 112717 | 2031120 | 1818 + (0,1) | 1 | 0 | 63 | 114535 | 2063880 | 1818 + (0,1) | 1 | 0 | 64 | 116353 | 2096640 | 1818 + (0,1) | 1 | 0 | 65 | 118171 | 2129400 | 1818 + (0,1) | 1 | 0 | 66 | 119989 | 2162160 | 1818 + (0,1) | 1 | 0 | 67 | 121807 | 2194920 | 1818 + (0,1) | 1 | 0 | 68 | 123625 | 2227680 | 1818 + (0,1) | 1 | 0 | 69 | 125443 | 2260440 | 1818 + (0,1) | 1 | 0 | 70 | 127261 | 2293200 | 1818 + (0,1) | 1 | 0 | 71 | 129079 | 2325960 | 1818 + (0,1) | 1 | 0 | 72 | 130897 | 2358720 | 1818 + (0,1) | 1 | 0 | 73 | 132715 | 2391480 | 1818 + (0,1) | 1 | 0 | 74 | 134533 | 2424240 | 1818 + (0,1) | 1 | 0 | 75 | 136351 | 2457000 | 1818 + (0,1) | 1 | 0 | 76 | 138169 | 2489760 | 1818 + (0,1) | 1 | 0 | 77 | 139987 | 2522520 | 1818 + (0,1) | 1 | 0 | 78 | 141805 | 2555280 | 1818 + (0,1) | 1 | 0 | 79 | 143623 | 2588040 | 1818 + (0,1) | 1 | 0 | 80 | 145441 | 2620800 | 1818 + (0,1) | 1 | 0 | 81 | 147259 | 2653560 | 1818 + (0,1) | 1 | 0 | 82 | 149077 | 2686320 | 1818 + (0,1) | 1 | 0 | 83 | 150895 | 2719080 | 1818 + (0,1) | 1 | 0 | 84 | 152713 | 2751840 | 1818 + (0,1) | 1 | 0 | 85 | 154531 | 2784600 | 1818 + (0,1) | 1 | 0 | 86 | 156349 | 2817360 | 1818 + (0,1) | 1 | 0 | 87 | 158167 | 2850120 | 1818 + (0,1) | 1 | 0 | 88 | 159985 | 2882880 | 1818 + (0,1) | 1 | 0 | 89 | 161803 | 2915640 | 1818 + (0,1) | 1 | 0 | 90 | 163621 | 2948400 | 1818 + (0,1) | 1 | 0 | 91 | 165439 | 2981160 | 1818 + (0,1) | 1 | 0 | 92 | 167257 | 3013920 | 1818 + (0,1) | 1 | 0 | 93 | 169075 | 3046680 | 1818 + (0,1) | 1 | 0 | 94 | 170893 | 3079440 | 1818 + (0,1) | 1 | 0 | 95 | 172711 | 3112200 | 1818 + (0,1) | 1 | 0 | 96 | 174529 | 3144960 | 1818 + (0,1) | 1 | 0 | 97 | 176347 | 3177720 | 1818 + (0,1) | 1 | 0 | 98 | 178165 | 3210480 | 1818 + (0,1) | 1 | 0 | 99 | 179983 | 3243240 | 1818 + (0,1) | 1 | 0 | 100 | 181801 | 3276000 | 1818 + (0,1) | 1 | 0 | 101 | 183619 | 3308760 | 1818 + (0,1) | 1 | 0 | 102 | 185437 | 3341520 | 1818 + (0,1) | 1 | 0 | 103 | 187255 | 3374280 | 1818 + (0,1) | 1 | 0 | 104 | 189073 | 3407040 | 1818 + (0,1) | 1 | 0 | 105 | 190891 | 3439800 | 1818 + (0,1) | 1 | 0 | 106 | 192709 | 3472560 | 1818 + (0,1) | 1 | 0 | 107 | 194527 | 3505320 | 1818 + (0,1) | 1 | 0 | 108 | 196345 | 3538080 | 1818 + (0,1) | 1 | 0 | 109 | 198163 | 3570840 | 1818 + (0,1) | 1 | 0 | 110 | 199981 | 3603600 | 1818 + (0,1) | 1 | 0 | 111 | 201799 | 3636360 | 1818 + (0,1) | 1 | 0 | 112 | 203617 | 3669120 | 1818 + (0,1) | 1 | 0 | 113 | 205435 | 3701880 | 1818 + (0,1) | 1 | 0 | 114 | 207253 | 3734640 | 1818 + (0,1) | 1 | 0 | 115 | 209071 | 3767400 | 1818 + (0,1) | 1 | 0 | 116 | 210889 | 3800160 | 1818 + (0,1) | 1 | 0 | 117 | 212707 | 3832920 | 1818 + (0,1) | 1 | 0 | 118 | 214525 | 3865680 | 1818 + (0,1) | 1 | 0 | 119 | 216343 | 3898440 | 1818 + (0,1) | 1 | 0 | 120 | 218161 | 3931200 | 1818 + (0,1) | 1 | 0 | 121 | 219979 | 3963960 | 1818 + (0,1) | 1 | 0 | 122 | 221797 | 3996720 | 1818 + (0,1) | 1 | 0 | 123 | 223615 | 4029480 | 1818 + (0,1) | 1 | 0 | 124 | 225433 | 4062240 | 1818 + (0,1) | 1 | 0 | 125 | 227251 | 4095000 | 1818 + (0,1) | 1 | 0 | 126 | 229069 | 4127760 | 1818 + (0,1) | 1 | 0 | 127 | 230887 | 4160520 | 1818 + (0,1) | 1 | 0 | 128 | 232705 | 4193280 | 1818 + (0,1) | 1 | 0 | 129 | 234523 | 4226040 | 1818 + (0,1) | 1 | 0 | 130 | 236341 | 4258800 | 1818 + (0,1) | 1 | 0 | 131 | 238159 | 4291560 | 1818 + (0,1) | 1 | 0 | 132 | 239977 | 4324320 | 1818 + (0,1) | 1 | 0 | 133 | 241795 | 4357080 | 1818 + (0,1) | 1 | 0 | 134 | 243613 | 4389840 | 1818 + (0,1) | 1 | 0 | 135 | 245431 | 4422600 | 1818 + (0,1) | 1 | 0 | 136 | 247249 | 4455360 | 1818 + (0,1) | 1 | 0 | 137 | 249067 | 4488120 | 1818 + (0,1) | 1 | 0 | 138 | 250885 | 4520880 | 1818 + (0,1) | 1 | 0 | 139 | 252703 | 4553640 | 1818 + (0,1) | 1 | 0 | 140 | 254521 | 4586400 | 1818 + (0,1) | 1 | 0 | 141 | 256339 | 4619160 | 1818 + (0,1) | 1 | 0 | 142 | 258157 | 4651920 | 1818 + (0,1) | 1 | 0 | 143 | 259975 | 4684680 | 1818 + (0,1) | 1 | 0 | 144 | 261793 | 4717440 | 1818 + (0,1) | 1 | 0 | 145 | 263611 | 4750200 | 1818 + (0,1) | 1 | 0 | 146 | 265429 | 4782960 | 1818 + (0,1) | 1 | 0 | 147 | 267247 | 4815720 | 1818 + (0,1) | 1 | 0 | 148 | 269065 | 4848480 | 1818 + (0,1) | 1 | 0 | 149 | 270883 | 4881240 | 1818 + (0,1) | 1 | 0 | 150 | 272701 | 4914000 | 1818 + (0,1) | 1 | 0 | 151 | 274519 | 4946760 | 1818 + (0,1) | 1 | 0 | 152 | 276337 | 4979520 | 1818 + (0,1) | 1 | 0 | 153 | 278155 | 5012280 | 1818 + (0,1) | 1 | 0 | 154 | 279973 | 5045040 | 1818 + (0,1) | 1 | 0 | 155 | 281791 | 5077800 | 1818 + (0,1) | 1 | 0 | 156 | 283609 | 5110560 | 1818 + (0,1) | 1 | 0 | 157 | 285427 | 5143320 | 1818 + (0,1) | 1 | 0 | 158 | 287245 | 5176080 | 1818 + (0,1) | 1 | 0 | 159 | 289063 | 5208840 | 1818 + (0,1) | 1 | 0 | 160 | 290881 | 5241600 | 1818 + (0,2) | 1 | 0 | 0 | 292699 | 5274360 | 2 +(162 rows) + +-- Unique index white box tests +DROP TABLE ao_blkdir_test; +DROP +CREATE TABLE ao_blkdir_test(i int UNIQUE, j int) USING ao_row DISTRIBUTED BY (i); +CREATE + +SELECT gp_inject_fault('appendonly_insert', 'suspend', '', '', 'ao_blkdir_test', 1, 1, 0, dbid) FROM gp_segment_configuration WHERE role = 'p' AND content = 0; + gp_inject_fault +----------------- + Success: +(1 row) +1: BEGIN; +BEGIN +1&: INSERT INTO ao_blkdir_test VALUES (2, 2); + +-- There should be a placeholder row inserted to cover the rows for each INSERT +-- session, before we insert the 1st row in that session, that is only visible +-- to SNAPSHOT_DIRTY. +SELECT gp_wait_until_triggered_fault('appendonly_insert', 1, dbid) FROM gp_segment_configuration WHERE role = 'p' AND content = 0; + gp_wait_until_triggered_fault +------------------------------- + Success: +(1 row) +SELECT (gp_toolkit.__gp_aoblkdir('ao_blkdir_test')).* FROM gp_dist_random('gp_id') WHERE gp_segment_id = 0 ORDER BY 1,2,3,4,5; + tupleid | segno | columngroup_no | entry_no | first_row_no | file_offset | row_count +---------+-------+----------------+----------+--------------+-------------+----------- +(0 rows) +SET gp_select_invisible TO ON; +SET +SELECT (gp_toolkit.__gp_aoblkdir('ao_blkdir_test')).* FROM gp_dist_random('gp_id') WHERE gp_segment_id = 0 ORDER BY 1,2,3,4,5; + tupleid | segno | columngroup_no | entry_no | first_row_no | file_offset | row_count +---------+-------+----------------+----------+--------------+-------------+--------------- + (0,1) | 1 | 0 | 0 | 1 | 0 | 1099511627775 +(1 row) +RESET gp_select_invisible; +RESET + +-- The placeholder row is invisible to other transactions (that don't perform a +-- uniqueness check) while the INSERT is in progress. +2: SELECT (gp_toolkit.__gp_aoblkdir('ao_blkdir_test')).* FROM gp_dist_random('gp_id') WHERE gp_segment_id = 0 ORDER BY 1,2,3,4,5; + tupleid | segno | columngroup_no | entry_no | first_row_no | file_offset | row_count +---------+-------+----------------+----------+--------------+-------------+----------- +(0 rows) + +SELECT gp_inject_fault('appendonly_insert', 'reset', dbid) FROM gp_segment_configuration WHERE role = 'p' AND content = 0; + gp_inject_fault +----------------- + Success: +(1 row) +1<: <... completed> +INSERT 1 + +-- The placeholder row is invisible to the INSERTing transaction. Since the +-- INSERT finished, there should be 1 visible blkdir row representing the INSERT. +1: SELECT (gp_toolkit.__gp_aoblkdir('ao_blkdir_test')).* FROM gp_dist_random('gp_id') WHERE gp_segment_id = 0 ORDER BY 1,2,3,4,5; + tupleid | segno | columngroup_no | entry_no | first_row_no | file_offset | row_count +---------+-------+----------------+----------+--------------+-------------+----------- + (0,2) | 1 | 0 | 0 | 1 | 0 | 1 +(1 row) + +-- The placeholder row is invisible to other transactions (that don't perform a +-- uniqueness check) even after the INSERT finishes. The blkdir row representing +-- the INSERT should not be visible as the INSERTing transaction hasn't +-- committed yet. +2: SELECT (gp_toolkit.__gp_aoblkdir('ao_blkdir_test')).* FROM gp_dist_random('gp_id') WHERE gp_segment_id = 0 ORDER BY 1,2,3,4,5; + tupleid | segno | columngroup_no | entry_no | first_row_no | file_offset | row_count +---------+-------+----------------+----------+--------------+-------------+----------- +(0 rows) + +1: COMMIT; +COMMIT + +-- The placeholder row is invisible to other transactions (that don't perform a +-- uniqueness check) even after the INSERTing transaction commits. Since the +-- INSERTing transaction has committed, the blkdir row representing the INSERT +-- should be visible now. +2: SELECT (gp_toolkit.__gp_aoblkdir('ao_blkdir_test')).* FROM gp_dist_random('gp_id') WHERE gp_segment_id = 0 ORDER BY 1,2,3,4,5; + tupleid | segno | columngroup_no | entry_no | first_row_no | file_offset | row_count +---------+-------+----------------+----------+--------------+-------------+----------- + (0,2) | 1 | 0 | 0 | 1 | 0 | 1 +(1 row) + +DROP TABLE ao_blkdir_test; +DROP + +-------------------------------------------------------------------------------- +-- AOCO tables +-------------------------------------------------------------------------------- + +CREATE TABLE aoco_blkdir_test(i int, j int) USING ao_column DISTRIBUTED BY (j); +CREATE +CREATE INDEX aoco_blkdir_test_idx ON aoco_blkdir_test(i); +CREATE + +1: INSERT INTO aoco_blkdir_test SELECT i, 2 FROM generate_series(1, 10) i; +INSERT 10 +-- There should be 2 block directory rows with a single entry covering 10 rows, +-- (1 for each column). +SELECT (gp_toolkit.__gp_aoblkdir('aoco_blkdir_test')).* FROM gp_dist_random('gp_id') WHERE gp_segment_id = 0 ORDER BY 1,2,3,4,5; + tupleid | segno | columngroup_no | entry_no | first_row_no | file_offset | row_count +---------+-------+----------------+----------+--------------+-------------+----------- + (0,1) | 1 | 0 | 0 | 1 | 0 | 10 + (0,2) | 1 | 1 | 0 | 1 | 0 | 10 +(2 rows) + +1: INSERT INTO aoco_blkdir_test SELECT i, 2 FROM generate_series(11, 30) i; +INSERT 20 +-- There should be 2 block directory rows, carrying 2 entries each. The rows +-- from the previous INSERT should not be visible. The entries from the first +-- INSERT should remain unchanged. +SELECT (gp_toolkit.__gp_aoblkdir('aoco_blkdir_test')).* FROM gp_dist_random('gp_id') WHERE gp_segment_id = 0 ORDER BY 1,2,3,4,5; + tupleid | segno | columngroup_no | entry_no | first_row_no | file_offset | row_count +---------+-------+----------------+----------+--------------+-------------+----------- + (0,3) | 1 | 0 | 0 | 1 | 0 | 10 + (0,3) | 1 | 0 | 1 | 101 | 80 | 20 + (0,4) | 1 | 1 | 0 | 1 | 0 | 10 + (0,4) | 1 | 1 | 1 | 101 | 80 | 20 +(4 rows) + +1: BEGIN; +BEGIN +1: INSERT INTO aoco_blkdir_test SELECT i, 2 FROM generate_series(31, 60) i; +INSERT 30 +2: BEGIN; +BEGIN +2: INSERT INTO aoco_blkdir_test SELECT i, 2 FROM generate_series(71, 110) i; +INSERT 40 +1: COMMIT; +COMMIT +2: COMMIT; +COMMIT +-- The second INSERT of 40 rows above would have landed in segfile 1 (unlike +-- segfile 0, like the first INSERT of 30 rows above). This should be reflected +-- in the block directory entries for these rows. +SELECT (gp_toolkit.__gp_aoblkdir('aoco_blkdir_test')).* FROM gp_dist_random('gp_id') WHERE gp_segment_id = 0 ORDER BY 1,2,3,4,5; + tupleid | segno | columngroup_no | entry_no | first_row_no | file_offset | row_count +---------+-------+----------------+----------+--------------+-------------+----------- + (0,5) | 1 | 0 | 0 | 1 | 0 | 10 + (0,5) | 1 | 0 | 1 | 101 | 80 | 20 + (0,5) | 1 | 0 | 2 | 201 | 200 | 30 + (0,6) | 1 | 1 | 0 | 1 | 0 | 10 + (0,6) | 1 | 1 | 1 | 101 | 80 | 20 + (0,6) | 1 | 1 | 2 | 201 | 200 | 30 + (0,7) | 2 | 0 | 0 | 1 | 0 | 40 + (0,8) | 2 | 1 | 0 | 1 | 0 | 40 +(8 rows) + +TRUNCATE aoco_blkdir_test; +TRUNCATE +-- Insert enough rows to overflow the first block directory minipage by 2. +INSERT INTO aoco_blkdir_test SELECT i, 2 FROM generate_series(1, 1317143) i; +INSERT 1317143 +-- There should be 2 block directory rows, 2 for each column, one with 161 +-- entries covering 1317141 rows and the other with 1 entry covering the 2 +-- overflow rows. +SELECT (gp_toolkit.__gp_aoblkdir('aoco_blkdir_test')).* FROM gp_dist_random('gp_id') WHERE gp_segment_id = 0 ORDER BY 1,2,3,4,5; + tupleid | segno | columngroup_no | entry_no | first_row_no | file_offset | row_count +---------+-------+----------------+----------+--------------+-------------+----------- + (0,1) | 1 | 0 | 0 | 1 | 0 | 8181 + (0,1) | 1 | 0 | 1 | 8182 | 32768 | 8181 + (0,1) | 1 | 0 | 2 | 16363 | 65536 | 8181 + (0,1) | 1 | 0 | 3 | 24544 | 98304 | 8181 + (0,1) | 1 | 0 | 4 | 32725 | 131072 | 8181 + (0,1) | 1 | 0 | 5 | 40906 | 163840 | 8181 + (0,1) | 1 | 0 | 6 | 49087 | 196608 | 8181 + (0,1) | 1 | 0 | 7 | 57268 | 229376 | 8181 + (0,1) | 1 | 0 | 8 | 65449 | 262144 | 8181 + (0,1) | 1 | 0 | 9 | 73630 | 294912 | 8181 + (0,1) | 1 | 0 | 10 | 81811 | 327680 | 8181 + (0,1) | 1 | 0 | 11 | 89992 | 360448 | 8181 + (0,1) | 1 | 0 | 12 | 98173 | 393216 | 8181 + (0,1) | 1 | 0 | 13 | 106354 | 425984 | 8181 + (0,1) | 1 | 0 | 14 | 114535 | 458752 | 8181 + (0,1) | 1 | 0 | 15 | 122716 | 491520 | 8181 + (0,1) | 1 | 0 | 16 | 130897 | 524288 | 8181 + (0,1) | 1 | 0 | 17 | 139078 | 557056 | 8181 + (0,1) | 1 | 0 | 18 | 147259 | 589824 | 8181 + (0,1) | 1 | 0 | 19 | 155440 | 622592 | 8181 + (0,1) | 1 | 0 | 20 | 163621 | 655360 | 8181 + (0,1) | 1 | 0 | 21 | 171802 | 688128 | 8181 + (0,1) | 1 | 0 | 22 | 179983 | 720896 | 8181 + (0,1) | 1 | 0 | 23 | 188164 | 753664 | 8181 + (0,1) | 1 | 0 | 24 | 196345 | 786432 | 8181 + (0,1) | 1 | 0 | 25 | 204526 | 819200 | 8181 + (0,1) | 1 | 0 | 26 | 212707 | 851968 | 8181 + (0,1) | 1 | 0 | 27 | 220888 | 884736 | 8181 + (0,1) | 1 | 0 | 28 | 229069 | 917504 | 8181 + (0,1) | 1 | 0 | 29 | 237250 | 950272 | 8181 + (0,1) | 1 | 0 | 30 | 245431 | 983040 | 8181 + (0,1) | 1 | 0 | 31 | 253612 | 1015808 | 8181 + (0,1) | 1 | 0 | 32 | 261793 | 1048576 | 8181 + (0,1) | 1 | 0 | 33 | 269974 | 1081344 | 8181 + (0,1) | 1 | 0 | 34 | 278155 | 1114112 | 8181 + (0,1) | 1 | 0 | 35 | 286336 | 1146880 | 8181 + (0,1) | 1 | 0 | 36 | 294517 | 1179648 | 8181 + (0,1) | 1 | 0 | 37 | 302698 | 1212416 | 8181 + (0,1) | 1 | 0 | 38 | 310879 | 1245184 | 8181 + (0,1) | 1 | 0 | 39 | 319060 | 1277952 | 8181 + (0,1) | 1 | 0 | 40 | 327241 | 1310720 | 8181 + (0,1) | 1 | 0 | 41 | 335422 | 1343488 | 8181 + (0,1) | 1 | 0 | 42 | 343603 | 1376256 | 8181 + (0,1) | 1 | 0 | 43 | 351784 | 1409024 | 8181 + (0,1) | 1 | 0 | 44 | 359965 | 1441792 | 8181 + (0,1) | 1 | 0 | 45 | 368146 | 1474560 | 8181 + (0,1) | 1 | 0 | 46 | 376327 | 1507328 | 8181 + (0,1) | 1 | 0 | 47 | 384508 | 1540096 | 8181 + (0,1) | 1 | 0 | 48 | 392689 | 1572864 | 8181 + (0,1) | 1 | 0 | 49 | 400870 | 1605632 | 8181 + (0,1) | 1 | 0 | 50 | 409051 | 1638400 | 8181 + (0,1) | 1 | 0 | 51 | 417232 | 1671168 | 8181 + (0,1) | 1 | 0 | 52 | 425413 | 1703936 | 8181 + (0,1) | 1 | 0 | 53 | 433594 | 1736704 | 8181 + (0,1) | 1 | 0 | 54 | 441775 | 1769472 | 8181 + (0,1) | 1 | 0 | 55 | 449956 | 1802240 | 8181 + (0,1) | 1 | 0 | 56 | 458137 | 1835008 | 8181 + (0,1) | 1 | 0 | 57 | 466318 | 1867776 | 8181 + (0,1) | 1 | 0 | 58 | 474499 | 1900544 | 8181 + (0,1) | 1 | 0 | 59 | 482680 | 1933312 | 8181 + (0,1) | 1 | 0 | 60 | 490861 | 1966080 | 8181 + (0,1) | 1 | 0 | 61 | 499042 | 1998848 | 8181 + (0,1) | 1 | 0 | 62 | 507223 | 2031616 | 8181 + (0,1) | 1 | 0 | 63 | 515404 | 2064384 | 8181 + (0,1) | 1 | 0 | 64 | 523585 | 2097152 | 8181 + (0,1) | 1 | 0 | 65 | 531766 | 2129920 | 8181 + (0,1) | 1 | 0 | 66 | 539947 | 2162688 | 8181 + (0,1) | 1 | 0 | 67 | 548128 | 2195456 | 8181 + (0,1) | 1 | 0 | 68 | 556309 | 2228224 | 8181 + (0,1) | 1 | 0 | 69 | 564490 | 2260992 | 8181 + (0,1) | 1 | 0 | 70 | 572671 | 2293760 | 8181 + (0,1) | 1 | 0 | 71 | 580852 | 2326528 | 8181 + (0,1) | 1 | 0 | 72 | 589033 | 2359296 | 8181 + (0,1) | 1 | 0 | 73 | 597214 | 2392064 | 8181 + (0,1) | 1 | 0 | 74 | 605395 | 2424832 | 8181 + (0,1) | 1 | 0 | 75 | 613576 | 2457600 | 8181 + (0,1) | 1 | 0 | 76 | 621757 | 2490368 | 8181 + (0,1) | 1 | 0 | 77 | 629938 | 2523136 | 8181 + (0,1) | 1 | 0 | 78 | 638119 | 2555904 | 8181 + (0,1) | 1 | 0 | 79 | 646300 | 2588672 | 8181 + (0,1) | 1 | 0 | 80 | 654481 | 2621440 | 8181 + (0,1) | 1 | 0 | 81 | 662662 | 2654208 | 8181 + (0,1) | 1 | 0 | 82 | 670843 | 2686976 | 8181 + (0,1) | 1 | 0 | 83 | 679024 | 2719744 | 8181 + (0,1) | 1 | 0 | 84 | 687205 | 2752512 | 8181 + (0,1) | 1 | 0 | 85 | 695386 | 2785280 | 8181 + (0,1) | 1 | 0 | 86 | 703567 | 2818048 | 8181 + (0,1) | 1 | 0 | 87 | 711748 | 2850816 | 8181 + (0,1) | 1 | 0 | 88 | 719929 | 2883584 | 8181 + (0,1) | 1 | 0 | 89 | 728110 | 2916352 | 8181 + (0,1) | 1 | 0 | 90 | 736291 | 2949120 | 8181 + (0,1) | 1 | 0 | 91 | 744472 | 2981888 | 8181 + (0,1) | 1 | 0 | 92 | 752653 | 3014656 | 8181 + (0,1) | 1 | 0 | 93 | 760834 | 3047424 | 8181 + (0,1) | 1 | 0 | 94 | 769015 | 3080192 | 8181 + (0,1) | 1 | 0 | 95 | 777196 | 3112960 | 8181 + (0,1) | 1 | 0 | 96 | 785377 | 3145728 | 8181 + (0,1) | 1 | 0 | 97 | 793558 | 3178496 | 8181 + (0,1) | 1 | 0 | 98 | 801739 | 3211264 | 8181 + (0,1) | 1 | 0 | 99 | 809920 | 3244032 | 8181 + (0,1) | 1 | 0 | 100 | 818101 | 3276800 | 8181 + (0,1) | 1 | 0 | 101 | 826282 | 3309568 | 8181 + (0,1) | 1 | 0 | 102 | 834463 | 3342336 | 8181 + (0,1) | 1 | 0 | 103 | 842644 | 3375104 | 8181 + (0,1) | 1 | 0 | 104 | 850825 | 3407872 | 8181 + (0,1) | 1 | 0 | 105 | 859006 | 3440640 | 8181 + (0,1) | 1 | 0 | 106 | 867187 | 3473408 | 8181 + (0,1) | 1 | 0 | 107 | 875368 | 3506176 | 8181 + (0,1) | 1 | 0 | 108 | 883549 | 3538944 | 8181 + (0,1) | 1 | 0 | 109 | 891730 | 3571712 | 8181 + (0,1) | 1 | 0 | 110 | 899911 | 3604480 | 8181 + (0,1) | 1 | 0 | 111 | 908092 | 3637248 | 8181 + (0,1) | 1 | 0 | 112 | 916273 | 3670016 | 8181 + (0,1) | 1 | 0 | 113 | 924454 | 3702784 | 8181 + (0,1) | 1 | 0 | 114 | 932635 | 3735552 | 8181 + (0,1) | 1 | 0 | 115 | 940816 | 3768320 | 8181 + (0,1) | 1 | 0 | 116 | 948997 | 3801088 | 8181 + (0,1) | 1 | 0 | 117 | 957178 | 3833856 | 8181 + (0,1) | 1 | 0 | 118 | 965359 | 3866624 | 8181 + (0,1) | 1 | 0 | 119 | 973540 | 3899392 | 8181 + (0,1) | 1 | 0 | 120 | 981721 | 3932160 | 8181 + (0,1) | 1 | 0 | 121 | 989902 | 3964928 | 8181 + (0,1) | 1 | 0 | 122 | 998083 | 3997696 | 8181 + (0,1) | 1 | 0 | 123 | 1006264 | 4030464 | 8181 + (0,1) | 1 | 0 | 124 | 1014445 | 4063232 | 8181 + (0,1) | 1 | 0 | 125 | 1022626 | 4096000 | 8181 + (0,1) | 1 | 0 | 126 | 1030807 | 4128768 | 8181 + (0,1) | 1 | 0 | 127 | 1038988 | 4161536 | 8181 + (0,1) | 1 | 0 | 128 | 1047169 | 4194304 | 8181 + (0,1) | 1 | 0 | 129 | 1055350 | 4227072 | 8181 + (0,1) | 1 | 0 | 130 | 1063531 | 4259840 | 8181 + (0,1) | 1 | 0 | 131 | 1071712 | 4292608 | 8181 + (0,1) | 1 | 0 | 132 | 1079893 | 4325376 | 8181 + (0,1) | 1 | 0 | 133 | 1088074 | 4358144 | 8181 + (0,1) | 1 | 0 | 134 | 1096255 | 4390912 | 8181 + (0,1) | 1 | 0 | 135 | 1104436 | 4423680 | 8181 + (0,1) | 1 | 0 | 136 | 1112617 | 4456448 | 8181 + (0,1) | 1 | 0 | 137 | 1120798 | 4489216 | 8181 + (0,1) | 1 | 0 | 138 | 1128979 | 4521984 | 8181 + (0,1) | 1 | 0 | 139 | 1137160 | 4554752 | 8181 + (0,1) | 1 | 0 | 140 | 1145341 | 4587520 | 8181 + (0,1) | 1 | 0 | 141 | 1153522 | 4620288 | 8181 + (0,1) | 1 | 0 | 142 | 1161703 | 4653056 | 8181 + (0,1) | 1 | 0 | 143 | 1169884 | 4685824 | 8181 + (0,1) | 1 | 0 | 144 | 1178065 | 4718592 | 8181 + (0,1) | 1 | 0 | 145 | 1186246 | 4751360 | 8181 + (0,1) | 1 | 0 | 146 | 1194427 | 4784128 | 8181 + (0,1) | 1 | 0 | 147 | 1202608 | 4816896 | 8181 + (0,1) | 1 | 0 | 148 | 1210789 | 4849664 | 8181 + (0,1) | 1 | 0 | 149 | 1218970 | 4882432 | 8181 + (0,1) | 1 | 0 | 150 | 1227151 | 4915200 | 8181 + (0,1) | 1 | 0 | 151 | 1235332 | 4947968 | 8181 + (0,1) | 1 | 0 | 152 | 1243513 | 4980736 | 8181 + (0,1) | 1 | 0 | 153 | 1251694 | 5013504 | 8181 + (0,1) | 1 | 0 | 154 | 1259875 | 5046272 | 8181 + (0,1) | 1 | 0 | 155 | 1268056 | 5079040 | 8181 + (0,1) | 1 | 0 | 156 | 1276237 | 5111808 | 8181 + (0,1) | 1 | 0 | 157 | 1284418 | 5144576 | 8181 + (0,1) | 1 | 0 | 158 | 1292599 | 5177344 | 8181 + (0,1) | 1 | 0 | 159 | 1300780 | 5210112 | 8181 + (0,1) | 1 | 0 | 160 | 1308961 | 5242880 | 8181 + (0,2) | 1 | 1 | 0 | 1 | 0 | 8181 + (0,2) | 1 | 1 | 1 | 8182 | 32768 | 8181 + (0,2) | 1 | 1 | 2 | 16363 | 65536 | 8181 + (0,2) | 1 | 1 | 3 | 24544 | 98304 | 8181 + (0,2) | 1 | 1 | 4 | 32725 | 131072 | 8181 + (0,2) | 1 | 1 | 5 | 40906 | 163840 | 8181 + (0,2) | 1 | 1 | 6 | 49087 | 196608 | 8181 + (0,2) | 1 | 1 | 7 | 57268 | 229376 | 8181 + (0,2) | 1 | 1 | 8 | 65449 | 262144 | 8181 + (0,2) | 1 | 1 | 9 | 73630 | 294912 | 8181 + (0,2) | 1 | 1 | 10 | 81811 | 327680 | 8181 + (0,2) | 1 | 1 | 11 | 89992 | 360448 | 8181 + (0,2) | 1 | 1 | 12 | 98173 | 393216 | 8181 + (0,2) | 1 | 1 | 13 | 106354 | 425984 | 8181 + (0,2) | 1 | 1 | 14 | 114535 | 458752 | 8181 + (0,2) | 1 | 1 | 15 | 122716 | 491520 | 8181 + (0,2) | 1 | 1 | 16 | 130897 | 524288 | 8181 + (0,2) | 1 | 1 | 17 | 139078 | 557056 | 8181 + (0,2) | 1 | 1 | 18 | 147259 | 589824 | 8181 + (0,2) | 1 | 1 | 19 | 155440 | 622592 | 8181 + (0,2) | 1 | 1 | 20 | 163621 | 655360 | 8181 + (0,2) | 1 | 1 | 21 | 171802 | 688128 | 8181 + (0,2) | 1 | 1 | 22 | 179983 | 720896 | 8181 + (0,2) | 1 | 1 | 23 | 188164 | 753664 | 8181 + (0,2) | 1 | 1 | 24 | 196345 | 786432 | 8181 + (0,2) | 1 | 1 | 25 | 204526 | 819200 | 8181 + (0,2) | 1 | 1 | 26 | 212707 | 851968 | 8181 + (0,2) | 1 | 1 | 27 | 220888 | 884736 | 8181 + (0,2) | 1 | 1 | 28 | 229069 | 917504 | 8181 + (0,2) | 1 | 1 | 29 | 237250 | 950272 | 8181 + (0,2) | 1 | 1 | 30 | 245431 | 983040 | 8181 + (0,2) | 1 | 1 | 31 | 253612 | 1015808 | 8181 + (0,2) | 1 | 1 | 32 | 261793 | 1048576 | 8181 + (0,2) | 1 | 1 | 33 | 269974 | 1081344 | 8181 + (0,2) | 1 | 1 | 34 | 278155 | 1114112 | 8181 + (0,2) | 1 | 1 | 35 | 286336 | 1146880 | 8181 + (0,2) | 1 | 1 | 36 | 294517 | 1179648 | 8181 + (0,2) | 1 | 1 | 37 | 302698 | 1212416 | 8181 + (0,2) | 1 | 1 | 38 | 310879 | 1245184 | 8181 + (0,2) | 1 | 1 | 39 | 319060 | 1277952 | 8181 + (0,2) | 1 | 1 | 40 | 327241 | 1310720 | 8181 + (0,2) | 1 | 1 | 41 | 335422 | 1343488 | 8181 + (0,2) | 1 | 1 | 42 | 343603 | 1376256 | 8181 + (0,2) | 1 | 1 | 43 | 351784 | 1409024 | 8181 + (0,2) | 1 | 1 | 44 | 359965 | 1441792 | 8181 + (0,2) | 1 | 1 | 45 | 368146 | 1474560 | 8181 + (0,2) | 1 | 1 | 46 | 376327 | 1507328 | 8181 + (0,2) | 1 | 1 | 47 | 384508 | 1540096 | 8181 + (0,2) | 1 | 1 | 48 | 392689 | 1572864 | 8181 + (0,2) | 1 | 1 | 49 | 400870 | 1605632 | 8181 + (0,2) | 1 | 1 | 50 | 409051 | 1638400 | 8181 + (0,2) | 1 | 1 | 51 | 417232 | 1671168 | 8181 + (0,2) | 1 | 1 | 52 | 425413 | 1703936 | 8181 + (0,2) | 1 | 1 | 53 | 433594 | 1736704 | 8181 + (0,2) | 1 | 1 | 54 | 441775 | 1769472 | 8181 + (0,2) | 1 | 1 | 55 | 449956 | 1802240 | 8181 + (0,2) | 1 | 1 | 56 | 458137 | 1835008 | 8181 + (0,2) | 1 | 1 | 57 | 466318 | 1867776 | 8181 + (0,2) | 1 | 1 | 58 | 474499 | 1900544 | 8181 + (0,2) | 1 | 1 | 59 | 482680 | 1933312 | 8181 + (0,2) | 1 | 1 | 60 | 490861 | 1966080 | 8181 + (0,2) | 1 | 1 | 61 | 499042 | 1998848 | 8181 + (0,2) | 1 | 1 | 62 | 507223 | 2031616 | 8181 + (0,2) | 1 | 1 | 63 | 515404 | 2064384 | 8181 + (0,2) | 1 | 1 | 64 | 523585 | 2097152 | 8181 + (0,2) | 1 | 1 | 65 | 531766 | 2129920 | 8181 + (0,2) | 1 | 1 | 66 | 539947 | 2162688 | 8181 + (0,2) | 1 | 1 | 67 | 548128 | 2195456 | 8181 + (0,2) | 1 | 1 | 68 | 556309 | 2228224 | 8181 + (0,2) | 1 | 1 | 69 | 564490 | 2260992 | 8181 + (0,2) | 1 | 1 | 70 | 572671 | 2293760 | 8181 + (0,2) | 1 | 1 | 71 | 580852 | 2326528 | 8181 + (0,2) | 1 | 1 | 72 | 589033 | 2359296 | 8181 + (0,2) | 1 | 1 | 73 | 597214 | 2392064 | 8181 + (0,2) | 1 | 1 | 74 | 605395 | 2424832 | 8181 + (0,2) | 1 | 1 | 75 | 613576 | 2457600 | 8181 + (0,2) | 1 | 1 | 76 | 621757 | 2490368 | 8181 + (0,2) | 1 | 1 | 77 | 629938 | 2523136 | 8181 + (0,2) | 1 | 1 | 78 | 638119 | 2555904 | 8181 + (0,2) | 1 | 1 | 79 | 646300 | 2588672 | 8181 + (0,2) | 1 | 1 | 80 | 654481 | 2621440 | 8181 + (0,2) | 1 | 1 | 81 | 662662 | 2654208 | 8181 + (0,2) | 1 | 1 | 82 | 670843 | 2686976 | 8181 + (0,2) | 1 | 1 | 83 | 679024 | 2719744 | 8181 + (0,2) | 1 | 1 | 84 | 687205 | 2752512 | 8181 + (0,2) | 1 | 1 | 85 | 695386 | 2785280 | 8181 + (0,2) | 1 | 1 | 86 | 703567 | 2818048 | 8181 + (0,2) | 1 | 1 | 87 | 711748 | 2850816 | 8181 + (0,2) | 1 | 1 | 88 | 719929 | 2883584 | 8181 + (0,2) | 1 | 1 | 89 | 728110 | 2916352 | 8181 + (0,2) | 1 | 1 | 90 | 736291 | 2949120 | 8181 + (0,2) | 1 | 1 | 91 | 744472 | 2981888 | 8181 + (0,2) | 1 | 1 | 92 | 752653 | 3014656 | 8181 + (0,2) | 1 | 1 | 93 | 760834 | 3047424 | 8181 + (0,2) | 1 | 1 | 94 | 769015 | 3080192 | 8181 + (0,2) | 1 | 1 | 95 | 777196 | 3112960 | 8181 + (0,2) | 1 | 1 | 96 | 785377 | 3145728 | 8181 + (0,2) | 1 | 1 | 97 | 793558 | 3178496 | 8181 + (0,2) | 1 | 1 | 98 | 801739 | 3211264 | 8181 + (0,2) | 1 | 1 | 99 | 809920 | 3244032 | 8181 + (0,2) | 1 | 1 | 100 | 818101 | 3276800 | 8181 + (0,2) | 1 | 1 | 101 | 826282 | 3309568 | 8181 + (0,2) | 1 | 1 | 102 | 834463 | 3342336 | 8181 + (0,2) | 1 | 1 | 103 | 842644 | 3375104 | 8181 + (0,2) | 1 | 1 | 104 | 850825 | 3407872 | 8181 + (0,2) | 1 | 1 | 105 | 859006 | 3440640 | 8181 + (0,2) | 1 | 1 | 106 | 867187 | 3473408 | 8181 + (0,2) | 1 | 1 | 107 | 875368 | 3506176 | 8181 + (0,2) | 1 | 1 | 108 | 883549 | 3538944 | 8181 + (0,2) | 1 | 1 | 109 | 891730 | 3571712 | 8181 + (0,2) | 1 | 1 | 110 | 899911 | 3604480 | 8181 + (0,2) | 1 | 1 | 111 | 908092 | 3637248 | 8181 + (0,2) | 1 | 1 | 112 | 916273 | 3670016 | 8181 + (0,2) | 1 | 1 | 113 | 924454 | 3702784 | 8181 + (0,2) | 1 | 1 | 114 | 932635 | 3735552 | 8181 + (0,2) | 1 | 1 | 115 | 940816 | 3768320 | 8181 + (0,2) | 1 | 1 | 116 | 948997 | 3801088 | 8181 + (0,2) | 1 | 1 | 117 | 957178 | 3833856 | 8181 + (0,2) | 1 | 1 | 118 | 965359 | 3866624 | 8181 + (0,2) | 1 | 1 | 119 | 973540 | 3899392 | 8181 + (0,2) | 1 | 1 | 120 | 981721 | 3932160 | 8181 + (0,2) | 1 | 1 | 121 | 989902 | 3964928 | 8181 + (0,2) | 1 | 1 | 122 | 998083 | 3997696 | 8181 + (0,2) | 1 | 1 | 123 | 1006264 | 4030464 | 8181 + (0,2) | 1 | 1 | 124 | 1014445 | 4063232 | 8181 + (0,2) | 1 | 1 | 125 | 1022626 | 4096000 | 8181 + (0,2) | 1 | 1 | 126 | 1030807 | 4128768 | 8181 + (0,2) | 1 | 1 | 127 | 1038988 | 4161536 | 8181 + (0,2) | 1 | 1 | 128 | 1047169 | 4194304 | 8181 + (0,2) | 1 | 1 | 129 | 1055350 | 4227072 | 8181 + (0,2) | 1 | 1 | 130 | 1063531 | 4259840 | 8181 + (0,2) | 1 | 1 | 131 | 1071712 | 4292608 | 8181 + (0,2) | 1 | 1 | 132 | 1079893 | 4325376 | 8181 + (0,2) | 1 | 1 | 133 | 1088074 | 4358144 | 8181 + (0,2) | 1 | 1 | 134 | 1096255 | 4390912 | 8181 + (0,2) | 1 | 1 | 135 | 1104436 | 4423680 | 8181 + (0,2) | 1 | 1 | 136 | 1112617 | 4456448 | 8181 + (0,2) | 1 | 1 | 137 | 1120798 | 4489216 | 8181 + (0,2) | 1 | 1 | 138 | 1128979 | 4521984 | 8181 + (0,2) | 1 | 1 | 139 | 1137160 | 4554752 | 8181 + (0,2) | 1 | 1 | 140 | 1145341 | 4587520 | 8181 + (0,2) | 1 | 1 | 141 | 1153522 | 4620288 | 8181 + (0,2) | 1 | 1 | 142 | 1161703 | 4653056 | 8181 + (0,2) | 1 | 1 | 143 | 1169884 | 4685824 | 8181 + (0,2) | 1 | 1 | 144 | 1178065 | 4718592 | 8181 + (0,2) | 1 | 1 | 145 | 1186246 | 4751360 | 8181 + (0,2) | 1 | 1 | 146 | 1194427 | 4784128 | 8181 + (0,2) | 1 | 1 | 147 | 1202608 | 4816896 | 8181 + (0,2) | 1 | 1 | 148 | 1210789 | 4849664 | 8181 + (0,2) | 1 | 1 | 149 | 1218970 | 4882432 | 8181 + (0,2) | 1 | 1 | 150 | 1227151 | 4915200 | 8181 + (0,2) | 1 | 1 | 151 | 1235332 | 4947968 | 8181 + (0,2) | 1 | 1 | 152 | 1243513 | 4980736 | 8181 + (0,2) | 1 | 1 | 153 | 1251694 | 5013504 | 8181 + (0,2) | 1 | 1 | 154 | 1259875 | 5046272 | 8181 + (0,2) | 1 | 1 | 155 | 1268056 | 5079040 | 8181 + (0,2) | 1 | 1 | 156 | 1276237 | 5111808 | 8181 + (0,2) | 1 | 1 | 157 | 1284418 | 5144576 | 8181 + (0,2) | 1 | 1 | 158 | 1292599 | 5177344 | 8181 + (0,2) | 1 | 1 | 159 | 1300780 | 5210112 | 8181 + (0,2) | 1 | 1 | 160 | 1308961 | 5242880 | 8181 + (0,3) | 1 | 0 | 0 | 1317142 | 5275648 | 2 + (0,4) | 1 | 1 | 0 | 1317142 | 5275648 | 2 +(324 rows) + +-- Unique index white box tests +DROP TABLE aoco_blkdir_test; +DROP +CREATE TABLE aoco_blkdir_test(h int, i int UNIQUE, j int) USING ao_column DISTRIBUTED BY (i); +CREATE + +SELECT gp_inject_fault('appendonly_insert', 'suspend', '', '', 'aoco_blkdir_test', 1, 1, 0, dbid) FROM gp_segment_configuration WHERE role = 'p' AND content = 0; + gp_inject_fault +----------------- + Success: +(1 row) +1: BEGIN; +BEGIN +1&: INSERT INTO aoco_blkdir_test VALUES (2, 2, 2); + +-- There should be a placeholder row inserted to cover the rows for each INSERT +-- session (for the first non-dropped column), before we insert the 1st row in +-- that session, that is only visible to SNAPSHOT_DIRTY. +SELECT gp_wait_until_triggered_fault('appendonly_insert', 1, dbid) FROM gp_segment_configuration WHERE role = 'p' AND content = 0; + gp_wait_until_triggered_fault +------------------------------- + Success: +(1 row) +SELECT (gp_toolkit.__gp_aoblkdir('aoco_blkdir_test')).* FROM gp_dist_random('gp_id') WHERE gp_segment_id = 0 ORDER BY 1,2,3,4,5; + tupleid | segno | columngroup_no | entry_no | first_row_no | file_offset | row_count +---------+-------+----------------+----------+--------------+-------------+----------- +(0 rows) +SET gp_select_invisible TO ON; +SET +SELECT (gp_toolkit.__gp_aoblkdir('aoco_blkdir_test')).* FROM gp_dist_random('gp_id') WHERE gp_segment_id = 0 ORDER BY 1,2,3,4,5; + tupleid | segno | columngroup_no | entry_no | first_row_no | file_offset | row_count +---------+-------+----------------+----------+--------------+-------------+--------------- + (0,1) | 1 | 0 | 0 | 1 | 0 | 1099511627775 +(1 row) +RESET gp_select_invisible; +RESET + +-- The placeholder row is invisible to other transactions (that don't perform a +-- uniqueness check) while the INSERT is in progress. +2: SELECT (gp_toolkit.__gp_aoblkdir('aoco_blkdir_test')).* FROM gp_dist_random('gp_id') WHERE gp_segment_id = 0 ORDER BY 1,2,3,4,5; + tupleid | segno | columngroup_no | entry_no | first_row_no | file_offset | row_count +---------+-------+----------------+----------+--------------+-------------+----------- +(0 rows) + +-- Before the INSERT commits, if we try to drop column 'h', for which the +-- placeholder row was created, the session will block (locking). So it is +-- perfectly safe to use 1 placeholder row (and not have 1 placeholder/column) +3&: ALTER TABLE aoco_blkdir_test DROP COLUMN h; + +SELECT gp_inject_fault('appendonly_insert', 'reset', dbid) FROM gp_segment_configuration WHERE role = 'p' AND content = 0; + gp_inject_fault +----------------- + Success: +(1 row) +1<: <... completed> +INSERT 1 + +-- The placeholder row is invisible to the INSERTing transaction. Since the +-- INSERT finished, there should be 3 visible blkdir rows representing the +-- INSERT, 1 for each column. +1: SELECT (gp_toolkit.__gp_aoblkdir('aoco_blkdir_test')).* FROM gp_dist_random('gp_id') WHERE gp_segment_id = 0 ORDER BY 1,2,3,4,5; + tupleid | segno | columngroup_no | entry_no | first_row_no | file_offset | row_count +---------+-------+----------------+----------+--------------+-------------+----------- + (0,2) | 1 | 0 | 0 | 1 | 0 | 1 + (0,3) | 1 | 1 | 0 | 1 | 0 | 1 + (0,4) | 1 | 2 | 0 | 1 | 0 | 1 +(3 rows) + +-- The placeholder row is invisible to other transactions (that don't perform a +-- uniqueness check) even after the INSERT finishes. The blkdir rows representing +-- the INSERT should not be visible as the INSERTing transaction hasn't +-- committed yet. +2: SELECT (gp_toolkit.__gp_aoblkdir('aoco_blkdir_test')).* FROM gp_dist_random('gp_id') WHERE gp_segment_id = 0 ORDER BY 1,2,3,4,5; + tupleid | segno | columngroup_no | entry_no | first_row_no | file_offset | row_count +---------+-------+----------------+----------+--------------+-------------+----------- +(0 rows) + +1: COMMIT; +COMMIT + +-- The placeholder row is invisible to other transactions (that don't perform a +-- uniqueness check) even after the INSERTing transaction commits. Since the +-- INSERTing transaction has committed, the blkdir rows representing the INSERT +-- should be visible now. +2: SELECT (gp_toolkit.__gp_aoblkdir('aoco_blkdir_test')).* FROM gp_dist_random('gp_id') WHERE gp_segment_id = 0 ORDER BY 1,2,3,4,5; + tupleid | segno | columngroup_no | entry_no | first_row_no | file_offset | row_count +---------+-------+----------------+----------+--------------+-------------+----------- + (0,2) | 1 | 0 | 0 | 1 | 0 | 1 + (0,3) | 1 | 1 | 0 | 1 | 0 | 1 + (0,4) | 1 | 2 | 0 | 1 | 0 | 1 +(3 rows) + +-- Now even though the DROP COLUMN has finished, we would still be able to +-- properly resolve uniqueness checks (by consulting the first non-dropped +-- column's block directory row). +3<: <... completed> +ALTER +4: INSERT INTO aoco_blkdir_test VALUES (2, 2); +ERROR: duplicate key value violates unique constraint "aoco_blkdir_test_i_key" (seg0 192.168.0.148:7002 pid=176693) +DETAIL: Key (i)=(2) already exists. + +DROP TABLE aoco_blkdir_test; +DROP diff --git a/src/test/isolation2/expected/ao_unique_index.out b/src/test/isolation2/expected/ao_unique_index.out new file mode 100644 index 00000000000..8748b85bd69 --- /dev/null +++ b/src/test/isolation2/expected/ao_unique_index.out @@ -0,0 +1,571 @@ +-- Tests to ensure that unique indexes work as expected w/ ao_row tables. + +-- We use a replicated table to test each table for ease in testing edge cases +-- where conflicts arise at block directory boundaries. We can treat the table +-- as if it were being populated in utility mode on a single segment, allowing +-- us to predict block directory entries without having to worry about the +-- table's distribution. + +-- Case 1: Conflict with committed transaction---------------------------------- +CREATE TABLE unique_index_ao_row (a INT unique) USING ao_row DISTRIBUTED REPLICATED; +CREATE +INSERT INTO unique_index_ao_row SELECT * FROM generate_series(1, 329729); +INSERT 329729 +-- should conflict +INSERT INTO unique_index_ao_row VALUES (1); +ERROR: duplicate key value violates unique constraint "unique_index_ao_row_a_key" (seg1 192.168.0.148:7003 pid=205740) +DETAIL: Key (a)=(1) already exists. +INSERT INTO unique_index_ao_row VALUES (329729); +ERROR: duplicate key value violates unique constraint "unique_index_ao_row_a_key" (seg2 192.168.0.148:7004 pid=205741) +DETAIL: Key (a)=(329729) already exists. +-- should not conflict +INSERT INTO unique_index_ao_row VALUES (329730); +INSERT 1 +DROP TABLE unique_index_ao_row; +DROP + +-- Case 2: Conflict within the same transaction--------------------------------- +CREATE TABLE unique_index_ao_row (a INT unique) USING ao_row DISTRIBUTED REPLICATED; +CREATE +BEGIN; +BEGIN +INSERT INTO unique_index_ao_row SELECT * FROM generate_series(1, 329729); +INSERT 329729 +-- should conflict +INSERT INTO unique_index_ao_row VALUES (1); +ERROR: duplicate key value violates unique constraint "unique_index_ao_row_a_key" (seg0 192.168.0.148:7002 pid=205739) +DETAIL: Key (a)=(1) already exists. +END; +END +DROP TABLE unique_index_ao_row; +DROP + +CREATE TABLE unique_index_ao_row (a INT unique) USING ao_row DISTRIBUTED REPLICATED; +CREATE +BEGIN; +BEGIN +INSERT INTO unique_index_ao_row SELECT * FROM generate_series(1, 329729); +INSERT 329729 +-- should conflict +INSERT INTO unique_index_ao_row VALUES (329729); +ERROR: duplicate key value violates unique constraint "unique_index_ao_row_a_key" (seg0 192.168.0.148:7002 pid=205739) +DETAIL: Key (a)=(329729) already exists. +END; +END +DROP TABLE unique_index_ao_row; +DROP + +CREATE TABLE unique_index_ao_row (a INT unique) USING ao_row DISTRIBUTED REPLICATED; +CREATE +BEGIN; +BEGIN +INSERT INTO unique_index_ao_row SELECT * FROM generate_series(1, 329729); +INSERT 329729 +-- should not conflict +INSERT INTO unique_index_ao_row VALUES (329730); +INSERT 1 +END; +END +DROP TABLE unique_index_ao_row; +DROP + +-- Case 3: Conflict with aborted transaction is not a conflict------------------ +CREATE TABLE unique_index_ao_row (a INT unique) USING ao_row DISTRIBUTED REPLICATED; +CREATE +BEGIN; +BEGIN +INSERT INTO unique_index_ao_row SELECT * FROM generate_series(1, 329729); +INSERT 329729 +ABORT; +ABORT +-- should not conflict +INSERT INTO unique_index_ao_row VALUES (1); +INSERT 1 +INSERT INTO unique_index_ao_row VALUES (329729); +INSERT 1 +INSERT INTO unique_index_ao_row VALUES (329730); +INSERT 1 +DROP TABLE unique_index_ao_row; +DROP + +-- Case 4: Conflict with to-be-committed transaction---------------------------- +-- +-- 1. Uncommitted tx 1 has inserted non-conflicting key = 0. +-- 2. Uncommitted tx 2 has inserted (161 * 2048 + 1 = 329729 rows), which spans +-- 2 block directory rows (1st row: [1,329728] ; 2nd row: [329729,329729]) +-- 3. Tx 3 tries to insert conflicting key = 2, which maps to the second rownum +-- covered by the 1st block directory row of seg 1, and blocks on tx 2. +-- 4. Tx 4 tries to insert conflicting key = 329728, which maps to the last +-- rownum covered by the 1st block directory row of seg 1, and blocks on tx 2. +-- 5. Tx 5 tries to insert conflicting key = 329729, which maps to the first +-- rownum covered by the 2nd block directory row of seg 1, and blocks on tx 2. +-- 6. Tx 6 tries to insert non-conflicting key = 329730 and is immediately +-- successful. +-- 7. Tx 2 commits +-- 8. Txs 3,4,5 report unique constraint violation +-- 9. Tx 1 commits +-- +CREATE TABLE unique_index_ao_row (a INT unique) USING ao_row DISTRIBUTED REPLICATED; +CREATE +1: BEGIN; +BEGIN +1: INSERT INTO unique_index_ao_row VALUES (0); +INSERT 1 +2: BEGIN; +BEGIN +2: INSERT INTO unique_index_ao_row SELECT * FROM generate_series(1, 329729); +INSERT 329729 +3&: INSERT INTO unique_index_ao_row VALUES (1); +4&: INSERT INTO unique_index_ao_row VALUES (329728); +5&: INSERT INTO unique_index_ao_row VALUES (329729); +-- should succeed immediately +6: INSERT INTO unique_index_ao_row VALUES (329730); +INSERT 1 +2: COMMIT; +COMMIT +3<: <... completed> +ERROR: duplicate key value violates unique constraint "unique_index_ao_row_a_key" (seg1 192.168.0.148:7003 pid=205769) +DETAIL: Key (a)=(1) already exists. +4<: <... completed> +ERROR: duplicate key value violates unique constraint "unique_index_ao_row_a_key" (seg0 192.168.0.148:7002 pid=205777) +DETAIL: Key (a)=(329728) already exists. +5<: <... completed> +ERROR: duplicate key value violates unique constraint "unique_index_ao_row_a_key" (seg1 192.168.0.148:7003 pid=205787) +DETAIL: Key (a)=(329729) already exists. +1: COMMIT; +COMMIT +DROP TABLE unique_index_ao_row; +DROP + +-- Case 5: Conflict with to-be-aborted transaction------------------------------ +-- +-- 1. Uncommitted tx 1 has inserted non-conflicting key = 0. +-- 2. Uncommitted tx 2 has inserted (161 * 2048 + 1 = 329729 rows), which spans +-- 2 block directory rows (1st row: [1,329728] ; 2nd row: [329729,329729]) +-- 3. Tx 3 tries to insert conflicting key = 2, which maps to the second rownum +-- covered by the 1st block directory row of seg 1, and blocks on tx 2. +-- 4. Tx 4 tries to insert conflicting key = 329728, which maps to the last +-- rownum covered by the 1st block directory row of seg 1, and blocks on tx 2. +-- 5. Tx 5 tries to insert conflicting key = 329729, which maps to the first +-- rownum covered by the 2nd block directory row of seg 1, and blocks on tx 2. +-- 6. Tx 6 tries to insert non-conflicting key = 329730 and is immediately +-- successful. +-- 8. Tx 2 aborts +-- 9. Txs 3,4,5 report unique constraint violation +-- 10. Tx 1 commits +-- +CREATE TABLE unique_index_ao_row (a INT unique) USING ao_row DISTRIBUTED REPLICATED; +CREATE +1: BEGIN; +BEGIN +1: INSERT INTO unique_index_ao_row VALUES (0); +INSERT 1 +2: BEGIN; +BEGIN +2: INSERT INTO unique_index_ao_row SELECT * FROM generate_series(1, 329729); +INSERT 329729 +3&: INSERT INTO unique_index_ao_row VALUES (1); +4&: INSERT INTO unique_index_ao_row VALUES (329728); +5&: INSERT INTO unique_index_ao_row VALUES (329729); +-- should succeed immediately +6: INSERT INTO unique_index_ao_row VALUES (329730); +INSERT 1 +2: ABORT; +ABORT +3<: <... completed> +INSERT 1 +4<: <... completed> +INSERT 1 +5<: <... completed> +INSERT 1 +1: COMMIT; +COMMIT +DROP TABLE unique_index_ao_row; +DROP + +-- Case 6: Conflict with aborted rows following some committed rows ------------ +CREATE TABLE unique_index_ao_row (a INT unique) USING ao_row DISTRIBUTED REPLICATED; +CREATE +-- 1. Tx 1 commits rows 1-100. +-- 2. Tx 2 inserts rows 101-200 and then aborts. +-- 3. Tx 3 tries to insert row in range [101,200] and is immediately successful. +-- 4. Tx 4 tries to insert conflicting row in range [1,100] and raises unique +-- constraint violation. +-- 5. Tx 5 tries to insert row in range [201, ) and is immediately successful. +1: INSERT INTO unique_index_ao_row SELECT generate_series(1, 100); +INSERT 100 +2: BEGIN; +BEGIN +2: INSERT INTO unique_index_ao_row SELECT generate_series(101, 200); +INSERT 100 +2: ABORT; +ABORT +3: INSERT INTO unique_index_ao_row VALUES(102); +INSERT 1 +4: INSERT INTO unique_index_ao_row VALUES(2); +ERROR: duplicate key value violates unique constraint "unique_index_ao_row_a_key" (seg0 192.168.0.148:7002 pid=659656) +DETAIL: Key (a)=(2) already exists. +5: INSERT INTO unique_index_ao_row VALUES(202); +INSERT 1 +DROP TABLE unique_index_ao_row; +DROP + +-------------------------------------------------------------------------------- +----------------- More concurrent tests with fault injection ------------------ +-------------------------------------------------------------------------------- + +-- Case 7: Conflict with to-be-committed transaction while only 1 placeholder +-- row exists in the block directory-------------------------------------------- +-- +-- This case highlights the importance of the placeholder row, inserted at the +-- beginning of an INSERT command. +-- +-- 1. Uncommitted Tx 1 has inserted 3 out of its 10 rows and is suspended. +-- 2. Tx 2 inserts a conflicting row and blocks on Tx 1. +-- 3. Tx 3 inserts a non-conflicting row within the range [4,10] and is +-- immediately successful. (Index entries have been written only for [1,3] so +-- far, so conflicts shouldn't arise) +-- 4. Tx 4 inserts a non-conflicting row in range [11, ..) and should be +-- immediately successful. +-- 5. Now Tx 1 resumes and tries to insert a row in range [4,10] and reports a +-- unique constraint violation with Tx 3. +-- 6. Tx 2 succeeds as Tx 1 aborted. + +CREATE TABLE unique_index_ao_row (a INT unique) USING ao_row DISTRIBUTED REPLICATED; +CREATE +SELECT gp_inject_fault('appendonly_insert', 'suspend', '', '', 'unique_index_ao_row', 4, 4, 0, dbid) FROM gp_segment_configuration WHERE role = 'p' AND content <> -1; + gp_inject_fault +----------------- + Success: + Success: + Success: +(3 rows) +1&: INSERT INTO unique_index_ao_row SELECT * FROM generate_series(1, 10); +-- Wait until 3 rows have been successfully inserted into the index and Tx 1 +-- is just beginning to insert the 4th row. +SELECT gp_wait_until_triggered_fault('appendonly_insert', 4, dbid) FROM gp_segment_configuration WHERE role = 'p' AND content <> -1; + gp_wait_until_triggered_fault +------------------------------- + Success: + Success: + Success: +(3 rows) +2&: INSERT INTO unique_index_ao_row VALUES(2); +4: INSERT INTO unique_index_ao_row VALUES(11); +INSERT 1 +3: INSERT INTO unique_index_ao_row VALUES(4); +INSERT 1 +SELECT gp_inject_fault('appendonly_insert', 'reset', dbid) FROM gp_segment_configuration WHERE role = 'p' AND content <> -1; + gp_inject_fault +----------------- + Success: + Success: + Success: +(3 rows) +1<: <... completed> +ERROR: duplicate key value violates unique constraint "unique_index_ao_row_a_key" (seg1 192.168.0.148:7003 pid=828519) +DETAIL: Key (a)=(4) already exists. +2<: <... completed> +INSERT 1 +DROP TABLE unique_index_ao_row; +DROP + +-- Case 8: Conflict with to-be-committed transaction - generalization of case 7 +-- where there are multiple minipages (and block directory rows) in play from +-- the same insert. +-- +-- This justifies why 1 placeholder row is enough and we don't need to flush a +-- placeholder row every time we insert a block directory row (i.e. start a new +-- in-memory minipage) throughout the course of a single insert. +-- +-- 1. Uncommitted Tx 1 has inserted (2048 * (161 * 2 + 1) + 3) = 661507 rows +-- and is suspended, enough rows to fill 2 entire minipages (covers +-- range [1,329728] and [329729,659456]) before suspension. +-- 2. Txs 2,3,4 inserts conflicting rows that map to the 1st minipage and block. +-- 3. Txs 5,6,7 inserts conflicting rows that map to the 2nd minipage and block. +-- 4. Tx 8 inserts a conflicting row that maps to the 3rd minipage, which is +-- currently only in-memory and it conflicts on the placeholder row and +-- blocks (showcases why 1 placeholder row is enough) +-- 5. Tx 9 inserts a non-conflicting row for which there is no index entry and +-- and is immediately successful (661510). +-- 6. Now Tx 1 resumes and tries to insert 661510 and reports a unique +-- constraint violation with Tx 9. +-- 7. All blocked Txs succeed. + +CREATE TABLE unique_index_ao_row (a INT unique) USING ao_row DISTRIBUTED REPLICATED; +CREATE + +SELECT gp_inject_fault('insert_new_entry_curr_minipage_full', 'suspend', '', '', '', 2, 2, 0, dbid) FROM gp_segment_configuration WHERE role = 'p' AND content <> -1; + gp_inject_fault +----------------- + Success: + Success: + Success: +(3 rows) +1&: INSERT INTO unique_index_ao_row SELECT generate_series(1, 661510); + +-- Wait until we have inserted (2048 * (161 * 2 + 1) + 3) = 661507 rows and we +-- are about to insert the 661508th row. +SELECT gp_wait_until_triggered_fault('insert_new_entry_curr_minipage_full', 2, dbid) FROM gp_segment_configuration WHERE role = 'p' AND content <> -1; + gp_wait_until_triggered_fault +------------------------------- + Success: + Success: + Success: +(3 rows) +SELECT gp_inject_fault('appendonly_insert', 'suspend', '', '', 'unique_index_ao_row', 4, 4, 0, dbid) FROM gp_segment_configuration WHERE role = 'p' AND content <> -1; + gp_inject_fault +----------------- + Success: + Success: + Success: +(3 rows) +SELECT gp_inject_fault('insert_new_entry_curr_minipage_full', 'reset', dbid) FROM gp_segment_configuration WHERE role = 'p' AND content <> -1; + gp_inject_fault +----------------- + Success: + Success: + Success: +(3 rows) +SELECT gp_wait_until_triggered_fault('appendonly_insert', 4, dbid) FROM gp_segment_configuration WHERE role = 'p' AND content <> -1; + gp_wait_until_triggered_fault +------------------------------- + Success: + Success: + Success: +(3 rows) + +-- maps to 1st minipage +2&: INSERT INTO unique_index_ao_row VALUES(1); +3&: INSERT INTO unique_index_ao_row VALUES(300000); +4&: INSERT INTO unique_index_ao_row VALUES(329728); +-- maps to 2nd minipage +5&: INSERT INTO unique_index_ao_row VALUES(329729); +6&: INSERT INTO unique_index_ao_row VALUES(598000); +7&: INSERT INTO unique_index_ao_row VALUES(659456); +-- maps to 3rd minipage +8&: INSERT INTO unique_index_ao_row VALUES(661507); +-- no index entry exists for it, so should not conflict. +9: INSERT INTO unique_index_ao_row VALUES(661510); +INSERT 1 + +SELECT gp_inject_fault('appendonly_insert', 'reset', dbid) FROM gp_segment_configuration WHERE role = 'p' AND content <> -1; + gp_inject_fault +----------------- + Success: + Success: + Success: +(3 rows) + +1<: <... completed> +ERROR: duplicate key value violates unique constraint "unique_index_ao_row_a_key" (seg1 192.168.0.148:7003 pid=630215) +DETAIL: Key (a)=(661510) already exists. +2<: <... completed> +INSERT 1 +3<: <... completed> +INSERT 1 +4<: <... completed> +INSERT 1 +5<: <... completed> +INSERT 1 +6<: <... completed> +INSERT 1 +7<: <... completed> +INSERT 1 +8<: <... completed> +INSERT 1 + +DROP TABLE unique_index_ao_row; +DROP + +-------------------------------------------------------------------------------- +--------------------------- Smoke tests for COPY ------------------------------- +-------------------------------------------------------------------------------- + +CREATE TABLE unique_index_ao_row (a INT unique) USING ao_row DISTRIBUTED REPLICATED; +CREATE + +1: BEGIN; +BEGIN +1: COPY unique_index_ao_row FROM PROGRAM 'seq 1 10'; +COPY 30 +-- concurrent tx inserting conflicting row should block. +2&: COPY unique_index_ao_row FROM PROGRAM 'seq 1 1'; +-- concurrent tx inserting non-conflicting rows should be successful. +3: COPY unique_index_ao_row FROM PROGRAM 'seq 11 20'; +COPY 30 +-- inserting a conflicting row in the same transaction should ERROR out. +1: COPY unique_index_ao_row FROM PROGRAM 'seq 1 1'; +ERROR: duplicate key value violates unique constraint "unique_index_ao_row_a_key" +DETAIL: Key (a)=(1) already exists. +CONTEXT: COPY unique_index_ao_row, line 1 +-- now that tx 1 was aborted, tx 2 is successful. +2<: <... completed> +COPY 3 +1: END; +END + +DROP TABLE unique_index_ao_row; +DROP + +-------------------------------------------------------------------------------- +-------------------- Smoke tests for subtransactions --------------------------- +-------------------------------------------------------------------------------- +CREATE TABLE unique_index_ao_row (a INT unique) USING ao_row DISTRIBUTED REPLICATED; +CREATE + +1: BEGIN; +BEGIN +1: SAVEPOINT a; +SAVEPOINT +1: INSERT INTO unique_index_ao_row VALUES(1); +INSERT 1 + +-- concurrent tx inserting conflicting row should block. +2: BEGIN; +BEGIN +2&: INSERT INTO unique_index_ao_row VALUES(1); +-- concurrent tx inserting non-conflicting row should be successful. +3: INSERT INTO unique_index_ao_row VALUES(2); +INSERT 1 + +-- conflict should be detected within the same subtx. +1: INSERT INTO unique_index_ao_row VALUES(1); +ERROR: duplicate key value violates unique constraint "unique_index_ao_row_a_key" (seg2 192.168.0.148:7004 pid=3396954) +DETAIL: Key (a)=(1) already exists. +-- the concurrent tx should now succeed. +2<: <... completed> +INSERT 1 +2: ABORT; +ABORT + +-- after rolling back to the savepoint, we should be able to re-insert the key +1: ROLLBACK TO SAVEPOINT a; +ROLLBACK +1: INSERT INTO unique_index_ao_row VALUES(1); +INSERT 1 +1: COMMIT; +COMMIT + +SELECT * FROM unique_index_ao_row; + a +--- + 1 + 2 +(2 rows) + +DROP TABLE unique_index_ao_row; +DROP + +-------------------------------------------------------------------------------- +-------------------- Smoke tests for repeatable read --------------------------- +-------------------------------------------------------------------------------- + +-- Test that shows that unique index checks transcend transaction isolation +-- boundaries. + +CREATE TABLE unique_index_ao_row (a INT unique) USING ao_row DISTRIBUTED REPLICATED; +CREATE + +-- Begin two txs with tx level snapshot taken early. +1: BEGIN ISOLATION LEVEL REPEATABLE READ; +BEGIN +1: SELECT * FROM unique_index_ao_row; + a +--- +(0 rows) +2: BEGIN ISOLATION LEVEL REPEATABLE READ; +BEGIN +2: SELECT * FROM unique_index_ao_row; + a +--- +(0 rows) + +-- Now begin a concurrent transaction which inserts a key. +3: BEGIN; +BEGIN +3: INSERT INTO unique_index_ao_row VALUES(1); +INSERT 1 + +-- And another transaction inserts a key and commits. +INSERT INTO unique_index_ao_row VALUES(2); +INSERT 1 + +-- Tx should block on insert of conflicting key, even though it can't "see" the +-- conflicting key due to its isolation level. +1: SELECT * FROM unique_index_ao_row; + a +--- +(0 rows) +1&: INSERT INTO unique_index_ao_row VALUES(1); + +3: ABORT; +ABORT +1<: <... completed> +INSERT 1 +1: ABORT; +ABORT + +-- Tx should raise a conflict, even though it can't "see" the conflicting key +-- due to its isolation level. +2: SELECT * FROM unique_index_ao_row; + a +--- +(0 rows) +2: INSERT INTO unique_index_ao_row VALUES(2); +ERROR: duplicate key value violates unique constraint "unique_index_ao_row_a_key" (seg1 192.168.0.148:7003 pid=3417060) +DETAIL: Key (a)=(2) already exists. +2: ABORT; +ABORT + +DROP TABLE unique_index_ao_row; +DROP + + +-------------------------------------------------------------------------------- +----------------------- Smoke tests for ADD CONSTRAINT ------------------------ +-------------------------------------------------------------------------------- +CREATE TABLE unique_index_ao_row (a INT) USING ao_row DISTRIBUTED REPLICATED; +CREATE +INSERT INTO unique_index_ao_row SELECT * FROM generate_series(1, 5); +INSERT 5 + +ALTER table unique_index_ao_row ADD CONSTRAINT a_unique UNIQUE(a); +ALTER +-- should conflict +INSERT INTO unique_index_ao_row VALUES (1); +DETAIL: Key (a)=(1) already exists. +ERROR: duplicate key value violates unique constraint "a_unique" +ALTER table unique_index_ao_row DROP CONSTRAINT a_unique; +ALTER + +INSERT INTO unique_index_ao_row VALUES (1); +INSERT 1 +-- should failed +ALTER table unique_index_ao_row ADD CONSTRAINT a_unique UNIQUE(a); +DETAIL: Key (a)=(1) is duplicated. +ERROR: could not create unique index "a_unique" + +DROP TABLE unique_index_ao_row; +DROP + + +-------------------------------------------------------------------------------- +----------------------- Smoke tests for Multiple Key --------------------------- +-------------------------------------------------------------------------------- +CREATE TABLE unique_index_ao_row (a INT, b INT) USING ao_row DISTRIBUTED REPLICATED; +CREATE +INSERT INTO unique_index_ao_row SELECT i,i FROM generate_series(1, 5) i; +INSERT 5 + +CREATE UNIQUE INDEX a_b_unique ON unique_index_ao_row(a,b); +CREATE +-- should not conflict +INSERT INTO unique_index_ao_row VALUES (1,2); +INSERT 1 +-- should conflict +INSERT INTO unique_index_ao_row VALUES (1,1); +DETAIL: Key (a, b)=(1, 1) already exists. +ERROR: duplicate key value violates unique constraint "a_b_unique" +DROP TABLE unique_index_ao_row; +DROP diff --git a/src/test/isolation2/expected/aocs_unique_index.out b/src/test/isolation2/expected/aocs_unique_index.out new file mode 100644 index 00000000000..839dc5b159d --- /dev/null +++ b/src/test/isolation2/expected/aocs_unique_index.out @@ -0,0 +1,571 @@ +-- Tests to ensure that unique indexes work as expected w/ ao_column tables. + +-- We use a replicated table to test each table for ease in testing edge cases +-- where conflicts arise at block directory boundaries. We can treat the table +-- as if it were being populated in utility mode on a single segment, allowing +-- us to predict block directory entries without having to worry about the +-- table's distribution. + +-- Case 1: Conflict with committed transaction---------------------------------- +CREATE TABLE unique_index_ao_column (a bigint unique) USING ao_column DISTRIBUTED REPLICATED; +CREATE +INSERT INTO unique_index_ao_column SELECT * FROM generate_series(1, 658491); +INSERT 658491 +-- should conflict +INSERT INTO unique_index_ao_column VALUES (1); +ERROR: duplicate key value violates unique constraint "unique_index_ao_column_a_key" (seg0 192.168.0.148:7002 pid=721860) +DETAIL: Key (a)=(1) already exists. +INSERT INTO unique_index_ao_column VALUES (658491); +ERROR: duplicate key value violates unique constraint "unique_index_ao_column_a_key" (seg0 192.168.0.148:7002 pid=721860) +DETAIL: Key (a)=(658491) already exists. +-- should not conflict +INSERT INTO unique_index_ao_column VALUES (658492); +INSERT 1 +DROP TABLE unique_index_ao_column; +DROP + +-- Case 2: Conflict within the same transaction--------------------------------- +CREATE TABLE unique_index_ao_column (a bigint unique) USING ao_column DISTRIBUTED REPLICATED; +CREATE +BEGIN; +BEGIN +INSERT INTO unique_index_ao_column SELECT * FROM generate_series(1, 658491); +INSERT 658491 +-- should conflict +INSERT INTO unique_index_ao_column VALUES (1); +ERROR: duplicate key value violates unique constraint "unique_index_ao_column_a_key" (seg1 192.168.0.148:7003 pid=721861) +DETAIL: Key (a)=(1) already exists. +END; +END +DROP TABLE unique_index_ao_column; +DROP + +CREATE TABLE unique_index_ao_column (a bigint unique) USING ao_column DISTRIBUTED REPLICATED; +CREATE +BEGIN; +BEGIN +INSERT INTO unique_index_ao_column SELECT * FROM generate_series(1, 658491); +INSERT 658491 +-- should conflict +INSERT INTO unique_index_ao_column VALUES (658491); +ERROR: duplicate key value violates unique constraint "unique_index_ao_column_a_key" (seg0 192.168.0.148:7002 pid=721860) +DETAIL: Key (a)=(658491) already exists. +END; +END +DROP TABLE unique_index_ao_column; +DROP + +CREATE TABLE unique_index_ao_column (a bigint unique) USING ao_column DISTRIBUTED REPLICATED; +CREATE +BEGIN; +BEGIN +INSERT INTO unique_index_ao_column SELECT * FROM generate_series(1, 658491); +INSERT 658491 +-- should not conflict +INSERT INTO unique_index_ao_column VALUES (658492); +INSERT 1 +END; +END +DROP TABLE unique_index_ao_column; +DROP + +-- Case 3: Conflict with aborted transaction is not a conflict------------------ +CREATE TABLE unique_index_ao_column (a bigint unique) USING ao_column DISTRIBUTED REPLICATED; +CREATE +BEGIN; +BEGIN +INSERT INTO unique_index_ao_column SELECT * FROM generate_series(1, 658491); +INSERT 658491 +ABORT; +ABORT +-- should not conflict +INSERT INTO unique_index_ao_column VALUES (1); +INSERT 1 +INSERT INTO unique_index_ao_column VALUES (658491); +INSERT 1 +INSERT INTO unique_index_ao_column VALUES (658492); +INSERT 1 +DROP TABLE unique_index_ao_column; +DROP + +-- Case 4: Conflict with to-be-committed transaction---------------------------- +-- +-- 1. Uncommitted tx 1 has inserted non-conflicting key = 0. +-- 2. Uncommitted tx 2 has inserted (161 * 4090 + 1 = 658491 rows), which spans +-- 2 block directory rows (1st row: [1,658490] ; 2nd row: [658491,658491]) +-- 3. Tx 3 tries to insert conflicting key = 2, which maps to the second rownum +-- covered by the 1st block directory row of seg 1, and blocks on tx 2. +-- 4. Tx 4 tries to insert conflicting key = 658490, which maps to the last +-- rownum covered by the 1st block directory row of seg 1, and blocks on tx 2. +-- 5. Tx 5 tries to insert conflicting key = 658491, which maps to the first +-- rownum covered by the 2nd block directory row of seg 1, and blocks on tx 2. +-- 6. Tx 6 tries to insert non-conflicting key = 658492 and is immediately +-- successful. +-- 8. Tx 2 commits +-- 9. Txs 3,4,5 report unique constraint violation +-- 10. Tx 1 commits +-- +CREATE TABLE unique_index_ao_column (a bigint unique) USING ao_column DISTRIBUTED REPLICATED; +CREATE +1: BEGIN; +BEGIN +1: INSERT INTO unique_index_ao_column VALUES (0); +INSERT 1 +2: BEGIN; +BEGIN +2: INSERT INTO unique_index_ao_column SELECT * FROM generate_series(1, 658491); +INSERT 658491 +3&: INSERT INTO unique_index_ao_column VALUES (1); +4&: INSERT INTO unique_index_ao_column VALUES (658490); +5&: INSERT INTO unique_index_ao_column VALUES (658491); +-- should succeed immediately +6: INSERT INTO unique_index_ao_column VALUES (658492); +INSERT 1 +2: COMMIT; +COMMIT +3<: <... completed> +ERROR: duplicate key value violates unique constraint "unique_index_ao_column_a_key" (seg0 192.168.0.148:7002 pid=722493) +DETAIL: Key (a)=(1) already exists. +4<: <... completed> +ERROR: duplicate key value violates unique constraint "unique_index_ao_column_a_key" (seg0 192.168.0.148:7002 pid=722502) +DETAIL: Key (a)=(658490) already exists. +5<: <... completed> +ERROR: duplicate key value violates unique constraint "unique_index_ao_column_a_key" (seg1 192.168.0.148:7003 pid=722513) +DETAIL: Key (a)=(658491) already exists. +1: COMMIT; +COMMIT +DROP TABLE unique_index_ao_column; +DROP + +-- Case 5: Conflict with to-be-aborted transaction------------------------------ +-- +-- 1. Uncommitted tx 1 has inserted non-conflicting key = 0. +-- 2. Uncommitted tx 2 has inserted (161 * 4090 + 1 = 658491 rows), which spans +-- 2 block directory rows (1st row: [1,658490] ; 2nd row: [658491,658491]) +-- 3. Tx 3 tries to insert conflicting key = 2, which maps to the second rownum +-- covered by the 1st block directory row of seg 1, and blocks on tx 2. +-- 4. Tx 4 tries to insert conflicting key = 658490, which maps to the last +-- rownum covered by the 1st block directory row of seg 1, and blocks on tx 2. +-- 5. Tx 5 tries to insert conflicting key = 658491, which maps to the first +-- rownum covered by the 2nd block directory row of seg 1, and blocks on tx 2. +-- 6. Tx 6 tries to insert non-conflicting key = 658492 and is immediately +-- successful. +-- 8. Tx 2 aborts +-- 9. Txs 3,4,5 report unique constraint violation +-- 10. Tx 1 commits +-- +CREATE TABLE unique_index_ao_column (a bigint unique) USING ao_column DISTRIBUTED REPLICATED; +CREATE +1: BEGIN; +BEGIN +1: INSERT INTO unique_index_ao_column VALUES (0); +INSERT 1 +2: BEGIN; +BEGIN +2: INSERT INTO unique_index_ao_column SELECT * FROM generate_series(1, 658491); +INSERT 658491 +3&: INSERT INTO unique_index_ao_column VALUES (1); +4&: INSERT INTO unique_index_ao_column VALUES (658490); +5&: INSERT INTO unique_index_ao_column VALUES (658491); +-- should succeed immediately +6: INSERT INTO unique_index_ao_column VALUES (658492); +INSERT 1 +2: ABORT; +ABORT +3<: <... completed> +INSERT 1 +4<: <... completed> +INSERT 1 +5<: <... completed> +INSERT 1 +1: COMMIT; +COMMIT +DROP TABLE unique_index_ao_column; +DROP + +-- Case 6: Conflict with aborted rows following some committed rows ------------ +CREATE TABLE unique_index_ao_column (a bigint unique) USING ao_column DISTRIBUTED REPLICATED; +CREATE +-- 1. Tx 1 commits rows 1-100. +-- 2. Tx 2 inserts rows 101-200 and then aborts. +-- 3. Tx 3 tries to insert row in range [101,200] and is immediately successful. +-- 4. Tx 4 tries to insert conflicting row in range [1,100] and raises unique +-- constraint violation. +-- 5. Tx 5 tries to insert row in range [201, ) and is immediately successful. +1: INSERT INTO unique_index_ao_column SELECT generate_series(1, 100); +INSERT 100 +2: BEGIN; +BEGIN +2: INSERT INTO unique_index_ao_column SELECT generate_series(101, 200); +INSERT 100 +2: ABORT; +ABORT +3: INSERT INTO unique_index_ao_column VALUES(102); +INSERT 1 +4: INSERT INTO unique_index_ao_column VALUES(2); +ERROR: duplicate key value violates unique constraint "unique_index_ao_column_a_key" (seg0 192.168.0.148:7002 pid=659656) +DETAIL: Key (a)=(2) already exists. +5: INSERT INTO unique_index_ao_column VALUES(202); +INSERT 1 +DROP TABLE unique_index_ao_column; +DROP + +-------------------------------------------------------------------------------- +----------------- More concurrent tests with fault injection ------------------ +-------------------------------------------------------------------------------- + +-- Case 7: Conflict with to-be-committed transaction while only a placeholder +-- row exists in the block directory-------------------------------------------- +-- +-- This case highlights the importance of the placeholder row, inserted at the +-- beginning of an INSERT command. +-- +-- 1. Uncommitted Tx 1 has inserted 3 out of its 10 rows and is suspended. +-- 2. Tx 2 inserts a conflicting row and blocks on Tx 1. +-- 3. Tx 3 inserts a non-conflicting row within the range [4,10] and is +-- immediately successful. (Index entries have been written only for [1,3] so +-- far, so conflicts shouldn't arise) +-- 4. Tx 4 inserts a non-conflicting row in range [11, ..) and should be +-- immediately successful. +-- 5. Now Tx 1 resumes and tries to insert a row in range [4,10] and reports a +-- unique constraint violation with Tx 3. +-- 6. Tx 2 succeeds as Tx 1 aborted. + +CREATE TABLE unique_index_ao_column (a bigint unique) USING ao_column DISTRIBUTED REPLICATED; +CREATE +SELECT gp_inject_fault('appendonly_insert', 'suspend', '', '', 'unique_index_ao_column', 4, 4, 0, dbid) FROM gp_segment_configuration WHERE role = 'p' AND content <> -1; + gp_inject_fault +----------------- + Success: + Success: + Success: +(3 rows) +1&: INSERT INTO unique_index_ao_column SELECT * FROM generate_series(1, 10); +-- Wait until 3 rows have been successfully inserted into the index and Tx 1 +-- is just beginning to insert the 4th row. +SELECT gp_wait_until_triggered_fault('appendonly_insert', 4, dbid) FROM gp_segment_configuration WHERE role = 'p' AND content <> -1; + gp_wait_until_triggered_fault +------------------------------- + Success: + Success: + Success: +(3 rows) +2&: INSERT INTO unique_index_ao_column VALUES(2); +4: INSERT INTO unique_index_ao_column VALUES(11); +INSERT 1 +3: INSERT INTO unique_index_ao_column VALUES(4); +INSERT 1 +SELECT gp_inject_fault('appendonly_insert', 'reset', dbid) FROM gp_segment_configuration WHERE role = 'p' AND content <> -1; + gp_inject_fault +----------------- + Success: + Success: + Success: +(3 rows) +1<: <... completed> +ERROR: duplicate key value violates unique constraint "unique_index_ao_column_a_key" (seg1 192.168.0.148:7003 pid=828519) +DETAIL: Key (a)=(4) already exists. +2<: <... completed> +INSERT 1 +DROP TABLE unique_index_ao_column; +DROP + +-- Case 8: Conflict with to-be-committed transaction - generalization of case 7 +-- where there are multiple minipages (and block directory rows) in play from +-- the same insert. +-- +-- This justifies why 1 placeholder row is enough and we don't need to flush a +-- placeholder row every time we insert a block directory row (i.e. start a new +-- in-memory minipage) throughout the course of a single insert. +-- +-- 1. Uncommitted Tx 1 has inserted (4090 * (161 * 2 + 1) + 4) = 1321074 rows +-- and is suspended, enough rows to fill 2 entire minipages (covers +-- range [1,658490] and [658491,1321070]) before suspension. +-- 2. Txs 2,3,4 inserts conflicting rows that map to the 1st minipage and block. +-- 3. Txs 5,6,7 inserts conflicting rows that map to the 2nd minipage and block. +-- 4. Tx 8 inserts a conflicting row that maps to the 3rd minipage, which is +-- currently only in-memory and it conflicts on the placeholder row and +-- blocks (showcases why 1 placeholder row is enough) +-- 5. Tx 9 inserts a non-conflicting row for which there is no index entry and +-- and is immediately successful (1321075). +-- 6. Now Tx 1 resumes and tries to insert 1321075 and reports a unique +-- constraint violation with Tx 9. +-- 7. All blocked Txs succeed. + +CREATE TABLE unique_index_ao_column (a bigint unique) USING ao_column DISTRIBUTED REPLICATED; +CREATE + +SELECT gp_inject_fault('insert_new_entry_curr_minipage_full', 'suspend', '', '', '', 2, 2, 0, dbid) FROM gp_segment_configuration WHERE role = 'p' AND content <> -1; + gp_inject_fault +----------------- + Success: + Success: + Success: +(3 rows) +1&: INSERT INTO unique_index_ao_column SELECT generate_series(1, 1321075); + +-- Wait until we have inserted (4090 * (161 * 2 + 1) + 3) = 1321073 rows and we +-- are about to insert the 1321074th row. +SELECT gp_wait_until_triggered_fault('insert_new_entry_curr_minipage_full', 2, dbid) FROM gp_segment_configuration WHERE role = 'p' AND content <> -1; + gp_wait_until_triggered_fault +------------------------------- + Success: + Success: + Success: +(3 rows) +SELECT gp_inject_fault('appendonly_insert', 'suspend', '', '', 'unique_index_ao_column', 4, 4, 0, dbid) FROM gp_segment_configuration WHERE role = 'p' AND content <> -1; + gp_inject_fault +----------------- + Success: + Success: + Success: +(3 rows) +SELECT gp_inject_fault('insert_new_entry_curr_minipage_full', 'reset', dbid) FROM gp_segment_configuration WHERE role = 'p' AND content <> -1; + gp_inject_fault +----------------- + Success: + Success: + Success: +(3 rows) +SELECT gp_wait_until_triggered_fault('appendonly_insert', 4, dbid) FROM gp_segment_configuration WHERE role = 'p' AND content <> -1; + gp_wait_until_triggered_fault +------------------------------- + Success: + Success: + Success: +(3 rows) + +-- maps to 1st minipage +2&: INSERT INTO unique_index_ao_column VALUES(1); +3&: INSERT INTO unique_index_ao_column VALUES(300000); +4&: INSERT INTO unique_index_ao_column VALUES(658490); +-- maps to 2nd minipage +5&: INSERT INTO unique_index_ao_column VALUES(658491); +6&: INSERT INTO unique_index_ao_column VALUES(700000); +7&: INSERT INTO unique_index_ao_column VALUES(1321070); +-- maps to 3rd minipage +8&: INSERT INTO unique_index_ao_column VALUES(1321071); +-- no index entry exists for it, so should not conflict. +9: INSERT INTO unique_index_ao_column VALUES(1321075); +INSERT 1 + +SELECT gp_inject_fault('appendonly_insert', 'reset', dbid) FROM gp_segment_configuration WHERE role = 'p' AND content <> -1; + gp_inject_fault +----------------- + Success: + Success: + Success: +(3 rows) + +1<: <... completed> +ERROR: duplicate key value violates unique constraint "unique_index_ao_column_a_key" (seg2 192.168.0.148:7004 pid=735802) +DETAIL: Key (a)=(1321075) already exists. +2<: <... completed> +INSERT 1 +3<: <... completed> +INSERT 1 +4<: <... completed> +INSERT 1 +5<: <... completed> +INSERT 1 +6<: <... completed> +INSERT 1 +7<: <... completed> +INSERT 1 +8<: <... completed> +INSERT 1 + +DROP TABLE unique_index_ao_column; +DROP + +-------------------------------------------------------------------------------- +--------------------------- Smoke tests for COPY ------------------------------- +-------------------------------------------------------------------------------- + +CREATE TABLE unique_index_ao_column (a INT unique) USING ao_column DISTRIBUTED REPLICATED; +CREATE + +1: BEGIN; +BEGIN +1: COPY unique_index_ao_column FROM PROGRAM 'seq 1 10'; +COPY 30 +-- concurrent tx inserting conflicting row should block. +2&: COPY unique_index_ao_column FROM PROGRAM 'seq 1 1'; +-- concurrent tx inserting non-conflicting rows should be successful. +3: COPY unique_index_ao_column FROM PROGRAM 'seq 11 20'; +COPY 30 +-- inserting a conflicting row in the same transaction should ERROR out. +1: COPY unique_index_ao_column FROM PROGRAM 'seq 1 1'; +ERROR: duplicate key value violates unique constraint "unique_index_ao_column_a_key" +DETAIL: Key (a)=(1) already exists. +CONTEXT: COPY unique_index_ao_column, line 1 +-- now that tx 1 was aborted, tx 2 is successful. +2<: <... completed> +COPY 3 +1: END; +END + +DROP TABLE unique_index_ao_column; +DROP + +-------------------------------------------------------------------------------- +-------------------- Smoke tests for subtransactions --------------------------- +-------------------------------------------------------------------------------- +CREATE TABLE unique_index_ao_column (a INT unique) USING ao_column DISTRIBUTED REPLICATED; +CREATE + +1: BEGIN; +BEGIN +1: SAVEPOINT a; +SAVEPOINT +1: INSERT INTO unique_index_ao_column VALUES(1); +INSERT 1 + +-- concurrent tx inserting conflicting row should block. +2: BEGIN; +BEGIN +2&: INSERT INTO unique_index_ao_column VALUES(1); +-- concurrent tx inserting non-conflicting row should be successful. +3: INSERT INTO unique_index_ao_column VALUES(2); +INSERT 1 + +-- conflict should be detected within the same subtx. +1: INSERT INTO unique_index_ao_column VALUES(1); +ERROR: duplicate key value violates unique constraint "unique_index_ao_column_a_key" (seg1 192.168.0.148:7003 pid=3397768) +DETAIL: Key (a)=(1) already exists. +-- the concurrent tx should now succeed. +2<: <... completed> +INSERT 1 +2: ABORT; +ABORT + +-- after rolling back to the savepoint, we should be able to re-insert the key +1: ROLLBACK TO SAVEPOINT a; +ROLLBACK +1: INSERT INTO unique_index_ao_column VALUES(1); +INSERT 1 +1: COMMIT; +COMMIT + +SELECT * FROM unique_index_ao_column; + a +--- + 1 + 2 +(2 rows) + +DROP TABLE unique_index_ao_column; +DROP + +-------------------------------------------------------------------------------- +-------------------- Smoke tests for repeatable read --------------------------- +-------------------------------------------------------------------------------- + +-- Test that shows that unique index checks transcend transaction isolation +-- boundaries. + +CREATE TABLE unique_index_ao_column (a INT unique) USING ao_column DISTRIBUTED REPLICATED; +CREATE + +-- Begin two txs with tx level snapshot taken early. +1: BEGIN ISOLATION LEVEL REPEATABLE READ; +BEGIN +1: SELECT * FROM unique_index_ao_column; + a +--- +(0 rows) +2: BEGIN ISOLATION LEVEL REPEATABLE READ; +BEGIN +2: SELECT * FROM unique_index_ao_column; + a +--- +(0 rows) + +-- Now begin a concurrent transaction which inserts a key. +3: BEGIN; +BEGIN +3: INSERT INTO unique_index_ao_column VALUES(1); +INSERT 1 + +-- And another transaction inserts a key and commits. +INSERT INTO unique_index_ao_column VALUES(2); +INSERT 1 + +-- Tx should block on insert of conflicting key, even though it can't "see" the +-- conflicting key due to its isolation level. +1: SELECT * FROM unique_index_ao_column; + a +--- +(0 rows) +1&: INSERT INTO unique_index_ao_column VALUES(1); + +3: ABORT; +ABORT +1<: <... completed> +INSERT 1 +1: ABORT; +ABORT + +-- Tx should raise a conflict, even though it can't "see" the conflicting key +-- due to its isolation level. +2: SELECT * FROM unique_index_ao_column; + a +--- +(0 rows) +2: INSERT INTO unique_index_ao_column VALUES(2); +ERROR: duplicate key value violates unique constraint "unique_index_ao_column_a_key" (seg0 192.168.0.148:7002 pid=3417500) +DETAIL: Key (a)=(2) already exists. +2: ABORT; +ABORT + +DROP TABLE unique_index_ao_column; +DROP + + +-------------------------------------------------------------------------------- +----------------------- Smoke tests for ADD CONSTRAINT ------------------------ +-------------------------------------------------------------------------------- +CREATE TABLE unique_index_ao_column (a INT) USING ao_column DISTRIBUTED REPLICATED; +CREATE +INSERT INTO unique_index_ao_column SELECT * FROM generate_series(1, 5); +INSERT 5 + +ALTER table unique_index_ao_column ADD CONSTRAINT a_unique UNIQUE(a); +ALTER +-- should conflict +INSERT INTO unique_index_ao_column VALUES (1); +DETAIL: Key (a)=(1) already exists. +ERROR: duplicate key value violates unique constraint "a_unique" +ALTER table unique_index_ao_column DROP CONSTRAINT a_unique; +ALTER + +INSERT INTO unique_index_ao_column VALUES (1); +INSERT 1 +-- should failed +ALTER table unique_index_ao_column ADD CONSTRAINT a_unique UNIQUE(a); +DETAIL: Key (a)=(1) is duplicated. +ERROR: could not create unique index "a_unique" + +DROP TABLE unique_index_ao_column; +DROP + + +-------------------------------------------------------------------------------- +----------------------- Smoke tests for Multiple Key --------------------------- +-------------------------------------------------------------------------------- +CREATE TABLE unique_index_ao_column (a INT, b INT) USING ao_column DISTRIBUTED REPLICATED; +CREATE +INSERT INTO unique_index_ao_column SELECT i,i FROM generate_series(1, 5) i; +INSERT 5 + +CREATE UNIQUE INDEX a_b_unique ON unique_index_ao_column(a,b); +CREATE +-- should not conflict +INSERT INTO unique_index_ao_column VALUES (1,2); +INSERT 1 +-- should conflict +INSERT INTO unique_index_ao_column VALUES (1,1); +DETAIL: Key (a, b)=(1, 1) already exists. +ERROR: duplicate key value violates unique constraint "a_b_unique" +DROP TABLE unique_index_ao_column; +DROP diff --git a/src/test/isolation2/expected/fsync_ao.out b/src/test/isolation2/expected/fsync_ao.out index 36ea980932d..5c4bd17f3ed 100644 --- a/src/test/isolation2/expected/fsync_ao.out +++ b/src/test/isolation2/expected/fsync_ao.out @@ -83,13 +83,17 @@ select gp_wait_until_triggered_fault('restartpoint_guts', 2, dbid) from gp_segme (1 row) -- Validate that the number of files fsync'ed by checkpointer (on --- mirror). `num times hit` is corresponding to the number of files --- synced by `ao_fsync_counter` fault. +-- mirror). This should match the number of files for fsync_ao and fsync_co. +select gp_wait_until_triggered_fault('ao_fsync_counter', 3, dbid) from gp_segment_configuration where content=0 and role='m'; + gp_wait_until_triggered_fault +------------------------------- + Success: +(1 row) select gp_inject_fault('ao_fsync_counter', 'status', dbid) from gp_segment_configuration where content=0 and role='m'; gp_inject_fault ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ - Success: fault name:'ao_fsync_counter' fault type:'skip' ddl statement:'' database name:'' table name:'' start occurrence:'1' end occurrence:'-1' extra arg:'0' fault injection state:'triggered' num times hit:'3' + Success: fault name:'ao_fsync_counter' fault type:'skip' ddl statement:'' database name:'' table name:'' start occurrence:'1' end occurrence:'-1' extra arg:'0' fault injection state:'triggered' num times hit:'3' (1 row) -- Test vacuum compaction with more than one segment file per table. @@ -186,6 +190,11 @@ select gp_wait_until_triggered_fault('restartpoint_guts', 3, dbid) from gp_segme (1 row) -- Expect the segment files that were updated by vacuum to be fsync'ed. +select gp_wait_until_triggered_fault('ao_fsync_counter', 12, dbid) from gp_segment_configuration where content=0 and role='m'; + gp_wait_until_triggered_fault +------------------------------- + Success: +(1 row) select gp_inject_fault('ao_fsync_counter', 'status', dbid) from gp_segment_configuration where content=0 and role='m'; gp_inject_fault ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- @@ -227,6 +236,11 @@ select gp_wait_until_triggered_fault('restartpoint_guts', 4, dbid) from gp_segme -- Expect that fsync is only performed for fsync_ao table (1 file) but -- not for fsync_co table because it was dropped after being updated. +select gp_wait_until_triggered_fault('ao_fsync_counter', 13, dbid) from gp_segment_configuration where content=0 and role='m'; + gp_wait_until_triggered_fault +------------------------------- + Success: +(1 row) select gp_inject_fault('ao_fsync_counter', 'status', dbid) from gp_segment_configuration where content=0 and role='m'; gp_inject_fault ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- diff --git a/src/test/isolation2/input/uao/ao_unique_index_vacuum.source b/src/test/isolation2/input/uao/ao_unique_index_vacuum.source new file mode 100644 index 00000000000..4b1f3ca0b4e --- /dev/null +++ b/src/test/isolation2/input/uao/ao_unique_index_vacuum.source @@ -0,0 +1,67 @@ +-- Test cases to cover VACUUM and concurrent INSERT behavior on append-optimized +-- tables with unique indexes. + +-- Case 1: Basic case with a few deleted tuples--------------------------------- +CREATE TABLE unique_index_vacuum_@amname@(i int UNIQUE) USING @amname@ + DISTRIBUTED REPLICATED; +INSERT INTO unique_index_vacuum_@amname@ SELECT generate_series(1, 5); +DELETE FROM unique_index_vacuum_@amname@ WHERE i = 5; +-- should succeed (and not raise conflicts for rows [1,4] while moving rows [1,4]) +VACUUM unique_index_vacuum_@amname@; +-- There should be 1 visible blkdir row with all 4 visible tuples +SELECT (gp_toolkit.__gp_aoblkdir('unique_index_vacuum_@amname@')).* + FROM gp_dist_random('gp_id') WHERE gp_segment_id = 0 ORDER BY 1,2,3,4,5; +DROP TABLE unique_index_vacuum_@amname@; + +-- Case 2: Concurrent case showcasing that a placeholder block directory row is +-- not necessary to be inserted for the rows transferred to a new segment by +-- a VACUUM operation. +CREATE TABLE unique_index_vacuum_@amname@(i int UNIQUE) USING @amname@ + DISTRIBUTED REPLICATED; +INSERT INTO unique_index_vacuum_@amname@ SELECT generate_series(1, 5); +DELETE FROM unique_index_vacuum_@amname@ WHERE i = 5; + +SELECT gp_inject_fault('appendonly_insert', 'suspend', '', '', 'unique_index_vacuum_@amname@', 2, 2, 0, dbid) + FROM gp_segment_configuration WHERE role = 'p' AND content <> -1; + +1&: VACUUM unique_index_vacuum_@amname@; + +-- Wait until tuple with key i = 1 has been moved by the vacuum operation +SELECT gp_wait_until_triggered_fault('appendonly_insert', 2, dbid) + FROM gp_segment_configuration WHERE role = 'p' AND content <> -1; +-- Even though a new index entry has been written for the moved tuple with key +-- i = 1, the old index entry (pointing to the old segfile) will still be live +-- and will always be used in detecting the conflict (chosen over the new index +-- entry and its associated block directory entry). +INSERT INTO unique_index_vacuum_@amname@ VALUES(1); + +-- Inserting a key not moved yet should also result in a conflict. +INSERT INTO unique_index_vacuum_@amname@ VALUES(2); + +SELECT gp_inject_fault('appendonly_insert', 'reset', dbid) + FROM gp_segment_configuration WHERE role = 'p' AND content <> -1; + +1<: +DROP TABLE unique_index_vacuum_@amname@; + +-- Case 3: Validate the occurrence of vacuum index. +-- just after it has bulk deleted the old index entries. +CREATE TABLE unique_index_vacuum_@amname@(i int UNIQUE) USING @amname@ + DISTRIBUTED REPLICATED; +INSERT INTO unique_index_vacuum_@amname@ SELECT generate_series(1, 5); +DELETE FROM unique_index_vacuum_@amname@ WHERE i = 5; + +select gp_inject_fault_infinite('vacuum_ao_after_index_delete', 'skip', dbid) + from gp_segment_configuration where role = 'p' AND content <> -1; + +VACUUM unique_index_vacuum_@amname@; + +-- Wait until all old index entries have been deleted by the VACUUM. +SELECT gp_wait_until_triggered_fault('vacuum_ao_after_index_delete', 1, dbid) + FROM gp_segment_configuration WHERE role = 'p' AND content <> -1; + + +SELECT gp_inject_fault_infinite('vacuum_ao_after_index_delete', 'reset', dbid) + FROM gp_segment_configuration WHERE role = 'p' AND content <> -1; + +DROP TABLE unique_index_vacuum_@amname@; diff --git a/src/test/isolation2/input/uao/test_pg_appendonly_version.source b/src/test/isolation2/input/uao/test_pg_appendonly_version.source new file mode 100644 index 00000000000..9c613e66a16 --- /dev/null +++ b/src/test/isolation2/input/uao/test_pg_appendonly_version.source @@ -0,0 +1,29 @@ +-- Validate pg_appendonly.version functioning by unique index creation +-- as it requires pg_appendonly.version >= AORelationVersion_CB2. + +create table @amname@_version_tbl (a int) using @amname@; + +-- unique index on AO is supported starting from version 2 (AORelationVersion_CB2) +select version from pg_appendonly where relid = '@amname@_version_tbl'::regclass; +create unique index on @amname@_version_tbl(a); +insert into @amname@_version_tbl select generate_series(1, 10); +create unique index on @amname@_version_tbl(a); +set enable_seqscan = off; +select * from @amname@_version_tbl where a = 2; + +-- modify pg_appendonly.version to older such like 1 (AORelationVersion_GP6) +set allow_system_table_mods = on; +update pg_appendonly set version = 1 where relid = '@amname@_version_tbl'::regclass; + +-- unique index on AO isn't supported on version < AORelationVersion_CB2 +select version from pg_appendonly where relid = '@amname@_version_tbl'::regclass; +create unique index on @amname@_version_tbl(a); + +-- alter table with reorganize to verify pg_appendonly being rewritten +alter table @amname@_version_tbl set with (reorganize = true); +select version from pg_appendonly where relid = '@amname@_version_tbl'::regclass; +create unique index on @amname@_version_tbl(a); +select * from @amname@_version_tbl where a = 3; + +drop table @amname@_version_tbl; +reset allow_system_table_mods; diff --git a/src/test/isolation2/input/uao/vacuum_cleanup.source b/src/test/isolation2/input/uao/vacuum_cleanup.source index 1a7ebef0799..fd967a03195 100644 --- a/src/test/isolation2/input/uao/vacuum_cleanup.source +++ b/src/test/isolation2/input/uao/vacuum_cleanup.source @@ -27,8 +27,7 @@ 1: select age(relfrozenxid), regexp_replace(replace(relname, 'ao_@amname@_vacuum_cleanup2'::regclass::oid::text, ''), 'ao.*seg', '') from gp_dist_random('pg_class') where relkind in ('r','t','o','b','M') and (relname like '%' || 'ao_@amname@_vacuum_cleanup2'::regclass::oid || '%') and gp_segment_id = 0; 2: end; --- Check that drop phase is skipped, but still the cleanup phase is performed --- when there are concurrent serializable transactions +-- Check that drop phase is not skipped, when there are concurrent serializable transactions. 1: create table ao_@amname@_vacuum_cleanup3(a int, b int) using @amname@; 1: insert into ao_@amname@_vacuum_cleanup3 select i, i from generate_series(1, 100) i; 1: delete from ao_@amname@_vacuum_cleanup3; @@ -41,9 +40,9 @@ -- Wait till compaction phase is completed and only then start the serializable -- transaction to ensure that only drop phase runs after the serializable --- transaction started. Because the transaction is holding a lock on the table, --- the drop phase cannot upgrade to AccessExclusiveLock and so recycling the --- segfiles is skipped. +-- transaction started. For the new VACUUM strategy, because no need to upgrade +-- to AccessExclusiveLock for drop phase, and the transaction would not access +-- AWAITING_DROP segfiles, dropping the dead segfiles could proceed accordingly. 2: select gp_wait_until_triggered_fault('vacuum_relation_open_relation_during_drop_phase', 1, 1); 2: begin isolation level serializable; 2: select * from ao_@amname@_vacuum_cleanup3; @@ -52,10 +51,145 @@ 1<: 1: select age(relfrozenxid), regexp_replace(replace(relname, 'ao_@amname@_vacuum_cleanup3'::regclass::oid::text, ''), 'ao.*seg', '') from gp_dist_random('pg_class') where relkind in ('r','t','o','b','M') and (relname like '%' || 'ao_@amname@_vacuum_cleanup3'::regclass::oid || '%') and gp_segment_id = 0; --- Validate that the drop phase was skipped. segfile 1 should be in state 2 --- (AWAITING_DROP) +-- Validate that the drop phase wasn't skipped in the new VACUUM strategy, +-- segfile 1 should be in state 1 (AWAITING_DEFAULT). This is because +-- no need to acquire AccessExclusiveLock at dead segments recycling hence +-- the cleanup routine could be performed as is. -- This result is related to data distribution. -- Current hash algorithm is jump-consistent-hash. 1: SELECT * FROM gp_ao_or_aocs_seg('ao_@amname@_vacuum_cleanup3'); 2: commit; + +1q: +2q: + +-- Test VACUUM with concurrent readers: +-- a) if reader transaction started before VACUUM, VACUUM should not drop the AWAITING_DROP segment +-- which was accessing by the reader; +-- b) if reader transaction started after VACUUM, VACUUM should drop the AWAITING_DROP segments. + +create or replace function show_aoseg(tabname text) returns table + (segno int, tupcount bigint, modcount bigint, formatversion smallint, state smallint) as $$ +declare + tabrelid oid; /* in func */ + tabsegrelid oid; /* in func */ + tabsegrelname text; /* in func */ +begin + select tabname::regclass::oid into tabrelid; /* in func */ + select segrelid from pg_appendonly where relid = tabrelid into tabsegrelid; /* in func */ + select relname from pg_class where oid = tabsegrelid into tabsegrelname; /* in func */ + + return query execute 'select segno,tupcount,modcount,formatversion,state from pg_aoseg.' || tabsegrelname; /* in func */ +end; /* in func */ +$$ language plpgsql; + +create table vacuum_concurrent_test_@amname@ (a int, b int, c int) using @amname@; +insert into vacuum_concurrent_test_@amname@ select 2, b, b from generate_series(1, 11) b; +create index i_b_vacuum_concurrent_reader_@amname@ on vacuum_concurrent_test_@amname@(b); +update vacuum_concurrent_test_@amname@ set b = b + 1; + +-- expect segment state is DEFAULT (state == 1) +0U: select * from show_aoseg('vacuum_concurrent_test_@amname@') order by segno; + +-- start a reader before VACUUM +1: begin; +1: select * from vacuum_concurrent_test_@amname@ where b = 10; + +vacuum vacuum_concurrent_test_@amname@; + +-- expect to see AWAITING_DROP(state == 2) tuple +0U: select * from show_aoseg('vacuum_concurrent_test_@amname@') order by segno; + +1: select * from vacuum_concurrent_test_@amname@ where b = 5; +1: select * from vacuum_concurrent_test_@amname@ order by c; +1: end; + +-- start another reader after VACUUM +2: begin; +2: select * from vacuum_concurrent_test_@amname@ where c = 2; + +-- expect to see AWAITING_DROP(state == 2) tuple for unable to drop the dead segment by the first VACUUM +0U: select * from show_aoseg('vacuum_concurrent_test_@amname@') order by segno; + +vacuum vacuum_concurrent_test_@amname@; + +-- expect no AWAITING_DROP(state == 2) tuple because it was dropped by the last VACUUM +0U: select * from show_aoseg('vacuum_concurrent_test_@amname@') order by segno; + +2: select * from vacuum_concurrent_test_@amname@ where b = 7; +2: select * from vacuum_concurrent_test_@amname@ order by c; +2: end; + +1q: +2q: + +-- Test VACUUM with concurrent writer. +-- There was a concurrent insert transaction started prior to VACUUM, VACUUM should not mark +-- the target segment which was also writting by the concurrent writer to AWAITING_DROP, and +-- the corresponding index entries should not be deleted. + +truncate table vacuum_concurrent_test_@amname@; +insert into vacuum_concurrent_test_@amname@ select 2, b, b from generate_series(1, 5) b; +delete from vacuum_concurrent_test_@amname@; + +1: begin; +1: insert into vacuum_concurrent_test_@amname@ select 2, b, b from generate_series(6, 10) b; + +2: vacuum vacuum_concurrent_test_@amname@; + +1: commit; + +set enable_seqscan = on; +-- start_ignore +explain (costs off) select * from vacuum_concurrent_test_@amname@ order by b; +-- end_ignore +select * from vacuum_concurrent_test_@amname@ order by b; + +-- expect all bitmapindexscan results are consistent with above seqscan +set enable_seqscan = off; +-- start_ignore +explain (costs off) select * from vacuum_concurrent_test_@amname@ where b = b order by b; +-- end_ignore +select * from vacuum_concurrent_test_@amname@ where b = b order by b; + +-- expose dead tuples +set gp_select_invisible = true; + +set enable_seqscan = on; +-- start_ignore +explain (costs off) select * from vacuum_concurrent_test_@amname@ order by b; +-- end_ignore +select * from vacuum_concurrent_test_@amname@ order by b; + +-- expect all bitmapindexscan results are same as above seqscan +set enable_seqscan = off; +-- start_ignore +explain (costs off) select * from vacuum_concurrent_test_@amname@ where b = b order by b; +-- end_ignore +select * from vacuum_concurrent_test_@amname@ where b = b order by b; + +-- vacuum again without concurrent reader/writer, expect above dead tuples could be removed +2: vacuum vacuum_concurrent_test_@amname@; + +set enable_seqscan = on; +-- start_ignore +explain (costs off) select * from vacuum_concurrent_test_@amname@ order by b; +-- end_ignore +select * from vacuum_concurrent_test_@amname@ order by b; + +-- expect all bitmapindexscan results are same as seqscan +set enable_seqscan = off; +-- start_ignore +explain (costs off) select * from vacuum_concurrent_test_@amname@ where b = b order by b; +-- end_ignore +select * from vacuum_concurrent_test_@amname@ where b = b order by b; + +1q: +2q: + +reset enable_seqscan; +reset gp_select_invisible; + +drop table vacuum_concurrent_test_@amname@; +drop function show_aoseg; diff --git a/src/test/isolation2/input/uao/vacuum_index_stats.source b/src/test/isolation2/input/uao/vacuum_index_stats.source new file mode 100644 index 00000000000..139ef71849a --- /dev/null +++ b/src/test/isolation2/input/uao/vacuum_index_stats.source @@ -0,0 +1,63 @@ +-- This is intended to test a new behavior of VACUUM AO/CO enhancement. +-- The enhacement introduced a new strategy to improve performance by +-- vacuuming indexes based on the collected AWAITING_DROP segment files, +-- instead of reading AO/CO visibility map catalog for every index tuple. +-- This behavior would lead to the index->reltuples being updated only when +-- AWAITING_DROP segment is greater than 0, which requires compaction during +-- VACUUM. If no compaction happens, even if dead tuples were deleted, +-- index->reltuples wouldn't get updated accordingly, which could generate +-- difference between table->reltuples and index->reltuples. That is supposed +-- to be fine in most cases since bloating size of indexes is limited in +-- the scope of gp_appendonly_compaction_threshold percentage of total tuples. +-- The new strategy would not impact table->reltuples updates. + +create table vacuum_index_stats_@amname@ (a int, b int, c int) using @amname@; +insert into vacuum_index_stats_@amname@ select 2, b, b from generate_series(1, 11) b; +create index i_b_vacuum_index_stats_@amname@ on vacuum_index_stats_@amname@(b); + +set gp_appendonly_compaction_threshold = 10; +analyze vacuum_index_stats_@amname@; + +-- expect reltuples == 11 +0U: select reltuples from pg_class where relname = 'vacuum_index_stats_@amname@'; +-- expect reltuples == 11 +0U: select reltuples from pg_class where relname = 'i_b_vacuum_index_stats_@amname@'; + +-- delete one tuple +delete from vacuum_index_stats_@amname@ where c = 1; +vacuum vacuum_index_stats_@amname@; + +-- hideRatio = hiddenTupcount / totalTupcount * 100 = 1 / 11 * 100 = 9% +-- less than gp_appendonly_compaction_threshold (10%), no compaction would happen +-- during vacuum, expect no change in reltuples of the index but decrease 1 in +-- reltuples of the table. + +-- expect reltuples == 10 +0U: select reltuples from pg_class where relname = 'vacuum_index_stats_@amname@'; +-- expect reltuples == 11 for no compaction happened +0U: select reltuples from pg_class where relname = 'i_b_vacuum_index_stats_@amname@'; + +analyze vacuum_index_stats_@amname@; + +-- expect reltuples == 10 +0U: select reltuples from pg_class where relname = 'vacuum_index_stats_@amname@'; +-- expect reltuples == 10 +0U: select reltuples from pg_class where relname = 'i_b_vacuum_index_stats_@amname@'; + +-- delete two tuples +delete from vacuum_index_stats_@amname@ where c < 4; +vacuum vacuum_index_stats_@amname@; + +-- hideRatio = hiddenTupcount / totalTupcount * 100 = 2 / 10 * 100 = 20% +-- greater than gp_appendonly_compaction_threshold (10%), compaction would happen +-- during vacuum, expect changes in reltuples for both index and table. + +-- expect reltuples == 8 +0U: select reltuples from pg_class where relname = 'vacuum_index_stats_@amname@'; +-- expect reltuples == 8 for compaction happened +0U: select reltuples from pg_class where relname = 'i_b_vacuum_index_stats_@amname@'; + +0Uq: + +drop table vacuum_index_stats_@amname@; +reset gp_appendonly_compaction_threshold; diff --git a/src/test/isolation2/isolation2_schedule b/src/test/isolation2/isolation2_schedule index e89566e9d67..9ff22c39f82 100644 --- a/src/test/isolation2/isolation2_schedule +++ b/src/test/isolation2/isolation2_schedule @@ -90,6 +90,7 @@ test: invalidated_toast_index test: distributed_snapshot test: gp_collation test: ao_upgrade +test: ao_blkdir test: bitmap_index_concurrent test: bitmap_index_crash test: bitmap_update_words_backup_block @@ -150,8 +151,13 @@ test: uao/vacuum_self_function_row test: uao/vacuum_while_insert_row test: uao/vacuum_while_vacuum_row test: uao/vacuum_cleanup_row +test: uao/vacuum_index_stats_row test: uao/bitmapindex_rescan_row test: uao/limit_indexscan_inits_row +test: uao/test_pg_appendonly_version_row +# Refer to the case comment for why it is commented out. +# test: uao/bad_buffer_on_temp_ao_row + test: reorganize_after_ao_vacuum_skip_drop truncate_after_ao_vacuum_skip_drop mark_all_aoseg_await_drop # below test(s) inject faults so each of them need to be in a separate group test: segwalrep/master_wal_switch @@ -201,8 +207,12 @@ test: uao/vacuum_self_function_column test: uao/vacuum_while_insert_column test: uao/vacuum_while_vacuum_column test: uao/vacuum_cleanup_column +test: uao/vacuum_index_stats_column test: uao/bitmapindex_rescan_column test: uao/limit_indexscan_inits_column +test: uao/test_pg_appendonly_version_column +# Refer to the case comment for why it is commented out. +# test: uao/bad_buffer_on_temp_ao_column # this case contains fault injection, must be put in a separate test group test: terminate_in_gang_creation @@ -291,3 +301,9 @@ test: check_gxid # test if GUC is synchronized from the QD to QEs. test: sync_guc + +# Tests for unique indexes on AO/CO tables (uses fault injector) +test: ao_unique_index +test: aocs_unique_index +test: uao/ao_unique_index_vacuum_row +test: uao/ao_unique_index_vacuum_column diff --git a/src/test/isolation2/output/uao/ao_unique_index_vacuum.source b/src/test/isolation2/output/uao/ao_unique_index_vacuum.source new file mode 100644 index 00000000000..74db357c6ab --- /dev/null +++ b/src/test/isolation2/output/uao/ao_unique_index_vacuum.source @@ -0,0 +1,116 @@ +-- Test cases to cover VACUUM and concurrent INSERT behavior on append-optimized +-- tables with unique indexes. + +-- Case 1: Basic case with a few deleted tuples--------------------------------- +CREATE TABLE unique_index_vacuum_@amname@(i int UNIQUE) USING @amname@ DISTRIBUTED REPLICATED; +CREATE +INSERT INTO unique_index_vacuum_@amname@ SELECT generate_series(1, 5); +INSERT 5 +DELETE FROM unique_index_vacuum_@amname@ WHERE i = 5; +DELETE 1 +-- should succeed (and not raise conflicts for rows [1,4] while moving rows [1,4]) +VACUUM unique_index_vacuum_@amname@; +VACUUM +-- There should be 1 visible blkdir row with all 4 visible tuples +SELECT (gp_toolkit.__gp_aoblkdir('unique_index_vacuum_@amname@')).* FROM gp_dist_random('gp_id') WHERE gp_segment_id = 0 ORDER BY 1,2,3,4,5; + tupleid | segno | columngroup_no | entry_no | first_row_no | file_offset | row_count +---------+-------+----------------+----------+--------------+-------------+----------- + (0,3) | 2 | 0 | 0 | 1 | 0 | 4 +(1 row) +DROP TABLE unique_index_vacuum_@amname@; +DROP + +-- Case 2: Concurrent case showcasing that a placeholder block directory row is +-- not necessary to be inserted for the rows transferred to a new segment by +-- a VACUUM operation. +CREATE TABLE unique_index_vacuum_@amname@(i int UNIQUE) USING @amname@ DISTRIBUTED REPLICATED; +CREATE +INSERT INTO unique_index_vacuum_@amname@ SELECT generate_series(1, 5); +INSERT 5 +DELETE FROM unique_index_vacuum_@amname@ WHERE i = 5; +DELETE 1 + +SELECT gp_inject_fault('appendonly_insert', 'suspend', '', '', 'unique_index_vacuum_@amname@', 2, 2, 0, dbid) FROM gp_segment_configuration WHERE role = 'p' AND content <> -1; + gp_inject_fault +----------------- + Success: + Success: + Success: +(3 rows) + +1&: VACUUM unique_index_vacuum_@amname@; + +-- Wait until tuple with key i = 1 has been moved by the vacuum operation +SELECT gp_wait_until_triggered_fault('appendonly_insert', 2, dbid) FROM gp_segment_configuration WHERE role = 'p' AND content <> -1; + gp_wait_until_triggered_fault +------------------------------- + Success: + Success: + Success: +(3 rows) +-- Even though a new index entry has been written for the moved tuple with key +-- i = 1, the old index entry (pointing to the old segfile) will still be live +-- and will always be used in detecting the conflict (chosen over the new index +-- entry and its associated block directory entry). +INSERT INTO unique_index_vacuum_@amname@ VALUES(1); +ERROR: duplicate key value violates unique constraint "unique_index_vacuum_@amname@_i_key" (seg1 192.168.0.148:7003 pid=3197772) +DETAIL: Key (i)=(1) already exists. + +-- Inserting a key not moved yet should also result in a conflict. +INSERT INTO unique_index_vacuum_@amname@ VALUES(2); +ERROR: duplicate key value violates unique constraint "unique_index_vacuum_@amname@_i_key" (seg1 192.168.0.148:7003 pid=3197772) +DETAIL: Key (i)=(2) already exists. + +SELECT gp_inject_fault('appendonly_insert', 'reset', dbid) FROM gp_segment_configuration WHERE role = 'p' AND content <> -1; + gp_inject_fault +----------------- + Success: + Success: + Success: +(3 rows) + +1<: <... completed> +VACUUM +DROP TABLE unique_index_vacuum_@amname@; +DROP + +-- Case 3: Validate the occurrence of vacuum index. +-- just after it has bulk deleted the old index entries. +CREATE TABLE unique_index_vacuum_@amname@(i int UNIQUE) USING @amname@ DISTRIBUTED REPLICATED; +CREATE +INSERT INTO unique_index_vacuum_@amname@ SELECT generate_series(1, 5); +INSERT 5 +DELETE FROM unique_index_vacuum_@amname@ WHERE i = 5; +DELETE 1 + +select gp_inject_fault_infinite('vacuum_ao_after_index_delete', 'skip', dbid) from gp_segment_configuration where role = 'p' AND content <> -1; + gp_inject_fault_infinite +-------------------------- + Success: + Success: + Success: +(3 rows) + +VACUUM unique_index_vacuum_@amname@; +VACUUM + +-- Wait until all old index entries have been deleted by the VACUUM. +SELECT gp_wait_until_triggered_fault('vacuum_ao_after_index_delete', 1, dbid) FROM gp_segment_configuration WHERE role = 'p' AND content <> -1; + gp_wait_until_triggered_fault +------------------------------- + Success: + Success: + Success: +(3 rows) + + +SELECT gp_inject_fault_infinite('vacuum_ao_after_index_delete', 'reset', dbid) FROM gp_segment_configuration WHERE role = 'p' AND content <> -1; + gp_inject_fault_infinite +-------------------------- + Success: + Success: + Success: +(3 rows) + +DROP TABLE unique_index_vacuum_@amname@; +DROP diff --git a/src/test/isolation2/output/uao/test_pg_appendonly_version.source b/src/test/isolation2/output/uao/test_pg_appendonly_version.source new file mode 100644 index 00000000000..2bcf4f25353 --- /dev/null +++ b/src/test/isolation2/output/uao/test_pg_appendonly_version.source @@ -0,0 +1,63 @@ +-- Validate pg_appendonly.version functioning by unique index creation +-- as it requires pg_appendonly.version >= AORelationVersion_CB2. + +create table @amname@_version_tbl (a int) using @amname@; +CREATE + +-- unique index on AO is supported starting from version 2 (AORelationVersion_CB2) +select version from pg_appendonly where relid = '@amname@_version_tbl'::regclass; + version +--------- + 2 +(1 row) +create unique index on @amname@_version_tbl(a); +CREATE +insert into @amname@_version_tbl select generate_series(1, 10); +INSERT 10 +create unique index on @amname@_version_tbl(a); +CREATE +set enable_seqscan = off; +SET +select * from @amname@_version_tbl where a = 2; + a +--- + 2 +(1 row) + +-- modify pg_appendonly.version to older such like 1 (AORelationVersion_GP6) +set allow_system_table_mods = on; +SET +update pg_appendonly set version = 1 where relid = '@amname@_version_tbl'::regclass; +UPDATE 1 + +-- unique index on AO isn't supported on version < AORelationVersion_CB2 +select version from pg_appendonly where relid = '@amname@_version_tbl'::regclass; + version +--------- + 1 +(1 row) +create unique index on @amname@_version_tbl(a); +ERROR: append-only tables with older relation versions do not support unique indexes +DETAIL: version found = 1, minimum version required = 2 +HINT: ALTER TABLE SET WITH (REORGANIZE = true) before creating the unique index + +-- alter table with reorganize to verify pg_appendonly being rewritten +alter table @amname@_version_tbl set with (reorganize = true); +ALTER +select version from pg_appendonly where relid = '@amname@_version_tbl'::regclass; + version +--------- + 2 +(1 row) +create unique index on @amname@_version_tbl(a); +CREATE +select * from @amname@_version_tbl where a = 3; + a +--- + 3 +(1 row) + +drop table @amname@_version_tbl; +DROP +reset allow_system_table_mods; +RESET diff --git a/src/test/isolation2/output/uao/vacuum_cleanup.source b/src/test/isolation2/output/uao/vacuum_cleanup.source index 920ddfaf530..b90c82bfd68 100644 --- a/src/test/isolation2/output/uao/vacuum_cleanup.source +++ b/src/test/isolation2/output/uao/vacuum_cleanup.source @@ -46,14 +46,13 @@ VACUUM 1: select age(relfrozenxid), regexp_replace(replace(relname, 'ao_@amname@_vacuum_cleanup2'::regclass::oid::text, ''), 'ao.*seg', '') from gp_dist_random('pg_class') where relkind in ('r','t','o','b','M') and (relname like '%' || 'ao_@amname@_vacuum_cleanup2'::regclass::oid || '%') and gp_segment_id = 0; age | regexp_replace -----+-------------------- - 2 | pg__ - 2 | pg_aovisimap_ + 1 | pg__ + 1 | pg_aovisimap_ (2 rows) 2: end; END --- Check that drop phase is skipped, but still the cleanup phase is performed --- when there are concurrent serializable transactions +-- Check that drop phase is not skipped, when there are concurrent serializable transactions. 1: create table ao_@amname@_vacuum_cleanup3(a int, b int) using @amname@; CREATE 1: insert into ao_@amname@_vacuum_cleanup3 select i, i from generate_series(1, 100) i; @@ -78,9 +77,9 @@ DELETE 100 -- Wait till compaction phase is completed and only then start the serializable -- transaction to ensure that only drop phase runs after the serializable --- transaction started. Because the transaction is holding a lock on the table, --- the drop phase cannot upgrade to AccessExclusiveLock and so recycling the --- segfiles is skipped. +-- transaction started. For the new VACUUM strategy, because no need to upgrade +-- to AccessExclusiveLock for drop phase, and the transaction would not access +-- AWAITING_DROP segfiles, dropping the dead segfiles could proceed accordingly. 2: select gp_wait_until_triggered_fault('vacuum_relation_open_relation_during_drop_phase', 1, 1); gp_wait_until_triggered_fault ------------------------------- @@ -107,20 +106,357 @@ VACUUM 2 | pg_aovisimap_ (2 rows) --- Validate that the drop phase was skipped. segfile 1 should be in state 2 --- (AWAITING_DROP) +-- Validate that the drop phase wasn't skipped in the new VACUUM strategy, +-- segfile 1 should be in state 1 (AWAITING_DEFAULT). This is because +-- no need to acquire AccessExclusiveLock at dead segments recycling hence +-- the cleanup routine could be performed as is. -- This result is related to data distribution. -- Current hash algorithm is jump-consistent-hash. 1: SELECT * FROM gp_ao_or_aocs_seg('ao_@amname@_vacuum_cleanup3'); segment_id | segno | tupcount | modcount | formatversion | state ------------+-------+----------+----------+---------------+------- - 2 | 1 | 25 | 2 | 3 | 2 + 0 | 1 | 0 | 2 | 3 | 1 + 0 | 2 | 0 | 0 | 3 | 1 + 2 | 1 | 0 | 2 | 3 | 1 2 | 2 | 0 | 0 | 3 | 1 - 1 | 1 | 37 | 2 | 3 | 2 + 1 | 1 | 0 | 2 | 3 | 1 1 | 2 | 0 | 0 | 3 | 1 - 0 | 1 | 38 | 2 | 3 | 2 - 0 | 2 | 0 | 0 | 3 | 1 (6 rows) 2: commit; COMMIT + +1q: ... +2q: ... + +-- Test VACUUM with concurrent readers: +-- a) if reader transaction started before VACUUM, VACUUM should not drop the AWAITING_DROP segment +-- which was accessing by the reader; +-- b) if reader transaction started after VACUUM, VACUUM should drop the AWAITING_DROP segments. + +create or replace function show_aoseg(tabname text) returns table (segno int, tupcount bigint, modcount bigint, formatversion smallint, state smallint) as $$ declare tabrelid oid; /* in func */ tabsegrelid oid; /* in func */ tabsegrelname text; /* in func */ begin select tabname::regclass::oid into tabrelid; /* in func */ select segrelid from pg_appendonly where relid = tabrelid into tabsegrelid; /* in func */ select relname from pg_class where oid = tabsegrelid into tabsegrelname; /* in func */ +return query execute 'select segno,tupcount,modcount,formatversion,state from pg_aoseg.' || tabsegrelname; /* in func */ end; /* in func */ $$ language plpgsql; +CREATE + +create table vacuum_concurrent_test_@amname@ (a int, b int, c int) using @amname@; +CREATE +insert into vacuum_concurrent_test_@amname@ select 2, b, b from generate_series(1, 11) b; +INSERT 11 +create index i_b_vacuum_concurrent_reader_@amname@ on vacuum_concurrent_test_@amname@(b); +CREATE +update vacuum_concurrent_test_@amname@ set b = b + 1; +UPDATE 11 + +-- expect segment state is DEFAULT (state == 1) +0U: select * from show_aoseg('vacuum_concurrent_test_@amname@') order by segno; + segno | tupcount | modcount | formatversion | state +-------+----------+----------+---------------+------- + 1 | 22 | 2 | 3 | 1 +(1 row) + +-- start a reader before VACUUM +1: begin; +BEGIN +1: select * from vacuum_concurrent_test_@amname@ where b = 10; + a | b | c +---+----+--- + 2 | 10 | 9 +(1 row) + +vacuum vacuum_concurrent_test_@amname@; +VACUUM + +-- expect to see AWAITING_DROP(state == 2) tuple +0U: select * from show_aoseg('vacuum_concurrent_test_@amname@') order by segno; + segno | tupcount | modcount | formatversion | state +-------+----------+----------+---------------+------- + 1 | 22 | 2 | 3 | 2 + 2 | 11 | 0 | 3 | 1 +(2 rows) + +1: select * from vacuum_concurrent_test_@amname@ where b = 5; + a | b | c +---+---+--- + 2 | 5 | 4 +(1 row) +1: select * from vacuum_concurrent_test_@amname@ order by c; + a | b | c +---+----+---- + 2 | 2 | 1 + 2 | 3 | 2 + 2 | 4 | 3 + 2 | 5 | 4 + 2 | 6 | 5 + 2 | 7 | 6 + 2 | 8 | 7 + 2 | 9 | 8 + 2 | 10 | 9 + 2 | 11 | 10 + 2 | 12 | 11 +(11 rows) +1: end; +END + +-- start another reader after VACUUM +2: begin; +BEGIN +2: select * from vacuum_concurrent_test_@amname@ where c = 2; + a | b | c +---+---+--- + 2 | 3 | 2 +(1 row) + +-- expect to see AWAITING_DROP(state == 2) tuple for unable to drop the dead segment by the first VACUUM +0U: select * from show_aoseg('vacuum_concurrent_test_@amname@') order by segno; + segno | tupcount | modcount | formatversion | state +-------+----------+----------+---------------+------- + 1 | 22 | 2 | 3 | 2 + 2 | 11 | 0 | 3 | 1 +(2 rows) + +vacuum vacuum_concurrent_test_@amname@; +VACUUM + +-- expect no AWAITING_DROP(state == 2) tuple because it was dropped by the last VACUUM +0U: select * from show_aoseg('vacuum_concurrent_test_@amname@') order by segno; + segno | tupcount | modcount | formatversion | state +-------+----------+----------+---------------+------- + 1 | 0 | 2 | 3 | 1 + 2 | 11 | 0 | 3 | 1 +(2 rows) + +2: select * from vacuum_concurrent_test_@amname@ where b = 7; + a | b | c +---+---+--- + 2 | 7 | 6 +(1 row) +2: select * from vacuum_concurrent_test_@amname@ order by c; + a | b | c +---+----+---- + 2 | 2 | 1 + 2 | 3 | 2 + 2 | 4 | 3 + 2 | 5 | 4 + 2 | 6 | 5 + 2 | 7 | 6 + 2 | 8 | 7 + 2 | 9 | 8 + 2 | 10 | 9 + 2 | 11 | 10 + 2 | 12 | 11 +(11 rows) +2: end; +END + +1q: ... +2q: ... + +-- Test VACUUM with concurrent writer. +-- There was a concurrent insert transaction started prior to VACUUM, VACUUM should not mark +-- the target segment which was also writting by the concurrent writer to AWAITING_DROP, and +-- the corresponding index entries should not be deleted. + +truncate table vacuum_concurrent_test_@amname@; +TRUNCATE +insert into vacuum_concurrent_test_@amname@ select 2, b, b from generate_series(1, 5) b; +INSERT 5 +delete from vacuum_concurrent_test_@amname@; +DELETE 5 + +1: begin; +BEGIN +1: insert into vacuum_concurrent_test_@amname@ select 2, b, b from generate_series(6, 10) b; +INSERT 5 + +2: vacuum vacuum_concurrent_test_@amname@; +VACUUM + +1: commit; +COMMIT + +set enable_seqscan = on; +SET +-- start_ignore +explain (costs off) select * from vacuum_concurrent_test_@amname@ order by b; + QUERY PLAN +------------------------------------------------------- + Gather Motion 3:1 (slice1; segments: 3) + Merge Key: b + -> Sort + Sort Key: b + -> Seq Scan on vacuum_concurrent_test_@amname@ + Optimizer: Postgres query optimizer +(6 rows) +-- end_ignore +select * from vacuum_concurrent_test_@amname@ order by b; + a | b | c +---+----+---- + 2 | 6 | 6 + 2 | 7 | 7 + 2 | 8 | 8 + 2 | 9 | 9 + 2 | 10 | 10 +(5 rows) + +-- expect all bitmapindexscan results are consistent with above seqscan +set enable_seqscan = off; +SET +-- start_ignore +explain (costs off) select * from vacuum_concurrent_test_@amname@ where b = b order by b; + QUERY PLAN +---------------------------------------------------------------------------- + Gather Motion 3:1 (slice1; segments: 3) + Merge Key: b + -> Sort + Sort Key: b + -> Bitmap Heap Scan on vacuum_concurrent_test_@amname@ + Recheck Cond: (b IS NOT NULL) + -> Bitmap Index Scan on i_b_vacuum_concurrent_reader_@amname@ + Index Cond: (b IS NOT NULL) + Optimizer: Postgres query optimizer +(9 rows) +-- end_ignore +select * from vacuum_concurrent_test_@amname@ where b = b order by b; + a | b | c +---+----+---- + 2 | 6 | 6 + 2 | 7 | 7 + 2 | 8 | 8 + 2 | 9 | 9 + 2 | 10 | 10 +(5 rows) + +-- expose dead tuples +set gp_select_invisible = true; +SET + +set enable_seqscan = on; +SET +-- start_ignore +explain (costs off) select * from vacuum_concurrent_test_@amname@ order by b; + QUERY PLAN +------------------------------------------------------- + Gather Motion 3:1 (slice1; segments: 3) + Merge Key: b + -> Sort + Sort Key: b + -> Seq Scan on vacuum_concurrent_test_@amname@ + Optimizer: Postgres query optimizer +(6 rows) +-- end_ignore +select * from vacuum_concurrent_test_@amname@ order by b; + a | b | c +---+----+---- + 2 | 1 | 1 + 2 | 2 | 2 + 2 | 3 | 3 + 2 | 4 | 4 + 2 | 5 | 5 + 2 | 6 | 6 + 2 | 7 | 7 + 2 | 8 | 8 + 2 | 9 | 9 + 2 | 10 | 10 +(10 rows) + +-- expect all bitmapindexscan results are same as above seqscan +set enable_seqscan = off; +SET +-- start_ignore +explain (costs off) select * from vacuum_concurrent_test_@amname@ where b = b order by b; + QUERY PLAN +---------------------------------------------------------------------------- + Gather Motion 3:1 (slice1; segments: 3) + Merge Key: b + -> Sort + Sort Key: b + -> Bitmap Heap Scan on vacuum_concurrent_test_@amname@ + Recheck Cond: (b IS NOT NULL) + -> Bitmap Index Scan on i_b_vacuum_concurrent_reader_@amname@ + Index Cond: (b IS NOT NULL) + Optimizer: Postgres query optimizer +(9 rows) +-- end_ignore +select * from vacuum_concurrent_test_@amname@ where b = b order by b; + a | b | c +---+----+---- + 2 | 1 | 1 + 2 | 2 | 2 + 2 | 3 | 3 + 2 | 4 | 4 + 2 | 5 | 5 + 2 | 6 | 6 + 2 | 7 | 7 + 2 | 8 | 8 + 2 | 9 | 9 + 2 | 10 | 10 +(10 rows) + +-- vacuum again without concurrent reader/writer, expect above dead tuples could be removed +2: vacuum vacuum_concurrent_test_@amname@; +VACUUM + +set enable_seqscan = on; +SET +-- start_ignore +explain (costs off) select * from vacuum_concurrent_test_@amname@ order by b; + QUERY PLAN +------------------------------------------------------- + Gather Motion 3:1 (slice1; segments: 3) + Merge Key: b + -> Sort + Sort Key: b + -> Seq Scan on vacuum_concurrent_test_@amname@ + Optimizer: Postgres query optimizer +(6 rows) +-- end_ignore +select * from vacuum_concurrent_test_@amname@ order by b; + a | b | c +---+----+---- + 2 | 6 | 6 + 2 | 7 | 7 + 2 | 8 | 8 + 2 | 9 | 9 + 2 | 10 | 10 +(5 rows) + +-- expect all bitmapindexscan results are same as seqscan +set enable_seqscan = off; +SET +-- start_ignore +explain (costs off) select * from vacuum_concurrent_test_@amname@ where b = b order by b; + QUERY PLAN +---------------------------------------------------------------------------- + Gather Motion 3:1 (slice1; segments: 3) + Merge Key: b + -> Sort + Sort Key: b + -> Bitmap Heap Scan on vacuum_concurrent_test_@amname@ + Recheck Cond: (b IS NOT NULL) + -> Bitmap Index Scan on i_b_vacuum_concurrent_reader_@amname@ + Index Cond: (b IS NOT NULL) + Optimizer: Postgres query optimizer +(9 rows) +-- end_ignore +select * from vacuum_concurrent_test_@amname@ where b = b order by b; + a | b | c +---+----+---- + 2 | 6 | 6 + 2 | 7 | 7 + 2 | 8 | 8 + 2 | 9 | 9 + 2 | 10 | 10 +(5 rows) + +1q: ... +2q: ... + +reset enable_seqscan; +RESET +reset gp_select_invisible; +RESET + +drop table vacuum_concurrent_test_@amname@; +DROP +drop function show_aoseg; +DROP diff --git a/src/test/isolation2/output/uao/vacuum_index_stats.source b/src/test/isolation2/output/uao/vacuum_index_stats.source new file mode 100644 index 00000000000..ac47b47913f --- /dev/null +++ b/src/test/isolation2/output/uao/vacuum_index_stats.source @@ -0,0 +1,107 @@ +-- This is intended to test a new behavior of VACUUM AO/CO enhancement. +-- The enhacement introduced a new strategy to improve performance by +-- vacuuming indexes based on the collected AWAITING_DROP segment files, +-- instead of reading AO/CO visibility map catalog for every index tuple. +-- This behavior would lead to the index->reltuples being updated only when +-- AWAITING_DROP segment is greater than 0, which requires compaction during +-- VACUUM. If no compaction happens, even if dead tuples were deleted, +-- index->reltuples wouldn't get updated accordingly, which could generate +-- difference between table->reltuples and index->reltuples. That is supposed +-- to be fine in most cases since bloating size of indexes is limited in +-- the scope of gp_appendonly_compaction_threshold percentage of total tuples. +-- The new strategy would not impact table->reltuples updates. + +create table vacuum_index_stats_@amname@ (a int, b int, c int) using @amname@; +CREATE +insert into vacuum_index_stats_@amname@ select 2, b, b from generate_series(1, 11) b; +INSERT 11 +create index i_b_vacuum_index_stats_@amname@ on vacuum_index_stats_@amname@(b); +CREATE + +set gp_appendonly_compaction_threshold = 10; +SET +analyze vacuum_index_stats_@amname@; +ANALYZE + +-- expect reltuples == 11 +0U: select reltuples from pg_class where relname = 'vacuum_index_stats_@amname@'; + reltuples +----------- + 11 +(1 row) +-- expect reltuples == 11 +0U: select reltuples from pg_class where relname = 'i_b_vacuum_index_stats_@amname@'; + reltuples +----------- + 11 +(1 row) + +-- delete one tuple +delete from vacuum_index_stats_@amname@ where c = 1; +DELETE 1 +vacuum vacuum_index_stats_@amname@; +VACUUM + +-- hideRatio = hiddenTupcount / totalTupcount * 100 = 1 / 11 * 100 = 9% +-- less than gp_appendonly_compaction_threshold (10%), no compaction would happen +-- during vacuum, expect no change in reltuples of the index but decrease 1 in +-- reltuples of the table. + +-- expect reltuples == 10 +0U: select reltuples from pg_class where relname = 'vacuum_index_stats_@amname@'; + reltuples +----------- + 10 +(1 row) +-- expect reltuples == 11 for no compaction happened +0U: select reltuples from pg_class where relname = 'i_b_vacuum_index_stats_@amname@'; + reltuples +----------- + 11 +(1 row) + +analyze vacuum_index_stats_@amname@; +ANALYZE + +-- expect reltuples == 10 +0U: select reltuples from pg_class where relname = 'vacuum_index_stats_@amname@'; + reltuples +----------- + 10 +(1 row) +-- expect reltuples == 10 +0U: select reltuples from pg_class where relname = 'i_b_vacuum_index_stats_@amname@'; + reltuples +----------- + 10 +(1 row) + +-- delete two tuples +delete from vacuum_index_stats_@amname@ where c < 4; +DELETE 2 +vacuum vacuum_index_stats_@amname@; +VACUUM + +-- hideRatio = hiddenTupcount / totalTupcount * 100 = 2 / 10 * 100 = 20% +-- greater than gp_appendonly_compaction_threshold (10%), compaction would happen +-- during vacuum, expect changes in reltuples for both index and table. + +-- expect reltuples == 8 +0U: select reltuples from pg_class where relname = 'vacuum_index_stats_@amname@'; + reltuples +----------- + 8 +(1 row) +-- expect reltuples == 8 for compaction happened +0U: select reltuples from pg_class where relname = 'i_b_vacuum_index_stats_@amname@'; + reltuples +----------- + 8 +(1 row) + +0Uq: ... + +drop table vacuum_index_stats_@amname@; +DROP +reset gp_appendonly_compaction_threshold; +RESET diff --git a/src/test/isolation2/sql/add_column_after_vacuum_skip_drop_column.sql b/src/test/isolation2/sql/add_column_after_vacuum_skip_drop_column.sql index f1db7898cfb..2c56d0eea69 100644 --- a/src/test/isolation2/sql/add_column_after_vacuum_skip_drop_column.sql +++ b/src/test/isolation2/sql/add_column_after_vacuum_skip_drop_column.sql @@ -1,5 +1,7 @@ -- @Description Ensures that an ALTER TABLE ADD COLUMN will drop segfiles in --- AOSEG_STATE_AWAITING_DROP state left over by a previous vacuum +-- AOSEG_STATE_AWAITING_DROP state left over by a previous vacuum. +-- We removed recycling dead segfiles from ADD COLUMN workflow, so +-- the test expected result were adjusted accordingly. -- CREATE TABLE aoco_add_column_after_vacuum_skip_drop (a INT, b INT) WITH (appendonly=true, orientation=column); INSERT INTO aoco_add_column_after_vacuum_skip_drop SELECT i as a, i as b FROM generate_series(1, 10) AS i; diff --git a/src/test/isolation2/sql/ao_blkdir.sql b/src/test/isolation2/sql/ao_blkdir.sql new file mode 100644 index 00000000000..f7e3074dba3 --- /dev/null +++ b/src/test/isolation2/sql/ao_blkdir.sql @@ -0,0 +1,201 @@ +-- White-box tests asserting composition of AO/CO block directory entries. +-- All tuples are directed to seg0 and each INSERT has an increasing row count +-- to make their identification easy. + +-------------------------------------------------------------------------------- +-- AO tables +-------------------------------------------------------------------------------- + +CREATE TABLE ao_blkdir_test(i int, j int) USING ao_row DISTRIBUTED BY (j); +CREATE INDEX ao_blkdir_test_idx ON ao_blkdir_test(i); + +1: INSERT INTO ao_blkdir_test SELECT i, 2 FROM generate_series(1, 10) i; +-- There should be 1 block directory row with a single entry covering 10 rows +SELECT (gp_toolkit.__gp_aoblkdir('ao_blkdir_test')).* FROM gp_dist_random('gp_id') + WHERE gp_segment_id = 0 ORDER BY 1,2,3,4,5; + +1: INSERT INTO ao_blkdir_test SELECT i, 2 FROM generate_series(11, 30) i; +-- There should be 2 block directory entries in a new block directory row, and +-- the row from the previous INSERT should not be visible. The entry from the +-- first INSERT should remain unchanged. +SELECT (gp_toolkit.__gp_aoblkdir('ao_blkdir_test')).* FROM gp_dist_random('gp_id') +WHERE gp_segment_id = 0 ORDER BY 1,2,3,4,5; + +1: BEGIN; +1: INSERT INTO ao_blkdir_test SELECT i, 2 FROM generate_series(31, 60) i; +2: BEGIN; +2: INSERT INTO ao_blkdir_test SELECT i, 2 FROM generate_series(71, 110) i; +1: COMMIT; +2: COMMIT; +-- The second INSERT of 40 rows above would have landed in segfile 1 (unlike +-- segfile 0, like the first INSERT of 30 rows above). This should be reflected +-- in the block directory entries for these rows. +SELECT (gp_toolkit.__gp_aoblkdir('ao_blkdir_test')).* FROM gp_dist_random('gp_id') +WHERE gp_segment_id = 0 ORDER BY 1,2,3,4,5; + +TRUNCATE ao_blkdir_test; +-- Insert enough rows to overflow the first block directory minipage by 2. +INSERT INTO ao_blkdir_test SELECT i, 2 FROM generate_series(1, 292700) i; +-- There should be 2 block directory rows, one with 161 entries covering 292698 +-- rows and the other with 1 entry covering the 2 overflow rows. +SELECT (gp_toolkit.__gp_aoblkdir('ao_blkdir_test')).* FROM gp_dist_random('gp_id') +WHERE gp_segment_id = 0 ORDER BY 1,2,3,4,5; + +-- Unique index white box tests +DROP TABLE ao_blkdir_test; +CREATE TABLE ao_blkdir_test(i int UNIQUE, j int) USING ao_row DISTRIBUTED BY (i); + +SELECT gp_inject_fault('appendonly_insert', 'suspend', '', '', 'ao_blkdir_test', 1, 1, 0, dbid) +FROM gp_segment_configuration WHERE role = 'p' AND content = 0; +1: BEGIN; +1&: INSERT INTO ao_blkdir_test VALUES (2, 2); + +-- There should be a placeholder row inserted to cover the rows for each INSERT +-- session, before we insert the 1st row in that session, that is only visible +-- to SNAPSHOT_DIRTY. +SELECT gp_wait_until_triggered_fault('appendonly_insert', 1, dbid) +FROM gp_segment_configuration WHERE role = 'p' AND content = 0; +SELECT (gp_toolkit.__gp_aoblkdir('ao_blkdir_test')).* FROM gp_dist_random('gp_id') +WHERE gp_segment_id = 0 ORDER BY 1,2,3,4,5; +SET gp_select_invisible TO ON; +SELECT (gp_toolkit.__gp_aoblkdir('ao_blkdir_test')).* FROM gp_dist_random('gp_id') +WHERE gp_segment_id = 0 ORDER BY 1,2,3,4,5; +RESET gp_select_invisible; + +-- The placeholder row is invisible to other transactions (that don't perform a +-- uniqueness check) while the INSERT is in progress. +2: SELECT (gp_toolkit.__gp_aoblkdir('ao_blkdir_test')).* FROM gp_dist_random('gp_id') + WHERE gp_segment_id = 0 ORDER BY 1,2,3,4,5; + +SELECT gp_inject_fault('appendonly_insert', 'reset', dbid) +FROM gp_segment_configuration WHERE role = 'p' AND content = 0; +1<: + +-- The placeholder row is invisible to the INSERTing transaction. Since the +-- INSERT finished, there should be 1 visible blkdir row representing the INSERT. +1: SELECT (gp_toolkit.__gp_aoblkdir('ao_blkdir_test')).* FROM gp_dist_random('gp_id') +WHERE gp_segment_id = 0 ORDER BY 1,2,3,4,5; + +-- The placeholder row is invisible to other transactions (that don't perform a +-- uniqueness check) even after the INSERT finishes. The blkdir row representing +-- the INSERT should not be visible as the INSERTing transaction hasn't +-- committed yet. +2: SELECT (gp_toolkit.__gp_aoblkdir('ao_blkdir_test')).* FROM gp_dist_random('gp_id') +WHERE gp_segment_id = 0 ORDER BY 1,2,3,4,5; + +1: COMMIT; + +-- The placeholder row is invisible to other transactions (that don't perform a +-- uniqueness check) even after the INSERTing transaction commits. Since the +-- INSERTing transaction has committed, the blkdir row representing the INSERT +-- should be visible now. +2: SELECT (gp_toolkit.__gp_aoblkdir('ao_blkdir_test')).* FROM gp_dist_random('gp_id') +WHERE gp_segment_id = 0 ORDER BY 1,2,3,4,5; + +DROP TABLE ao_blkdir_test; + +-------------------------------------------------------------------------------- +-- AOCO tables +-------------------------------------------------------------------------------- + +CREATE TABLE aoco_blkdir_test(i int, j int) USING ao_column DISTRIBUTED BY (j); +CREATE INDEX aoco_blkdir_test_idx ON aoco_blkdir_test(i); + +1: INSERT INTO aoco_blkdir_test SELECT i, 2 FROM generate_series(1, 10) i; +-- There should be 2 block directory rows with a single entry covering 10 rows, +-- (1 for each column). +SELECT (gp_toolkit.__gp_aoblkdir('aoco_blkdir_test')).* FROM gp_dist_random('gp_id') +WHERE gp_segment_id = 0 ORDER BY 1,2,3,4,5; + +1: INSERT INTO aoco_blkdir_test SELECT i, 2 FROM generate_series(11, 30) i; +-- There should be 2 block directory rows, carrying 2 entries each. The rows +-- from the previous INSERT should not be visible. The entries from the first +-- INSERT should remain unchanged. +SELECT (gp_toolkit.__gp_aoblkdir('aoco_blkdir_test')).* FROM gp_dist_random('gp_id') +WHERE gp_segment_id = 0 ORDER BY 1,2,3,4,5; + +1: BEGIN; +1: INSERT INTO aoco_blkdir_test SELECT i, 2 FROM generate_series(31, 60) i; +2: BEGIN; +2: INSERT INTO aoco_blkdir_test SELECT i, 2 FROM generate_series(71, 110) i; +1: COMMIT; +2: COMMIT; +-- The second INSERT of 40 rows above would have landed in segfile 1 (unlike +-- segfile 0, like the first INSERT of 30 rows above). This should be reflected +-- in the block directory entries for these rows. +SELECT (gp_toolkit.__gp_aoblkdir('aoco_blkdir_test')).* FROM gp_dist_random('gp_id') +WHERE gp_segment_id = 0 ORDER BY 1,2,3,4,5; + +TRUNCATE aoco_blkdir_test; +-- Insert enough rows to overflow the first block directory minipage by 2. +INSERT INTO aoco_blkdir_test SELECT i, 2 FROM generate_series(1, 1317143) i; +-- There should be 2 block directory rows, 2 for each column, one with 161 +-- entries covering 1317141 rows and the other with 1 entry covering the 2 +-- overflow rows. +SELECT (gp_toolkit.__gp_aoblkdir('aoco_blkdir_test')).* FROM gp_dist_random('gp_id') +WHERE gp_segment_id = 0 ORDER BY 1,2,3,4,5; + +-- Unique index white box tests +DROP TABLE aoco_blkdir_test; +CREATE TABLE aoco_blkdir_test(h int, i int UNIQUE, j int) USING ao_column DISTRIBUTED BY (i); + +SELECT gp_inject_fault('appendonly_insert', 'suspend', '', '', 'aoco_blkdir_test', 1, 1, 0, dbid) +FROM gp_segment_configuration WHERE role = 'p' AND content = 0; +1: BEGIN; +1&: INSERT INTO aoco_blkdir_test VALUES (2, 2, 2); + +-- There should be a placeholder row inserted to cover the rows for each INSERT +-- session (for the first non-dropped column), before we insert the 1st row in +-- that session, that is only visible to SNAPSHOT_DIRTY. +SELECT gp_wait_until_triggered_fault('appendonly_insert', 1, dbid) +FROM gp_segment_configuration WHERE role = 'p' AND content = 0; +SELECT (gp_toolkit.__gp_aoblkdir('aoco_blkdir_test')).* FROM gp_dist_random('gp_id') +WHERE gp_segment_id = 0 ORDER BY 1,2,3,4,5; +SET gp_select_invisible TO ON; +SELECT (gp_toolkit.__gp_aoblkdir('aoco_blkdir_test')).* FROM gp_dist_random('gp_id') +WHERE gp_segment_id = 0 ORDER BY 1,2,3,4,5; +RESET gp_select_invisible; + +-- The placeholder row is invisible to other transactions (that don't perform a +-- uniqueness check) while the INSERT is in progress. +2: SELECT (gp_toolkit.__gp_aoblkdir('aoco_blkdir_test')).* FROM gp_dist_random('gp_id') + WHERE gp_segment_id = 0 ORDER BY 1,2,3,4,5; + +-- Before the INSERT commits, if we try to drop column 'h', for which the +-- placeholder row was created, the session will block (locking). So it is +-- perfectly safe to use 1 placeholder row (and not have 1 placeholder/column) +3&: ALTER TABLE aoco_blkdir_test DROP COLUMN h; + +SELECT gp_inject_fault('appendonly_insert', 'reset', dbid) +FROM gp_segment_configuration WHERE role = 'p' AND content = 0; +1<: + +-- The placeholder row is invisible to the INSERTing transaction. Since the +-- INSERT finished, there should be 3 visible blkdir rows representing the +-- INSERT, 1 for each column. +1: SELECT (gp_toolkit.__gp_aoblkdir('aoco_blkdir_test')).* FROM gp_dist_random('gp_id') + WHERE gp_segment_id = 0 ORDER BY 1,2,3,4,5; + +-- The placeholder row is invisible to other transactions (that don't perform a +-- uniqueness check) even after the INSERT finishes. The blkdir rows representing +-- the INSERT should not be visible as the INSERTing transaction hasn't +-- committed yet. +2: SELECT (gp_toolkit.__gp_aoblkdir('aoco_blkdir_test')).* FROM gp_dist_random('gp_id') + WHERE gp_segment_id = 0 ORDER BY 1,2,3,4,5; + +1: COMMIT; + +-- The placeholder row is invisible to other transactions (that don't perform a +-- uniqueness check) even after the INSERTing transaction commits. Since the +-- INSERTing transaction has committed, the blkdir rows representing the INSERT +-- should be visible now. +2: SELECT (gp_toolkit.__gp_aoblkdir('aoco_blkdir_test')).* FROM gp_dist_random('gp_id') + WHERE gp_segment_id = 0 ORDER BY 1,2,3,4,5; + +-- Now even though the DROP COLUMN has finished, we would still be able to +-- properly resolve uniqueness checks (by consulting the first non-dropped +-- column's block directory row). +3<: +4: INSERT INTO aoco_blkdir_test VALUES (2, 2); + +DROP TABLE aoco_blkdir_test; diff --git a/src/test/isolation2/sql/ao_unique_index.sql b/src/test/isolation2/sql/ao_unique_index.sql new file mode 100644 index 00000000000..68c2faad128 --- /dev/null +++ b/src/test/isolation2/sql/ao_unique_index.sql @@ -0,0 +1,378 @@ +-- Tests to ensure that unique indexes work as expected w/ ao_row tables. + +-- We use a replicated table to test each table for ease in testing edge cases +-- where conflicts arise at block directory boundaries. We can treat the table +-- as if it were being populated in utility mode on a single segment, allowing +-- us to predict block directory entries without having to worry about the +-- table's distribution. + +-- Case 1: Conflict with committed transaction---------------------------------- +CREATE TABLE unique_index_ao_row (a INT unique) USING ao_row + DISTRIBUTED REPLICATED; +INSERT INTO unique_index_ao_row SELECT * FROM generate_series(1, 329729); +-- should conflict +INSERT INTO unique_index_ao_row VALUES (1); +INSERT INTO unique_index_ao_row VALUES (329729); +-- should not conflict +INSERT INTO unique_index_ao_row VALUES (329730); +DROP TABLE unique_index_ao_row; + +-- Case 2: Conflict within the same transaction--------------------------------- +CREATE TABLE unique_index_ao_row (a INT unique) USING ao_row + DISTRIBUTED REPLICATED; +BEGIN; +INSERT INTO unique_index_ao_row SELECT * FROM generate_series(1, 329729); +-- should conflict +INSERT INTO unique_index_ao_row VALUES (1); +END; +DROP TABLE unique_index_ao_row; + +CREATE TABLE unique_index_ao_row (a INT unique) USING ao_row + DISTRIBUTED REPLICATED; +BEGIN; +INSERT INTO unique_index_ao_row SELECT * FROM generate_series(1, 329729); +-- should conflict +INSERT INTO unique_index_ao_row VALUES (329729); +END; +DROP TABLE unique_index_ao_row; + +CREATE TABLE unique_index_ao_row (a INT unique) USING ao_row + DISTRIBUTED REPLICATED; +BEGIN; +INSERT INTO unique_index_ao_row SELECT * FROM generate_series(1, 329729); +-- should not conflict +INSERT INTO unique_index_ao_row VALUES (329730); +END; +DROP TABLE unique_index_ao_row; + +-- Case 3: Conflict with aborted transaction is not a conflict------------------ +CREATE TABLE unique_index_ao_row (a INT unique) USING ao_row + DISTRIBUTED REPLICATED; +BEGIN; +INSERT INTO unique_index_ao_row SELECT * FROM generate_series(1, 329729); +ABORT; +-- should not conflict +INSERT INTO unique_index_ao_row VALUES (1); +INSERT INTO unique_index_ao_row VALUES (329729); +INSERT INTO unique_index_ao_row VALUES (329730); +DROP TABLE unique_index_ao_row; + +-- Case 4: Conflict with to-be-committed transaction---------------------------- +-- +-- 1. Uncommitted tx 1 has inserted non-conflicting key = 0. +-- 2. Uncommitted tx 2 has inserted (161 * 2048 + 1 = 329729 rows), which spans +-- 2 block directory rows (1st row: [1,329728] ; 2nd row: [329729,329729]) +-- 3. Tx 3 tries to insert conflicting key = 2, which maps to the second rownum +-- covered by the 1st block directory row of seg 1, and blocks on tx 2. +-- 4. Tx 4 tries to insert conflicting key = 329728, which maps to the last +-- rownum covered by the 1st block directory row of seg 1, and blocks on tx 2. +-- 5. Tx 5 tries to insert conflicting key = 329729, which maps to the first +-- rownum covered by the 2nd block directory row of seg 1, and blocks on tx 2. +-- 6. Tx 6 tries to insert non-conflicting key = 329730 and is immediately +-- successful. +-- 7. Tx 2 commits +-- 8. Txs 3,4,5 report unique constraint violation +-- 9. Tx 1 commits +-- +CREATE TABLE unique_index_ao_row (a INT unique) USING ao_row + DISTRIBUTED REPLICATED; +1: BEGIN; +1: INSERT INTO unique_index_ao_row VALUES (0); +2: BEGIN; +2: INSERT INTO unique_index_ao_row SELECT * FROM generate_series(1, 329729); +3&: INSERT INTO unique_index_ao_row VALUES (1); +4&: INSERT INTO unique_index_ao_row VALUES (329728); +5&: INSERT INTO unique_index_ao_row VALUES (329729); +-- should succeed immediately +6: INSERT INTO unique_index_ao_row VALUES (329730); +2: COMMIT; +3<: +4<: +5<: +1: COMMIT; +DROP TABLE unique_index_ao_row; + +-- Case 5: Conflict with to-be-aborted transaction------------------------------ +-- +-- 1. Uncommitted tx 1 has inserted non-conflicting key = 0. +-- 2. Uncommitted tx 2 has inserted (161 * 2048 + 1 = 329729 rows), which spans +-- 2 block directory rows (1st row: [1,329728] ; 2nd row: [329729,329729]) +-- 3. Tx 3 tries to insert conflicting key = 2, which maps to the second rownum +-- covered by the 1st block directory row of seg 1, and blocks on tx 2. +-- 4. Tx 4 tries to insert conflicting key = 329728, which maps to the last +-- rownum covered by the 1st block directory row of seg 1, and blocks on tx 2. +-- 5. Tx 5 tries to insert conflicting key = 329729, which maps to the first +-- rownum covered by the 2nd block directory row of seg 1, and blocks on tx 2. +-- 6. Tx 6 tries to insert non-conflicting key = 329730 and is immediately +-- successful. +-- 8. Tx 2 aborts +-- 9. Txs 3,4,5 report unique constraint violation +-- 10. Tx 1 commits +-- +CREATE TABLE unique_index_ao_row (a INT unique) USING ao_row + DISTRIBUTED REPLICATED; +1: BEGIN; +1: INSERT INTO unique_index_ao_row VALUES (0); +2: BEGIN; +2: INSERT INTO unique_index_ao_row SELECT * FROM generate_series(1, 329729); +3&: INSERT INTO unique_index_ao_row VALUES (1); +4&: INSERT INTO unique_index_ao_row VALUES (329728); +5&: INSERT INTO unique_index_ao_row VALUES (329729); +-- should succeed immediately +6: INSERT INTO unique_index_ao_row VALUES (329730); +2: ABORT; +3<: +4<: +5<: +1: COMMIT; +DROP TABLE unique_index_ao_row; + +-- Case 6: Conflict with aborted rows following some committed rows ------------ +CREATE TABLE unique_index_ao_row (a INT unique) USING ao_row + DISTRIBUTED REPLICATED; +-- 1. Tx 1 commits rows 1-100. +-- 2. Tx 2 inserts rows 101-200 and then aborts. +-- 3. Tx 3 tries to insert row in range [101,200] and is immediately successful. +-- 4. Tx 4 tries to insert conflicting row in range [1,100] and raises unique +-- constraint violation. +-- 5. Tx 5 tries to insert row in range [201, ) and is immediately successful. +1: INSERT INTO unique_index_ao_row SELECT generate_series(1, 100); +2: BEGIN; +2: INSERT INTO unique_index_ao_row SELECT generate_series(101, 200); +2: ABORT; +3: INSERT INTO unique_index_ao_row VALUES(102); +4: INSERT INTO unique_index_ao_row VALUES(2); +5: INSERT INTO unique_index_ao_row VALUES(202); +DROP TABLE unique_index_ao_row; + +-------------------------------------------------------------------------------- +----------------- More concurrent tests with fault injection ------------------ +-------------------------------------------------------------------------------- + +-- Case 7: Conflict with to-be-committed transaction while only 1 placeholder +-- row exists in the block directory-------------------------------------------- +-- +-- This case highlights the importance of the placeholder row, inserted at the +-- beginning of an INSERT command. +-- +-- 1. Uncommitted Tx 1 has inserted 3 out of its 10 rows and is suspended. +-- 2. Tx 2 inserts a conflicting row and blocks on Tx 1. +-- 3. Tx 3 inserts a non-conflicting row within the range [4,10] and is +-- immediately successful. (Index entries have been written only for [1,3] so +-- far, so conflicts shouldn't arise) +-- 4. Tx 4 inserts a non-conflicting row in range [11, ..) and should be +-- immediately successful. +-- 5. Now Tx 1 resumes and tries to insert a row in range [4,10] and reports a +-- unique constraint violation with Tx 3. +-- 6. Tx 2 succeeds as Tx 1 aborted. + +CREATE TABLE unique_index_ao_row (a INT unique) USING ao_row + DISTRIBUTED REPLICATED; +SELECT gp_inject_fault('appendonly_insert', 'suspend', '', '', 'unique_index_ao_row', 4, 4, 0, dbid) + FROM gp_segment_configuration WHERE role = 'p' AND content <> -1; +1&: INSERT INTO unique_index_ao_row SELECT * FROM generate_series(1, 10); +-- Wait until 3 rows have been successfully inserted into the index and Tx 1 +-- is just beginning to insert the 4th row. +SELECT gp_wait_until_triggered_fault('appendonly_insert', 4, dbid) + FROM gp_segment_configuration WHERE role = 'p' AND content <> -1; +2&: INSERT INTO unique_index_ao_row VALUES(2); +4: INSERT INTO unique_index_ao_row VALUES(11); +3: INSERT INTO unique_index_ao_row VALUES(4); +SELECT gp_inject_fault('appendonly_insert', 'reset', dbid) + FROM gp_segment_configuration WHERE role = 'p' AND content <> -1; +1<: +2<: +DROP TABLE unique_index_ao_row; + +-- Case 8: Conflict with to-be-committed transaction - generalization of case 7 +-- where there are multiple minipages (and block directory rows) in play from +-- the same insert. +-- +-- This justifies why 1 placeholder row is enough and we don't need to flush a +-- placeholder row every time we insert a block directory row (i.e. start a new +-- in-memory minipage) throughout the course of a single insert. +-- +-- 1. Uncommitted Tx 1 has inserted (2048 * (161 * 2 + 1) + 3) = 661507 rows +-- and is suspended, enough rows to fill 2 entire minipages (covers +-- range [1,329728] and [329729,659456]) before suspension. +-- 2. Txs 2,3,4 inserts conflicting rows that map to the 1st minipage and block. +-- 3. Txs 5,6,7 inserts conflicting rows that map to the 2nd minipage and block. +-- 4. Tx 8 inserts a conflicting row that maps to the 3rd minipage, which is +-- currently only in-memory and it conflicts on the placeholder row and +-- blocks (showcases why 1 placeholder row is enough) +-- 5. Tx 9 inserts a non-conflicting row for which there is no index entry and +-- and is immediately successful (661510). +-- 6. Now Tx 1 resumes and tries to insert 661510 and reports a unique +-- constraint violation with Tx 9. +-- 7. All blocked Txs succeed. + +CREATE TABLE unique_index_ao_row (a INT unique) USING ao_row + DISTRIBUTED REPLICATED; + +SELECT gp_inject_fault('insert_new_entry_curr_minipage_full', 'suspend', '', '', '', 2, 2, 0, dbid) +FROM gp_segment_configuration WHERE role = 'p' AND content <> -1; +1&: INSERT INTO unique_index_ao_row SELECT generate_series(1, 661510); + +-- Wait until we have inserted (2048 * (161 * 2 + 1) + 3) = 661507 rows and we +-- are about to insert the 661508th row. +SELECT gp_wait_until_triggered_fault('insert_new_entry_curr_minipage_full', 2, dbid) +FROM gp_segment_configuration WHERE role = 'p' AND content <> -1; +SELECT gp_inject_fault('appendonly_insert', 'suspend', '', '', 'unique_index_ao_row', 4, 4, 0, dbid) +FROM gp_segment_configuration WHERE role = 'p' AND content <> -1; +SELECT gp_inject_fault('insert_new_entry_curr_minipage_full', 'reset', dbid) +FROM gp_segment_configuration WHERE role = 'p' AND content <> -1; +SELECT gp_wait_until_triggered_fault('appendonly_insert', 4, dbid) +FROM gp_segment_configuration WHERE role = 'p' AND content <> -1; + +-- maps to 1st minipage +2&: INSERT INTO unique_index_ao_row VALUES(1); +3&: INSERT INTO unique_index_ao_row VALUES(300000); +4&: INSERT INTO unique_index_ao_row VALUES(329728); +-- maps to 2nd minipage +5&: INSERT INTO unique_index_ao_row VALUES(329729); +6&: INSERT INTO unique_index_ao_row VALUES(598000); +7&: INSERT INTO unique_index_ao_row VALUES(659456); +-- maps to 3rd minipage +8&: INSERT INTO unique_index_ao_row VALUES(661507); +-- no index entry exists for it, so should not conflict. +9: INSERT INTO unique_index_ao_row VALUES(661510); + +SELECT gp_inject_fault('appendonly_insert', 'reset', dbid) +FROM gp_segment_configuration WHERE role = 'p' AND content <> -1; + +1<: +2<: +3<: +4<: +5<: +6<: +7<: +8<: + +DROP TABLE unique_index_ao_row; + +-------------------------------------------------------------------------------- +--------------------------- Smoke tests for COPY ------------------------------- +-------------------------------------------------------------------------------- + +CREATE TABLE unique_index_ao_row (a INT unique) USING ao_row + DISTRIBUTED REPLICATED; + +1: BEGIN; +1: COPY unique_index_ao_row FROM PROGRAM 'seq 1 10'; +-- concurrent tx inserting conflicting row should block. +2&: COPY unique_index_ao_row FROM PROGRAM 'seq 1 1'; +-- concurrent tx inserting non-conflicting rows should be successful. +3: COPY unique_index_ao_row FROM PROGRAM 'seq 11 20'; +-- inserting a conflicting row in the same transaction should ERROR out. +1: COPY unique_index_ao_row FROM PROGRAM 'seq 1 1'; +-- now that tx 1 was aborted, tx 2 is successful. +2<: +1: END; + +DROP TABLE unique_index_ao_row; + +-------------------------------------------------------------------------------- +-------------------- Smoke tests for subtransactions --------------------------- +-------------------------------------------------------------------------------- +CREATE TABLE unique_index_ao_row (a INT unique) USING ao_row + DISTRIBUTED REPLICATED; + +1: BEGIN; +1: SAVEPOINT a; +1: INSERT INTO unique_index_ao_row VALUES(1); + +-- concurrent tx inserting conflicting row should block. +2: BEGIN; +2&: INSERT INTO unique_index_ao_row VALUES(1); +-- concurrent tx inserting non-conflicting row should be successful. +3: INSERT INTO unique_index_ao_row VALUES(2); + +-- conflict should be detected within the same subtx. +1: INSERT INTO unique_index_ao_row VALUES(1); +-- the concurrent tx should now succeed. +2<: +2: ABORT; + +-- after rolling back to the savepoint, we should be able to re-insert the key +1: ROLLBACK TO SAVEPOINT a; +1: INSERT INTO unique_index_ao_row VALUES(1); +1: COMMIT; + +SELECT * FROM unique_index_ao_row; + +DROP TABLE unique_index_ao_row; + +-------------------------------------------------------------------------------- +-------------------- Smoke tests for repeatable read --------------------------- +-------------------------------------------------------------------------------- + +-- Test that shows that unique index checks transcend transaction isolation +-- boundaries. + +CREATE TABLE unique_index_ao_row (a INT unique) USING ao_row + DISTRIBUTED REPLICATED; + +-- Begin two txs with tx level snapshot taken early. +1: BEGIN ISOLATION LEVEL REPEATABLE READ; +1: SELECT * FROM unique_index_ao_row; +2: BEGIN ISOLATION LEVEL REPEATABLE READ; +2: SELECT * FROM unique_index_ao_row; + +-- Now begin a concurrent transaction which inserts a key. +3: BEGIN; +3: INSERT INTO unique_index_ao_row VALUES(1); + +-- And another transaction inserts a key and commits. +INSERT INTO unique_index_ao_row VALUES(2); + +-- Tx should block on insert of conflicting key, even though it can't "see" the +-- conflicting key due to its isolation level. +1: SELECT * FROM unique_index_ao_row; +1&: INSERT INTO unique_index_ao_row VALUES(1); + +3: ABORT; +1<: +1: ABORT; + +-- Tx should raise a conflict, even though it can't "see" the conflicting key +-- due to its isolation level. +2: SELECT * FROM unique_index_ao_row; +2: INSERT INTO unique_index_ao_row VALUES(2); +2: ABORT; + +DROP TABLE unique_index_ao_row; + + +-------------------------------------------------------------------------------- +----------------------- Smoke tests for ADD CONSTRAINT ------------------------ +-------------------------------------------------------------------------------- +CREATE TABLE unique_index_ao_row (a INT) USING ao_row + DISTRIBUTED REPLICATED; +INSERT INTO unique_index_ao_row SELECT * FROM generate_series(1, 5); + +ALTER table unique_index_ao_row ADD CONSTRAINT a_unique UNIQUE(a); +-- should conflict +INSERT INTO unique_index_ao_row VALUES (1); +ALTER table unique_index_ao_row DROP CONSTRAINT a_unique; + +INSERT INTO unique_index_ao_row VALUES (1); +-- should failed +ALTER table unique_index_ao_row ADD CONSTRAINT a_unique UNIQUE(a); + +DROP TABLE unique_index_ao_row; + + +-------------------------------------------------------------------------------- +----------------------- Smoke tests for Multiple Key --------------------------- +-------------------------------------------------------------------------------- +CREATE TABLE unique_index_ao_row (a INT, b INT) USING ao_row + DISTRIBUTED REPLICATED; +INSERT INTO unique_index_ao_row SELECT i,i FROM generate_series(1, 5) i; + +CREATE UNIQUE INDEX a_b_unique ON unique_index_ao_row(a,b); +-- should not conflict +INSERT INTO unique_index_ao_row VALUES (1,2); +-- should conflict +INSERT INTO unique_index_ao_row VALUES (1,1); +DROP TABLE unique_index_ao_row; \ No newline at end of file diff --git a/src/test/isolation2/sql/aocs_unique_index.sql b/src/test/isolation2/sql/aocs_unique_index.sql new file mode 100644 index 00000000000..e2624f6da9f --- /dev/null +++ b/src/test/isolation2/sql/aocs_unique_index.sql @@ -0,0 +1,378 @@ +-- Tests to ensure that unique indexes work as expected w/ ao_column tables. + +-- We use a replicated table to test each table for ease in testing edge cases +-- where conflicts arise at block directory boundaries. We can treat the table +-- as if it were being populated in utility mode on a single segment, allowing +-- us to predict block directory entries without having to worry about the +-- table's distribution. + +-- Case 1: Conflict with committed transaction---------------------------------- +CREATE TABLE unique_index_ao_column (a bigint unique) USING ao_column + DISTRIBUTED REPLICATED; +INSERT INTO unique_index_ao_column SELECT * FROM generate_series(1, 658491); +-- should conflict +INSERT INTO unique_index_ao_column VALUES (1); +INSERT INTO unique_index_ao_column VALUES (658491); +-- should not conflict +INSERT INTO unique_index_ao_column VALUES (658492); +DROP TABLE unique_index_ao_column; + +-- Case 2: Conflict within the same transaction--------------------------------- +CREATE TABLE unique_index_ao_column (a bigint unique) USING ao_column + DISTRIBUTED REPLICATED; +BEGIN; +INSERT INTO unique_index_ao_column SELECT * FROM generate_series(1, 658491); +-- should conflict +INSERT INTO unique_index_ao_column VALUES (1); +END; +DROP TABLE unique_index_ao_column; + +CREATE TABLE unique_index_ao_column (a bigint unique) USING ao_column + DISTRIBUTED REPLICATED; +BEGIN; +INSERT INTO unique_index_ao_column SELECT * FROM generate_series(1, 658491); +-- should conflict +INSERT INTO unique_index_ao_column VALUES (658491); +END; +DROP TABLE unique_index_ao_column; + +CREATE TABLE unique_index_ao_column (a bigint unique) USING ao_column + DISTRIBUTED REPLICATED; +BEGIN; +INSERT INTO unique_index_ao_column SELECT * FROM generate_series(1, 658491); +-- should not conflict +INSERT INTO unique_index_ao_column VALUES (658492); +END; +DROP TABLE unique_index_ao_column; + +-- Case 3: Conflict with aborted transaction is not a conflict------------------ +CREATE TABLE unique_index_ao_column (a bigint unique) USING ao_column + DISTRIBUTED REPLICATED; +BEGIN; +INSERT INTO unique_index_ao_column SELECT * FROM generate_series(1, 658491); +ABORT; +-- should not conflict +INSERT INTO unique_index_ao_column VALUES (1); +INSERT INTO unique_index_ao_column VALUES (658491); +INSERT INTO unique_index_ao_column VALUES (658492); +DROP TABLE unique_index_ao_column; + +-- Case 4: Conflict with to-be-committed transaction---------------------------- +-- +-- 1. Uncommitted tx 1 has inserted non-conflicting key = 0. +-- 2. Uncommitted tx 2 has inserted (161 * 4090 + 1 = 658491 rows), which spans +-- 2 block directory rows (1st row: [1,658490] ; 2nd row: [658491,658491]) +-- 3. Tx 3 tries to insert conflicting key = 2, which maps to the second rownum +-- covered by the 1st block directory row of seg 1, and blocks on tx 2. +-- 4. Tx 4 tries to insert conflicting key = 658490, which maps to the last +-- rownum covered by the 1st block directory row of seg 1, and blocks on tx 2. +-- 5. Tx 5 tries to insert conflicting key = 658491, which maps to the first +-- rownum covered by the 2nd block directory row of seg 1, and blocks on tx 2. +-- 6. Tx 6 tries to insert non-conflicting key = 658492 and is immediately +-- successful. +-- 8. Tx 2 commits +-- 9. Txs 3,4,5 report unique constraint violation +-- 10. Tx 1 commits +-- +CREATE TABLE unique_index_ao_column (a bigint unique) USING ao_column + DISTRIBUTED REPLICATED; +1: BEGIN; +1: INSERT INTO unique_index_ao_column VALUES (0); +2: BEGIN; +2: INSERT INTO unique_index_ao_column SELECT * FROM generate_series(1, 658491); +3&: INSERT INTO unique_index_ao_column VALUES (1); +4&: INSERT INTO unique_index_ao_column VALUES (658490); +5&: INSERT INTO unique_index_ao_column VALUES (658491); +-- should succeed immediately +6: INSERT INTO unique_index_ao_column VALUES (658492); +2: COMMIT; +3<: +4<: +5<: +1: COMMIT; +DROP TABLE unique_index_ao_column; + +-- Case 5: Conflict with to-be-aborted transaction------------------------------ +-- +-- 1. Uncommitted tx 1 has inserted non-conflicting key = 0. +-- 2. Uncommitted tx 2 has inserted (161 * 4090 + 1 = 658491 rows), which spans +-- 2 block directory rows (1st row: [1,658490] ; 2nd row: [658491,658491]) +-- 3. Tx 3 tries to insert conflicting key = 2, which maps to the second rownum +-- covered by the 1st block directory row of seg 1, and blocks on tx 2. +-- 4. Tx 4 tries to insert conflicting key = 658490, which maps to the last +-- rownum covered by the 1st block directory row of seg 1, and blocks on tx 2. +-- 5. Tx 5 tries to insert conflicting key = 658491, which maps to the first +-- rownum covered by the 2nd block directory row of seg 1, and blocks on tx 2. +-- 6. Tx 6 tries to insert non-conflicting key = 658492 and is immediately +-- successful. +-- 8. Tx 2 aborts +-- 9. Txs 3,4,5 report unique constraint violation +-- 10. Tx 1 commits +-- +CREATE TABLE unique_index_ao_column (a bigint unique) USING ao_column + DISTRIBUTED REPLICATED; +1: BEGIN; +1: INSERT INTO unique_index_ao_column VALUES (0); +2: BEGIN; +2: INSERT INTO unique_index_ao_column SELECT * FROM generate_series(1, 658491); +3&: INSERT INTO unique_index_ao_column VALUES (1); +4&: INSERT INTO unique_index_ao_column VALUES (658490); +5&: INSERT INTO unique_index_ao_column VALUES (658491); +-- should succeed immediately +6: INSERT INTO unique_index_ao_column VALUES (658492); +2: ABORT; +3<: +4<: +5<: +1: COMMIT; +DROP TABLE unique_index_ao_column; + +-- Case 6: Conflict with aborted rows following some committed rows ------------ +CREATE TABLE unique_index_ao_column (a bigint unique) USING ao_column + DISTRIBUTED REPLICATED; +-- 1. Tx 1 commits rows 1-100. +-- 2. Tx 2 inserts rows 101-200 and then aborts. +-- 3. Tx 3 tries to insert row in range [101,200] and is immediately successful. +-- 4. Tx 4 tries to insert conflicting row in range [1,100] and raises unique +-- constraint violation. +-- 5. Tx 5 tries to insert row in range [201, ) and is immediately successful. +1: INSERT INTO unique_index_ao_column SELECT generate_series(1, 100); +2: BEGIN; +2: INSERT INTO unique_index_ao_column SELECT generate_series(101, 200); +2: ABORT; +3: INSERT INTO unique_index_ao_column VALUES(102); +4: INSERT INTO unique_index_ao_column VALUES(2); +5: INSERT INTO unique_index_ao_column VALUES(202); +DROP TABLE unique_index_ao_column; + +-------------------------------------------------------------------------------- +----------------- More concurrent tests with fault injection ------------------ +-------------------------------------------------------------------------------- + +-- Case 7: Conflict with to-be-committed transaction while only a placeholder +-- row exists in the block directory-------------------------------------------- +-- +-- This case highlights the importance of the placeholder row, inserted at the +-- beginning of an INSERT command. +-- +-- 1. Uncommitted Tx 1 has inserted 3 out of its 10 rows and is suspended. +-- 2. Tx 2 inserts a conflicting row and blocks on Tx 1. +-- 3. Tx 3 inserts a non-conflicting row within the range [4,10] and is +-- immediately successful. (Index entries have been written only for [1,3] so +-- far, so conflicts shouldn't arise) +-- 4. Tx 4 inserts a non-conflicting row in range [11, ..) and should be +-- immediately successful. +-- 5. Now Tx 1 resumes and tries to insert a row in range [4,10] and reports a +-- unique constraint violation with Tx 3. +-- 6. Tx 2 succeeds as Tx 1 aborted. + +CREATE TABLE unique_index_ao_column (a bigint unique) USING ao_column + DISTRIBUTED REPLICATED; +SELECT gp_inject_fault('appendonly_insert', 'suspend', '', '', 'unique_index_ao_column', 4, 4, 0, dbid) + FROM gp_segment_configuration WHERE role = 'p' AND content <> -1; +1&: INSERT INTO unique_index_ao_column SELECT * FROM generate_series(1, 10); +-- Wait until 3 rows have been successfully inserted into the index and Tx 1 +-- is just beginning to insert the 4th row. +SELECT gp_wait_until_triggered_fault('appendonly_insert', 4, dbid) + FROM gp_segment_configuration WHERE role = 'p' AND content <> -1; +2&: INSERT INTO unique_index_ao_column VALUES(2); +4: INSERT INTO unique_index_ao_column VALUES(11); +3: INSERT INTO unique_index_ao_column VALUES(4); +SELECT gp_inject_fault('appendonly_insert', 'reset', dbid) + FROM gp_segment_configuration WHERE role = 'p' AND content <> -1; +1<: +2<: +DROP TABLE unique_index_ao_column; + +-- Case 8: Conflict with to-be-committed transaction - generalization of case 7 +-- where there are multiple minipages (and block directory rows) in play from +-- the same insert. +-- +-- This justifies why 1 placeholder row is enough and we don't need to flush a +-- placeholder row every time we insert a block directory row (i.e. start a new +-- in-memory minipage) throughout the course of a single insert. +-- +-- 1. Uncommitted Tx 1 has inserted (4090 * (161 * 2 + 1) + 4) = 1321074 rows +-- and is suspended, enough rows to fill 2 entire minipages (covers +-- range [1,658490] and [658491,1321070]) before suspension. +-- 2. Txs 2,3,4 inserts conflicting rows that map to the 1st minipage and block. +-- 3. Txs 5,6,7 inserts conflicting rows that map to the 2nd minipage and block. +-- 4. Tx 8 inserts a conflicting row that maps to the 3rd minipage, which is +-- currently only in-memory and it conflicts on the placeholder row and +-- blocks (showcases why 1 placeholder row is enough) +-- 5. Tx 9 inserts a non-conflicting row for which there is no index entry and +-- and is immediately successful (1321075). +-- 6. Now Tx 1 resumes and tries to insert 1321075 and reports a unique +-- constraint violation with Tx 9. +-- 7. All blocked Txs succeed. + +CREATE TABLE unique_index_ao_column (a bigint unique) USING ao_column + DISTRIBUTED REPLICATED; + +SELECT gp_inject_fault('insert_new_entry_curr_minipage_full', 'suspend', '', '', '', 2, 2, 0, dbid) + FROM gp_segment_configuration WHERE role = 'p' AND content <> -1; +1&: INSERT INTO unique_index_ao_column SELECT generate_series(1, 1321075); + +-- Wait until we have inserted (4090 * (161 * 2 + 1) + 3) = 1321073 rows and we +-- are about to insert the 1321074th row. +SELECT gp_wait_until_triggered_fault('insert_new_entry_curr_minipage_full', 2, dbid) + FROM gp_segment_configuration WHERE role = 'p' AND content <> -1; +SELECT gp_inject_fault('appendonly_insert', 'suspend', '', '', 'unique_index_ao_column', 4, 4, 0, dbid) + FROM gp_segment_configuration WHERE role = 'p' AND content <> -1; +SELECT gp_inject_fault('insert_new_entry_curr_minipage_full', 'reset', dbid) + FROM gp_segment_configuration WHERE role = 'p' AND content <> -1; +SELECT gp_wait_until_triggered_fault('appendonly_insert', 4, dbid) + FROM gp_segment_configuration WHERE role = 'p' AND content <> -1; + +-- maps to 1st minipage +2&: INSERT INTO unique_index_ao_column VALUES(1); +3&: INSERT INTO unique_index_ao_column VALUES(300000); +4&: INSERT INTO unique_index_ao_column VALUES(658490); +-- maps to 2nd minipage +5&: INSERT INTO unique_index_ao_column VALUES(658491); +6&: INSERT INTO unique_index_ao_column VALUES(700000); +7&: INSERT INTO unique_index_ao_column VALUES(1321070); +-- maps to 3rd minipage +8&: INSERT INTO unique_index_ao_column VALUES(1321071); +-- no index entry exists for it, so should not conflict. +9: INSERT INTO unique_index_ao_column VALUES(1321075); + +SELECT gp_inject_fault('appendonly_insert', 'reset', dbid) + FROM gp_segment_configuration WHERE role = 'p' AND content <> -1; + +1<: +2<: +3<: +4<: +5<: +6<: +7<: +8<: + +DROP TABLE unique_index_ao_column; + +-------------------------------------------------------------------------------- +--------------------------- Smoke tests for COPY ------------------------------- +-------------------------------------------------------------------------------- + +CREATE TABLE unique_index_ao_column (a INT unique) USING ao_column + DISTRIBUTED REPLICATED; + +1: BEGIN; +1: COPY unique_index_ao_column FROM PROGRAM 'seq 1 10'; +-- concurrent tx inserting conflicting row should block. +2&: COPY unique_index_ao_column FROM PROGRAM 'seq 1 1'; +-- concurrent tx inserting non-conflicting rows should be successful. +3: COPY unique_index_ao_column FROM PROGRAM 'seq 11 20'; +-- inserting a conflicting row in the same transaction should ERROR out. +1: COPY unique_index_ao_column FROM PROGRAM 'seq 1 1'; +-- now that tx 1 was aborted, tx 2 is successful. +2<: +1: END; + +DROP TABLE unique_index_ao_column; + +-------------------------------------------------------------------------------- +-------------------- Smoke tests for subtransactions --------------------------- +-------------------------------------------------------------------------------- +CREATE TABLE unique_index_ao_column (a INT unique) USING ao_column + DISTRIBUTED REPLICATED; + +1: BEGIN; +1: SAVEPOINT a; +1: INSERT INTO unique_index_ao_column VALUES(1); + +-- concurrent tx inserting conflicting row should block. +2: BEGIN; +2&: INSERT INTO unique_index_ao_column VALUES(1); +-- concurrent tx inserting non-conflicting row should be successful. +3: INSERT INTO unique_index_ao_column VALUES(2); + +-- conflict should be detected within the same subtx. +1: INSERT INTO unique_index_ao_column VALUES(1); +-- the concurrent tx should now succeed. +2<: +2: ABORT; + +-- after rolling back to the savepoint, we should be able to re-insert the key +1: ROLLBACK TO SAVEPOINT a; +1: INSERT INTO unique_index_ao_column VALUES(1); +1: COMMIT; + +SELECT * FROM unique_index_ao_column; + +DROP TABLE unique_index_ao_column; + +-------------------------------------------------------------------------------- +-------------------- Smoke tests for repeatable read --------------------------- +-------------------------------------------------------------------------------- + +-- Test that shows that unique index checks transcend transaction isolation +-- boundaries. + +CREATE TABLE unique_index_ao_column (a INT unique) USING ao_column + DISTRIBUTED REPLICATED; + +-- Begin two txs with tx level snapshot taken early. +1: BEGIN ISOLATION LEVEL REPEATABLE READ; +1: SELECT * FROM unique_index_ao_column; +2: BEGIN ISOLATION LEVEL REPEATABLE READ; +2: SELECT * FROM unique_index_ao_column; + +-- Now begin a concurrent transaction which inserts a key. +3: BEGIN; +3: INSERT INTO unique_index_ao_column VALUES(1); + +-- And another transaction inserts a key and commits. +INSERT INTO unique_index_ao_column VALUES(2); + +-- Tx should block on insert of conflicting key, even though it can't "see" the +-- conflicting key due to its isolation level. +1: SELECT * FROM unique_index_ao_column; +1&: INSERT INTO unique_index_ao_column VALUES(1); + +3: ABORT; +1<: +1: ABORT; + +-- Tx should raise a conflict, even though it can't "see" the conflicting key +-- due to its isolation level. +2: SELECT * FROM unique_index_ao_column; +2: INSERT INTO unique_index_ao_column VALUES(2); +2: ABORT; + +DROP TABLE unique_index_ao_column; + + +-------------------------------------------------------------------------------- +----------------------- Smoke tests for ADD CONSTRAINT ------------------------ +-------------------------------------------------------------------------------- +CREATE TABLE unique_index_ao_column (a INT) USING ao_column + DISTRIBUTED REPLICATED; +INSERT INTO unique_index_ao_column SELECT * FROM generate_series(1, 5); + +ALTER table unique_index_ao_column ADD CONSTRAINT a_unique UNIQUE(a); +-- should conflict +INSERT INTO unique_index_ao_column VALUES (1); +ALTER table unique_index_ao_column DROP CONSTRAINT a_unique; + +INSERT INTO unique_index_ao_column VALUES (1); +-- should failed +ALTER table unique_index_ao_column ADD CONSTRAINT a_unique UNIQUE(a); + +DROP TABLE unique_index_ao_column; + + +-------------------------------------------------------------------------------- +----------------------- Smoke tests for Multiple Key --------------------------- +-------------------------------------------------------------------------------- +CREATE TABLE unique_index_ao_column (a INT, b INT) USING ao_column + DISTRIBUTED REPLICATED; +INSERT INTO unique_index_ao_column SELECT i,i FROM generate_series(1, 5) i; + +CREATE UNIQUE INDEX a_b_unique ON unique_index_ao_column(a,b); +-- should not conflict +INSERT INTO unique_index_ao_column VALUES (1,2); +-- should conflict +INSERT INTO unique_index_ao_column VALUES (1,1); +DROP TABLE unique_index_ao_column; \ No newline at end of file diff --git a/src/test/isolation2/sql/fsync_ao.sql b/src/test/isolation2/sql/fsync_ao.sql index f2ebc6779d6..1e04b3ffdf4 100644 --- a/src/test/isolation2/sql/fsync_ao.sql +++ b/src/test/isolation2/sql/fsync_ao.sql @@ -58,8 +58,9 @@ select gp_wait_until_triggered_fault('restartpoint_guts', 2, dbid) from gp_segment_configuration where content=0 and role='m'; -- Validate that the number of files fsync'ed by checkpointer (on --- mirror). `num times hit` is corresponding to the number of files --- synced by `ao_fsync_counter` fault. +-- mirror). This should match the number of files for fsync_ao and fsync_co. +select gp_wait_until_triggered_fault('ao_fsync_counter', 3, dbid) + from gp_segment_configuration where content=0 and role='m'; select gp_inject_fault('ao_fsync_counter', 'status', dbid) from gp_segment_configuration where content=0 and role='m'; @@ -94,6 +95,8 @@ select gp_wait_until_triggered_fault('restartpoint_guts', 3, dbid) from gp_segment_configuration where content=0 and role='m'; -- Expect the segment files that were updated by vacuum to be fsync'ed. +select gp_wait_until_triggered_fault('ao_fsync_counter', 12, dbid) + from gp_segment_configuration where content=0 and role='m'; select gp_inject_fault('ao_fsync_counter', 'status', dbid) from gp_segment_configuration where content=0 and role='m'; @@ -118,6 +121,8 @@ select gp_wait_until_triggered_fault('restartpoint_guts', 4, dbid) -- Expect that fsync is only performed for fsync_ao table (1 file) but -- not for fsync_co table because it was dropped after being updated. +select gp_wait_until_triggered_fault('ao_fsync_counter', 13, dbid) + from gp_segment_configuration where content=0 and role='m'; select gp_inject_fault('ao_fsync_counter', 'status', dbid) from gp_segment_configuration where content=0 and role='m'; diff --git a/src/test/regress/expected/alter_table_aocs.out b/src/test/regress/expected/alter_table_aocs.out index cfcccc2da95..091d0f5e250 100644 --- a/src/test/regress/expected/alter_table_aocs.out +++ b/src/test/regress/expected/alter_table_aocs.out @@ -527,11 +527,6 @@ select attstattarget from pg_attribute where attrelid = 'aocs_addcol.addcol1'::r -- test alter distribution policy alter table addcol1 set distributed randomly; alter table addcol1 set distributed by (a); --- test some constraints (unique indexes do not work for unique and pkey) -alter table addcol1 add constraint tunique unique(a); -ERROR: append-only tables do not support unique indexes -alter table addcol1 add constraint tpkey primary key(a); -ERROR: append-only tables do not support unique indexes alter table addcol1 add constraint tcheck check (a is not null); -- test changing the storage type of a column alter table addcol1 alter column f_renamed type varchar(7); diff --git a/src/test/regress/expected/alter_table_set_am.out b/src/test/regress/expected/alter_table_set_am.out new file mode 100644 index 00000000000..e69de29bb2d diff --git a/src/test/regress/expected/brin_ao.out b/src/test/regress/expected/brin_ao.out index 7f4b6e592bb..b0d4b234a64 100644 --- a/src/test/regress/expected/brin_ao.out +++ b/src/test/regress/expected/brin_ao.out @@ -466,9 +466,15 @@ CONTEXT: SQL function "brin_summarize_new_values" statement 1 SELECT brin_summarize_new_values('tenk1_unique1'); -- error, not a BRIN index ERROR: "tenk1_unique1" is not a BRIN index CONTEXT: SQL function "brin_summarize_new_values" statement 1 +-- New strategy of VACUUM AO/CO was introduced by PR #13255 for performance enhancement. +-- Index dead tuples will not always be cleaned up completely after VACUUM, resulting +-- brin_summarize_new_values() will not always be accurate. So ignore the check to +-- coordinate with the new behavior. +-- start_ignore SELECT brin_summarize_new_values('brinaoidx'); -- ok, no change expected brin_summarize_new_values --------------------------- 0 (1 row) +-- end_ignore \ No newline at end of file diff --git a/src/test/regress/expected/brin_ao_optimizer.out b/src/test/regress/expected/brin_ao_optimizer.out index 02caa9f2c83..7038d21493b 100644 --- a/src/test/regress/expected/brin_ao_optimizer.out +++ b/src/test/regress/expected/brin_ao_optimizer.out @@ -489,9 +489,15 @@ CONTEXT: SQL function "brin_summarize_new_values" statement 1 SELECT brin_summarize_new_values('tenk1_unique1'); -- error, not a BRIN index ERROR: "tenk1_unique1" is not a BRIN index CONTEXT: SQL function "brin_summarize_new_values" statement 1 +-- New strategy of VACUUM AO/CO was introduced by PR #13255 for performance enhancement. +-- Index dead tuples will not always be cleaned up completely after VACUUM, resulting +-- brin_summarize_new_values() will not always be accurate. So ignore the check to +-- coordinate with the new behavior. +-- start_ignore SELECT brin_summarize_new_values('brinaoidx'); -- ok, no change expected brin_summarize_new_values --------------------------- 0 (1 row) +-- end_ignore \ No newline at end of file diff --git a/src/test/regress/expected/brin_aocs.out b/src/test/regress/expected/brin_aocs.out index 0fa29bb2581..69555f8297c 100644 --- a/src/test/regress/expected/brin_aocs.out +++ b/src/test/regress/expected/brin_aocs.out @@ -466,9 +466,15 @@ CONTEXT: SQL function "brin_summarize_new_values" statement 1 SELECT brin_summarize_new_values('tenk1_unique1'); -- error, not a BRIN index ERROR: "tenk1_unique1" is not a BRIN index CONTEXT: SQL function "brin_summarize_new_values" statement 1 +-- New strategy of VACUUM AO/CO was introduced by PR #13255 for performance enhancement. +-- Index dead tuples will not always be cleaned up completely after VACUUM, resulting +-- brin_summarize_new_values() will not always be accurate. So ignore the check to +-- coordinate with the new behavior. +-- start_ignore SELECT brin_summarize_new_values('brinaocsidx'); -- ok, no change expected brin_summarize_new_values --------------------------- 0 (1 row) +-- end_ignore \ No newline at end of file diff --git a/src/test/regress/expected/brin_aocs_optimizer.out b/src/test/regress/expected/brin_aocs_optimizer.out index 5a83c375758..3ba2a58bb2a 100644 --- a/src/test/regress/expected/brin_aocs_optimizer.out +++ b/src/test/regress/expected/brin_aocs_optimizer.out @@ -489,9 +489,15 @@ CONTEXT: SQL function "brin_summarize_new_values" statement 1 SELECT brin_summarize_new_values('tenk1_unique1'); -- error, not a BRIN index ERROR: "tenk1_unique1" is not a BRIN index CONTEXT: SQL function "brin_summarize_new_values" statement 1 +-- New strategy of VACUUM AO/CO was introduced by PR #13255 for performance enhancement. +-- Index dead tuples will not always be cleaned up completely after VACUUM, resulting +-- brin_summarize_new_values() will not always be accurate. So ignore the check to +-- coordinate with the new behavior. +-- start_ignore SELECT brin_summarize_new_values('brinaocsidx'); -- ok, no change expected brin_summarize_new_values --------------------------- 0 (1 row) +-- end_ignore \ No newline at end of file diff --git a/src/test/regress/expected/gp_toolkit_ao_funcs.out b/src/test/regress/expected/gp_toolkit_ao_funcs.out index 156a57f81ba..91ca5182049 100644 --- a/src/test/regress/expected/gp_toolkit_ao_funcs.out +++ b/src/test/regress/expected/gp_toolkit_ao_funcs.out @@ -10,12 +10,14 @@ DROP TABLE IF EXISTS toolkit_ao_test; CREATE TABLE toolkit_ao_test (a INT, b INT, c INT) WITH (appendonly=true) DISTRIBUTED BY (c); +CREATE INDEX ON toolkit_ao_test(a); INSERT INTO toolkit_ao_test SELECT i as a, i as b, 1 FROM generate_series(1,20) AS i; UPDATE toolkit_ao_test SET b = 0 WHERE a = 1; DELETE FROM toolkit_ao_test WHERE a = 2; DROP TABLE IF EXISTS toolkit_aocs_test; CREATE TABLE toolkit_aocs_test (a INT, b INT, C INT) WITH (appendonly=true, orientation=column) DISTRIBUTED BY (c); +CREATE INDEX ON toolkit_aocs_test(a); INSERT INTO toolkit_aocs_test SELECT i as a, i as b FROM generate_series(1,20) AS i; UPDATE toolkit_aocs_test SET b = 0 WHERE a = 1; DELETE FROM toolkit_aocs_test WHERE a = 2; @@ -66,6 +68,16 @@ SELECT count(*) FROM gp_toolkit.__gp_aoseg('toolkit_ao_test'); 1 (1 row) +SELECT * FROM gp_toolkit.__gp_aoblkdir('toolkit_ao_test'); + tupleid | segno | columngroup_no | entry_no | first_row_no | file_offset | row_count +---------+-------+----------------+----------+--------------+-------------+----------- +(0 rows) + +SELECT * FROM gp_toolkit.__gp_aoblkdir('toolkit_aocs_test'); + tupleid | segno | columngroup_no | entry_no | first_row_no | file_offset | row_count +---------+-------+----------------+----------+--------------+-------------+----------- +(0 rows) + -- The same, but on the segments. SELECT (t).* FROM ( SELECT gp_toolkit.__gp_aovisimap('toolkit_ao_test') AS t FROM gp_dist_random('gp_id') @@ -84,3 +96,25 @@ SELECT (t).segno, (t).first_row_num, (t).hidden_tupcount >= 1 as hidden_tupcount 1 | 0 | t | t (1 row) +SELECT (t).* FROM ( + SELECT gp_toolkit.__gp_aoblkdir('toolkit_ao_test') AS t FROM gp_dist_random('gp_id') +) AS x; + tupleid | segno | columngroup_no | entry_no | first_row_no | file_offset | row_count +---------+-------+----------------+----------+--------------+-------------+----------- + (0,2) | 1 | 0 | 0 | 1 | 0 | 20 + (0,2) | 1 | 0 | 1 | 101 | 392 | 1 +(2 rows) + +SELECT (t).* FROM ( + SELECT gp_toolkit.__gp_aoblkdir('toolkit_aocs_test') AS t FROM gp_dist_random('gp_id') +) AS x; + tupleid | segno | columngroup_no | entry_no | first_row_no | file_offset | row_count +---------+-------+----------------+----------+--------------+-------------+----------- + (0,4) | 1 | 0 | 0 | 1 | 0 | 20 + (0,4) | 1 | 0 | 1 | 101 | 120 | 1 + (0,5) | 1 | 1 | 0 | 1 | 0 | 20 + (0,5) | 1 | 1 | 1 | 101 | 120 | 1 + (0,6) | 1 | 2 | 0 | 1 | 0 | 20 + (0,6) | 1 | 2 | 1 | 101 | 48 | 1 +(6 rows) + diff --git a/src/test/regress/expected/qp_with_clause.out b/src/test/regress/expected/qp_with_clause.out index 773b213d86a..10c02bae0c4 100644 --- a/src/test/regress/expected/qp_with_clause.out +++ b/src/test/regress/expected/qp_with_clause.out @@ -6392,15 +6392,6 @@ CREATE TABLE countrylanguage_ao ( isofficial boolean NOT NULL, percentage real NOT NULL ) with (appendonly=true) distributed by (countrycode,language); -ALTER TABLE ONLY city_ao - ADD CONSTRAINT city_ao_pkey PRIMARY KEY (id); -ERROR: append-only tables do not support unique indexes -ALTER TABLE ONLY country_ao - ADD CONSTRAINT country_ao_pkey PRIMARY KEY (code); -ERROR: append-only tables do not support unique indexes -ALTER TABLE ONLY countrylanguage_ao - ADD CONSTRAINT countrylanguage_ao_pkey PRIMARY KEY (countrycode, "language"); -ERROR: append-only tables do not support unique indexes create index bitmap_city_ao_countrycode on city_ao using bitmap(countrycode); create index bitmap_country_ao_gf on country_ao using bitmap(governmentform); create index bitmap_country_ao_region on country_ao using bitmap(region); @@ -7294,15 +7285,6 @@ CREATE TABLE countrylanguage_co ( isofficial boolean NOT NULL, percentage real NOT NULL ) with (appendonly=true,orientation=column) distributed by (countrycode,language); -ALTER TABLE ONLY city_co - ADD CONSTRAINT city_co_pkey PRIMARY KEY (id); -ERROR: append-only tables do not support unique indexes -ALTER TABLE ONLY country_co - ADD CONSTRAINT country_co_pkey PRIMARY KEY (code); -ERROR: append-only tables do not support unique indexes -ALTER TABLE ONLY countrylanguage_co - ADD CONSTRAINT countrylanguage_co_pkey PRIMARY KEY (countrycode, "language"); -ERROR: append-only tables do not support unique indexes create index bitmap_city_co_countrycode on city_co using bitmap(countrycode); create index bitmap_country_co_gf on country_co using bitmap(governmentform); create index bitmap_country_co_region on country_co using bitmap(region); diff --git a/src/test/regress/expected/uao_compaction/drop_column.out b/src/test/regress/expected/uao_compaction/drop_column.out index 56b9f7dcb94..898b82b172f 100644 --- a/src/test/regress/expected/uao_compaction/drop_column.out +++ b/src/test/regress/expected/uao_compaction/drop_column.out @@ -29,12 +29,18 @@ SELECT relname, reltuples FROM pg_class WHERE relname = 'uao_drop_col'; uao_drop_col | 7 (1 row) +-- New strategy of VACUUM AO/CO was introduced by PR #13255 for performance enhancement. +-- Index dead tuples will not always be cleaned up completely after VACUUM, resulting +-- index stats pg_class->reltuples will not always be accurate. So ignore the stats check +-- for reltuples to coordinate with the new behavior. +-- start_ignore SELECT relname, reltuples FROM pg_class WHERE relname = 'uao_drop_col_index'; relname | reltuples --------------------+----------- uao_drop_col_index | 7 (1 row) +-- end_ignore ALTER TABLE uao_drop_col DROP COLUMN c; SELECT * FROM uao_drop_col; a | b diff --git a/src/test/regress/expected/uao_compaction/full_stats.out b/src/test/regress/expected/uao_compaction/full_stats.out index 5530c1065d9..1f9d473aa7a 100644 --- a/src/test/regress/expected/uao_compaction/full_stats.out +++ b/src/test/regress/expected/uao_compaction/full_stats.out @@ -26,9 +26,15 @@ SELECT relname, reltuples FROM pg_class WHERE relname = 'uao_full_stats'; uao_full_stats | 85 (1 row) +-- New strategy of VACUUM AO/CO was introduced by PR #13255 for performance enhancement. +-- Index dead tuples will not always be cleaned up completely after VACUUM, resulting +-- index stats pg_class->reltuples will not always be accurate. So ignore the stats check +-- for reltuples to coordinate with the new behavior. +-- start_ignore SELECT relname, reltuples FROM pg_class WHERE relname = 'uao_full_stats_index'; relname | reltuples ----------------------+----------- uao_full_stats_index | 85 (1 row) +-- end_ignore \ No newline at end of file diff --git a/src/test/regress/expected/uao_compaction/index_stats.out b/src/test/regress/expected/uao_compaction/index_stats.out index e5a9fe4ac69..1c32cfd8caa 100644 --- a/src/test/regress/expected/uao_compaction/index_stats.out +++ b/src/test/regress/expected/uao_compaction/index_stats.out @@ -29,9 +29,15 @@ SELECT relname, reltuples FROM pg_class WHERE relname = 'mytab'; mytab | 2 (1 row) +-- New strategy of VACUUM AO/CO was introduced by PR #13255 for performance enhancement. +-- Index dead tuples will not always be cleaned up completely after VACUUM, resulting +-- index stats pg_class->reltuples will not always be accurate. So ignore the stats check +-- for reltuples to coordinate with the new behavior. +-- start_ignore SELECT relname, reltuples FROM pg_class WHERE relname = 'mytab_int_idx1'; relname | reltuples ----------------+----------- mytab_int_idx1 | 2 (1 row) +-- end_ignore \ No newline at end of file diff --git a/src/test/regress/expected/uao_compaction/stats.out b/src/test/regress/expected/uao_compaction/stats.out index d34cfede4fc..a18834d43e3 100644 --- a/src/test/regress/expected/uao_compaction/stats.out +++ b/src/test/regress/expected/uao_compaction/stats.out @@ -27,9 +27,15 @@ SELECT relname, reltuples FROM pg_class WHERE relname = 'uao_stats'; uao_stats | 85 (1 row) +-- New strategy of VACUUM AO/CO was introduced by PR #13255 for performance enhancement. +-- Index dead tuples will not always be cleaned up completely after VACUUM, resulting +-- index stats pg_class->reltuples will not always be accurate. So ignore the stats check +-- for reltuples to coordinate with the new behavior. +-- start_ignore SELECT relname, reltuples FROM pg_class WHERE relname = 'uao_stats_index'; relname | reltuples -----------------+----------- - uao_stats_index | 85 + uao_stats_index | 88 (1 row) +-- end_ignore \ No newline at end of file diff --git a/src/test/regress/expected/uaocs_compaction/drop_column.out b/src/test/regress/expected/uaocs_compaction/drop_column.out index 3b867c5ca15..5d9579e2e0b 100644 --- a/src/test/regress/expected/uaocs_compaction/drop_column.out +++ b/src/test/regress/expected/uaocs_compaction/drop_column.out @@ -29,12 +29,18 @@ SELECT relname, reltuples FROM pg_class WHERE relname = 'uaocs_drop'; uaocs_drop | 7 (1 row) +-- New strategy of VACUUM AO/CO was introduced by PR #13255 for performance enhancement. +-- Index dead tuples will not always be cleaned up completely after VACUUM, resulting +-- index stats pg_class->reltuples will not always be accurate. So ignore the stats check +-- for reltuples to coordinate with the new behavior. +-- start_ignore SELECT relname, reltuples FROM pg_class WHERE relname = 'uaocs_drop_index'; relname | reltuples ------------------+----------- uaocs_drop_index | 7 (1 row) +-- end_ignore ALTER TABLE uaocs_drop DROP COLUMN c; SELECT * FROM uaocs_drop; a | b diff --git a/src/test/regress/expected/uaocs_compaction/full_stats.out b/src/test/regress/expected/uaocs_compaction/full_stats.out index fcdad3daf5b..e5d7825ecf1 100644 --- a/src/test/regress/expected/uaocs_compaction/full_stats.out +++ b/src/test/regress/expected/uaocs_compaction/full_stats.out @@ -44,9 +44,15 @@ SELECT relname, reltuples FROM pg_class WHERE relname = 'uaocs_full_stats'; uaocs_full_stats | 85 (1 row) +-- New strategy of VACUUM AO/CO was introduced by PR #13255 for performance enhancement. +-- Index dead tuples will not always be cleaned up completely after VACUUM, resulting +-- index stats pg_class->reltuples will not always be accurate. So ignore the stats check +-- for reltuples to coordinate with the new behavior. +-- start_ignore SELECT relname, reltuples FROM pg_class WHERE relname = 'uaocs_full_stats_index'; relname | reltuples ------------------------+----------- uaocs_full_stats_index | 85 (1 row) +-- end_ignore \ No newline at end of file diff --git a/src/test/regress/expected/uaocs_compaction/index_stats.out b/src/test/regress/expected/uaocs_compaction/index_stats.out index 36bd145d2a8..adc286c0811 100644 --- a/src/test/regress/expected/uaocs_compaction/index_stats.out +++ b/src/test/regress/expected/uaocs_compaction/index_stats.out @@ -34,9 +34,15 @@ SELECT relname, reltuples FROM pg_class WHERE relname = 'uaocs_index_stats'; uaocs_index_stats | 2 (1 row) +-- New strategy of VACUUM AO/CO was introduced by PR #13255 for performance enhancement. +-- Index dead tuples will not always be cleaned up completely after VACUUM, resulting +-- index stats pg_class->reltuples will not always be accurate. So ignore the stats check +-- for reltuples to coordinate with the new behavior. +-- start_ignore SELECT relname, reltuples FROM pg_class WHERE relname = 'uaocs_index_stats_int_idx1'; relname | reltuples ----------------------------+----------- uaocs_index_stats_int_idx1 | 2 (1 row) +-- end_ignore \ No newline at end of file diff --git a/src/test/regress/expected/uaocs_compaction/stats.out b/src/test/regress/expected/uaocs_compaction/stats.out index 07678823d00..f643fb4dcc7 100644 --- a/src/test/regress/expected/uaocs_compaction/stats.out +++ b/src/test/regress/expected/uaocs_compaction/stats.out @@ -27,9 +27,15 @@ SELECT relname, reltuples FROM pg_class WHERE relname = 'uaocs_stats'; uaocs_stats | 85 (1 row) +-- New strategy of VACUUM AO/CO was introduced by PR #13255 for performance enhancement. +-- Index dead tuples will not always be cleaned up completely after VACUUM, resulting +-- index stats pg_class->reltuples will not always be accurate. So ignore the stats check +-- for reltuples to coordinate with the new behavior. +-- start_ignore SELECT relname, reltuples FROM pg_class WHERE relname = 'uaocs_stats_index'; relname | reltuples -------------------+----------- uaocs_stats_index | 85 (1 row) +-- end_ignore \ No newline at end of file diff --git a/src/test/regress/greenplum_schedule b/src/test/regress/greenplum_schedule index e66d9266f17..c5bd9fd5b54 100755 --- a/src/test/regress/greenplum_schedule +++ b/src/test/regress/greenplum_schedule @@ -232,7 +232,7 @@ test: uao_ddl/alter_drop_allcol_row uao_ddl/alter_drop_allcol_column uao_ddl/alt # These tests use gp_select_invisible and VACUUM, and will get confused if there are # concurrent transactions holding back the global xmin. -test: uao_dml/uao_dml_cursor_row uao_dml/uao_dml_select_row uao_dml/uao_dml_cursor_column uao_dml/uao_dml_select_column +test: uao_dml/uao_dml_cursor_row uao_dml/uao_dml_select_row uao_dml/uao_dml_cursor_column uao_dml/uao_dml_select_column uao_dml/uao_dml_unique_index_delete_row uao_dml/uao_dml_unique_index_delete_column uao_dml/uao_dml_unique_index_update_row uao_dml/uao_dml_unique_index_update_column # disable autovacuum for the test diff --git a/src/test/regress/input/uao_ddl/alter_ao_table_index.source b/src/test/regress/input/uao_ddl/alter_ao_table_index.source index da52b70f89f..2d41b579e3c 100644 --- a/src/test/regress/input/uao_ddl/alter_ao_table_index.source +++ b/src/test/regress/input/uao_ddl/alter_ao_table_index.source @@ -51,6 +51,3 @@ select relfrozenxid from pg_class c, pg_namespace n where select relfrozenxid from gp_dist_random('pg_class') c, pg_namespace n where c.relnamespace = n.oid and relname = 'sto_alt_uao3_idx' and n.nspname = 'alter_ao_table_index_@amname@'; - --- Verify that unique index is not allowed -CREATE UNIQUE INDEX uni_index ON sto_alt_uao3_idx (text_col); diff --git a/src/test/regress/input/uao_dml/ao_unique_index_build.source b/src/test/regress/input/uao_dml/ao_unique_index_build.source new file mode 100644 index 00000000000..30ee292aa00 --- /dev/null +++ b/src/test/regress/input/uao_dml/ao_unique_index_build.source @@ -0,0 +1,86 @@ +-- Test cases to cover CREATE UNIQUE INDEX on AO/CO tables. + +SET default_table_access_method TO @amname@; +-- To force index scans for smoke tests +SET enable_seqscan TO off; +SET optimizer TO off; + +-- Case 1: Build with no conflicting rows. +CREATE TABLE unique_index_build_@amname@(i int) DISTRIBUTED REPLICATED; +INSERT INTO unique_index_build_@amname@ VALUES(1); +-- should succeed +CREATE UNIQUE INDEX on unique_index_build_@amname@(i); +-- post-build smoke test +EXPLAIN SELECT * FROM unique_index_build_@amname@ WHERE i = 1; +SELECT * FROM unique_index_build_@amname@ WHERE i = 1; +INSERT INTO unique_index_build_@amname@ VALUES(1); + +DROP TABLE unique_index_build_@amname@; + +-- Case 2: Build with conflicting row. +CREATE TABLE unique_index_build_@amname@(i int) DISTRIBUTED REPLICATED; +INSERT INTO unique_index_build_@amname@ VALUES(1); +INSERT INTO unique_index_build_@amname@ VALUES(1); +-- should ERROR out +CREATE UNIQUE INDEX on unique_index_build_@amname@(i); + +DROP TABLE unique_index_build_@amname@; + +-- Case 3: Build with conflict on aborted row. +CREATE TABLE unique_index_build_@amname@(i int) DISTRIBUTED REPLICATED; +BEGIN; +INSERT INTO unique_index_build_@amname@ VALUES(1); +ABORT; +INSERT INTO unique_index_build_@amname@ VALUES(1); +-- should succeed +CREATE UNIQUE INDEX on unique_index_build_@amname@(i); +-- post-build smoke test +EXPLAIN SELECT * FROM unique_index_build_@amname@ WHERE i = 1; +SELECT * FROM unique_index_build_@amname@ WHERE i = 1; +INSERT INTO unique_index_build_@amname@ VALUES(1); + +DROP TABLE unique_index_build_@amname@; + +-- Case 4: Build with conflict on deleted row. +CREATE TABLE unique_index_build_@amname@(i int) DISTRIBUTED REPLICATED; +INSERT INTO unique_index_build_@amname@ VALUES(1); +DELETE FROM unique_index_build_@amname@; +INSERT INTO unique_index_build_@amname@ VALUES(1); +-- should succeed +CREATE UNIQUE INDEX on unique_index_build_@amname@(i); +-- post-build smoke test +EXPLAIN SELECT * FROM unique_index_build_@amname@ WHERE i = 1; +SELECT * FROM unique_index_build_@amname@ WHERE i = 1; +INSERT INTO unique_index_build_@amname@ VALUES(1); + +DROP TABLE unique_index_build_@amname@; + +-- Case 5: Partial unique index build +CREATE TABLE unique_index_build_@amname@(i int) DISTRIBUTED REPLICATED; +INSERT INTO unique_index_build_@amname@ VALUES(1); +INSERT INTO unique_index_build_@amname@ VALUES(1); +INSERT INTO unique_index_build_@amname@ VALUES(2); +INSERT INTO unique_index_build_@amname@ VALUES(6); +INSERT INTO unique_index_build_@amname@ VALUES(6); +-- should fail as conflict lies in indexed portion of data +CREATE UNIQUE INDEX on unique_index_build_@amname@(i) WHERE i < 5; +-- removing conflict should make index build succeed +DELETE FROM unique_index_build_@amname@ WHERE i = 1; +CREATE UNIQUE INDEX on unique_index_build_@amname@(i) WHERE i < 5; +-- post build smoke tests: +-- should succeed as it lies in non-indexed portion +INSERT INTO unique_index_build_@amname@ VALUES(6); +-- should fail as conflict lies in indexed portion of data +INSERT INTO unique_index_build_@amname@ VALUES(2); +-- should succeed as there is no conflicting key that exists +INSERT INTO unique_index_build_@amname@ VALUES(3); +SELECT * FROM unique_index_build_@amname@ WHERE i = 1; +SELECT * FROM unique_index_build_@amname@ WHERE i = 2; +SELECT * FROM unique_index_build_@amname@ WHERE i = 3; +SELECT * FROM unique_index_build_@amname@ WHERE i = 6; + +DROP TABLE unique_index_build_@amname@; + +RESET default_table_access_method; +RESET enable_seqscan; +RESET optimizer; diff --git a/src/test/regress/input/uao_dml/uao_dml.source b/src/test/regress/input/uao_dml/uao_dml.source index c1206bea64f..adb9f033b70 100644 --- a/src/test/regress/input/uao_dml/uao_dml.source +++ b/src/test/regress/input/uao_dml/uao_dml.source @@ -479,8 +479,14 @@ update mytab_@amname@ set col_text=' new value' where col_int = 1; select * from mytab_@amname@; vacuum mytab_@amname@; SELECT reltuples FROM pg_class WHERE relname = 'mytab_@amname@'; -SELECT reltuples FROM pg_class WHERE relname = 'mytab_int_idx1_@amname@'; +-- New strategy of VACUUM AO/CO was introduced by PR #13255 for performance enhancement. +-- Index dead tuples will not always be cleaned up completely after VACUUM, resulting +-- index stats pg_class->reltuples will not always be accurate. So ignore the stats check +-- for reltuples to coordinate with the new behavior. +-- start_ignore +SELECT reltuples FROM pg_class WHERE relname = 'mytab_int_idx1_@amname@'; +-- end_ignore -- @Description Checks that deleting works with many AO blocks. -- diff --git a/src/test/regress/input/uao_dml/uao_dml_unique_index_delete.source b/src/test/regress/input/uao_dml/uao_dml_unique_index_delete.source new file mode 100644 index 00000000000..6f88d3224c9 --- /dev/null +++ b/src/test/regress/input/uao_dml/uao_dml_unique_index_delete.source @@ -0,0 +1,111 @@ +create schema uao_dml_unique_index_@amname@; +set search_path=uao_dml_unique_index_@amname@; +set default_table_access_method=@amname@; + +-- Case 1: Inserting tx inserting a deleted key--------------------------------- +CREATE TABLE uao_unique_index_delete (a INT unique); +INSERT INTO uao_unique_index_delete VALUES (1); +DELETE FROM uao_unique_index_delete WHERE a = 1; +-- Should not raise a conflict as the key has been deleted. +INSERT INTO uao_unique_index_delete VALUES (1); +SELECT * FROM uao_unique_index_delete; + +DROP TABLE uao_unique_index_delete; + +-- Case 2: Inserting tx inserting a key whose delete was aborted---------------- +CREATE TABLE uao_unique_index_delete (a INT unique); +INSERT INTO uao_unique_index_delete VALUES (1); +BEGIN; +DELETE FROM uao_unique_index_delete WHERE a = 1; +ABORT; +-- Should raise a conflict as the delete of the key was aborted. +INSERT INTO uao_unique_index_delete VALUES (1); +SELECT * FROM uao_unique_index_delete; + +DROP TABLE uao_unique_index_delete; + +-- Case 3: Inserting tx inserting a key deleted in the same tx------------------ +CREATE TABLE uao_unique_index_delete (a INT unique); +INSERT INTO uao_unique_index_delete VALUES (1); +BEGIN; +DELETE FROM uao_unique_index_delete WHERE a = 1; +-- should succeed as the INSERT should see that the conflicting key was deleted. +INSERT INTO uao_unique_index_delete VALUES (1); +COMMIT; +SELECT * FROM uao_unique_index_delete; + +DROP TABLE uao_unique_index_delete; + +-- Case 4: Deleting tx deletes a key already deleted in the same tx------------- +CREATE TABLE uao_unique_index_delete (a INT unique); +INSERT INTO uao_unique_index_delete VALUES (1); +BEGIN; +DELETE FROM uao_unique_index_delete WHERE a = 1; +-- should be a no-op as it should see the prior DELETE. +DELETE FROM uao_unique_index_delete WHERE a = 1; +COMMIT; +SELECT * FROM uao_unique_index_delete; + +DROP TABLE uao_unique_index_delete; + +-- Case 5: Deleting tx deletes a key inserted in the same tx-------------------- +CREATE TABLE uao_unique_index_delete (a INT unique); +BEGIN; +INSERT INTO uao_unique_index_delete VALUES (1); +-- should be able to see and delete the inserted key. +DELETE FROM uao_unique_index_delete WHERE a = 1; +COMMIT; +SELECT * FROM uao_unique_index_delete; + +DROP TABLE uao_unique_index_delete; + +-- Case 6: Deleting tx deletes a key absent from the table---------------------- +CREATE TABLE uao_unique_index_delete (a INT unique); +-- should be a no-op. +DELETE FROM uao_unique_index_delete WHERE a = 1; +INSERT INTO uao_unique_index_delete VALUES (1); +-- should be a no-op. +DELETE FROM uao_unique_index_delete WHERE a = 2; +SELECT * FROM uao_unique_index_delete; + +DROP TABLE uao_unique_index_delete; + +-- Case 7: Deleting tx deletes a key inserted in the same subtx----------------- +CREATE TABLE uao_unique_index_delete (a INT unique); +BEGIN; +SAVEPOINT a; +INSERT INTO uao_unique_index_delete VALUES (1); +DELETE FROM uao_unique_index_delete WHERE a = 1; +COMMIT; +SELECT * FROM uao_unique_index_delete; + +DROP TABLE uao_unique_index_delete; + +-- Case 8: Deleting tx deletes a key deleted in the same subtx------------------ +CREATE TABLE uao_unique_index_delete (a INT unique); +BEGIN; +SAVEPOINT a; +INSERT INTO uao_unique_index_delete VALUES (1); +DELETE FROM uao_unique_index_delete WHERE a = 1; +-- should be a no-op +DELETE FROM uao_unique_index_delete WHERE a = 1; +COMMIT; +SELECT * FROM uao_unique_index_delete; + +DROP TABLE uao_unique_index_delete; + +-- Case 9: Deleting tx deletes a key whose earlier delete was rolled back------- +CREATE TABLE uao_unique_index_delete (a INT unique); +INSERT INTO uao_unique_index_delete VALUES (1); +BEGIN; +SAVEPOINT a; +DELETE FROM uao_unique_index_delete WHERE a = 1; +ROLLBACK TO SAVEPOINT a; +-- should be able to delete it again. +DELETE FROM uao_unique_index_delete WHERE a = 1; +-- should be able to insert it now since it has been deleted +INSERT INTO uao_unique_index_delete VALUES (1); +COMMIT; +SELECT * FROM uao_unique_index_delete; + +DROP TABLE uao_unique_index_delete; diff --git a/src/test/regress/input/uao_dml/uao_dml_unique_index_update.source b/src/test/regress/input/uao_dml/uao_dml_unique_index_update.source new file mode 100644 index 00000000000..fe7c16e3198 --- /dev/null +++ b/src/test/regress/input/uao_dml/uao_dml_unique_index_update.source @@ -0,0 +1,172 @@ +create schema uao_dml_unique_index_update_@amname@; +set search_path=uao_dml_unique_index_update_@amname@; +set default_table_access_method=@amname@; + +-- Case 1: Inserting tx inserting a key affected by an update-------------------- +CREATE TABLE uao_unique_index_update (a INT unique); +INSERT INTO uao_unique_index_update VALUES (1); +UPDATE uao_unique_index_update SET a = 2 WHERE a = 1; +-- Should not raise a conflict as the key has been deleted the update. +INSERT INTO uao_unique_index_update VALUES (1); +-- Should raise a conflict as the key has been inserted by the update +INSERT INTO uao_unique_index_update VALUES (2); +SELECT * FROM uao_unique_index_update; + +DROP TABLE uao_unique_index_update; + +-- Case 2: Inserting tx inserting a key whose update was aborted---------------- +CREATE TABLE uao_unique_index_update (a INT unique); +INSERT INTO uao_unique_index_update VALUES (1); +BEGIN; +UPDATE uao_unique_index_update SET a = 2 WHERE a = 1; +ABORT; +-- Should raise a conflict as the update of the key was aborted. +INSERT INTO uao_unique_index_update VALUES (1); +-- Should not raise a conflict as the update of the key was aborted. +INSERT INTO uao_unique_index_update VALUES (2); +SELECT * FROM uao_unique_index_update; + +DROP TABLE uao_unique_index_update; + +-- Case 3: Inserting tx inserting a key updated in the same tx------------------ +CREATE TABLE uao_unique_index_update (a INT unique); +INSERT INTO uao_unique_index_update VALUES (1); +BEGIN; +UPDATE uao_unique_index_update SET a = 2 WHERE a = 1; +-- should succeed as the conflicting key was deleted by the update. +INSERT INTO uao_unique_index_update VALUES (1); +-- should raise a conflict as the key was inserted by the update. +INSERT INTO uao_unique_index_update VALUES (2); +COMMIT; +SELECT * FROM uao_unique_index_update; + +DROP TABLE uao_unique_index_update; + +-- Case 4: Updating tx deletes a key already updated in the same tx------------- +CREATE TABLE uao_unique_index_update (a INT unique); +INSERT INTO uao_unique_index_update VALUES (1); +BEGIN; +UPDATE uao_unique_index_update SET a = 2 WHERE a = 1; +-- should be a no-op as the key has already been deleted by the update +UPDATE uao_unique_index_update SET a = 2 WHERE a = 1; +-- should succeed as the key has been inserted by the 1st update +UPDATE uao_unique_index_update SET a = 3 WHERE a = 2; +COMMIT; +SELECT * FROM uao_unique_index_update; + +DROP TABLE uao_unique_index_update; + +-- Case 5: Updating tx updates a key inserted in the same tx-------------------- +CREATE TABLE uao_unique_index_update (a INT unique); +BEGIN; +INSERT INTO uao_unique_index_update VALUES (1); +-- should be able to see and update the inserted key. +UPDATE uao_unique_index_update SET a = 2 WHERE a = 1; +COMMIT; +SELECT * FROM uao_unique_index_update; + +-- Case 6: Updating tx updates a key to a key inserted in the same tx----------- +CREATE TABLE uao_unique_index_update (a INT unique); +INSERT INTO uao_unique_index_update VALUES (1); +BEGIN; +INSERT INTO uao_unique_index_update VALUES (2); +-- should raise a conflict with the key inserted inside the same tx. +UPDATE uao_unique_index_update SET a = 2 WHERE a = 1; +END; +SELECT * FROM uao_unique_index_update; + +DROP TABLE uao_unique_index_update; + +-- Case 7: Updating tx tries to update a key absent from the table-------------- +CREATE TABLE uao_unique_index_update (a INT unique); +-- should be a no-op. +UPDATE uao_unique_index_update SET a = 2 WHERE a = 1; +INSERT INTO uao_unique_index_update VALUES (1); +-- should be a no-op. +UPDATE uao_unique_index_update SET a = 3 WHERE a = 2; +SELECT * FROM uao_unique_index_update; + +DROP TABLE uao_unique_index_update; + +-- Case 8: Update where pre-update key = post-update key------------------------ +CREATE TABLE uao_unique_index_update (a INT unique); +INSERT INTO uao_unique_index_update VALUES (1); +-- should succeed even though the pre-update and post-update values are equal. +UPDATE uao_unique_index_update SET a = 1 WHERE a = 1; +SELECT * FROM uao_unique_index_update; + +DROP TABLE uao_unique_index_update; + +-- Case 9: Updating tx inserts a key that already exists------------------------ +CREATE TABLE uao_unique_index_update (a INT unique); +INSERT INTO uao_unique_index_update VALUES (1), (2); +-- should raise a conflict as the target value already exists. +UPDATE uao_unique_index_update SET a = 2 WHERE a = 1; +SELECT * FROM uao_unique_index_update; + +DROP TABLE uao_unique_index_update; + +-- Case 10: Updating command tries to insert the same key more than once-------- +CREATE TABLE uao_unique_index_update (a INT unique); +INSERT INTO uao_unique_index_update SELECT generate_series(1,5); +-- should raise a conflict as the second update will try to insert 6, which was +-- already inserted by the first update. +UPDATE uao_unique_index_update SET a=6 WHERE a>2; + +DROP TABLE uao_unique_index_update; + +-- Case 11: Updating tx updates a key inserted in the same subtx---------------- +CREATE TABLE uao_unique_index_update (a INT unique); +BEGIN; +SAVEPOINT a; +INSERT INTO uao_unique_index_update VALUES(1); +UPDATE uao_unique_index_update SET a=2 WHERE a=1; +COMMIT; +SELECT * FROM uao_unique_index_update; + +DROP TABLE uao_unique_index_update; + +-- Case 12: Updating tx updates a key updated in the same subtx----------------- +CREATE TABLE uao_unique_index_update (a INT unique); +BEGIN; +SAVEPOINT a; +INSERT INTO uao_unique_index_update VALUES(1); +UPDATE uao_unique_index_update SET a=2 WHERE a=1; +-- should be a no-op +UPDATE uao_unique_index_update SET a=2 WHERE a=1; +-- should succeed +UPDATE uao_unique_index_update SET a=3 WHERE a=2; +COMMIT; +SELECT * FROM uao_unique_index_update; + +DROP TABLE uao_unique_index_update; + +-- Case 13: Updating tx updates a key whose earlier update was rolled back------ +CREATE TABLE uao_unique_index_update (a INT unique); +INSERT INTO uao_unique_index_update VALUES(1); +BEGIN; +SAVEPOINT a; +UPDATE uao_unique_index_update SET a=2 WHERE a=1; +ROLLBACK TO SAVEPOINT a; +-- should be able to run the update again as we have rolled back. +UPDATE uao_unique_index_update SET a=2 WHERE a=1; +COMMIT; +SELECT * FROM uao_unique_index_update; + +DROP TABLE uao_unique_index_update; + +-- Case 14: Updating tx updates a key to a key inserted in the same subtx------- +CREATE TABLE uao_unique_index_update (a INT unique); +INSERT INTO uao_unique_index_update VALUES (1); +BEGIN; +SAVEPOINT a; +INSERT INTO uao_unique_index_update VALUES (2); +-- should raise a conflict with the key inserted inside the same subtx. +UPDATE uao_unique_index_update SET a = 2 WHERE a = 1; +ROLLBACK TO SAVEPOINT a; +-- should be able to run the update again as we have rolled back. +UPDATE uao_unique_index_update SET a = 2 WHERE a = 1; +COMMIT; +SELECT * FROM uao_unique_index_update; + +DROP TABLE uao_unique_index_update; diff --git a/src/test/regress/output/uao_ddl/alter_ao_table_index.source b/src/test/regress/output/uao_ddl/alter_ao_table_index.source index 92c3731b7fb..761b95176ff 100644 --- a/src/test/regress/output/uao_ddl/alter_ao_table_index.source +++ b/src/test/regress/output/uao_ddl/alter_ao_table_index.source @@ -111,6 +111,3 @@ select relfrozenxid from gp_dist_random('pg_class') c, pg_namespace n where 0 (3 rows) --- Verify that unique index is not allowed -CREATE UNIQUE INDEX uni_index ON sto_alt_uao3_idx (text_col); -ERROR: append-only tables do not support unique indexes diff --git a/src/test/regress/output/uao_dml/ao_unique_index_build.source b/src/test/regress/output/uao_dml/ao_unique_index_build.source new file mode 100644 index 00000000000..734d4903c0f --- /dev/null +++ b/src/test/regress/output/uao_dml/ao_unique_index_build.source @@ -0,0 +1,152 @@ +-- Test cases to cover CREATE UNIQUE INDEX on AO/CO tables. +SET default_table_access_method TO @amname@; +-- To force index scans for smoke tests +SET enable_seqscan TO off; +SET optimizer TO off; +-- Case 1: Build with no conflicting rows. +CREATE TABLE unique_index_build_@amname@(i int) DISTRIBUTED REPLICATED; +INSERT INTO unique_index_build_@amname@ VALUES(1); +-- should succeed +CREATE UNIQUE INDEX on unique_index_build_@amname@(i); +-- post-build smoke test +EXPLAIN SELECT * FROM unique_index_build_@amname@ WHERE i = 1; + QUERY PLAN +---------------------------------------------------------------------------------------------------- + Gather Motion 1:1 (slice1; segments: 1) (cost=8.19..8.19 rows=1 width=4) + -> Bitmap Heap Scan on unique_index_build_@amname@ (cost=4.18..8.19 rows=1 width=4) + Recheck Cond: (i = 1) + -> Bitmap Index Scan on unique_index_build_@amname@_i_idx (cost=0.00..4.18 rows=1 width=0) + Index Cond: (i = 1) + Optimizer: Postgres query optimizer +(6 rows) + +SELECT * FROM unique_index_build_@amname@ WHERE i = 1; + i +--- + 1 +(1 row) + +INSERT INTO unique_index_build_@amname@ VALUES(1); +ERROR: duplicate key value violates unique constraint "unique_index_build_@amname@_i_idx" (seg0 192.168.0.148:7002 pid=1421591) +DETAIL: Key (i)=(1) already exists. +DROP TABLE unique_index_build_@amname@; +-- Case 2: Build with conflicting row. +CREATE TABLE unique_index_build_@amname@(i int) DISTRIBUTED REPLICATED; +INSERT INTO unique_index_build_@amname@ VALUES(1); +INSERT INTO unique_index_build_@amname@ VALUES(1); +-- should ERROR out +CREATE UNIQUE INDEX on unique_index_build_@amname@(i); +ERROR: could not create unique index "unique_index_build_@amname@_i_idx" (seg0 192.168.0.148:7002 pid=1421591) +DETAIL: Key (i)=(1) is duplicated. +DROP TABLE unique_index_build_@amname@; +-- Case 3: Build with conflict on aborted row. +CREATE TABLE unique_index_build_@amname@(i int) DISTRIBUTED REPLICATED; +BEGIN; +INSERT INTO unique_index_build_@amname@ VALUES(1); +ABORT; +INSERT INTO unique_index_build_@amname@ VALUES(1); +-- should succeed +CREATE UNIQUE INDEX on unique_index_build_@amname@(i); +-- post-build smoke test +EXPLAIN SELECT * FROM unique_index_build_@amname@ WHERE i = 1; + QUERY PLAN +---------------------------------------------------------------------------------------------------- + Gather Motion 1:1 (slice1; segments: 1) (cost=8.19..8.19 rows=1 width=4) + -> Bitmap Heap Scan on unique_index_build_@amname@ (cost=4.18..8.19 rows=1 width=4) + Recheck Cond: (i = 1) + -> Bitmap Index Scan on unique_index_build_@amname@_i_idx (cost=0.00..4.18 rows=1 width=0) + Index Cond: (i = 1) + Optimizer: Postgres query optimizer +(6 rows) + +SELECT * FROM unique_index_build_@amname@ WHERE i = 1; + i +--- + 1 +(1 row) + +INSERT INTO unique_index_build_@amname@ VALUES(1); +ERROR: duplicate key value violates unique constraint "unique_index_build_@amname@_i_idx" (seg0 192.168.0.148:7002 pid=1421591) +DETAIL: Key (i)=(1) already exists. +DROP TABLE unique_index_build_@amname@; +-- Case 4: Build with conflict on deleted row. +CREATE TABLE unique_index_build_@amname@(i int) DISTRIBUTED REPLICATED; +INSERT INTO unique_index_build_@amname@ VALUES(1); +DELETE FROM unique_index_build_@amname@; +INSERT INTO unique_index_build_@amname@ VALUES(1); +-- should succeed +CREATE UNIQUE INDEX on unique_index_build_@amname@(i); +-- post-build smoke test +EXPLAIN SELECT * FROM unique_index_build_@amname@ WHERE i = 1; + QUERY PLAN +---------------------------------------------------------------------------------------------------- + Gather Motion 1:1 (slice1; segments: 1) (cost=8.19..8.19 rows=1 width=4) + -> Bitmap Heap Scan on unique_index_build_@amname@ (cost=4.18..8.19 rows=1 width=4) + Recheck Cond: (i = 1) + -> Bitmap Index Scan on unique_index_build_@amname@_i_idx (cost=0.00..4.18 rows=1 width=0) + Index Cond: (i = 1) + Optimizer: Postgres query optimizer +(6 rows) + +SELECT * FROM unique_index_build_@amname@ WHERE i = 1; + i +--- + 1 +(1 row) + +INSERT INTO unique_index_build_@amname@ VALUES(1); +ERROR: duplicate key value violates unique constraint "unique_index_build_@amname@_i_idx" (seg0 192.168.0.148:7002 pid=1421591) +DETAIL: Key (i)=(1) already exists. +DROP TABLE unique_index_build_@amname@; +-- Case 5: Partial unique index build +CREATE TABLE unique_index_build_@amname@(i int) DISTRIBUTED REPLICATED; +INSERT INTO unique_index_build_@amname@ VALUES(1); +INSERT INTO unique_index_build_@amname@ VALUES(1); +INSERT INTO unique_index_build_@amname@ VALUES(2); +INSERT INTO unique_index_build_@amname@ VALUES(6); +INSERT INTO unique_index_build_@amname@ VALUES(6); +-- should fail as conflict lies in indexed portion of data +CREATE UNIQUE INDEX on unique_index_build_@amname@(i) WHERE i < 5; +ERROR: could not create unique index "unique_index_build_@amname@_i_idx" (seg0 192.168.0.148:7002 pid=3690142) +DETAIL: Key (i)=(1) is duplicated. +-- removing conflict should make index build succeed +DELETE FROM unique_index_build_@amname@ WHERE i = 1; +CREATE UNIQUE INDEX on unique_index_build_@amname@(i) WHERE i < 5; +-- post build smoke tests: +-- should succeed as it lies in non-indexed portion +INSERT INTO unique_index_build_@amname@ VALUES(6); +-- should fail as conflict lies in indexed portion of data +INSERT INTO unique_index_build_@amname@ VALUES(2); +ERROR: duplicate key value violates unique constraint "unique_index_build_@amname@_i_idx" (seg1 192.168.0.148:7003 pid=3690143) +DETAIL: Key (i)=(2) already exists. +-- should succeed as there is no conflicting key that exists +INSERT INTO unique_index_build_@amname@ VALUES(3); +SELECT * FROM unique_index_build_@amname@ WHERE i = 1; + i +--- +(0 rows) + +SELECT * FROM unique_index_build_@amname@ WHERE i = 2; + i +--- + 2 +(1 row) + +SELECT * FROM unique_index_build_@amname@ WHERE i = 3; + i +--- + 3 +(1 row) + +SELECT * FROM unique_index_build_@amname@ WHERE i = 6; + i +--- + 6 + 6 + 6 +(3 rows) + +DROP TABLE unique_index_build_@amname@; +RESET default_table_access_method; +RESET enable_seqscan; +RESET optimizer; diff --git a/src/test/regress/output/uao_dml/uao_dml.source b/src/test/regress/output/uao_dml/uao_dml.source index 9ecf578eb83..f07b1b8de1e 100644 --- a/src/test/regress/output/uao_dml/uao_dml.source +++ b/src/test/regress/output/uao_dml/uao_dml.source @@ -939,12 +939,18 @@ SELECT reltuples FROM pg_class WHERE relname = 'mytab_@amname@'; 2 (1 row) +-- New strategy of VACUUM AO/CO was introduced by PR #13255 for performance enhancement. +-- Index dead tuples will not always be cleaned up completely after VACUUM, resulting +-- index stats pg_class->reltuples will not always be accurate. So ignore the stats check +-- for reltuples to coordinate with the new behavior. +-- start_ignore SELECT reltuples FROM pg_class WHERE relname = 'mytab_int_idx1_@amname@'; reltuples ----------- 2 (1 row) +-- end_ignore -- @Description Checks that deleting works with many AO blocks. -- DROP TABLE IF EXISTS foo; diff --git a/src/test/regress/output/uao_dml/uao_dml_unique_index_delete.source b/src/test/regress/output/uao_dml/uao_dml_unique_index_delete.source new file mode 100644 index 00000000000..3d9ce087e0b --- /dev/null +++ b/src/test/regress/output/uao_dml/uao_dml_unique_index_delete.source @@ -0,0 +1,136 @@ +create schema uao_dml_unique_index_@amname@; +set search_path=uao_dml_unique_index_@amname@; +set default_table_access_method=@amname@; +-- Case 1: Inserting tx inserting a deleted key--------------------------------- +CREATE TABLE uao_unique_index_delete (a INT unique); +INSERT INTO uao_unique_index_delete VALUES (1); +DELETE FROM uao_unique_index_delete WHERE a = 1; +-- Should not raise a conflict as the key has been deleted. +INSERT INTO uao_unique_index_delete VALUES (1); +SELECT * FROM uao_unique_index_delete; + a +--- + 1 +(1 row) + +DROP TABLE uao_unique_index_delete; +-- Case 2: Inserting tx inserting a key whose delete was aborted---------------- +CREATE TABLE uao_unique_index_delete (a INT unique); +INSERT INTO uao_unique_index_delete VALUES (1); +BEGIN; +DELETE FROM uao_unique_index_delete WHERE a = 1; +ABORT; +-- Should raise a conflict as the delete of the key was aborted. +INSERT INTO uao_unique_index_delete VALUES (1); +ERROR: duplicate key value violates unique constraint "uao_unique_index_delete_a_key" (seg1 192.168.0.148:7003 pid=1637278) +DETAIL: Key (a)=(1) already exists. +SELECT * FROM uao_unique_index_delete; + a +--- + 1 +(1 row) + +DROP TABLE uao_unique_index_delete; +-- Case 3: Inserting tx inserting a key deleted in the same tx------------------ +CREATE TABLE uao_unique_index_delete (a INT unique); +INSERT INTO uao_unique_index_delete VALUES (1); +BEGIN; +DELETE FROM uao_unique_index_delete WHERE a = 1; +-- should succeed as the INSERT should see that the conflicting key was deleted. +INSERT INTO uao_unique_index_delete VALUES (1); +COMMIT; +SELECT * FROM uao_unique_index_delete; + a +--- + 1 +(1 row) + +DROP TABLE uao_unique_index_delete; +-- Case 4: Deleting tx deletes a key already deleted in the same tx------------- +CREATE TABLE uao_unique_index_delete (a INT unique); +INSERT INTO uao_unique_index_delete VALUES (1); +BEGIN; +DELETE FROM uao_unique_index_delete WHERE a = 1; +-- should be a no-op as it should see the prior DELETE. +DELETE FROM uao_unique_index_delete WHERE a = 1; +COMMIT; +SELECT * FROM uao_unique_index_delete; + a +--- +(0 rows) + +DROP TABLE uao_unique_index_delete; +-- Case 5: Deleting tx deletes a key inserted in the same tx-------------------- +CREATE TABLE uao_unique_index_delete (a INT unique); +BEGIN; +INSERT INTO uao_unique_index_delete VALUES (1); +-- should be able to see and delete the inserted key. +DELETE FROM uao_unique_index_delete WHERE a = 1; +COMMIT; +SELECT * FROM uao_unique_index_delete; + a +--- +(0 rows) + +DROP TABLE uao_unique_index_delete; +-- Case 6: Deleting tx deletes a key absent from the table---------------------- +CREATE TABLE uao_unique_index_delete (a INT unique); +-- should be a no-op. +DELETE FROM uao_unique_index_delete WHERE a = 1; +INSERT INTO uao_unique_index_delete VALUES (1); +-- should be a no-op. +DELETE FROM uao_unique_index_delete WHERE a = 2; +SELECT * FROM uao_unique_index_delete; + a +--- + 1 +(1 row) + +DROP TABLE uao_unique_index_delete; +-- Case 7: Deleting tx deletes a key inserted in the same subtx----------------- +CREATE TABLE uao_unique_index_delete (a INT unique); +BEGIN; +SAVEPOINT a; +INSERT INTO uao_unique_index_delete VALUES (1); +DELETE FROM uao_unique_index_delete WHERE a = 1; +COMMIT; +SELECT * FROM uao_unique_index_delete; + a +--- +(0 rows) + +DROP TABLE uao_unique_index_delete; +-- Case 8: Deleting tx deletes a key deleted in the same subtx------------------ +CREATE TABLE uao_unique_index_delete (a INT unique); +BEGIN; +SAVEPOINT a; +INSERT INTO uao_unique_index_delete VALUES (1); +DELETE FROM uao_unique_index_delete WHERE a = 1; +-- should be a no-op +DELETE FROM uao_unique_index_delete WHERE a = 1; +COMMIT; +SELECT * FROM uao_unique_index_delete; + a +--- +(0 rows) + +DROP TABLE uao_unique_index_delete; +-- Case 9: Deleting tx deletes a key whose earlier delete was rolled back------- +CREATE TABLE uao_unique_index_delete (a INT unique); +INSERT INTO uao_unique_index_delete VALUES (1); +BEGIN; +SAVEPOINT a; +DELETE FROM uao_unique_index_delete WHERE a = 1; +ROLLBACK TO SAVEPOINT a; +-- should be able to delete it again. +DELETE FROM uao_unique_index_delete WHERE a = 1; +-- should be able to insert it now since it has been deleted +INSERT INTO uao_unique_index_delete VALUES (1); +COMMIT; +SELECT * FROM uao_unique_index_delete; + a +--- + 1 +(1 row) + +DROP TABLE uao_unique_index_delete; diff --git a/src/test/regress/output/uao_dml/uao_dml_unique_index_update.source b/src/test/regress/output/uao_dml/uao_dml_unique_index_update.source new file mode 100644 index 00000000000..62000c723bf --- /dev/null +++ b/src/test/regress/output/uao_dml/uao_dml_unique_index_update.source @@ -0,0 +1,230 @@ +create schema uao_dml_unique_index_update_@amname@; +set search_path=uao_dml_unique_index_update_@amname@; +set default_table_access_method=@amname@; +-- Case 1: Inserting tx inserting a key affected by an update-------------------- +CREATE TABLE uao_unique_index_update (a INT unique); +INSERT INTO uao_unique_index_update VALUES (1); +UPDATE uao_unique_index_update SET a = 2 WHERE a = 1; +-- Should not raise a conflict as the key has been deleted the update. +INSERT INTO uao_unique_index_update VALUES (1); +-- Should raise a conflict as the key has been inserted by the update +INSERT INTO uao_unique_index_update VALUES (2); +ERROR: duplicate key value violates unique constraint "uao_unique_index_update_a_key" (seg0 192.168.0.148:7002 pid=1927811) +DETAIL: Key (a)=(2) already exists. +SELECT * FROM uao_unique_index_update; + a +--- + 2 + 1 +(2 rows) + +DROP TABLE uao_unique_index_update; +-- Case 2: Inserting tx inserting a key whose update was aborted---------------- +CREATE TABLE uao_unique_index_update (a INT unique); +INSERT INTO uao_unique_index_update VALUES (1); +BEGIN; +UPDATE uao_unique_index_update SET a = 2 WHERE a = 1; +ABORT; +-- Should raise a conflict as the update of the key was aborted. +INSERT INTO uao_unique_index_update VALUES (1); +ERROR: duplicate key value violates unique constraint "uao_unique_index_update_a_key" (seg1 192.168.0.148:7003 pid=1927812) +DETAIL: Key (a)=(1) already exists. +-- Should not raise a conflict as the update of the key was aborted. +INSERT INTO uao_unique_index_update VALUES (2); +SELECT * FROM uao_unique_index_update; + a +--- + 2 + 1 +(2 rows) + +DROP TABLE uao_unique_index_update; +-- Case 3: Inserting tx inserting a key updated in the same tx------------------ +CREATE TABLE uao_unique_index_update (a INT unique); +INSERT INTO uao_unique_index_update VALUES (1); +BEGIN; +UPDATE uao_unique_index_update SET a = 2 WHERE a = 1; +-- should succeed as the conflicting key was deleted by the update. +INSERT INTO uao_unique_index_update VALUES (1); +-- should raise a conflict as the key was inserted by the update. +INSERT INTO uao_unique_index_update VALUES (2); +ERROR: duplicate key value violates unique constraint "uao_unique_index_update_a_key" (seg0 192.168.0.148:7002 pid=1927811) +DETAIL: Key (a)=(2) already exists. +COMMIT; +SELECT * FROM uao_unique_index_update; + a +--- + 1 +(1 row) + +DROP TABLE uao_unique_index_update; +-- Case 4: Updating tx deletes a key already updated in the same tx------------- +CREATE TABLE uao_unique_index_update (a INT unique); +INSERT INTO uao_unique_index_update VALUES (1); +BEGIN; +UPDATE uao_unique_index_update SET a = 2 WHERE a = 1; +-- should be a no-op as the key has already been deleted by the update +UPDATE uao_unique_index_update SET a = 2 WHERE a = 1; +-- should succeed as the key has been inserted by the 1st update +UPDATE uao_unique_index_update SET a = 3 WHERE a = 2; +COMMIT; +SELECT * FROM uao_unique_index_update; + a +--- + 3 +(1 row) + +DROP TABLE uao_unique_index_update; +-- Case 5: Updating tx updates a key inserted in the same tx-------------------- +CREATE TABLE uao_unique_index_update (a INT unique); +BEGIN; +INSERT INTO uao_unique_index_update VALUES (1); +-- should be able to see and update the inserted key. +UPDATE uao_unique_index_update SET a = 2 WHERE a = 1; +COMMIT; +SELECT * FROM uao_unique_index_update; + a +--- + 2 +(1 row) + +-- Case 6: Updating tx updates a key to a key inserted in the same tx----------- +CREATE TABLE uao_unique_index_update (a INT unique); +ERROR: relation "uao_unique_index_update" already exists +INSERT INTO uao_unique_index_update VALUES (1); +BEGIN; +INSERT INTO uao_unique_index_update VALUES (2); +ERROR: duplicate key value violates unique constraint "uao_unique_index_update_a_key" (seg0 192.168.0.148:7002 pid=1927811) +DETAIL: Key (a)=(2) already exists. +-- should raise a conflict with the key inserted inside the same tx. +UPDATE uao_unique_index_update SET a = 2 WHERE a = 1; +ERROR: current transaction is aborted, commands ignored until end of transaction block +END; +SELECT * FROM uao_unique_index_update; + a +--- + 2 + 1 +(2 rows) + +DROP TABLE uao_unique_index_update; +-- Case 7: Updating tx tries to update a key absent from the table-------------- +CREATE TABLE uao_unique_index_update (a INT unique); +-- should be a no-op. +UPDATE uao_unique_index_update SET a = 2 WHERE a = 1; +INSERT INTO uao_unique_index_update VALUES (1); +-- should be a no-op. +UPDATE uao_unique_index_update SET a = 3 WHERE a = 2; +SELECT * FROM uao_unique_index_update; + a +--- + 1 +(1 row) + +DROP TABLE uao_unique_index_update; +-- Case 8: Update where pre-update key = post-update key------------------------ +CREATE TABLE uao_unique_index_update (a INT unique); +INSERT INTO uao_unique_index_update VALUES (1); +-- should succeed even though the pre-update and post-update values are equal. +UPDATE uao_unique_index_update SET a = 1 WHERE a = 1; +SELECT * FROM uao_unique_index_update; + a +--- + 1 +(1 row) + +DROP TABLE uao_unique_index_update; +-- Case 9: Updating tx inserts a key that already exists------------------------ +CREATE TABLE uao_unique_index_update (a INT unique); +INSERT INTO uao_unique_index_update VALUES (1), (2); +-- should raise a conflict as the target value already exists. +UPDATE uao_unique_index_update SET a = 2 WHERE a = 1; +ERROR: duplicate key value violates unique constraint "uao_unique_index_update_a_key" (seg0 192.168.0.148:7002 pid=1929685) +DETAIL: Key (a)=(2) already exists. +SELECT * FROM uao_unique_index_update; + a +--- + 2 + 1 +(2 rows) + +DROP TABLE uao_unique_index_update; +-- Case 10: Updating command tries to insert the same key more than once-------- +CREATE TABLE uao_unique_index_update (a INT unique); +INSERT INTO uao_unique_index_update SELECT generate_series(1,5); +-- should raise a conflict as the second update will try to insert 6, which was +-- already inserted by the first update. +UPDATE uao_unique_index_update SET a=6 WHERE a>2; +ERROR: duplicate key value violates unique constraint "uao_unique_index_update_a_key" (seg2 192.168.0.148:7004 pid=1669359) +DETAIL: Key (a)=(6) already exists. +DROP TABLE uao_unique_index_update; +-- Case 11: Updating tx updates a key inserted in the same subtx---------------- +CREATE TABLE uao_unique_index_update (a INT unique); +BEGIN; +SAVEPOINT a; +INSERT INTO uao_unique_index_update VALUES(1); +UPDATE uao_unique_index_update SET a=2 WHERE a=1; +COMMIT; +SELECT * FROM uao_unique_index_update; + a +--- + 2 +(1 row) + +DROP TABLE uao_unique_index_update; +-- Case 12: Updating tx updates a key updated in the same subtx----------------- +CREATE TABLE uao_unique_index_update (a INT unique); +BEGIN; +SAVEPOINT a; +INSERT INTO uao_unique_index_update VALUES(1); +UPDATE uao_unique_index_update SET a=2 WHERE a=1; +-- should be a no-op +UPDATE uao_unique_index_update SET a=2 WHERE a=1; +-- should succeed +UPDATE uao_unique_index_update SET a=3 WHERE a=2; +COMMIT; +SELECT * FROM uao_unique_index_update; + a +--- + 3 +(1 row) + +DROP TABLE uao_unique_index_update; +-- Case 13: Updating tx updates a key whose earlier update was rolled back------ +CREATE TABLE uao_unique_index_update (a INT unique); +INSERT INTO uao_unique_index_update VALUES(1); +BEGIN; +SAVEPOINT a; +UPDATE uao_unique_index_update SET a=2 WHERE a=1; +ROLLBACK TO SAVEPOINT a; +-- should be able to run the update again as we have rolled back. +UPDATE uao_unique_index_update SET a=2 WHERE a=1; +COMMIT; +SELECT * FROM uao_unique_index_update; + a +--- + 2 +(1 row) + +DROP TABLE uao_unique_index_update; +-- Case 14: Updating tx updates a key to a key inserted in the same subtx------- +CREATE TABLE uao_unique_index_update (a INT unique); +INSERT INTO uao_unique_index_update VALUES (1); +BEGIN; +SAVEPOINT a; +INSERT INTO uao_unique_index_update VALUES (2); +-- should raise a conflict with the key inserted inside the same subtx. +UPDATE uao_unique_index_update SET a = 2 WHERE a = 1; +ERROR: duplicate key value violates unique constraint "uao_unique_index_update_a_key" (seg0 192.168.0.148:7002 pid=3411438) +DETAIL: Key (a)=(2) already exists. +ROLLBACK TO SAVEPOINT a; +-- should be able to run the update again as we have rolled back. +UPDATE uao_unique_index_update SET a = 2 WHERE a = 1; +COMMIT; +SELECT * FROM uao_unique_index_update; + a +--- + 2 +(1 row) + +DROP TABLE uao_unique_index_update; diff --git a/src/test/regress/sql/alter_table_aocs.sql b/src/test/regress/sql/alter_table_aocs.sql index 3a16ca3fa0d..89cff756f5f 100644 --- a/src/test/regress/sql/alter_table_aocs.sql +++ b/src/test/regress/sql/alter_table_aocs.sql @@ -308,9 +308,6 @@ select attstattarget from pg_attribute where attrelid = 'aocs_addcol.addcol1'::r alter table addcol1 set distributed randomly; alter table addcol1 set distributed by (a); --- test some constraints (unique indexes do not work for unique and pkey) -alter table addcol1 add constraint tunique unique(a); -alter table addcol1 add constraint tpkey primary key(a); alter table addcol1 add constraint tcheck check (a is not null); -- test changing the storage type of a column diff --git a/src/test/regress/sql/brin_ao.sql b/src/test/regress/sql/brin_ao.sql index 68057a1fdd9..0c624a08276 100644 --- a/src/test/regress/sql/brin_ao.sql +++ b/src/test/regress/sql/brin_ao.sql @@ -464,4 +464,10 @@ SELECT segment_id, segno, tupcount, state FROM gp_toolkit.__gp_aoseg('brintest_a -- Tests for brin_summarize_new_values SELECT brin_summarize_new_values('brintest_ao'); -- error, not an index SELECT brin_summarize_new_values('tenk1_unique1'); -- error, not a BRIN index +-- New strategy of VACUUM AO/CO was introduced by PR #13255 for performance enhancement. +-- Index dead tuples will not always be cleaned up completely after VACUUM, resulting +-- brin_summarize_new_values() will not always be accurate. So ignore the check to +-- coordinate with the new behavior. +-- start_ignore SELECT brin_summarize_new_values('brinaoidx'); -- ok, no change expected +-- end_ignore diff --git a/src/test/regress/sql/brin_aocs.sql b/src/test/regress/sql/brin_aocs.sql index 8e9d4c704b5..ff182c0aaf3 100644 --- a/src/test/regress/sql/brin_aocs.sql +++ b/src/test/regress/sql/brin_aocs.sql @@ -464,4 +464,10 @@ SELECT segment_id, segno, tupcount, state FROM gp_toolkit.__gp_aocsseg('brintest -- Tests for brin_summarize_new_values SELECT brin_summarize_new_values('brintest_aocs'); -- error, not an index SELECT brin_summarize_new_values('tenk1_unique1'); -- error, not a BRIN index +-- New strategy of VACUUM AO/CO was introduced by PR #13255 for performance enhancement. +-- Index dead tuples will not always be cleaned up completely after VACUUM, resulting +-- brin_summarize_new_values() will not always be accurate. So ignore the check to +-- coordinate with the new behavior. +-- start_ignore SELECT brin_summarize_new_values('brinaocsidx'); -- ok, no change expected +-- end_ignore diff --git a/src/test/regress/sql/gp_toolkit_ao_funcs.sql b/src/test/regress/sql/gp_toolkit_ao_funcs.sql index 545f575dfa3..0df9943ef45 100644 --- a/src/test/regress/sql/gp_toolkit_ao_funcs.sql +++ b/src/test/regress/sql/gp_toolkit_ao_funcs.sql @@ -12,6 +12,7 @@ DROP TABLE IF EXISTS toolkit_ao_test; CREATE TABLE toolkit_ao_test (a INT, b INT, c INT) WITH (appendonly=true) DISTRIBUTED BY (c); +CREATE INDEX ON toolkit_ao_test(a); INSERT INTO toolkit_ao_test SELECT i as a, i as b, 1 FROM generate_series(1,20) AS i; UPDATE toolkit_ao_test SET b = 0 WHERE a = 1; DELETE FROM toolkit_ao_test WHERE a = 2; @@ -19,6 +20,7 @@ DELETE FROM toolkit_ao_test WHERE a = 2; DROP TABLE IF EXISTS toolkit_aocs_test; CREATE TABLE toolkit_aocs_test (a INT, b INT, C INT) WITH (appendonly=true, orientation=column) DISTRIBUTED BY (c); +CREATE INDEX ON toolkit_aocs_test(a); INSERT INTO toolkit_aocs_test SELECT i as a, i as b FROM generate_series(1,20) AS i; UPDATE toolkit_aocs_test SET b = 0 WHERE a = 1; DELETE FROM toolkit_aocs_test WHERE a = 2; @@ -32,6 +34,8 @@ SELECT * FROM gp_toolkit.__gp_aovisimap('toolkit_ao_test'); SELECT count(*) FROM gp_toolkit.__gp_aovisimap_hidden_info('toolkit_ao_test'); SELECT * FROM gp_toolkit.__gp_aovisimap_entry('toolkit_ao_test'); SELECT count(*) FROM gp_toolkit.__gp_aoseg('toolkit_ao_test'); +SELECT * FROM gp_toolkit.__gp_aoblkdir('toolkit_ao_test'); +SELECT * FROM gp_toolkit.__gp_aoblkdir('toolkit_aocs_test'); -- The same, but on the segments. SELECT (t).* FROM ( @@ -40,3 +44,9 @@ SELECT (t).* FROM ( SELECT (t).segno, (t).first_row_num, (t).hidden_tupcount >= 1 as hidden_tupcount_nonzero, (t).bitmap like '01%' as bitmap_starts_with_01 FROM ( SELECT gp_toolkit.__gp_aovisimap_entry('toolkit_ao_test') AS t FROM gp_dist_random('gp_id') ) AS x; +SELECT (t).* FROM ( + SELECT gp_toolkit.__gp_aoblkdir('toolkit_ao_test') AS t FROM gp_dist_random('gp_id') +) AS x; +SELECT (t).* FROM ( + SELECT gp_toolkit.__gp_aoblkdir('toolkit_aocs_test') AS t FROM gp_dist_random('gp_id') +) AS x; diff --git a/src/test/regress/sql/qp_with_clause.sql b/src/test/regress/sql/qp_with_clause.sql index b2eea635d68..ad018427f55 100644 --- a/src/test/regress/sql/qp_with_clause.sql +++ b/src/test/regress/sql/qp_with_clause.sql @@ -8133,16 +8133,6 @@ CREATE TABLE countrylanguage_ao ( percentage real NOT NULL ) with (appendonly=true) distributed by (countrycode,language); -ALTER TABLE ONLY city_ao - ADD CONSTRAINT city_ao_pkey PRIMARY KEY (id); - -ALTER TABLE ONLY country_ao - ADD CONSTRAINT country_ao_pkey PRIMARY KEY (code); - -ALTER TABLE ONLY countrylanguage_ao - ADD CONSTRAINT countrylanguage_ao_pkey PRIMARY KEY (countrycode, "language"); - - create index bitmap_city_ao_countrycode on city_ao using bitmap(countrycode); create index bitmap_country_ao_gf on country_ao using bitmap(governmentform); create index bitmap_country_ao_region on country_ao using bitmap(region); @@ -8620,16 +8610,6 @@ CREATE TABLE countrylanguage_co ( percentage real NOT NULL ) with (appendonly=true,orientation=column) distributed by (countrycode,language); -ALTER TABLE ONLY city_co - ADD CONSTRAINT city_co_pkey PRIMARY KEY (id); - -ALTER TABLE ONLY country_co - ADD CONSTRAINT country_co_pkey PRIMARY KEY (code); - -ALTER TABLE ONLY countrylanguage_co - ADD CONSTRAINT countrylanguage_co_pkey PRIMARY KEY (countrycode, "language"); - - create index bitmap_city_co_countrycode on city_co using bitmap(countrycode); create index bitmap_country_co_gf on country_co using bitmap(governmentform); create index bitmap_country_co_region on country_co using bitmap(region); diff --git a/src/test/regress/sql/uao_compaction/drop_column.sql b/src/test/regress/sql/uao_compaction/drop_column.sql index b86ef526af8..83e542be3c9 100644 --- a/src/test/regress/sql/uao_compaction/drop_column.sql +++ b/src/test/regress/sql/uao_compaction/drop_column.sql @@ -11,7 +11,13 @@ SELECT relname, reltuples FROM pg_class WHERE relname = 'uao_drop_col'; SELECT relname, reltuples FROM pg_class WHERE relname = 'uao_drop_col_index'; VACUUM uao_drop_col; SELECT relname, reltuples FROM pg_class WHERE relname = 'uao_drop_col'; +-- New strategy of VACUUM AO/CO was introduced by PR #13255 for performance enhancement. +-- Index dead tuples will not always be cleaned up completely after VACUUM, resulting +-- index stats pg_class->reltuples will not always be accurate. So ignore the stats check +-- for reltuples to coordinate with the new behavior. +-- start_ignore SELECT relname, reltuples FROM pg_class WHERE relname = 'uao_drop_col_index'; +-- end_ignore ALTER TABLE uao_drop_col DROP COLUMN c; SELECT * FROM uao_drop_col; INSERT INTO uao_drop_col VALUES (42, 42); diff --git a/src/test/regress/sql/uao_compaction/full_stats.sql b/src/test/regress/sql/uao_compaction/full_stats.sql index 5299ffd5e9a..f8361ea0a94 100644 --- a/src/test/regress/sql/uao_compaction/full_stats.sql +++ b/src/test/regress/sql/uao_compaction/full_stats.sql @@ -13,4 +13,10 @@ SELECT relname, reltuples FROM pg_class WHERE relname = 'uao_full_stats_index'; DELETE FROM uao_full_stats WHERE a < 16; VACUUM FULL uao_full_stats; SELECT relname, reltuples FROM pg_class WHERE relname = 'uao_full_stats'; +-- New strategy of VACUUM AO/CO was introduced by PR #13255 for performance enhancement. +-- Index dead tuples will not always be cleaned up completely after VACUUM, resulting +-- index stats pg_class->reltuples will not always be accurate. So ignore the stats check +-- for reltuples to coordinate with the new behavior. +-- start_ignore SELECT relname, reltuples FROM pg_class WHERE relname = 'uao_full_stats_index'; +-- end_ignore diff --git a/src/test/regress/sql/uao_compaction/index_stats.sql b/src/test/regress/sql/uao_compaction/index_stats.sql index d8c3ea68538..05ad79953ef 100644 --- a/src/test/regress/sql/uao_compaction/index_stats.sql +++ b/src/test/regress/sql/uao_compaction/index_stats.sql @@ -16,4 +16,11 @@ update mytab set col_text=' new value' where col_int = 1; select * from mytab; vacuum mytab; SELECT relname, reltuples FROM pg_class WHERE relname = 'mytab'; + +-- New strategy of VACUUM AO/CO was introduced by PR #13255 for performance enhancement. +-- Index dead tuples will not always be cleaned up completely after VACUUM, resulting +-- index stats pg_class->reltuples will not always be accurate. So ignore the stats check +-- for reltuples to coordinate with the new behavior. +-- start_ignore SELECT relname, reltuples FROM pg_class WHERE relname = 'mytab_int_idx1'; +-- end_ignore diff --git a/src/test/regress/sql/uao_compaction/stats.sql b/src/test/regress/sql/uao_compaction/stats.sql index 8be71e52673..bd2c2ce8c0c 100644 --- a/src/test/regress/sql/uao_compaction/stats.sql +++ b/src/test/regress/sql/uao_compaction/stats.sql @@ -14,4 +14,11 @@ SELECT relname, reltuples FROM pg_class WHERE relname = 'uao_stats_index'; DELETE FROM uao_stats WHERE a < 16; VACUUM uao_stats; SELECT relname, reltuples FROM pg_class WHERE relname = 'uao_stats'; + +-- New strategy of VACUUM AO/CO was introduced by PR #13255 for performance enhancement. +-- Index dead tuples will not always be cleaned up completely after VACUUM, resulting +-- index stats pg_class->reltuples will not always be accurate. So ignore the stats check +-- for reltuples to coordinate with the new behavior. +-- start_ignore SELECT relname, reltuples FROM pg_class WHERE relname = 'uao_stats_index'; +-- end_ignore diff --git a/src/test/regress/sql/uaocs_compaction/drop_column.sql b/src/test/regress/sql/uaocs_compaction/drop_column.sql index 8b7240c05ad..3e66ab489f4 100644 --- a/src/test/regress/sql/uaocs_compaction/drop_column.sql +++ b/src/test/regress/sql/uaocs_compaction/drop_column.sql @@ -10,7 +10,13 @@ SELECT relname, reltuples FROM pg_class WHERE relname = 'uaocs_drop'; SELECT relname, reltuples FROM pg_class WHERE relname = 'uaocs_drop_index'; VACUUM uaocs_drop; SELECT relname, reltuples FROM pg_class WHERE relname = 'uaocs_drop'; +-- New strategy of VACUUM AO/CO was introduced by PR #13255 for performance enhancement. +-- Index dead tuples will not always be cleaned up completely after VACUUM, resulting +-- index stats pg_class->reltuples will not always be accurate. So ignore the stats check +-- for reltuples to coordinate with the new behavior. +-- start_ignore SELECT relname, reltuples FROM pg_class WHERE relname = 'uaocs_drop_index'; +-- end_ignore ALTER TABLE uaocs_drop DROP COLUMN c; SELECT * FROM uaocs_drop; INSERT INTO uaocs_drop VALUES (42, 42); diff --git a/src/test/regress/sql/uaocs_compaction/full_stats.sql b/src/test/regress/sql/uaocs_compaction/full_stats.sql index f50d1b25c56..92780646eac 100644 --- a/src/test/regress/sql/uaocs_compaction/full_stats.sql +++ b/src/test/regress/sql/uaocs_compaction/full_stats.sql @@ -15,4 +15,10 @@ SELECT COUNT(*) FROM uaocs_full_stats; VACUUM FULL uaocs_full_stats; SELECT COUNT(*) FROM uaocs_full_stats; SELECT relname, reltuples FROM pg_class WHERE relname = 'uaocs_full_stats'; +-- New strategy of VACUUM AO/CO was introduced by PR #13255 for performance enhancement. +-- Index dead tuples will not always be cleaned up completely after VACUUM, resulting +-- index stats pg_class->reltuples will not always be accurate. So ignore the stats check +-- for reltuples to coordinate with the new behavior. +-- start_ignore SELECT relname, reltuples FROM pg_class WHERE relname = 'uaocs_full_stats_index'; +-- end_ignore diff --git a/src/test/regress/sql/uaocs_compaction/index_stats.sql b/src/test/regress/sql/uaocs_compaction/index_stats.sql index 8ea8e30a00a..c93ee37f686 100644 --- a/src/test/regress/sql/uaocs_compaction/index_stats.sql +++ b/src/test/regress/sql/uaocs_compaction/index_stats.sql @@ -16,4 +16,11 @@ update uaocs_index_stats set col_text=' new value' where col_int = 1; select * from uaocs_index_stats; vacuum uaocs_index_stats; SELECT relname, reltuples FROM pg_class WHERE relname = 'uaocs_index_stats'; + +-- New strategy of VACUUM AO/CO was introduced by PR #13255 for performance enhancement. +-- Index dead tuples will not always be cleaned up completely after VACUUM, resulting +-- index stats pg_class->reltuples will not always be accurate. So ignore the stats check +-- for reltuples to coordinate with the new behavior. +-- start_ignore SELECT relname, reltuples FROM pg_class WHERE relname = 'uaocs_index_stats_int_idx1'; +-- end_ignore diff --git a/src/test/regress/sql/uaocs_compaction/stats.sql b/src/test/regress/sql/uaocs_compaction/stats.sql index c916429dd28..9e483641670 100644 --- a/src/test/regress/sql/uaocs_compaction/stats.sql +++ b/src/test/regress/sql/uaocs_compaction/stats.sql @@ -14,4 +14,11 @@ SELECT relname, reltuples FROM pg_class WHERE relname = 'uaocs_stats_index'; DELETE FROM uaocs_stats WHERE a < 16; VACUUM uaocs_stats; SELECT relname, reltuples FROM pg_class WHERE relname = 'uaocs_stats'; + +-- New strategy of VACUUM AO/CO was introduced by PR #13255 for performance enhancement. +-- Index dead tuples will not always be cleaned up completely after VACUUM, resulting +-- index stats pg_class->reltuples will not always be accurate. So ignore the stats check +-- for reltuples to coordinate with the new behavior. +-- start_ignore SELECT relname, reltuples FROM pg_class WHERE relname = 'uaocs_stats_index'; +-- end_ignore