diff --git a/src/VecSim/algorithms/brute_force/brute_force.h b/src/VecSim/algorithms/brute_force/brute_force.h index d47b6dd70..dca727392 100644 --- a/src/VecSim/algorithms/brute_force/brute_force.h +++ b/src/VecSim/algorithms/brute_force/brute_force.h @@ -57,13 +57,14 @@ class BruteForceIndex : public VecSimIndexAbstract { inline vecsim_stl::vector getVectorBlocks() const { return vectorBlocks; } inline const labelType getLabelByInternalId(idType internal_id) const { - return idToLabelMapping[internal_id]; + return idToLabelMapping.at(internal_id); } // Remove a specific vector that is stored under a label from the index by its internal id. virtual int deleteVectorById(labelType label, idType id) = 0; - // Remove a vector and return a map between internal ids and the original internal ids that they - // hold AFTER the removals and swaps. - virtual std::unordered_map deleteVectorAndGetUpdatedIds(labelType label) = 0; + // Remove a vector and return a map between internal ids and the original internal ids of the + // vector that they hold as a result of the overall removals and swaps, along with its label. + virtual std::unordered_map> + deleteVectorAndGetUpdatedIds(labelType label) = 0; // Check if a certain label exists in the index. virtual inline bool isLabelExists(labelType label) = 0; // Return a set of all labels that are stored in the index (helper for computing label count @@ -160,6 +161,7 @@ void BruteForceIndex::appendVector(const void *vector_data, size_t last_block_vectors_count = id % this->blockSize; this->idToLabelMapping.resize( idToLabelMapping_size + this->blockSize - last_block_vectors_count, 0); + this->idToLabelMapping.shrink_to_fit(); } // add label to idToLabelMapping @@ -210,10 +212,11 @@ void BruteForceIndex::removeVector(idType id_to_delete) { // Resize and align the idToLabelMapping. size_t idToLabel_size = idToLabelMapping.size(); // If the new size is smaller by at least one block comparing to the idToLabelMapping - // align to be a multiplication of blocksize and resize by one block. + // align to be a multiplication of block size and resize by one block. if (this->count + this->blockSize <= idToLabel_size) { size_t vector_to_align_count = idToLabel_size % this->blockSize; this->idToLabelMapping.resize(idToLabel_size - this->blockSize - vector_to_align_count); + this->idToLabelMapping.shrink_to_fit(); } } } diff --git a/src/VecSim/algorithms/brute_force/brute_force_multi.h b/src/VecSim/algorithms/brute_force/brute_force_multi.h index d2dc93edf..c2b515a89 100644 --- a/src/VecSim/algorithms/brute_force/brute_force_multi.h +++ b/src/VecSim/algorithms/brute_force/brute_force_multi.h @@ -34,7 +34,8 @@ class BruteForceIndex_Multi : public BruteForceIndex { return std::unique_ptr( new (this->allocator) vecsim_stl::unique_results_container(cap, this->allocator)); } - std::unordered_map deleteVectorAndGetUpdatedIds(labelType label) override; + std::unordered_map> + deleteVectorAndGetUpdatedIds(labelType label) override; #ifdef BUILD_TESTS void getDataByLabel(labelType label, std::vector> &vectors_output) const override { @@ -117,7 +118,7 @@ int BruteForceIndex_Multi::deleteVector(labelType label) { } template -std::unordered_map +std::unordered_map> BruteForceIndex_Multi::deleteVectorAndGetUpdatedIds(labelType label) { // Hold a mapping from ids that are removed and changed to the original ids that were swapped // into it. For example, if we have ids 0, 1, 2, 3, 4 and are about to remove ids 1, 3, 4, we @@ -125,7 +126,7 @@ BruteForceIndex_Multi::deleteVectorAndGetUpdatedIds(labelTyp // Explanation: first we delete 1 and swap it with 4. Then, we remove 3 and have no swap since 3 // is the last id. Lastly, we delete the original 4 which is now in id 1, and swap it with 2. // Eventually, in id 1 we should have the original vector whose id was 2. - std::unordered_map updated_ids; + std::unordered_map> updated_ids; // Find the id to delete. auto deleted_label_ids_pair = this->labelToIdsLookup.find(label); @@ -140,6 +141,7 @@ BruteForceIndex_Multi::deleteVectorAndGetUpdatedIds(labelTyp // The removal take into consideration the current internal id to remove, even if it is not // the original id, and it has swapped into this id after previous swap of another id that // belongs to this label. + labelType last_id_label = this->idToLabelMapping[this->count - 1]; this->removeVector(cur_id_to_delete); // If cur_id_to_delete exists in the map, remove it as it is no longer valid, whether it // will get a new value due to a swap, or it is the last element in the index. @@ -151,7 +153,7 @@ BruteForceIndex_Multi::deleteVectorAndGetUpdatedIds(labelTyp updated_ids.erase(this->count); } else { // Otherwise, the last id now resides where the deleted id was. - updated_ids[cur_id_to_delete] = this->count; + updated_ids[cur_id_to_delete] = {this->count, last_id_label}; } } } diff --git a/src/VecSim/algorithms/brute_force/brute_force_single.h b/src/VecSim/algorithms/brute_force/brute_force_single.h index 48f2582e7..ddf1373fa 100644 --- a/src/VecSim/algorithms/brute_force/brute_force_single.h +++ b/src/VecSim/algorithms/brute_force/brute_force_single.h @@ -33,7 +33,8 @@ class BruteForceIndex_Single : public BruteForceIndex { } inline size_t indexLabelCount() const override { return this->count; } - std::unordered_map deleteVectorAndGetUpdatedIds(labelType label) override; + std::unordered_map> + deleteVectorAndGetUpdatedIds(labelType label) override; // We call this when we KNOW that the label exists in the index. idType getIdOfLabel(labelType label) const { return labelToIdLookup.find(label)->second; } @@ -149,10 +150,10 @@ int BruteForceIndex_Single::deleteVector(labelType label) { } template -std::unordered_map +std::unordered_map> BruteForceIndex_Single::deleteVectorAndGetUpdatedIds(labelType label) { - std::unordered_map updated_ids; + std::unordered_map> updated_ids; // Find the id to delete. auto deleted_label_id_pair = this->labelToIdLookup.find(label); if (deleted_label_id_pair == this->labelToIdLookup.end()) { @@ -165,10 +166,10 @@ BruteForceIndex_Single::deleteVectorAndGetUpdatedIds(labelTy // Remove the pair of the deleted vector. labelToIdLookup.erase(label); - - this->removeVector(id_to_delete); + labelType last_id_label = this->idToLabelMapping[this->count - 1]; + this->removeVector(id_to_delete); // this will decrease this->count and make the swap if (id_to_delete != this->count) { - updated_ids[id_to_delete] = this->count; + updated_ids[id_to_delete] = {this->count, last_id_label}; } return updated_ids; } diff --git a/src/VecSim/algorithms/hnsw/hnsw_tiered.h b/src/VecSim/algorithms/hnsw/hnsw_tiered.h index e6acc3519..88e03dbd3 100644 --- a/src/VecSim/algorithms/hnsw/hnsw_tiered.h +++ b/src/VecSim/algorithms/hnsw/hnsw_tiered.h @@ -100,7 +100,7 @@ class TieredHNSWIndex : public VecSimTieredIndex { // label-to-insert-jobs lookup. Also, since deletion a vector triggers swapping of the // internal last id with the deleted vector id, here we update the pending insert job(s) for the // last id (if needed). This should be called while *flat lock is held* (exclusive lock). - void updateInsertJobInternalId(idType prev_id, idType new_id); + void updateInsertJobInternalId(idType prev_id, idType new_id, labelType label); // Helper function for performing in place mark delete of vector(s) associated with a label // and creating the appropriate repair jobs for the effected connections. This should be called @@ -356,11 +356,11 @@ int TieredHNSWIndex::deleteLabelFromHNSW(labelType label) { } template -void TieredHNSWIndex::updateInsertJobInternalId(idType prev_id, idType new_id) { +void TieredHNSWIndex::updateInsertJobInternalId(idType prev_id, idType new_id, + labelType label) { // Update the pending job id, due to a swap that was caused after the removal of new_id. assert(new_id != INVALID_ID && prev_id != INVALID_ID); - labelType last_idx_label = this->frontendIndex->getLabelByInternalId(prev_id); - auto it = this->labelToInsertJobs.find(last_idx_label); + auto it = this->labelToInsertJobs.find(label); if (it != this->labelToInsertJobs.end()) { // There is a pending job for the label of the swapped last id - update its id. for (HNSWInsertJob *job_it : it->second) { @@ -471,11 +471,17 @@ void TieredHNSWIndex::executeInsertJob(HNSWInsertJob *job) { if (labelToInsertJobs.at(job->label).empty()) { labelToInsertJobs.erase(job->label); } - // Remove the vector from the flat buffer. + // Remove the vector from the flat buffer. This may cause the last vector id to swap with + // the deleted id. Hold the label for the last id, so we can later on update its + // corresponding job id. Note that after calling deleteVectorById, the last id's label + // shouldn't be available, since it is removed from the lookup. + labelType last_vec_label = + this->frontendIndex->getLabelByInternalId(this->frontendIndex->indexSize() - 1); int deleted = this->frontendIndex->deleteVectorById(job->label, job->id); if (deleted && job->id != this->frontendIndex->indexSize()) { // If the vector removal caused a swap with the last id, update the relevant insert job. - this->updateInsertJobInternalId(this->frontendIndex->indexSize(), job->id); + this->updateInsertJobInternalId(this->frontendIndex->indexSize(), job->id, + last_vec_label); } } this->flatIndexGuard.unlock(); @@ -700,7 +706,9 @@ int TieredHNSWIndex::deleteVector(labelType label) { // an example in this function implementation in MULTI index). auto updated_ids = this->frontendIndex->deleteVectorAndGetUpdatedIds(label); for (auto &it : updated_ids) { - this->updateInsertJobInternalId(it.second, it.first); + idType prev_id = it.second.first; + labelType updated_vec_label = it.second.second; + this->updateInsertJobInternalId(prev_id, it.first, updated_vec_label); } } this->flatIndexGuard.unlock(); diff --git a/tests/unit/test_allocator.cpp b/tests/unit/test_allocator.cpp index c016b0883..523ad0b86 100644 --- a/tests/unit/test_allocator.cpp +++ b/tests/unit/test_allocator.cpp @@ -164,8 +164,8 @@ TYPED_TEST(IndexAllocatorTest, test_bf_index_block_size_1) { // collection allocate additional structures for their internal implementation. ASSERT_EQ(allocator->getAllocationSize(), expectedAllocationSize + deleteCommandAllocationDelta); - ASSERT_LE(expectedAllocationSize + expectedAllocationDelta, allocator->getAllocationSize()); - ASSERT_LE(expectedAllocationDelta, deleteCommandAllocationDelta); + ASSERT_GE(expectedAllocationSize + expectedAllocationDelta, allocator->getAllocationSize()); + ASSERT_GE(expectedAllocationDelta, deleteCommandAllocationDelta); info = bfIndex->info(); ASSERT_EQ(allocator->getAllocationSize(), info.bfInfo.memory); diff --git a/tests/unit/test_bruteforce.cpp b/tests/unit/test_bruteforce.cpp index 870485a87..67c0d8a9c 100644 --- a/tests/unit/test_bruteforce.cpp +++ b/tests/unit/test_bruteforce.cpp @@ -108,7 +108,7 @@ TYPED_TEST(BruteForceTest, brute_force_vector_update_test) { TYPED_TEST(BruteForceTest, resize_and_align_index) { size_t dim = 4; - size_t n = 15; + size_t n = 14; size_t blockSize = 10; BFParams params = { @@ -134,24 +134,26 @@ TYPED_TEST(BruteForceTest, resize_and_align_index) { // Add another vector, since index size equals to the capacity, this should cause resizing // (to fit a multiplication of block_size). - GenerateAndAddVector(index, dim, n + 1); + GenerateAndAddVector(index, dim, n); ASSERT_EQ(VecSimIndex_IndexSize(index), n + 1); // Check new capacity size, should be blockSize * 2. ASSERT_EQ(bf_index->idToLabelMapping.size(), 2 * blockSize); + ASSERT_EQ(bf_index->idToLabelMapping.capacity(), 2 * blockSize); - // Now size = n + 1 = 16, capacity = 2* bs = 20. Test capacity overflow again - // to check that it stays aligned with blocksize. + // Now size = n + 1 (= 15), capacity = 2 * bs (= 20). Test capacity overflow again + // to check that it stays aligned with block size. size_t add_vectors_count = 8; for (size_t i = 0; i < add_vectors_count; i++) { GenerateAndAddVector(index, dim, n + 2 + i, i); } - // Size should be n + 1 + 8 = 24. + // Size should be n + 1 + 8 (= 25). ASSERT_EQ(VecSimIndex_IndexSize(index), n + 1 + add_vectors_count); // Check new capacity size, should be blockSize * 3. ASSERT_EQ(bf_index->idToLabelMapping.size(), 3 * blockSize); + ASSERT_EQ(bf_index->idToLabelMapping.capacity(), 3 * blockSize); VecSimIndex_Free(index); } @@ -170,7 +172,7 @@ TYPED_TEST(BruteForceTest, resize_and_align_index_largeInitialCapacity) { BruteForceIndex *bf_index = this->CastToBF(index); ASSERT_EQ(VecSimIndex_IndexSize(index), 0); - // add up to blocksize + 1 = 3 + 1 = 4 + // Add up to block size + 1 = 3 + 1 = 4 for (size_t i = 0; i < bs + 1; i++) { GenerateAndAddVector(index, dim, i, i); } @@ -190,6 +192,7 @@ TYPED_TEST(BruteForceTest, resize_and_align_index_largeInitialCapacity) { // 10 - 3 - 10 % 3 (1) = 6 idToLabelMapping_size = bf_index->idToLabelMapping.size(); ASSERT_EQ(idToLabelMapping_size, n - bs - n % bs); + ASSERT_EQ(idToLabelMapping_size, bf_index->idToLabelMapping.capacity()); // Delete all the vectors to decrease idToLabelMapping size by another bs. size_t i = 0; @@ -198,19 +201,24 @@ TYPED_TEST(BruteForceTest, resize_and_align_index_largeInitialCapacity) { ++i; } ASSERT_EQ(bf_index->idToLabelMapping.size(), bs); + ASSERT_EQ(bf_index->idToLabelMapping.capacity(), bs); + // Add and delete a vector to achieve: // size % block_size == 0 && size + bs <= idToLabelMapping_size(3). // idToLabelMapping_size should be resized to zero. GenerateAndAddVector(index, dim, 0); VecSimIndex_DeleteVector(index, 0); ASSERT_EQ(bf_index->idToLabelMapping.size(), 0); + ASSERT_EQ(bf_index->idToLabelMapping.capacity(), 0); // Do it again. This time after adding a vector idToLabelMapping_size is increased by bs. // Upon deletion it will be resized to zero again. GenerateAndAddVector(index, dim, 0); ASSERT_EQ(bf_index->idToLabelMapping.size(), bs); + ASSERT_EQ(bf_index->idToLabelMapping.capacity(), bs); VecSimIndex_DeleteVector(index, 0); ASSERT_EQ(bf_index->idToLabelMapping.size(), 0); + ASSERT_EQ(bf_index->idToLabelMapping.capacity(), 0); VecSimIndex_Free(index); } diff --git a/tests/unit/test_hnsw_tiered.cpp b/tests/unit/test_hnsw_tiered.cpp index 23ca3ff7c..ccbd40105 100644 --- a/tests/unit/test_hnsw_tiered.cpp +++ b/tests/unit/test_hnsw_tiered.cpp @@ -1601,9 +1601,9 @@ TYPED_TEST(HNSWTieredIndexTestBasic, deleteVectorMultiFromFlatAdvanced) { ASSERT_EQ(tiered_index->labelToInsertJobs.erase(vec_label_second), 1); auto updated_ids = tiered_index->frontendIndex->deleteVectorAndGetUpdatedIds(vec_label_second); ASSERT_EQ(updated_ids.size(), 1); - ASSERT_EQ(updated_ids.at(1), 2); + ASSERT_EQ(updated_ids.at(1).first, 2); for (auto &it : updated_ids) { - tiered_index->updateInsertJobInternalId(it.second, it.first); + tiered_index->updateInsertJobInternalId(it.second.first, it.first, it.second.second); } ASSERT_EQ(tiered_index->labelToInsertJobs.size(), 1); ASSERT_EQ(tiered_index->labelToInsertJobs.at(vec_label_first).size(), 2); @@ -1633,9 +1633,9 @@ TYPED_TEST(HNSWTieredIndexTestBasic, deleteVectorMultiFromFlatAdvanced) { ASSERT_EQ(tiered_index->labelToInsertJobs.erase(vec_label_second), 1); updated_ids = tiered_index->frontendIndex->deleteVectorAndGetUpdatedIds(vec_label_second); ASSERT_EQ(updated_ids.size(), 1); - ASSERT_EQ(updated_ids.at(1), 3); + ASSERT_EQ(updated_ids.at(1).first, 3); for (auto &it : updated_ids) { - tiered_index->updateInsertJobInternalId(it.second, it.first); + tiered_index->updateInsertJobInternalId(it.second.first, it.first, it.second.second); } ASSERT_EQ(tiered_index->labelToInsertJobs.size(), 1); ASSERT_EQ(tiered_index->labelToInsertJobs.at(vec_label_first).size(), 2);