Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 8 additions & 5 deletions src/VecSim/algorithms/brute_force/brute_force.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,14 @@ class BruteForceIndex : public VecSimIndexAbstract<DistType> {

inline vecsim_stl::vector<VectorBlock *> getVectorBlocks() const { return vectorBlocks; }
inline const labelType getLabelByInternalId(idType internal_id) const {
return idToLabelMapping[internal_id];
return idToLabelMapping.at(internal_id);
}
// Remove a specific vector that is stored under a label from the index by its internal id.
virtual int deleteVectorById(labelType label, idType id) = 0;
// Remove a vector and return a map between internal ids and the original internal ids that they
// hold AFTER the removals and swaps.
virtual std::unordered_map<idType, idType> deleteVectorAndGetUpdatedIds(labelType label) = 0;
// Remove a vector and return a map between internal ids and the original internal ids of the
// vector that they hold as a result of the overall removals and swaps, along with its label.
virtual std::unordered_map<idType, std::pair<idType, labelType>>
deleteVectorAndGetUpdatedIds(labelType label) = 0;
// Check if a certain label exists in the index.
virtual inline bool isLabelExists(labelType label) = 0;
// Return a set of all labels that are stored in the index (helper for computing label count
Expand Down Expand Up @@ -160,6 +161,7 @@ void BruteForceIndex<DataType, DistType>::appendVector(const void *vector_data,
size_t last_block_vectors_count = id % this->blockSize;
this->idToLabelMapping.resize(
idToLabelMapping_size + this->blockSize - last_block_vectors_count, 0);
this->idToLabelMapping.shrink_to_fit();
}

// add label to idToLabelMapping
Expand Down Expand Up @@ -210,10 +212,11 @@ void BruteForceIndex<DataType, DistType>::removeVector(idType id_to_delete) {
// Resize and align the idToLabelMapping.
size_t idToLabel_size = idToLabelMapping.size();
// If the new size is smaller by at least one block comparing to the idToLabelMapping
// align to be a multiplication of blocksize and resize by one block.
// align to be a multiplication of block size and resize by one block.
if (this->count + this->blockSize <= idToLabel_size) {
size_t vector_to_align_count = idToLabel_size % this->blockSize;
this->idToLabelMapping.resize(idToLabel_size - this->blockSize - vector_to_align_count);
this->idToLabelMapping.shrink_to_fit();
}
}
}
Expand Down
10 changes: 6 additions & 4 deletions src/VecSim/algorithms/brute_force/brute_force_multi.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ class BruteForceIndex_Multi : public BruteForceIndex<DataType, DistType> {
return std::unique_ptr<vecsim_stl::abstract_results_container>(
new (this->allocator) vecsim_stl::unique_results_container(cap, this->allocator));
}
std::unordered_map<idType, idType> deleteVectorAndGetUpdatedIds(labelType label) override;
std::unordered_map<idType, std::pair<idType, labelType>>
deleteVectorAndGetUpdatedIds(labelType label) override;
#ifdef BUILD_TESTS
void getDataByLabel(labelType label,
std::vector<std::vector<DataType>> &vectors_output) const override {
Expand Down Expand Up @@ -117,15 +118,15 @@ int BruteForceIndex_Multi<DataType, DistType>::deleteVector(labelType label) {
}

template <typename DataType, typename DistType>
std::unordered_map<idType, idType>
std::unordered_map<idType, std::pair<idType, labelType>>
BruteForceIndex_Multi<DataType, DistType>::deleteVectorAndGetUpdatedIds(labelType label) {
// Hold a mapping from ids that are removed and changed to the original ids that were swapped
// into it. For example, if we have ids 0, 1, 2, 3, 4 and are about to remove ids 1, 3, 4, we
// should get the following scenario: {1->4} => {1->4} => {1->2}.
// Explanation: first we delete 1 and swap it with 4. Then, we remove 3 and have no swap since 3
// is the last id. Lastly, we delete the original 4 which is now in id 1, and swap it with 2.
// Eventually, in id 1 we should have the original vector whose id was 2.
std::unordered_map<idType, idType> updated_ids;
std::unordered_map<idType, std::pair<idType, labelType>> updated_ids;

// Find the id to delete.
auto deleted_label_ids_pair = this->labelToIdsLookup.find(label);
Expand All @@ -140,6 +141,7 @@ BruteForceIndex_Multi<DataType, DistType>::deleteVectorAndGetUpdatedIds(labelTyp
// The removal take into consideration the current internal id to remove, even if it is not
// the original id, and it has swapped into this id after previous swap of another id that
// belongs to this label.
labelType last_id_label = this->idToLabelMapping[this->count - 1];
this->removeVector(cur_id_to_delete);
// If cur_id_to_delete exists in the map, remove it as it is no longer valid, whether it
// will get a new value due to a swap, or it is the last element in the index.
Expand All @@ -151,7 +153,7 @@ BruteForceIndex_Multi<DataType, DistType>::deleteVectorAndGetUpdatedIds(labelTyp
updated_ids.erase(this->count);
} else {
// Otherwise, the last id now resides where the deleted id was.
updated_ids[cur_id_to_delete] = this->count;
updated_ids[cur_id_to_delete] = {this->count, last_id_label};
}
}
}
Expand Down
13 changes: 7 additions & 6 deletions src/VecSim/algorithms/brute_force/brute_force_single.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ class BruteForceIndex_Single : public BruteForceIndex<DataType, DistType> {
}

inline size_t indexLabelCount() const override { return this->count; }
std::unordered_map<idType, idType> deleteVectorAndGetUpdatedIds(labelType label) override;
std::unordered_map<idType, std::pair<idType, labelType>>
deleteVectorAndGetUpdatedIds(labelType label) override;

// We call this when we KNOW that the label exists in the index.
idType getIdOfLabel(labelType label) const { return labelToIdLookup.find(label)->second; }
Expand Down Expand Up @@ -149,10 +150,10 @@ int BruteForceIndex_Single<DataType, DistType>::deleteVector(labelType label) {
}

template <typename DataType, typename DistType>
std::unordered_map<idType, idType>
std::unordered_map<idType, std::pair<idType, labelType>>
BruteForceIndex_Single<DataType, DistType>::deleteVectorAndGetUpdatedIds(labelType label) {

std::unordered_map<idType, idType> updated_ids;
std::unordered_map<idType, std::pair<idType, labelType>> updated_ids;
// Find the id to delete.
auto deleted_label_id_pair = this->labelToIdLookup.find(label);
if (deleted_label_id_pair == this->labelToIdLookup.end()) {
Expand All @@ -165,10 +166,10 @@ BruteForceIndex_Single<DataType, DistType>::deleteVectorAndGetUpdatedIds(labelTy

// Remove the pair of the deleted vector.
labelToIdLookup.erase(label);

this->removeVector(id_to_delete);
labelType last_id_label = this->idToLabelMapping[this->count - 1];
this->removeVector(id_to_delete); // this will decrease this->count and make the swap
if (id_to_delete != this->count) {
updated_ids[id_to_delete] = this->count;
updated_ids[id_to_delete] = {this->count, last_id_label};
}
return updated_ids;
}
Expand Down
22 changes: 15 additions & 7 deletions src/VecSim/algorithms/hnsw/hnsw_tiered.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ class TieredHNSWIndex : public VecSimTieredIndex<DataType, DistType> {
// label-to-insert-jobs lookup. Also, since deletion a vector triggers swapping of the
// internal last id with the deleted vector id, here we update the pending insert job(s) for the
// last id (if needed). This should be called while *flat lock is held* (exclusive lock).
void updateInsertJobInternalId(idType prev_id, idType new_id);
void updateInsertJobInternalId(idType prev_id, idType new_id, labelType label);

// Helper function for performing in place mark delete of vector(s) associated with a label
// and creating the appropriate repair jobs for the effected connections. This should be called
Expand Down Expand Up @@ -356,11 +356,11 @@ int TieredHNSWIndex<DataType, DistType>::deleteLabelFromHNSW(labelType label) {
}

template <typename DataType, typename DistType>
void TieredHNSWIndex<DataType, DistType>::updateInsertJobInternalId(idType prev_id, idType new_id) {
void TieredHNSWIndex<DataType, DistType>::updateInsertJobInternalId(idType prev_id, idType new_id,
labelType label) {
// Update the pending job id, due to a swap that was caused after the removal of new_id.
assert(new_id != INVALID_ID && prev_id != INVALID_ID);
labelType last_idx_label = this->frontendIndex->getLabelByInternalId(prev_id);
auto it = this->labelToInsertJobs.find(last_idx_label);
auto it = this->labelToInsertJobs.find(label);
if (it != this->labelToInsertJobs.end()) {
// There is a pending job for the label of the swapped last id - update its id.
for (HNSWInsertJob *job_it : it->second) {
Expand Down Expand Up @@ -471,11 +471,17 @@ void TieredHNSWIndex<DataType, DistType>::executeInsertJob(HNSWInsertJob *job) {
if (labelToInsertJobs.at(job->label).empty()) {
labelToInsertJobs.erase(job->label);
}
// Remove the vector from the flat buffer.
// Remove the vector from the flat buffer. This may cause the last vector id to swap with
// the deleted id. Hold the label for the last id, so we can later on update its
// corresponding job id. Note that after calling deleteVectorById, the last id's label
// shouldn't be available, since it is removed from the lookup.
labelType last_vec_label =
this->frontendIndex->getLabelByInternalId(this->frontendIndex->indexSize() - 1);
int deleted = this->frontendIndex->deleteVectorById(job->label, job->id);
if (deleted && job->id != this->frontendIndex->indexSize()) {
// If the vector removal caused a swap with the last id, update the relevant insert job.
this->updateInsertJobInternalId(this->frontendIndex->indexSize(), job->id);
this->updateInsertJobInternalId(this->frontendIndex->indexSize(), job->id,
last_vec_label);
}
}
this->flatIndexGuard.unlock();
Expand Down Expand Up @@ -700,7 +706,9 @@ int TieredHNSWIndex<DataType, DistType>::deleteVector(labelType label) {
// an example in this function implementation in MULTI index).
auto updated_ids = this->frontendIndex->deleteVectorAndGetUpdatedIds(label);
for (auto &it : updated_ids) {
this->updateInsertJobInternalId(it.second, it.first);
idType prev_id = it.second.first;
labelType updated_vec_label = it.second.second;
this->updateInsertJobInternalId(prev_id, it.first, updated_vec_label);
}
}
this->flatIndexGuard.unlock();
Expand Down
4 changes: 2 additions & 2 deletions tests/unit/test_allocator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -164,8 +164,8 @@ TYPED_TEST(IndexAllocatorTest, test_bf_index_block_size_1) {
// collection allocate additional structures for their internal implementation.
ASSERT_EQ(allocator->getAllocationSize(),
expectedAllocationSize + deleteCommandAllocationDelta);
ASSERT_LE(expectedAllocationSize + expectedAllocationDelta, allocator->getAllocationSize());
ASSERT_LE(expectedAllocationDelta, deleteCommandAllocationDelta);
ASSERT_GE(expectedAllocationSize + expectedAllocationDelta, allocator->getAllocationSize());
ASSERT_GE(expectedAllocationDelta, deleteCommandAllocationDelta);

info = bfIndex->info();
ASSERT_EQ(allocator->getAllocationSize(), info.bfInfo.memory);
Expand Down
20 changes: 14 additions & 6 deletions tests/unit/test_bruteforce.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ TYPED_TEST(BruteForceTest, brute_force_vector_update_test) {

TYPED_TEST(BruteForceTest, resize_and_align_index) {
size_t dim = 4;
size_t n = 15;
size_t n = 14;
size_t blockSize = 10;

BFParams params = {
Expand All @@ -134,24 +134,26 @@ TYPED_TEST(BruteForceTest, resize_and_align_index) {

// Add another vector, since index size equals to the capacity, this should cause resizing
// (to fit a multiplication of block_size).
GenerateAndAddVector<TEST_DATA_T>(index, dim, n + 1);
GenerateAndAddVector<TEST_DATA_T>(index, dim, n);
ASSERT_EQ(VecSimIndex_IndexSize(index), n + 1);
// Check new capacity size, should be blockSize * 2.
ASSERT_EQ(bf_index->idToLabelMapping.size(), 2 * blockSize);
ASSERT_EQ(bf_index->idToLabelMapping.capacity(), 2 * blockSize);

// Now size = n + 1 = 16, capacity = 2* bs = 20. Test capacity overflow again
// to check that it stays aligned with blocksize.
// Now size = n + 1 (= 15), capacity = 2 * bs (= 20). Test capacity overflow again
// to check that it stays aligned with block size.

size_t add_vectors_count = 8;
for (size_t i = 0; i < add_vectors_count; i++) {
GenerateAndAddVector<TEST_DATA_T>(index, dim, n + 2 + i, i);
}

// Size should be n + 1 + 8 = 24.
// Size should be n + 1 + 8 (= 25).
ASSERT_EQ(VecSimIndex_IndexSize(index), n + 1 + add_vectors_count);

// Check new capacity size, should be blockSize * 3.
ASSERT_EQ(bf_index->idToLabelMapping.size(), 3 * blockSize);
ASSERT_EQ(bf_index->idToLabelMapping.capacity(), 3 * blockSize);

VecSimIndex_Free(index);
}
Expand All @@ -170,7 +172,7 @@ TYPED_TEST(BruteForceTest, resize_and_align_index_largeInitialCapacity) {
BruteForceIndex<TEST_DATA_T, TEST_DIST_T> *bf_index = this->CastToBF(index);
ASSERT_EQ(VecSimIndex_IndexSize(index), 0);

// add up to blocksize + 1 = 3 + 1 = 4
// Add up to block size + 1 = 3 + 1 = 4
for (size_t i = 0; i < bs + 1; i++) {
GenerateAndAddVector<TEST_DATA_T>(index, dim, i, i);
}
Expand All @@ -190,6 +192,7 @@ TYPED_TEST(BruteForceTest, resize_and_align_index_largeInitialCapacity) {
// 10 - 3 - 10 % 3 (1) = 6
idToLabelMapping_size = bf_index->idToLabelMapping.size();
ASSERT_EQ(idToLabelMapping_size, n - bs - n % bs);
ASSERT_EQ(idToLabelMapping_size, bf_index->idToLabelMapping.capacity());

// Delete all the vectors to decrease idToLabelMapping size by another bs.
size_t i = 0;
Expand All @@ -198,19 +201,24 @@ TYPED_TEST(BruteForceTest, resize_and_align_index_largeInitialCapacity) {
++i;
}
ASSERT_EQ(bf_index->idToLabelMapping.size(), bs);
ASSERT_EQ(bf_index->idToLabelMapping.capacity(), bs);

// Add and delete a vector to achieve:
// size % block_size == 0 && size + bs <= idToLabelMapping_size(3).
// idToLabelMapping_size should be resized to zero.
GenerateAndAddVector<TEST_DATA_T>(index, dim, 0);
VecSimIndex_DeleteVector(index, 0);
ASSERT_EQ(bf_index->idToLabelMapping.size(), 0);
ASSERT_EQ(bf_index->idToLabelMapping.capacity(), 0);

// Do it again. This time after adding a vector idToLabelMapping_size is increased by bs.
// Upon deletion it will be resized to zero again.
GenerateAndAddVector<TEST_DATA_T>(index, dim, 0);
ASSERT_EQ(bf_index->idToLabelMapping.size(), bs);
ASSERT_EQ(bf_index->idToLabelMapping.capacity(), bs);
VecSimIndex_DeleteVector(index, 0);
ASSERT_EQ(bf_index->idToLabelMapping.size(), 0);
ASSERT_EQ(bf_index->idToLabelMapping.capacity(), 0);

VecSimIndex_Free(index);
}
Expand Down
8 changes: 4 additions & 4 deletions tests/unit/test_hnsw_tiered.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1601,9 +1601,9 @@ TYPED_TEST(HNSWTieredIndexTestBasic, deleteVectorMultiFromFlatAdvanced) {
ASSERT_EQ(tiered_index->labelToInsertJobs.erase(vec_label_second), 1);
auto updated_ids = tiered_index->frontendIndex->deleteVectorAndGetUpdatedIds(vec_label_second);
ASSERT_EQ(updated_ids.size(), 1);
ASSERT_EQ(updated_ids.at(1), 2);
ASSERT_EQ(updated_ids.at(1).first, 2);
for (auto &it : updated_ids) {
tiered_index->updateInsertJobInternalId(it.second, it.first);
tiered_index->updateInsertJobInternalId(it.second.first, it.first, it.second.second);
}
ASSERT_EQ(tiered_index->labelToInsertJobs.size(), 1);
ASSERT_EQ(tiered_index->labelToInsertJobs.at(vec_label_first).size(), 2);
Expand Down Expand Up @@ -1633,9 +1633,9 @@ TYPED_TEST(HNSWTieredIndexTestBasic, deleteVectorMultiFromFlatAdvanced) {
ASSERT_EQ(tiered_index->labelToInsertJobs.erase(vec_label_second), 1);
updated_ids = tiered_index->frontendIndex->deleteVectorAndGetUpdatedIds(vec_label_second);
ASSERT_EQ(updated_ids.size(), 1);
ASSERT_EQ(updated_ids.at(1), 3);
ASSERT_EQ(updated_ids.at(1).first, 3);
for (auto &it : updated_ids) {
tiered_index->updateInsertJobInternalId(it.second, it.first);
tiered_index->updateInsertJobInternalId(it.second.first, it.first, it.second.second);
}
ASSERT_EQ(tiered_index->labelToInsertJobs.size(), 1);
ASSERT_EQ(tiered_index->labelToInsertJobs.at(vec_label_first).size(), 2);
Expand Down