From 3eb00546ba5c4be8a7218affcd475a1c5584c0b1 Mon Sep 17 00:00:00 2001 From: Neelam Mahapatro Date: Mon, 6 Feb 2023 10:07:45 +0530 Subject: [PATCH 01/13] Fix filter diskann issue - Fix sharded filter build - fix search label parsing --- src/disk_utils.cpp | 2 +- src/pq_flash_index.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/disk_utils.cpp b/src/disk_utils.cpp index 1e1286418..9b7c3ca24 100644 --- a/src/disk_utils.cpp +++ b/src/disk_utils.cpp @@ -1282,7 +1282,7 @@ namespace diskann { gen_random_slice(data_file_to_use.c_str(), sample_base_prefix, sample_sampling_rate); if (use_filters) { - copy_file(mem_labels_file, disk_labels_file); + copy_file(labels_file_to_use, disk_labels_file); std::remove(mem_labels_file.c_str()); if (universal_label != "") { copy_file(mem_univ_label_file, disk_univ_label_file); diff --git a/src/pq_flash_index.cpp b/src/pq_flash_index.cpp index 1db901b92..f6ac79b34 100644 --- a/src/pq_flash_index.cpp +++ b/src/pq_flash_index.cpp @@ -507,7 +507,7 @@ namespace diskann { _u32 &num_lbls_in_cur_pt = _pts_to_labels[counter]; num_lbls_in_cur_pt = 0; counter++; - getline(iss, token, '\t'); // first token contains metadata, not used + //getline(iss, token, '\t'); // first token contains metadata, not used getline(iss, token, '\t'); std::istringstream new_iss(token); while (getline(new_iss, token, ',')) { From 5650917f381bb5937ae46fef7476b07f9a28a867 Mon Sep 17 00:00:00 2001 From: Neelam Mahapatro Date: Thu, 9 Feb 2023 22:20:33 +0530 Subject: [PATCH 02/13] Fix comments --- include/disk_utils.h | 2 +- include/index.h | 2 +- src/disk_utils.cpp | 7 +- src/index.cpp | 158 ++++++++++++---------- src/pq_flash_index.cpp | 195 +++++++++++++++------------- tests/search_disk_index.cpp | 1 + tests/search_memory_index.cpp | 1 + tests/utils/compute_groundtruth.cpp | 3 - tests/utils/stats_label_data.cpp | 17 +-- 9 files changed, 201 insertions(+), 185 deletions(-) diff --git a/include/disk_utils.h b/include/disk_utils.h index dabeeeb0c..6aaa02426 100644 --- a/include/disk_utils.h +++ b/include/disk_utils.h @@ -75,7 +75,7 @@ namespace diskann { const _u64 nshards, unsigned max_degree, const std::string &output_vamana, const std::string &medoids_file, bool use_filters = false, - const std::string &labels_to_medoids_file = std::string("")); + const std::string &labels_to_medoids_file = std::string("")); DISKANN_DLLEXPORT void extract_shard_labels( const std::string &in_label_file, const std::string &shard_ids_bin, diff --git a/include/index.h b/include/index.h index 9025a22f5..871c2ddd7 100644 --- a/include/index.h +++ b/include/index.h @@ -238,7 +238,7 @@ namespace diskann { // determines navigating node of the graph by calculating medoid of datafopt unsigned calculate_entry_point(); - void parse_label_file(const std::string& map_file); + size_t parse_label_file(const std::string &map_file); std::pair iterate_to_fixed_point( const T *node_coords, const unsigned Lindex, diff --git a/src/disk_utils.cpp b/src/disk_utils.cpp index 9b7c3ca24..e6aa02fdb 100644 --- a/src/disk_utils.cpp +++ b/src/disk_utils.cpp @@ -323,6 +323,7 @@ namespace diskann { } std::ofstream mapping_writer(labels_to_medoids_file); + assert(mapping_writer.is_open()); for (auto iter : global_label_to_medoids) { mapping_writer << iter.first << ", "; auto &vec = iter.second; @@ -534,6 +535,7 @@ namespace diskann { diskann::cout << labels_per_point.size() << " is the new number of points" << std::endl; std::ofstream label_writer(out_labels_file); + assert(label_writer.is_open()); for (_u32 i = 0; i < labels_per_point.size(); i++) { for (_u32 j = 0; j < (labels_per_point[i].size() - 1); j++) { label_writer << labels_per_point[i][j] << ","; @@ -551,6 +553,7 @@ namespace diskann { data = (T *) std::realloc((void *) data, labels_per_point.size() * ndims * sizeof(T)); std::ofstream dummy_writer(out_metadata_file); + assert(dummy_writer.is_open()); for (auto i = dummy_pt_ids.begin(); i != dummy_pt_ids.end(); i++) { dummy_writer << i->first << "," << i->second << std::endl; std::memcpy(data + i->first * ndims, data + i->second * ndims, @@ -566,7 +569,7 @@ namespace diskann { const std::string &in_label_file, const std::string &shard_ids_bin, const std::string &shard_label_file) { // assumes ith row is for ith // point in labels file - std::cout << "Extracting labels for shard" << std::endl; + diskann::cout << "Extracting labels for shard" << std::endl; _u32 *ids = nullptr; _u64 num_ids, tmp_dim; @@ -578,6 +581,8 @@ namespace diskann { std::ifstream label_reader(in_label_file); std::ofstream label_writer(shard_label_file); + assert(label_reader.is_open()); + assert(label_writer.is_open()); if (label_reader && label_writer) { while (std::getline(label_reader, cur_line)) { if (shard_counter >= num_ids) { diff --git a/src/index.cpp b/src/index.cpp index f0e5e19a5..5ff702317 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -278,9 +278,9 @@ namespace diskann { if (_filter_to_medoid_id.size() > 0) { std::ofstream medoid_writer(std::string(filename) + "_labels_to_medoids.txt"); + assert(medoid_writer.is_open()); for (auto iter : _filter_to_medoid_id) { medoid_writer << iter.first << ", " << iter.second << std::endl; - // std::cout << iter.first << ", " << iter.second << std::endl; } medoid_writer.close(); } @@ -288,12 +288,14 @@ namespace diskann { if (_use_universal_label) { std::ofstream universal_label_writer(std::string(filename) + "_universal_label.txt"); + assert(universal_label_writer.is_open()); universal_label_writer << _universal_label << std::endl; universal_label_writer.close(); } if (_pts_to_labels.size() > 0) { std::ofstream label_writer(std::string(filename) + "_labels.txt"); + assert(label_writer.is_open()); for (_u32 i = 0; i < _pts_to_labels.size(); i++) { for (_u32 j = 0; j < (_pts_to_labels[i].size() - 1); j++) { label_writer << _pts_to_labels[i][j] << ","; @@ -476,52 +478,12 @@ namespace diskann { _has_built = true; - size_t tags_file_num_pts = 0, graph_num_pts = 0, data_file_num_pts = 0; + size_t tags_file_num_pts = 0, graph_num_pts = 0, data_file_num_pts = 0, label_num_pts = 0; std::string mem_index_file(filename); std::string labels_file = mem_index_file + "_labels.txt"; std::string labels_to_medoids = mem_index_file + "_labels_to_medoids.txt"; - if (file_exists(labels_file)) { - parse_label_file(labels_file); - if (file_exists(labels_to_medoids)) { - std::ifstream medoid_stream(labels_to_medoids); - - std::string line, token; - unsigned line_cnt = 0; - - _filter_to_medoid_id.clear(); - - while (std::getline(medoid_stream, line)) { - std::istringstream iss(line); - _u32 cnt = 0; - _u32 medoid = 0; - label label; - while (std::getline(iss, token, ',')) { - token.erase(std::remove(token.begin(), token.end(), '\n'), - token.end()); - token.erase(std::remove(token.begin(), token.end(), '\r'), - token.end()); - unsigned token_as_num = std::stoul(token); - if (cnt == 0) - label = token_as_num; - else - medoid = token_as_num; - cnt++; - } - _filter_to_medoid_id[label] = medoid; - line_cnt++; - } - } - - std::string universal_label_file(filename); - universal_label_file += "_universal_label.txt"; - if (file_exists(universal_label_file)) { - std::ifstream universal_label_reader(universal_label_file); - universal_label_reader >> _universal_label; - _use_universal_label = true; - universal_label_reader.close(); - } - } + if (!_save_as_one_file) { // For DLVS Store, we will not support saving the index in multiple files. @@ -560,6 +522,53 @@ namespace diskann { throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__, __LINE__); } + if (file_exists(labels_file)) { + label_num_pts = parse_label_file(labels_file); + assert(label_num_pts == data_file_num_pts); + if (file_exists(labels_to_medoids)) { + std::ifstream medoid_stream(labels_to_medoids); + assert(medoid_stream.is_open()); + std::string line, token; + unsigned line_cnt = 0; + + _filter_to_medoid_id.clear(); + try { + while (std::getline(medoid_stream, line)) { + std::istringstream iss(line); + _u32 cnt = 0; + _u32 medoid = 0; + label label; + while (std::getline(iss, token, ',')) { + token.erase(std::remove(token.begin(), token.end(), '\n'), + token.end()); + token.erase(std::remove(token.begin(), token.end(), '\r'), + token.end()); + unsigned token_as_num = std::stoul(token); + if (cnt == 0) + label = token_as_num; + else + medoid = token_as_num; + cnt++; + } + _filter_to_medoid_id[label] = medoid; + line_cnt++; + } + } catch (std::system_error &e) { + throw FileException(labels_to_medoids, e, __FUNCSIG__, __FILE__, + __LINE__); + } + } + + std::string universal_label_file(filename); + universal_label_file += "_universal_label.txt"; + if (file_exists(universal_label_file)) { + std::ifstream universal_label_reader(universal_label_file); + assert(universal_label_reader.is_open()); + universal_label_reader >> _universal_label; + _use_universal_label = true; + universal_label_reader.close(); + } + } _nd = data_file_num_pts - _num_frozen_pts; _empty_slots.clear(); @@ -1645,13 +1654,13 @@ namespace diskann { } template - void Index::parse_label_file(const std::string &map_file) { + size_t Index::parse_label_file(const std::string &label_file) { // Format of Label txt file: filters with comma separators - - std::ifstream infile(map_file); + std::ifstream infile(label_file); + assert(infile.is_open()); std::string line, token; unsigned line_cnt = 0; - + while (std::getline(infile, line)) { line_cnt++; } @@ -1660,28 +1669,35 @@ namespace diskann { infile.clear(); infile.seekg(0, std::ios::beg); line_cnt = 0; - while (std::getline(infile, line)) { - std::istringstream iss(line); - std::vector