diff --git a/server/api/CsvLoader.cpp b/server/api/CsvLoader.cpp index bd5970b3..98f75724 100644 --- a/server/api/CsvLoader.cpp +++ b/server/api/CsvLoader.cpp @@ -21,3 +21,23 @@ std::vector> CsvLoader::load_csv(std::string path) { } return result; } + +std::vector> CsvLoader::load_labels(std::string path) { + std::ifstream fin(path); + if (!fin) { + throw std::runtime_error("No such csv file in directory"); + } + std::string line; + std::vector> result; + getline(fin, line); + while (!line.empty()) { + std::stringstream line_stream(line); + std::string file; + std::string label; + getline(line_stream, file, ','); + getline(line_stream, label, ','); + result.push_back({file, std::stof(label)}); + getline(fin, line); + } + return result; +} \ No newline at end of file diff --git a/server/api/CsvLoader.h b/server/api/CsvLoader.h index a9890d89..1d597ef1 100644 --- a/server/api/CsvLoader.h +++ b/server/api/CsvLoader.h @@ -6,4 +6,5 @@ class CsvLoader { public: static std::vector> load_csv(std::string path); + static std::vector> load_labels(std::string path); }; diff --git a/server/api/DataLoader.cpp b/server/api/DataLoader.cpp new file mode 100644 index 00000000..9c643ed4 --- /dev/null +++ b/server/api/DataLoader.cpp @@ -0,0 +1,78 @@ +#include +#include +#include +#include "DataLoader.h" +#include "Blob.h" +#include "Allocator.h" + +void generate_rearrangement(std::vector& rearrangement, std::size_t size) { + rearrangement.resize(size); + for (int i = 0; i < rearrangement.size(); ++i) { + rearrangement[i] = i; + } + // Some shuffle magic from StackOverflow + auto rng = std::default_random_engine { 32 }; + std::shuffle(rearrangement.begin(), rearrangement.end(), rng); +} + +DataLoader::DataLoader(UnshuffledDataLoader* _loader, std::size_t _batch_size): loader(_loader), batch_size(_batch_size) { + generate_rearrangement(rearrangement, loader->size()); +} + +DataLoader::DataLoader(UnshuffledDataLoader* _loader, std::size_t _batch_size, std::string path): loader(_loader), batch_size(_batch_size) { + loader->load_data(path); + generate_rearrangement(rearrangement, loader->size()); +} + +void DataLoader::load_data(std::string path) { + loader->load_data(path); +} + +std::pair> DataLoader::operator[](std::size_t index) const { // batch_size lines from index + if (index >= loader->size()) { + throw std::out_of_range("Index out of range"); + } + auto data = get_raw(index); + Shape shape = loader->get_appropriate_shape(index, batch_size); + return {Blob::constBlob(shape, data.first.data()), data.second}; +} + +std::size_t DataLoader::size() const { + return loader->size(); +} + +void DataLoader::add_data(const DataLoader& other, int index) { + loader->add_data(other.loader, index); +} + +std::pair, std::vector> DataLoader::get_raw(std::size_t index) const { // batch_size lines from index + if (index >= loader->size()) { + throw std::out_of_range("Index out of range"); + } + std::vector data; + std::vector res(batch_size, 0); + Shape shape = loader->get_appropriate_shape(rearrangement[index], batch_size); + auto dims = shape.getDims(); + int data_size = 1; + for (int i = 0; i < dims.size(); ++i) { + data_size *= dims[i]; + } + data.resize(data_size, 0); + int cur_data = 0; + for (int i = index; i < index + batch_size; ++i) { + if (i >= loader->size()) { + break; + } + auto line = loader->get_raw(rearrangement[i]); + res[i - index] = line.second; + for (int j = 0; j < line.first.size(); ++j) { + data[cur_data] = line.first[j]; + cur_data++; + } + } + return {data, res}; +} + +void DataLoader::shuffle() { + generate_rearrangement(rearrangement, loader->size()); +} diff --git a/server/api/DataLoader.h b/server/api/DataLoader.h new file mode 100644 index 00000000..a10582cb --- /dev/null +++ b/server/api/DataLoader.h @@ -0,0 +1,23 @@ +#pragma once + +#include "UnshuffledDataLoader.h" +#include + +void generate_rearrangement(std::vector& rearrangement, std::size_t size); + +class DataLoader { +private: + UnshuffledDataLoader* loader; + std::vector rearrangement; + std::size_t batch_size; +public: + DataLoader() = default; + DataLoader(UnshuffledDataLoader* _loader, std::size_t _batch_size); + DataLoader(UnshuffledDataLoader* _loader, std::size_t _batch_size, std::string path); + void load_data(std::string path); + std::pair> operator[](std::size_t index) const; + void add_data(const DataLoader& other, int index); + std::size_t size() const; + std::pair, std::vector> get_raw(std::size_t index) const; + void shuffle(); +}; diff --git a/server/api/DataMarker.cpp b/server/api/DataMarker.cpp new file mode 100644 index 00000000..be37d5cd --- /dev/null +++ b/server/api/DataMarker.cpp @@ -0,0 +1,56 @@ +#include +#include "DataMarker.h" +#include "UnshuffledCsvLoader.h" +#include "UnshuffledImgLoader.h" +#include "Blob.h" + +DataMarker::DataMarker(std::string path, FileExtension type, int percentage_for_train, std::size_t batch_size) { + if (percentage_for_train > 100 || percentage_for_train < 0) { + throw std::logic_error("Wrong percentage"); + } + DataLoader file_loader; + UnshuffledDataLoader* file_unshuffled_loader; + if (type == FileExtension::Csv) { + file_unshuffled_loader = new UnshuffledCsvLoader; + train_unshuffled_loader = new UnshuffledCsvLoader; + check_unshuffled_loader = new UnshuffledCsvLoader; + } + else if (type == FileExtension::Png) { + file_unshuffled_loader = new UnshuffledImgLoader; + train_unshuffled_loader = new UnshuffledImgLoader; + check_unshuffled_loader = new UnshuffledImgLoader; + } + else { + throw std::logic_error("Unsupported type"); + } + file_loader = DataLoader(file_unshuffled_loader, batch_size, path); + std::vector rearrangement; + generate_rearrangement(rearrangement, file_loader.size()); + train_loader = DataLoader(train_unshuffled_loader, batch_size); + check_loader = DataLoader(check_unshuffled_loader, batch_size); + int instances_for_train = percentage_for_train * (file_loader.size()) / 100; + for (int i = 0; i < file_loader.size(); ++i) { + if (i < instances_for_train) { + train_loader.add_data(file_loader, rearrangement[i]); + } + else { + check_loader.add_data(file_loader, rearrangement[i]); + } + } + train_loader.shuffle(); + check_loader.shuffle(); + delete file_unshuffled_loader; +} + +DataMarker::~DataMarker() { + delete train_unshuffled_loader; + delete check_unshuffled_loader; +} + +DataLoader DataMarker::get_check_loader() { + return check_loader; +} + +DataLoader DataMarker::get_train_loader() { + return train_loader; +} \ No newline at end of file diff --git a/server/api/DataMarker.h b/server/api/DataMarker.h new file mode 100644 index 00000000..4beff113 --- /dev/null +++ b/server/api/DataMarker.h @@ -0,0 +1,21 @@ +#pragma once + +#include +#include "UnshuffledDataLoader.h" +#include "DataLoader.h" + +enum class FileExtension {Csv, Png}; + +class DataMarker { +private: + UnshuffledDataLoader* train_unshuffled_loader; + DataLoader train_loader; + UnshuffledDataLoader* check_unshuffled_loader; + DataLoader check_loader; +public: + DataMarker() = default; + DataMarker(std::string path, FileExtension file_type, int percentage_for_train, std::size_t batch_size); + ~DataMarker(); + DataLoader get_train_loader(); + DataLoader get_check_loader(); +}; diff --git a/server/api/ImageLoader.cpp b/server/api/ImageLoader.cpp index a6201421..2aaa6ac9 100644 --- a/server/api/ImageLoader.cpp +++ b/server/api/ImageLoader.cpp @@ -1,6 +1,6 @@ #include "ImageLoader.h" -std::vector ImageLoader::load_image(char* path) { +std::vector ImageLoader::load_image(const char* path) { cimg_library::CImg image(path); return get_pixels(image); } @@ -22,4 +22,9 @@ std::vector ImageLoader::get_pixels(cimg_library::CImg img } } return ans; +} + +std::pair ImageLoader::get_size(const char *path) { + cimg_library::CImg image(path); + return {image.width(), image.height()}; } \ No newline at end of file diff --git a/server/api/ImageLoader.h b/server/api/ImageLoader.h index ca57561b..8fd09733 100644 --- a/server/api/ImageLoader.h +++ b/server/api/ImageLoader.h @@ -7,6 +7,7 @@ class ImageLoader { public: - static std::vector load_image(char* path); + static std::vector load_image(const char* path); static std::vector get_pixels(cimg_library::CImg); + static std::pair get_size(const char* path); }; diff --git a/server/api/UnshuffledCsvLoader.cpp b/server/api/UnshuffledCsvLoader.cpp new file mode 100644 index 00000000..68c6a3f7 --- /dev/null +++ b/server/api/UnshuffledCsvLoader.cpp @@ -0,0 +1,33 @@ +#include +#include "UnshuffledCsvLoader.h" +#include "CsvLoader.h" + +void UnshuffledCsvLoader::load_data(std::string path) { + data.clear(); + auto file_data = CsvLoader::load_csv(path); + data.resize(file_data.size()); + for (int i = 0; i < file_data.size(); ++i) { + float result = file_data[i].back(); + file_data[i].pop_back(); + data[i] = {file_data[i], result}; + } +} + +void UnshuffledCsvLoader::add_data(const UnshuffledDataLoader* other, int index) { + data.push_back(other->get_raw(index)); +} + +std::size_t UnshuffledCsvLoader::size() const { + return data.size(); +} + +std::pair, float> UnshuffledCsvLoader::get_raw(std::size_t index) const { + if (index >= data.size()) { + throw std::out_of_range("Index out of range"); + } + return {data[index].first, data[index].second}; +} + +Shape UnshuffledCsvLoader::get_appropriate_shape(std::size_t index, std::size_t batch_size) const { + return Shape({batch_size, data[index].first.size()}); +} \ No newline at end of file diff --git a/server/api/UnshuffledCsvLoader.h b/server/api/UnshuffledCsvLoader.h new file mode 100644 index 00000000..64b63279 --- /dev/null +++ b/server/api/UnshuffledCsvLoader.h @@ -0,0 +1,18 @@ +#pragma once + +#include +#include +#include "UnshuffledDataLoader.h" + + +class UnshuffledCsvLoader: public UnshuffledDataLoader { +private: + std::vector, float>> data; +public: + UnshuffledCsvLoader() = default; + void load_data(std::string path) override; + void add_data(const UnshuffledDataLoader* other, int index) override; + std::size_t size() const override; + std::pair, float> get_raw(std::size_t index) const override; + Shape get_appropriate_shape(std::size_t index, std::size_t batch_size) const override; +}; diff --git a/server/api/UnshuffledDataLoader.h b/server/api/UnshuffledDataLoader.h new file mode 100644 index 00000000..6e938111 --- /dev/null +++ b/server/api/UnshuffledDataLoader.h @@ -0,0 +1,16 @@ +#pragma once + +#include +#include +#include "Allocator.h" + +class UnshuffledDataLoader { +public: + UnshuffledDataLoader() = default; + virtual ~UnshuffledDataLoader() = default; + virtual void load_data(std::string path) = 0; + virtual void add_data(const UnshuffledDataLoader* other, int index) = 0; + virtual std::size_t size() const = 0; + virtual std::pair, float> get_raw(std::size_t index) const = 0; + virtual Shape get_appropriate_shape(std::size_t index, std::size_t batch_size) const = 0; +}; diff --git a/server/api/UnshuffledImgLoader.cpp b/server/api/UnshuffledImgLoader.cpp new file mode 100644 index 00000000..772bf959 --- /dev/null +++ b/server/api/UnshuffledImgLoader.cpp @@ -0,0 +1,42 @@ +#include "UnshuffledImgLoader.h" +#include "CsvLoader.h" +#include "Blob.h" +#include "Allocator.h" +#include "ImageLoader.h" +#include +#include + +void UnshuffledImgLoader::load_data(std::string path) { + for (auto const& dir_entry : std::filesystem::recursive_directory_iterator(path)) { + std::string file_path = dir_entry.path(); + if (file_path.size() >= 4 && file_path.substr(file_path.size() - 4, 4) == ".csv") { + data = CsvLoader::load_labels(file_path.c_str()); + break; + } + } + for (int i = 0; i < data.size(); ++i) { + data[i].first = path + "/" + data[i].first; + } +} + +Shape UnshuffledImgLoader::get_appropriate_shape(std::size_t index, std::size_t batch_size) const { + auto img_size = ImageLoader::get_size(data[index].first.c_str()); + return Shape({batch_size, 3, img_size.first, img_size.second}); +} + +std::pair, float> UnshuffledImgLoader::get_raw(std::size_t index) const { + if (index >= data.size()) { + throw std::out_of_range("Index out of range"); + } + std::string file_path = data[index].first; + float ans = data[index].second; + return {ImageLoader::load_image(file_path.c_str()), ans}; +} + +std::size_t UnshuffledImgLoader::size() const { + return data.size(); +} + +void UnshuffledImgLoader::add_data(const UnshuffledDataLoader* other, int index) { + data.push_back(reinterpret_cast(other)->data[index]); +} \ No newline at end of file diff --git a/server/api/UnshuffledImgLoader.h b/server/api/UnshuffledImgLoader.h new file mode 100644 index 00000000..2eb0753c --- /dev/null +++ b/server/api/UnshuffledImgLoader.h @@ -0,0 +1,18 @@ +#pragma once + +#include "UnshuffledDataLoader.h" +#include "Blob.h" +#include +#include + +class UnshuffledImgLoader: public UnshuffledDataLoader { +private: + std::vector> data; +public: + UnshuffledImgLoader() = default; + void load_data(std::string path) override; // path to folder + void add_data(const UnshuffledDataLoader* other, int index) override; + std::size_t size() const override; + std::pair, float> get_raw(std::size_t index) const override; + Shape get_appropriate_shape(std::size_t index, std::size_t batch_size) const override; +}; diff --git a/server/tests/DataMarkerTests.cpp b/server/tests/DataMarkerTests.cpp new file mode 100644 index 00000000..e4ba27a8 --- /dev/null +++ b/server/tests/DataMarkerTests.cpp @@ -0,0 +1,76 @@ +#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN +#include "doctest.h" +#include "DataMarker.h" +#include + +void check_vectors(std::vector, float>>& ans, std::vector, float>>& res) { + CHECK(ans.size() == res.size()); + for (int i = 0; i < ans.size(); ++i) { + CHECK(ans[i].first.size() == res[i].first.size()); + CHECK(ans[i].second == res[i].second); + for (int j = 0; j < ans[i].first.size(); ++j) { + CHECK(ans[i].first[j] == res[i].first[j]); + } + } +} + +TEST_CASE("Csv-test") { + SUBCASE("and-train") { + DataMarker loader = DataMarker("./tests/data/and-train.csv", FileExtension::Csv, 50, 1); + DataLoader for_train = loader.get_train_loader(); + DataLoader for_check = loader.get_check_loader(); + std::vector, float>> ans = {{{0, 0}, 0}, {{0, 1}, 0}, {{1, 0}, 0}, {{1, 1}, 1}}; + std::vector, float>> res; + CHECK(for_train.size() == 2); + CHECK(for_check.size() == 2); + for (int i = 0; i < 2; ++i) { + auto line1 = for_train.get_raw(i); + CHECK(line1.second.size() == 1); + res.push_back({line1.first, line1.second[0]}); + auto line2 = for_check.get_raw(i); + CHECK(line2.second.size() == 1); + res.push_back({line2.first, line2.second[0]}); + } + sort(res.begin(), res.end()); + check_vectors(ans, res); + } + SUBCASE("xor-train") { + DataMarker loader = DataMarker("./tests/data/xor-train.csv", FileExtension::Csv, 50, 1); + DataLoader for_train = loader.get_train_loader(); + DataLoader for_check = loader.get_check_loader(); + std::vector, float>> ans = {{{0, 0}, 0}, {{0, 1}, 1}, {{1, 0}, 1}, {{1, 1}, 0}}; + std::vector, float>> res; + CHECK(for_train.size() == 2); + CHECK(for_check.size() == 2); + for (int i = 0; i < 2; ++i) { + auto line1 = for_train.get_raw(i); + CHECK(line1.second.size() == 1); + res.push_back({line1.first, line1.second[0]}); + auto line2 = for_check.get_raw(i); + CHECK(line2.second.size() == 1); + res.push_back({line2.first, line2.second[0]}); + } + sort(res.begin(), res.end()); + check_vectors(ans, res); + } +} + +TEST_CASE("Image-test") { + DataMarker loader = DataMarker("./tests/data/1", FileExtension::Png, 80, 1); + DataLoader for_train = loader.get_train_loader(); + DataLoader for_check = loader.get_check_loader(); + std::vector, float>> ans = {{{255, 255, 255}, 0}, {{0, 0, 0}, 0}, {{159, 252, 253}, 0}, {{255, 255, 0, 0, 255, 255, 0, 0, 0}, 1}, {{0, 255, 100, 153, 136, 255, 0, 174, 100, 217, 0, 255, 0, 201, 100, 234, 21, 255}, 1}}; + std::vector, float>> res; + CHECK(for_train.size() == 4); + CHECK(for_check.size() == 1); + for (int i = 0; i < 4; ++i) { + auto line = for_train.get_raw(i); + CHECK(line.second.size() == 1); + res.push_back({line.first, line.second[0]}); + } + CHECK(for_check.get_raw(0).second.size() == 1); + res.push_back({for_check.get_raw(0).first, for_check.get_raw(0).second[0]}); + sort(res.begin(), res.end()); + sort(ans.begin(), ans.end()); + check_vectors(ans, res); +} \ No newline at end of file diff --git a/server/tests/data/1/black_pixel.png b/server/tests/data/1/black_pixel.png new file mode 100644 index 00000000..0279819e Binary files /dev/null and b/server/tests/data/1/black_pixel.png differ diff --git a/server/tests/data/1/labels.csv b/server/tests/data/1/labels.csv new file mode 100644 index 00000000..4d1e5d6a --- /dev/null +++ b/server/tests/data/1/labels.csv @@ -0,0 +1,5 @@ +picture.png, 1 +traffic_light.png, 1 +black_pixel.png, 0 +white_pixel.png, 0 +lazure_pixel.png, 0 diff --git a/server/tests/data/1/lazure_pixel.png b/server/tests/data/1/lazure_pixel.png new file mode 100644 index 00000000..e914eb77 Binary files /dev/null and b/server/tests/data/1/lazure_pixel.png differ diff --git a/server/tests/data/1/picture.png b/server/tests/data/1/picture.png new file mode 100644 index 00000000..7954f390 Binary files /dev/null and b/server/tests/data/1/picture.png differ diff --git a/server/tests/data/1/traffic_light.png b/server/tests/data/1/traffic_light.png new file mode 100644 index 00000000..01542f26 Binary files /dev/null and b/server/tests/data/1/traffic_light.png differ diff --git a/server/tests/data/1/white_pixel.png b/server/tests/data/1/white_pixel.png new file mode 100644 index 00000000..b201b72e Binary files /dev/null and b/server/tests/data/1/white_pixel.png differ