-
Notifications
You must be signed in to change notification settings - Fork 0
GRA-122: Data loader implementation #67
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
1470733
5db37f4
89650b5
d539cdc
866bb3c
30f5e25
aecc66e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,78 @@ | ||
| #include <algorithm> | ||
| #include <random> | ||
| #include <stdexcept> | ||
| #include "DataLoader.h" | ||
| #include "Blob.h" | ||
| #include "Allocator.h" | ||
|
|
||
| void generate_rearrangement(std::vector<int>& rearrangement, std::size_t size) { | ||
| rearrangement.resize(size); | ||
| for (int i = 0; i < rearrangement.size(); ++i) { | ||
| rearrangement[i] = i; | ||
| } | ||
| // Some shuffle magic from StackOverflow | ||
| auto rng = std::default_random_engine { 32 }; | ||
| std::shuffle(rearrangement.begin(), rearrangement.end(), rng); | ||
| } | ||
|
|
||
| DataLoader::DataLoader(UnshuffledDataLoader* _loader, std::size_t _batch_size): loader(_loader), batch_size(_batch_size) { | ||
| generate_rearrangement(rearrangement, loader->size()); | ||
| } | ||
|
|
||
| DataLoader::DataLoader(UnshuffledDataLoader* _loader, std::size_t _batch_size, std::string path): loader(_loader), batch_size(_batch_size) { | ||
| loader->load_data(path); | ||
| generate_rearrangement(rearrangement, loader->size()); | ||
| } | ||
|
|
||
| void DataLoader::load_data(std::string path) { | ||
| loader->load_data(path); | ||
| } | ||
|
|
||
| std::pair<Blob, std::vector<float>> DataLoader::operator[](std::size_t index) const { // batch_size lines from index | ||
| if (index >= loader->size()) { | ||
| throw std::out_of_range("Index out of range"); | ||
| } | ||
| auto data = get_raw(index); | ||
| Shape shape = loader->get_appropriate_shape(index, batch_size); | ||
| return {Blob::constBlob(shape, data.first.data()), data.second}; | ||
| } | ||
|
|
||
| std::size_t DataLoader::size() const { | ||
| return loader->size(); | ||
| } | ||
|
|
||
| void DataLoader::add_data(const DataLoader& other, int index) { | ||
| loader->add_data(other.loader, index); | ||
| } | ||
|
|
||
| std::pair<std::vector<float>, std::vector<float>> DataLoader::get_raw(std::size_t index) const { // batch_size lines from index | ||
| if (index >= loader->size()) { | ||
| throw std::out_of_range("Index out of range"); | ||
| } | ||
| std::vector<float> data; | ||
| std::vector<float> res(batch_size, 0); | ||
| Shape shape = loader->get_appropriate_shape(rearrangement[index], batch_size); | ||
| auto dims = shape.getDims(); | ||
| int data_size = 1; | ||
| for (int i = 0; i < dims.size(); ++i) { | ||
| data_size *= dims[i]; | ||
| } | ||
|
Comment on lines
+55
to
+59
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. По-моему, я видел у |
||
| data.resize(data_size, 0); | ||
| int cur_data = 0; | ||
| for (int i = index; i < index + batch_size; ++i) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [ПОФИГ] |
||
| if (i >= loader->size()) { | ||
| break; | ||
| } | ||
| auto line = loader->get_raw(rearrangement[i]); | ||
| res[i - index] = line.second; | ||
| for (int j = 0; j < line.first.size(); ++j) { | ||
| data[cur_data] = line.first[j]; | ||
| cur_data++; | ||
| } | ||
| } | ||
| return {data, res}; | ||
| } | ||
|
|
||
| void DataLoader::shuffle() { | ||
| generate_rearrangement(rearrangement, loader->size()); | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,23 @@ | ||
| #pragma once | ||
|
|
||
| #include "UnshuffledDataLoader.h" | ||
| #include <vector> | ||
|
|
||
| void generate_rearrangement(std::vector<int>& rearrangement, std::size_t size); | ||
|
|
||
| class DataLoader { | ||
| private: | ||
| UnshuffledDataLoader* loader; | ||
| std::vector<int> rearrangement; | ||
| std::size_t batch_size; | ||
| public: | ||
| DataLoader() = default; | ||
| DataLoader(UnshuffledDataLoader* _loader, std::size_t _batch_size); | ||
| DataLoader(UnshuffledDataLoader* _loader, std::size_t _batch_size, std::string path); | ||
| void load_data(std::string path); | ||
| std::pair<Blob, std::vector<float>> operator[](std::size_t index) const; | ||
| void add_data(const DataLoader& other, int index); | ||
| std::size_t size() const; | ||
| std::pair<std::vector<float>, std::vector<float>> get_raw(std::size_t index) const; | ||
| void shuffle(); | ||
| }; |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,56 @@ | ||
| #include <stdexcept> | ||
| #include "DataMarker.h" | ||
| #include "UnshuffledCsvLoader.h" | ||
| #include "UnshuffledImgLoader.h" | ||
| #include "Blob.h" | ||
|
|
||
| DataMarker::DataMarker(std::string path, FileExtension type, int percentage_for_train, std::size_t batch_size) { | ||
| if (percentage_for_train > 100 || percentage_for_train < 0) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. А не лучше ли тут |
||
| throw std::logic_error("Wrong percentage"); | ||
| } | ||
| DataLoader file_loader; | ||
| UnshuffledDataLoader* file_unshuffled_loader; | ||
| if (type == FileExtension::Csv) { | ||
| file_unshuffled_loader = new UnshuffledCsvLoader; | ||
| train_unshuffled_loader = new UnshuffledCsvLoader; | ||
| check_unshuffled_loader = new UnshuffledCsvLoader; | ||
| } | ||
| else if (type == FileExtension::Png) { | ||
| file_unshuffled_loader = new UnshuffledImgLoader; | ||
| train_unshuffled_loader = new UnshuffledImgLoader; | ||
| check_unshuffled_loader = new UnshuffledImgLoader; | ||
| } | ||
| else { | ||
| throw std::logic_error("Unsupported type"); | ||
| } | ||
| file_loader = DataLoader(file_unshuffled_loader, batch_size, path); | ||
| std::vector<int> rearrangement; | ||
| generate_rearrangement(rearrangement, file_loader.size()); | ||
| train_loader = DataLoader(train_unshuffled_loader, batch_size); | ||
| check_loader = DataLoader(check_unshuffled_loader, batch_size); | ||
| int instances_for_train = percentage_for_train * (file_loader.size()) / 100; | ||
| for (int i = 0; i < file_loader.size(); ++i) { | ||
| if (i < instances_for_train) { | ||
| train_loader.add_data(file_loader, rearrangement[i]); | ||
| } | ||
| else { | ||
| check_loader.add_data(file_loader, rearrangement[i]); | ||
| } | ||
| } | ||
| train_loader.shuffle(); | ||
| check_loader.shuffle(); | ||
| delete file_unshuffled_loader; | ||
| } | ||
|
|
||
| DataMarker::~DataMarker() { | ||
| delete train_unshuffled_loader; | ||
| delete check_unshuffled_loader; | ||
| } | ||
|
|
||
| DataLoader DataMarker::get_check_loader() { | ||
| return check_loader; | ||
| } | ||
|
|
||
| DataLoader DataMarker::get_train_loader() { | ||
| return train_loader; | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,21 @@ | ||
| #pragma once | ||
|
|
||
| #include <string> | ||
| #include "UnshuffledDataLoader.h" | ||
| #include "DataLoader.h" | ||
|
|
||
| enum class FileExtension {Csv, Png}; | ||
|
|
||
| class DataMarker { | ||
| private: | ||
| UnshuffledDataLoader* train_unshuffled_loader; | ||
| DataLoader train_loader; | ||
| UnshuffledDataLoader* check_unshuffled_loader; | ||
| DataLoader check_loader; | ||
| public: | ||
| DataMarker() = default; | ||
| DataMarker(std::string path, FileExtension file_type, int percentage_for_train, std::size_t batch_size); | ||
| ~DataMarker(); | ||
| DataLoader get_train_loader(); | ||
| DataLoader get_check_loader(); | ||
| }; |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,33 @@ | ||
| #include <stdexcept> | ||
| #include "UnshuffledCsvLoader.h" | ||
| #include "CsvLoader.h" | ||
|
|
||
| void UnshuffledCsvLoader::load_data(std::string path) { | ||
| data.clear(); | ||
| auto file_data = CsvLoader::load_csv(path); | ||
| data.resize(file_data.size()); | ||
| for (int i = 0; i < file_data.size(); ++i) { | ||
| float result = file_data[i].back(); | ||
| file_data[i].pop_back(); | ||
| data[i] = {file_data[i], result}; | ||
| } | ||
| } | ||
|
|
||
| void UnshuffledCsvLoader::add_data(const UnshuffledDataLoader* other, int index) { | ||
| data.push_back(other->get_raw(index)); | ||
| } | ||
|
|
||
| std::size_t UnshuffledCsvLoader::size() const { | ||
| return data.size(); | ||
| } | ||
|
|
||
| std::pair<std::vector<float>, float> UnshuffledCsvLoader::get_raw(std::size_t index) const { | ||
| if (index >= data.size()) { | ||
| throw std::out_of_range("Index out of range"); | ||
| } | ||
| return {data[index].first, data[index].second}; | ||
| } | ||
|
|
||
| Shape UnshuffledCsvLoader::get_appropriate_shape(std::size_t index, std::size_t batch_size) const { | ||
| return Shape({batch_size, data[index].first.size()}); | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,18 @@ | ||
| #pragma once | ||
|
|
||
| #include <vector> | ||
| #include <string> | ||
| #include "UnshuffledDataLoader.h" | ||
|
|
||
|
|
||
| class UnshuffledCsvLoader: public UnshuffledDataLoader { | ||
| private: | ||
| std::vector<std::pair<std::vector<float>, float>> data; | ||
| public: | ||
| UnshuffledCsvLoader() = default; | ||
| void load_data(std::string path) override; | ||
| void add_data(const UnshuffledDataLoader* other, int index) override; | ||
| std::size_t size() const override; | ||
| std::pair<std::vector<float>, float> get_raw(std::size_t index) const override; | ||
| Shape get_appropriate_shape(std::size_t index, std::size_t batch_size) const override; | ||
| }; |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,16 @@ | ||
| #pragma once | ||
|
|
||
| #include <string> | ||
| #include <vector> | ||
| #include "Allocator.h" | ||
|
|
||
| class UnshuffledDataLoader { | ||
| public: | ||
| UnshuffledDataLoader() = default; | ||
| virtual ~UnshuffledDataLoader() = default; | ||
| virtual void load_data(std::string path) = 0; | ||
| virtual void add_data(const UnshuffledDataLoader* other, int index) = 0; | ||
| virtual std::size_t size() const = 0; | ||
| virtual std::pair<std::vector<float>, float> get_raw(std::size_t index) const = 0; | ||
| virtual Shape get_appropriate_shape(std::size_t index, std::size_t batch_size) const = 0; | ||
| }; |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,42 @@ | ||
| #include "UnshuffledImgLoader.h" | ||
| #include "CsvLoader.h" | ||
| #include "Blob.h" | ||
| #include "Allocator.h" | ||
| #include "ImageLoader.h" | ||
| #include <filesystem> | ||
| #include <string> | ||
|
|
||
| void UnshuffledImgLoader::load_data(std::string path) { | ||
| for (auto const& dir_entry : std::filesystem::recursive_directory_iterator(path)) { | ||
| std::string file_path = dir_entry.path(); | ||
| if (file_path.size() >= 4 && file_path.substr(file_path.size() - 4, 4) == ".csv") { | ||
| data = CsvLoader::load_labels(file_path.c_str()); | ||
| break; | ||
| } | ||
| } | ||
| for (int i = 0; i < data.size(); ++i) { | ||
| data[i].first = path + "/" + data[i].first; | ||
| } | ||
| } | ||
|
|
||
| Shape UnshuffledImgLoader::get_appropriate_shape(std::size_t index, std::size_t batch_size) const { | ||
| auto img_size = ImageLoader::get_size(data[index].first.c_str()); | ||
| return Shape({batch_size, 3, img_size.first, img_size.second}); | ||
| } | ||
|
|
||
| std::pair<std::vector<float>, float> UnshuffledImgLoader::get_raw(std::size_t index) const { | ||
| if (index >= data.size()) { | ||
| throw std::out_of_range("Index out of range"); | ||
| } | ||
| std::string file_path = data[index].first; | ||
| float ans = data[index].second; | ||
| return {ImageLoader::load_image(file_path.c_str()), ans}; | ||
| } | ||
|
|
||
| std::size_t UnshuffledImgLoader::size() const { | ||
| return data.size(); | ||
| } | ||
|
|
||
| void UnshuffledImgLoader::add_data(const UnshuffledDataLoader* other, int index) { | ||
| data.push_back(reinterpret_cast<const UnshuffledImgLoader*>(other)->data[index]); | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,18 @@ | ||
| #pragma once | ||
|
|
||
| #include "UnshuffledDataLoader.h" | ||
| #include "Blob.h" | ||
| #include <vector> | ||
| #include <string> | ||
|
|
||
| class UnshuffledImgLoader: public UnshuffledDataLoader { | ||
| private: | ||
| std::vector<std::pair<std::string, float>> data; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Вот здесь и в остальных местах тоже: много раз используется вот эта пара, не лучше ли написать структурку с понятными названиями полей? |
||
| public: | ||
| UnshuffledImgLoader() = default; | ||
| void load_data(std::string path) override; // path to folder | ||
| void add_data(const UnshuffledDataLoader* other, int index) override; | ||
| std::size_t size() const override; | ||
| std::pair<std::vector<float>, float> get_raw(std::size_t index) const override; | ||
| Shape get_appropriate_shape(std::size_t index, std::size_t batch_size) const override; | ||
| }; | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Кажется, методы можно сделать
constНо тут как хочешь, раз уж мы договорились придерживаться принципа ПОФИГ