Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions server/api/CsvLoader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,23 @@ std::vector<std::vector<float>> CsvLoader::load_csv(std::string path) {
}
return result;
}

std::vector<std::pair<std::string, float>> CsvLoader::load_labels(std::string path) {
std::ifstream fin(path);
if (!fin) {
throw std::runtime_error("No such csv file in directory");
}
std::string line;
std::vector<std::pair<std::string, float>> result;
getline(fin, line);
while (!line.empty()) {
std::stringstream line_stream(line);
std::string file;
std::string label;
getline(line_stream, file, ',');
getline(line_stream, label, ',');
result.push_back({file, std::stof(label)});
getline(fin, line);
}
return result;
}
1 change: 1 addition & 0 deletions server/api/CsvLoader.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,5 @@
class CsvLoader {
public:
static std::vector<std::vector<float>> load_csv(std::string path);
static std::vector<std::pair<std::string, float>> load_labels(std::string path);
Comment on lines 8 to +9
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Кажется, методы можно сделать const
Но тут как хочешь, раз уж мы договорились придерживаться принципа ПОФИГ

};
78 changes: 78 additions & 0 deletions server/api/DataLoader.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
#include <algorithm>
#include <random>
#include <stdexcept>
#include "DataLoader.h"
#include "Blob.h"
#include "Allocator.h"

void generate_rearrangement(std::vector<int>& rearrangement, std::size_t size) {
rearrangement.resize(size);
for (int i = 0; i < rearrangement.size(); ++i) {
rearrangement[i] = i;
}
// Some shuffle magic from StackOverflow
auto rng = std::default_random_engine { 32 };
std::shuffle(rearrangement.begin(), rearrangement.end(), rng);
}

DataLoader::DataLoader(UnshuffledDataLoader* _loader, std::size_t _batch_size): loader(_loader), batch_size(_batch_size) {
generate_rearrangement(rearrangement, loader->size());
}

DataLoader::DataLoader(UnshuffledDataLoader* _loader, std::size_t _batch_size, std::string path): loader(_loader), batch_size(_batch_size) {
loader->load_data(path);
generate_rearrangement(rearrangement, loader->size());
}

void DataLoader::load_data(std::string path) {
loader->load_data(path);
}

std::pair<Blob, std::vector<float>> DataLoader::operator[](std::size_t index) const { // batch_size lines from index
if (index >= loader->size()) {
throw std::out_of_range("Index out of range");
}
auto data = get_raw(index);
Shape shape = loader->get_appropriate_shape(index, batch_size);
return {Blob::constBlob(shape, data.first.data()), data.second};
}

std::size_t DataLoader::size() const {
return loader->size();
}

void DataLoader::add_data(const DataLoader& other, int index) {
loader->add_data(other.loader, index);
}

std::pair<std::vector<float>, std::vector<float>> DataLoader::get_raw(std::size_t index) const { // batch_size lines from index
if (index >= loader->size()) {
throw std::out_of_range("Index out of range");
}
std::vector<float> data;
std::vector<float> res(batch_size, 0);
Shape shape = loader->get_appropriate_shape(rearrangement[index], batch_size);
auto dims = shape.getDims();
int data_size = 1;
for (int i = 0; i < dims.size(); ++i) {
data_size *= dims[i];
}
Comment on lines +55 to +59
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

По-моему, я видел у Shape метод size(), делающий ровно это

data.resize(data_size, 0);
int cur_data = 0;
for (int i = index; i < index + batch_size; ++i) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[ПОФИГ]
Насколько я понимаю, index - это откуда мы читаем батч. Лично мне было бы удобнее подавать номер батча, и тогда цикл выглядел бы как
for (int i = index * batch_size; i < (index + 1) * batch_size; ++i)
Но в целом реально ПОФИГ, лучше оставить как есть, раз работает)

if (i >= loader->size()) {
break;
}
auto line = loader->get_raw(rearrangement[i]);
res[i - index] = line.second;
for (int j = 0; j < line.first.size(); ++j) {
data[cur_data] = line.first[j];
cur_data++;
}
}
return {data, res};
}

void DataLoader::shuffle() {
generate_rearrangement(rearrangement, loader->size());
}
23 changes: 23 additions & 0 deletions server/api/DataLoader.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#pragma once

#include "UnshuffledDataLoader.h"
#include <vector>

void generate_rearrangement(std::vector<int>& rearrangement, std::size_t size);

class DataLoader {
private:
UnshuffledDataLoader* loader;
std::vector<int> rearrangement;
std::size_t batch_size;
public:
DataLoader() = default;
DataLoader(UnshuffledDataLoader* _loader, std::size_t _batch_size);
DataLoader(UnshuffledDataLoader* _loader, std::size_t _batch_size, std::string path);
void load_data(std::string path);
std::pair<Blob, std::vector<float>> operator[](std::size_t index) const;
void add_data(const DataLoader& other, int index);
std::size_t size() const;
std::pair<std::vector<float>, std::vector<float>> get_raw(std::size_t index) const;
void shuffle();
};
56 changes: 56 additions & 0 deletions server/api/DataMarker.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#include <stdexcept>
#include "DataMarker.h"
#include "UnshuffledCsvLoader.h"
#include "UnshuffledImgLoader.h"
#include "Blob.h"

DataMarker::DataMarker(std::string path, FileExtension type, int percentage_for_train, std::size_t batch_size) {
if (percentage_for_train > 100 || percentage_for_train < 0) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

А не лучше ли тут float? Вряд ли, конечно, кому-то нужно именно 20,5% на тест, но как будто бы можно сделать более гибко практически бесплатно (ну и в торче/sklearn так сделано))
НО! Так как у нас ПОФИГ, на это можно забить, так тоже норм)

throw std::logic_error("Wrong percentage");
}
DataLoader file_loader;
UnshuffledDataLoader* file_unshuffled_loader;
if (type == FileExtension::Csv) {
file_unshuffled_loader = new UnshuffledCsvLoader;
train_unshuffled_loader = new UnshuffledCsvLoader;
check_unshuffled_loader = new UnshuffledCsvLoader;
}
else if (type == FileExtension::Png) {
file_unshuffled_loader = new UnshuffledImgLoader;
train_unshuffled_loader = new UnshuffledImgLoader;
check_unshuffled_loader = new UnshuffledImgLoader;
}
else {
throw std::logic_error("Unsupported type");
}
file_loader = DataLoader(file_unshuffled_loader, batch_size, path);
std::vector<int> rearrangement;
generate_rearrangement(rearrangement, file_loader.size());
train_loader = DataLoader(train_unshuffled_loader, batch_size);
check_loader = DataLoader(check_unshuffled_loader, batch_size);
int instances_for_train = percentage_for_train * (file_loader.size()) / 100;
for (int i = 0; i < file_loader.size(); ++i) {
if (i < instances_for_train) {
train_loader.add_data(file_loader, rearrangement[i]);
}
else {
check_loader.add_data(file_loader, rearrangement[i]);
}
}
train_loader.shuffle();
check_loader.shuffle();
delete file_unshuffled_loader;
}

DataMarker::~DataMarker() {
delete train_unshuffled_loader;
delete check_unshuffled_loader;
}

DataLoader DataMarker::get_check_loader() {
return check_loader;
}

DataLoader DataMarker::get_train_loader() {
return train_loader;
}
21 changes: 21 additions & 0 deletions server/api/DataMarker.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#pragma once

#include <string>
#include "UnshuffledDataLoader.h"
#include "DataLoader.h"

enum class FileExtension {Csv, Png};

class DataMarker {
private:
UnshuffledDataLoader* train_unshuffled_loader;
DataLoader train_loader;
UnshuffledDataLoader* check_unshuffled_loader;
DataLoader check_loader;
public:
DataMarker() = default;
DataMarker(std::string path, FileExtension file_type, int percentage_for_train, std::size_t batch_size);
~DataMarker();
DataLoader get_train_loader();
DataLoader get_check_loader();
};
7 changes: 6 additions & 1 deletion server/api/ImageLoader.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#include "ImageLoader.h"

std::vector<float> ImageLoader::load_image(char* path) {
std::vector<float> ImageLoader::load_image(const char* path) {
cimg_library::CImg<unsigned char> image(path);
return get_pixels(image);
}
Expand All @@ -22,4 +22,9 @@ std::vector<float> ImageLoader::get_pixels(cimg_library::CImg<unsigned char> img
}
}
return ans;
}

std::pair<std::size_t, std::size_t> ImageLoader::get_size(const char *path) {
cimg_library::CImg<unsigned char> image(path);
return {image.width(), image.height()};
}
3 changes: 2 additions & 1 deletion server/api/ImageLoader.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

class ImageLoader {
public:
static std::vector<float> load_image(char* path);
static std::vector<float> load_image(const char* path);
static std::vector<float> get_pixels(cimg_library::CImg<unsigned char>);
static std::pair<std::size_t, std::size_t> get_size(const char* path);
};
33 changes: 33 additions & 0 deletions server/api/UnshuffledCsvLoader.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#include <stdexcept>
#include "UnshuffledCsvLoader.h"
#include "CsvLoader.h"

void UnshuffledCsvLoader::load_data(std::string path) {
data.clear();
auto file_data = CsvLoader::load_csv(path);
data.resize(file_data.size());
for (int i = 0; i < file_data.size(); ++i) {
float result = file_data[i].back();
file_data[i].pop_back();
data[i] = {file_data[i], result};
}
}

void UnshuffledCsvLoader::add_data(const UnshuffledDataLoader* other, int index) {
data.push_back(other->get_raw(index));
}

std::size_t UnshuffledCsvLoader::size() const {
return data.size();
}

std::pair<std::vector<float>, float> UnshuffledCsvLoader::get_raw(std::size_t index) const {
if (index >= data.size()) {
throw std::out_of_range("Index out of range");
}
return {data[index].first, data[index].second};
}

Shape UnshuffledCsvLoader::get_appropriate_shape(std::size_t index, std::size_t batch_size) const {
return Shape({batch_size, data[index].first.size()});
}
18 changes: 18 additions & 0 deletions server/api/UnshuffledCsvLoader.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#pragma once

#include <vector>
#include <string>
#include "UnshuffledDataLoader.h"


class UnshuffledCsvLoader: public UnshuffledDataLoader {
private:
std::vector<std::pair<std::vector<float>, float>> data;
public:
UnshuffledCsvLoader() = default;
void load_data(std::string path) override;
void add_data(const UnshuffledDataLoader* other, int index) override;
std::size_t size() const override;
std::pair<std::vector<float>, float> get_raw(std::size_t index) const override;
Shape get_appropriate_shape(std::size_t index, std::size_t batch_size) const override;
};
16 changes: 16 additions & 0 deletions server/api/UnshuffledDataLoader.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#pragma once

#include <string>
#include <vector>
#include "Allocator.h"

class UnshuffledDataLoader {
public:
UnshuffledDataLoader() = default;
virtual ~UnshuffledDataLoader() = default;
virtual void load_data(std::string path) = 0;
virtual void add_data(const UnshuffledDataLoader* other, int index) = 0;
virtual std::size_t size() const = 0;
virtual std::pair<std::vector<float>, float> get_raw(std::size_t index) const = 0;
virtual Shape get_appropriate_shape(std::size_t index, std::size_t batch_size) const = 0;
};
42 changes: 42 additions & 0 deletions server/api/UnshuffledImgLoader.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#include "UnshuffledImgLoader.h"
#include "CsvLoader.h"
#include "Blob.h"
#include "Allocator.h"
#include "ImageLoader.h"
#include <filesystem>
#include <string>

void UnshuffledImgLoader::load_data(std::string path) {
for (auto const& dir_entry : std::filesystem::recursive_directory_iterator(path)) {
std::string file_path = dir_entry.path();
if (file_path.size() >= 4 && file_path.substr(file_path.size() - 4, 4) == ".csv") {
data = CsvLoader::load_labels(file_path.c_str());
break;
}
}
for (int i = 0; i < data.size(); ++i) {
data[i].first = path + "/" + data[i].first;
}
}

Shape UnshuffledImgLoader::get_appropriate_shape(std::size_t index, std::size_t batch_size) const {
auto img_size = ImageLoader::get_size(data[index].first.c_str());
return Shape({batch_size, 3, img_size.first, img_size.second});
}

std::pair<std::vector<float>, float> UnshuffledImgLoader::get_raw(std::size_t index) const {
if (index >= data.size()) {
throw std::out_of_range("Index out of range");
}
std::string file_path = data[index].first;
float ans = data[index].second;
return {ImageLoader::load_image(file_path.c_str()), ans};
}

std::size_t UnshuffledImgLoader::size() const {
return data.size();
}

void UnshuffledImgLoader::add_data(const UnshuffledDataLoader* other, int index) {
data.push_back(reinterpret_cast<const UnshuffledImgLoader*>(other)->data[index]);
}
18 changes: 18 additions & 0 deletions server/api/UnshuffledImgLoader.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#pragma once

#include "UnshuffledDataLoader.h"
#include "Blob.h"
#include <vector>
#include <string>

class UnshuffledImgLoader: public UnshuffledDataLoader {
private:
std::vector<std::pair<std::string, float>> data;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Вот здесь и в остальных местах тоже: много раз используется вот эта пара, не лучше ли написать структурку с понятными названиями полей?

public:
UnshuffledImgLoader() = default;
void load_data(std::string path) override; // path to folder
void add_data(const UnshuffledDataLoader* other, int index) override;
std::size_t size() const override;
std::pair<std::vector<float>, float> get_raw(std::size_t index) const override;
Shape get_appropriate_shape(std::size_t index, std::size_t batch_size) const override;
};
Loading