aca-kmeans/parallel.cpp at main · A7F/aca-kmeans · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
#include <iostream>
#include <fstream>
#include <sstream>
#include <string>
#include <ctime>
#include <cmath>
#include <chrono>
#include "omp.h"
#include "point.h"
#include "cluster.h"

using namespace std;

// define how many clusters of points we want and the total number of points
const int num_clusters = 4;
const int num_points = 10000;
const int max_iters = 250;
const int threads = 4;

// specify here your absolute path to project folder
const string base_dir = R"(C:\Users\rockt\CLionProjects\aca-kmeans\)";

/**
* specify here which dataset you want to use. Datasets available:
* "points" = random points
* "iris" = iris dataset, 3 clusters known, 150 points
* "blobs" = synthetic clusters generated from sk-learn with function make_blobs. Set cluster and points accordingly
**/
const string dataset = "blobs";


// defining data and functions
int load_dataset(string dataset);
int init_clusters();
double distance(Point point, Cluster cluster);
int random(int min, int max);
void naive_kmeans();
void export_result(int iterations, double serial_time, double convergence_time);
Point points[num_points];
Cluster clusters[num_clusters];


int main(){
    srand(time(NULL));
    // load the dataset. Use flag true if you want to test with iris dataset
    double time_point1 = omp_get_wtime();
    int tot_points = load_dataset(dataset);
    cout << "total points loaded: " << tot_points << endl;
    int tot_clusters = init_clusters();
    cout << "total clusters instantiated: " << tot_clusters << endl;
    printf("total number of processors available: %d\n", omp_get_num_procs());
    for(int i=0; i<num_clusters; i++){
        clusters[i].print();
    }
    double time_point2 = omp_get_wtime();
    double duration = time_point2 - time_point1;
    bool converged = false;

    // algorithm start. Compute distances between each point and centroids
    int iteration = 0;
    while((!converged) && iteration < max_iters){
        iteration++;
        naive_kmeans();
        // reset dimension for all the clusters before starting a new iteration
        for(int i=0; i<num_clusters; i++){
            converged = clusters[i].update_centroid();
            clusters[i].empty_pivot();
        }
        printf(">>> Iteration %d done <<<\n", iteration);
    }

    double time_point3 = omp_get_wtime();
    cout << endl << "======= results after convergence (" << iteration << " iters) =======" << endl;

    // print the clusters once the algorithm has converged
    for(int i=0; i<num_clusters; i++){
        clusters[i].print();
    }

    double serial_time = duration;
    duration = time_point3 - time_point2;

    printf("Number of iterations: %d, total time: %f seconds, time per iteration: %f seconds\n",
           iteration, duration, duration/iteration);

    export_result(iteration, serial_time, duration);
    return 0;
}

// load dataset from file and store each point inside the array declared in the beginning.
// datasets available are "iris", "points" or "blobs".
int load_dataset(string dataset){
    stringstream ss;
    if(dataset=="iris"){
        ss << base_dir << "input/iris.txt";
    }
    if(dataset=="points"){
        ss << base_dir << "input/" << num_points << "points.txt";
    }
    if(dataset=="blobs"){
        ss << base_dir << "input/" << num_points << "-" << num_clusters << "-blob.txt";
    }
    string filepath = ss.str();
    ifstream infile(filepath);
    if(!infile){
        cerr << "Could not open file. Maybe the number of points set is wrong?" << std::endl;
        exit(0);
    }
    double x, y;
    int i=0;
    while(infile >> x >> y){
        Point point((double) x, (double) y);
        points[i] = point;
        i++;
    }
    return i;
}

// initialize random pivoting elements as many as the number of clusters
int init_clusters(){
    int index=0;
    for(index; index<num_clusters; index++){
        int random_point_index = random(0, num_points);
        double x = points[random_point_index].get_x();
        double y = points[random_point_index].get_y();
        Cluster cluster = Cluster(Point((double) x, (double) y, index));
        clusters[index] = cluster;
    }
    return index;
}

//compute the distance between all the points and centroids
void naive_kmeans(){
    double min_distance;
    int min_cluster_index;

#pragma omp parallel private(min_distance, min_cluster_index) num_threads(threads)
    {
#pragma omp for schedule(static)
        for(int i=0; i<num_points; i++){
            min_distance = distance(points[i], clusters[0]);
            min_cluster_index = 0;
            for(int j=0; j<num_clusters; j++){
                double tmp_distance = distance(points[i], clusters[j]);
                if(tmp_distance<min_distance){
                    min_distance = tmp_distance;
                    min_cluster_index = j;
                }
            }
            points[i].set_cluster(min_cluster_index);
#pragma omp critical
            {
                clusters[min_cluster_index].add_point(points[i]);
            }
        }
    }
}

// compute the distance between two 2D points (pythagorean theorem)
double distance(Point point, Cluster cluster){
    double d = sqrt(pow(point.get_x() - cluster.get_pivot().get_x(), 2) + pow(point.get_y() - cluster.get_pivot().get_y(), 2));
    return d;
}

int random(int min, int max){
    return min + rand() % (( max + 1 ) - min);
}

// export algorithm result to a file to process it using python (plotting, mostly)
void export_result(int iterations, double serial_time, double convergence_time){
    stringstream ss;
    ss << base_dir << "output/" << "res.txt";
    string filepath = ss.str();
    ofstream outfile(filepath);
    if(!outfile){
        cout << "file res can't be opened" << endl;
    }
    outfile << num_clusters << endl;
    for(int i=0; i<num_clusters; i++){
        outfile << clusters[i].get_pivot().get_x() << " " << clusters[i].get_pivot().get_y() << " " << clusters[i].get_pivot().get_cluster() << endl;
    }
    for(int i=0; i<num_points; i++){
        outfile << points[i].get_x() << " " << points[i].get_y() << " " << points[i].get_cluster() << endl;
    }
    outfile.close();

    ofstream statfile;
    string dir = base_dir + "output/runs.csv";
    statfile.open(dir, ios_base::app);
    if(!statfile){
        cout << "file runs can't be opened" << endl;
    }
    const auto p1 = std::chrono::system_clock::now();
    long timestamp = std::chrono::duration_cast<std::chrono::seconds>(p1.time_since_epoch()).count();
    statfile <<timestamp<<","<<"p,c,"<<dataset<<","<<num_points<<","<<num_clusters<<","<<threads<<","<<iterations<<","<<serial_time<<","<<convergence_time<<endl;
}