MATLAB-Source-Code-Oversampling-Methods/Mod_AggCluster.m at master · Nekooeimehr/MATLAB-Source-Code-Oversampling-Methods · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
function [min_clusters] = Mod_AggCluster(Majority_features, Minority_features ,CThresh)

% This code is a modification of the source code for Hierachical Clustering
% implemented by David Ross
% The source code for the original Hierachical Clustering can be found in:
% http://www.cs.toronto.edu/~dross/code/

SizeMin = size(Minority_features,1);
min_clusters = (1:SizeMin)';

%% Clustering the majority class using Hierachical Clustering
maj_clusters = Orig_agg_cluster(Majority_features, CThresh);

% Kmaj = size(unique(maj_clusters),1);
% m_each_maj = histc(maj_clusters,1:Kmaj);

Whole_data_min = [Minority_features; Majority_features];
D = pdist(Whole_data_min,'euclidean');
point_dist_min = squareform(D);

%% Clustering the Minority instances using majority clusters
min_clusters = inside_AggCluster(Minority_features', min_clusters, maj_clusters, point_dist_min, CThresh);

function labels  = inside_AggCluster(data, same_clusters, other_clusters, point_dist_whole, CThresh)
Num_Reject = 0;
N = size(data,2);
Exist_Clus = unique(same_clusters);
M = size(Exist_Clus ,1);

% the distance between each pair of points
point_dist = point_dist_whole(1:N,1:N);
point_dist2 = point_dist;
for i=1:N
    point_dist2(i,i) = 100;
end

% Measuring the threshold
thresh = mean(median(point_dist2)).* CThresh;

% Clusters is a cell array of vectors.  Each vector contains the
% indicies of the points belonging to that cluster.
% Initially, each point is in it's own cluster.
clusters = cell(M,1);
for cc = 1:M
    clusters{cc} = find(same_clusters == Exist_Clus(cc))';
end

% until the termination condition is met
mm = 0;
while mm < thresh

    % compute the distances between all pairs of clusters
    cluster_dist = inf*ones(length(clusters));
    for c1 = 1:length(clusters)
        for c2 = (c1+1):length(clusters)
            cluster_dist(c1,c2) = cluster_distance(clusters{c1}, clusters{c2}, point_dist, 3);
        end
    end

    % merge the two nearest clusters
    [mm ii] = min(cluster_dist(:));
    [ii(1) ii(2)] = ind2sub(size(cluster_dist), ii(1));

    if mm > thresh || length(clusters) < 3,
        break
    end
    % find the distance of nearest clusters to other class clusters:
    Unique_Other = unique(other_clusters);
    num_clus = size(Unique_Other,1);

    for k = 1:num_clus
        MN2other(k) = cluster_distance_maj(clusters{ii(1)}, N + find(other_clusters == Unique_Other(k)), point_dist_whole, 3);
    end
    flag = 1;
    Distr = histc(other_clusters,1:max(other_clusters));
    Distr(Distr == 0) = [] ;
    near_other_ind = find(MN2other < mm & Distr' > 3);
    for t = 1:length(near_other_ind)
        check_dis = cluster_distance_maj(clusters{ii(2)}, N + find(other_clusters == Unique_Other(near_other_ind(t))) , point_dist_whole, 3);
        if check_dis <mm
            flag = 0;
            Num_Reject = Num_Reject + 1;
            A = clusters{ii(1)};
            B = clusters{ii(2)};
            point_dist (A(1,1),B(1,1)) = inf;
            point_dist (B(1,1),A(1,1)) = inf;
        end
    end
    % Place the if condition if there exist a majority cluster between them or not
    if flag == 1;
        clusters = merge_clusters(clusters, ii);
    end
end

% assign labels to the points, based on their cluster membership
Num_Reject
labels = zeros(N,1);
for cc = 1:length(clusters)
    labels(clusters{cc}) = cc;
end


%//////////////////////////////////////////////////////////
% d = point_distance(X)
%    Computes the pairwise distances between columns of X.
%----------------------------------------------------------
function d = Point_Distance(X)
N = size(X,2);
d = sum(X.^2,1);
d = ones(N,1)*d + d'*ones(1,N) - 2*X'*X;


%//////////////////////////////////////////////////////////
% d = cluster_distance(c1,c2,point_dist,linkage)
%    Computes the pairwise distances between clusters c1
%    and c2, using the point distance info in point_dist.
%----------------------------------------------------------
function d = cluster_distance(c1,c2,point_dist,version)

M1 = length(c1);
M2 = length(c2);
MaxM = max([M1,M2]);
d = point_dist(c1,c2);
if version == 1
    d = min(d(:))*MaxM^0.04;
else if version == 2
        d = mean(d(:))*MaxM^0.04;
    else
        d = max(d(:))*MaxM^0.04;
    end
end

function d = cluster_distance_maj(c1,c2,point_dist,version)
d = point_dist(c1,c2);
if version == 1
    d = min(d(:));
else if version == 2
        d = mean(d(:));
    else
        d = max(d(:));
    end
end
%//////////////////////////////////////////////////////////
% clusters = merge_clusters(clusters, indicies)
%   Merge the clusters indicated by the entries indicies(1)
%   and indicies(2) of cell array 'clusters'.
%----------------------------------------------------------
function clusters = merge_clusters(clusters, indicies)
clusters{indicies(1)} = [clusters{indicies(1)} clusters{indicies(2)}];
clusters(indicies(2)) = [];