diff --git a/recognition/45616756-GCN/README.md b/recognition/45616756-GCN/README.md new file mode 100644 index 0000000000..f75eca1e40 --- /dev/null +++ b/recognition/45616756-GCN/README.md @@ -0,0 +1,33 @@ +# Multi-layer GCN on Facebook Large Page-Page Network Dataset + +## Introduction to GCN: +GCN is a neural network for a graph data. The graph consists of two components: nodes and edges. Each node in the graph have features. Our task is to label each node with given categorical class (Node Classification). + +![GCN](./data/GCN.png) + +## How GCN Works in General: +1. Create N by N adjacency matrix (N is nodes number) +2. Create N by D matrix (D is features number) +3. Normalize the adjacency and the features matrix +4. Create a Two Layer Graph Convolutional Network +5. Train & test the dataset + +## Dependencies Required: +- Python +- Numpy +- Pytorch +- Matplotlib +- Sklearn +- Scipy +- Pandas + +## Results: +#### Loss Plot +![GCN](./data/Loss.png) +#### Training Plot +![GCN](./data/Accuracy.png) +#### Node embeddings +![GCN](./data/Embedding.png) + +## Reference +[1] https://arxiv.org/abs/1609.02907 diff --git a/recognition/45616756-GCN/data/Accuracy.png b/recognition/45616756-GCN/data/Accuracy.png new file mode 100644 index 0000000000..394d37e9a1 Binary files /dev/null and b/recognition/45616756-GCN/data/Accuracy.png differ diff --git a/recognition/45616756-GCN/data/Embedding.png b/recognition/45616756-GCN/data/Embedding.png new file mode 100644 index 0000000000..1551ec8f9f Binary files /dev/null and b/recognition/45616756-GCN/data/Embedding.png differ diff --git a/recognition/45616756-GCN/data/GCN.png b/recognition/45616756-GCN/data/GCN.png new file mode 100644 index 0000000000..ddc9ccfccf Binary files /dev/null and b/recognition/45616756-GCN/data/GCN.png differ diff --git a/recognition/45616756-GCN/data/Loss.png b/recognition/45616756-GCN/data/Loss.png new file mode 100644 index 0000000000..091f5b4a09 Binary files /dev/null and b/recognition/45616756-GCN/data/Loss.png differ diff --git a/recognition/45616756-GCN/driver.py b/recognition/45616756-GCN/driver.py new file mode 100644 index 0000000000..a1a0ce7de4 --- /dev/null +++ b/recognition/45616756-GCN/driver.py @@ -0,0 +1,162 @@ +import numpy as np +import torch +import torch.nn.functional as F +import torch.optim as optim +from scipy.sparse import coo_matrix, csr_matrix, eye, diags +from sklearn import preprocessing +from sklearn.model_selection import train_test_split +from sklearn.manifold import TSNE + +from model import GCN + + +def accuracy(output, features): + predicts = output.max(1)[1].type_as(features) + correct = predicts.eq(features).double() + correct = correct.sum() + return correct / len(features) + + +def main(): + # Load Facebook dataset + data = np.load('./data/facebook.npz') + + # Create an adjacency matrix representation based on the edges + facebook_edges = data['edges'] + adjacency_matrix = np.zeros((22470, 22470), dtype='float32') + for edge in facebook_edges: + adjacency_matrix[edge[0]][edge[1]] = 1 + adjacency_matrix = coo_matrix(adjacency_matrix) + + # Nodes features + facebook_features = data['features'] + facebook_features = csr_matrix(facebook_features) + + # Convert each categorical value (One-hot encoding) + facebook_target = data['target'] + lb = preprocessing.LabelBinarizer() + facebook_target = lb.fit_transform(facebook_target) + + # Split the target (20:20:60) + facebook_train_target, facebook_test_target = train_test_split( + facebook_target, train_size=0.20, shuffle=False + ) + facebook_validation_target, facebook_test_target = train_test_split( + facebook_test_target, train_size=0.20, shuffle=False + ) + + # Normalize the adjacency matrix + a_tilde = adjacency_matrix + eye(22470, dtype='float32') # adjacency matrix + self-loop + d = diags(np.array(a_tilde.sum(axis=1)).flatten()) # degree matrix + degrees_inverse = np.power(d.diagonal(), -1) + d_inverse = diags(degrees_inverse) # degree matrix inverse + adjacency_matrix = d_inverse.dot(a_tilde).tocoo() + + # Normalize the adjacency matrix (Ver.2) + # A_tilde = adjacency_matrix + np.eye(22470) + # D_tilde = np.matrix(np.diag(np.array(np.sum(A_tilde, axis=0))[0])) + # D_tilde_invroot = np.linalg.inv(sqrtm(D_tilde)) + # A_hat = np.matmul(np.matmul(A_tilde, D_tilde_invroot), D_tilde_invroot) + + # Normalize the features matrix + d = diags(np.array(facebook_features.sum(axis=1)).flatten()) # degree matrix + degrees_inverse = np.power(d.diagonal(), -1) + d_inverse = diags(degrees_inverse) # degree matrix inverse + facebook_features = d_inverse.dot(facebook_features) + + # Convert to tensor + facebook_features = torch.FloatTensor(np.array(facebook_features.todense())) + facebook_train_target = torch.LongTensor(np.where(facebook_train_target)[1]) + facebook_validation_target = torch.LongTensor(np.where(facebook_validation_target)[1]) + facebook_test_target = torch.LongTensor(np.where(facebook_test_target)[1]) + adjacency_matrix = torch.sparse.FloatTensor( + torch.LongTensor(np.vstack((adjacency_matrix.row, adjacency_matrix.col))), + torch.FloatTensor(adjacency_matrix.data), + torch.Size(adjacency_matrix.shape) + ) + + # Print output + print('facebook_features:', facebook_features) + print('facebook_train_target:', facebook_train_target.size()) + print('facebook_validation_target:', facebook_validation_target.size()) + print('facebook_test_target:', facebook_test_target.size()) + print('adjacency_matrix:', adjacency_matrix) + + # Create model + model = GCN(input_size=facebook_features.shape[1], + hidden_size=16, + num_classes=4, + dropout=0.5) + optimizer = optim.Adam(model.parameters(), + lr=0.01, + weight_decay=5e-4) + + train_size = facebook_train_target.size()[0] + validation_size = facebook_validation_target.size()[0] + test_size = facebook_test_target.size()[0] + total_target_size = (facebook_train_target.size()[0] + + facebook_validation_target.size()[0] + + facebook_test_target.size()[0]) + + train_losses = [] + train_accuracies = [] + validation_losses = [] + validation_accuracies = [] + + # Training + for epoch in range(200): + model.train() + optimizer.zero_grad() + output = model(facebook_features, adjacency_matrix) + train_loss = F.nll_loss(output[range(0, train_size)], + facebook_train_target) + train_losses.append(train_loss.item()) + train_accuracy = accuracy(output[range(0, train_size)], + facebook_train_target) + train_accuracies.append(train_accuracy.item()) + train_loss.backward() + optimizer.step() + + model.eval() + output = model(facebook_features, adjacency_matrix) + validation_loss = F.nll_loss(output[range(train_size, (train_size + validation_size))], + facebook_validation_target) + validation_losses.append(validation_loss.item()) + validation_accuracy = accuracy(output[range(train_size, (train_size + validation_size))], + facebook_validation_target) + validation_accuracies.append(validation_accuracy) + + print('Epoch: {:04d}'.format(epoch + 1), + 'Train loss: {:.4f}'.format(train_loss.item()), + 'Train accuracy: {:.4f}'.format(train_accuracy.item()), + 'Validation loss: {:.4f}'.format(validation_loss.item()), + 'Validation accuracy: {:.4f}'.format(validation_accuracy.item())) + + # Test + model.eval() + output = model(facebook_features, adjacency_matrix) + test_loss = F.nll_loss(output[range((train_size + validation_size), total_target_size)], + facebook_test_target) + test_accuracy = accuracy(output[range((train_size + validation_size), total_target_size)], + facebook_test_target) + + print('Test set results:', + 'Test loss: {:.4f}'.format(test_loss.item()), + 'Test accuracy: {:.4f}'.format(test_accuracy.item())) + + np.save('train_losses', train_losses) + np.save('train_accuracies', train_accuracies) + np.save('validation_losses', validation_losses) + np.save('validation_accuracies', validation_accuracies) + + # Node embeddings + model.eval() + with torch.no_grad(): + x = model(facebook_features, adjacency_matrix) + x_embedded = TSNE(n_components=2).fit_transform(x) + + np.save('x_embedded', x_embedded) + + +if __name__ == '__main__': + main() diff --git a/recognition/45616756-GCN/model.py b/recognition/45616756-GCN/model.py new file mode 100644 index 0000000000..64dbabe8cd --- /dev/null +++ b/recognition/45616756-GCN/model.py @@ -0,0 +1,43 @@ +import math +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn.parameter import Parameter + + +class GraphConvolution(nn.Module): + """ + GCN Layer + """ + + def __init__(self, in_features, out_features): + super(GraphConvolution, self).__init__() + self.weight = Parameter(torch.FloatTensor(in_features, out_features)) + self.bias = Parameter(torch.FloatTensor(out_features)) + stdv = 1. / math.sqrt(self.weight.size(1)) + self.weight.data.uniform_(-stdv, stdv) + self.bias.data.uniform_(-stdv, stdv) + + def forward(self, x, adjacency_matrix): + x = torch.mm(x, self.weight) + x = torch.spmm(adjacency_matrix, x) + return x + self.bias + + +class GCN(nn.Module): + """ + Model + """ + + def __init__(self, input_size, hidden_size, num_classes, dropout=0.5): + super(GCN, self).__init__() + self.gconv1 = GraphConvolution(input_size, hidden_size) + self.gconv2 = GraphConvolution(hidden_size, num_classes) + self.dropout = dropout + + def forward(self, x, adjacency_matrix): + x = self.gconv1(x, adjacency_matrix) + x = F.relu(x) + x = F.dropout(x, self.dropout, training=self.training) + x = self.gconv2(x, adjacency_matrix) + return F.log_softmax(x, dim=1) diff --git a/recognition/45616756-GCN/plot.py b/recognition/45616756-GCN/plot.py new file mode 100644 index 0000000000..5101a15fb5 --- /dev/null +++ b/recognition/45616756-GCN/plot.py @@ -0,0 +1,41 @@ +import numpy as np +import matplotlib.pyplot as plt +import pandas as pd + + +train_losses = np.load('train_losses.npy') +train_accuracies = np.load('train_accuracies.npy') +validation_losses = np.load('validation_losses.npy') +validation_accuracies = np.load('validation_accuracies.npy') +x_embedded = np.load('x_embedded.npy') + +plt.figure(figsize=(10, 5)) +plt.title('Training and Validation Loss') +plt.plot(train_losses, label='Train') +plt.plot(validation_losses, label='Validation') +plt.xlabel('Epoch') +plt.ylabel('Loss') +plt.legend() +plt.show() + +plt.figure(figsize=(10, 5)) +plt.title('Training and Validation Accuracy') +plt.plot(train_accuracies, label='Train') +plt.plot(validation_accuracies, label='Validation') +plt.xlabel('Epoch') +plt.ylabel('Accuracy') +plt.legend() +plt.show() + +data = np.load('./data/facebook.npz') +df = pd.DataFrame(data['target']) + +fig, ax = plt.subplots(figsize=(7, 7)) +ax.scatter(x_embedded[:, 0], + x_embedded[:, 1], + c=df[0].astype('category').cat.codes) +ax.set(aspect="equal", + xlabel="$X_1$", + ylabel="$X_2$", + title="Visualization of GCN embeddings for Facebook dataset") +plt.show()