From db3ec236da0a2848e3adb09b1f7c13847be1dc06 Mon Sep 17 00:00:00 2001 From: Jesper Jaxing Date: Mon, 3 Apr 2017 13:28:45 +0200 Subject: [PATCH 01/60] merge --- config.template.yaml | 2 ++ main.py | 2 ++ model/model.py | 46 ++++++++++++++++++++++++++++++++++++++++ model/util/csv_reader.py | 2 +- model/util/data.py | 5 +++++ 5 files changed, 56 insertions(+), 1 deletion(-) diff --git a/config.template.yaml b/config.template.yaml index 115f562..3ff473a 100644 --- a/config.template.yaml +++ b/config.template.yaml @@ -11,6 +11,7 @@ network: validation_data: 'validation_data_top_n_single.csv' training_data: 'training_data_top_n_single.csv' testing_data: 'testing_data_top_n_single.csv' + pre_train_data: 'pre-train_data_top_n.csv' # Embedding matrix configs: embedding_size: 150 # Make sure to match pretrained matrix dimensions trainable_matrix: true @@ -24,6 +25,7 @@ network: hidden_layers: 0 hidden_neurons: 300 use_concat_input: false + pre_train_subreddit: false # Regularisation configs: use_l2_loss: false l2_factor: 0.01 diff --git a/main.py b/main.py index dd048e3..d2c06d6 100644 --- a/main.py +++ b/main.py @@ -39,6 +39,8 @@ def main(): with tf.Session() as sess: builder = ModelBuilder(config_file, sess) network_model = builder.build() + if config_file["pre-train-subreddit"]: + network_model.pre_train() network_model.train() network_model.close_writers() tf.reset_default_graph() diff --git a/model/model.py b/model/model.py index 10fe2ea..9d5b0e4 100644 --- a/model/model.py +++ b/model/model.py @@ -231,6 +231,52 @@ def train(self): epoch_top=self.epoch_top, prec_valid=self.prec_valid, prec_train=self.prec_train, recall_valid=self.recall_valid, recall_train=self.recall_train) + def pre_train(self): + """ Pre-trains the model on the pre-training dataset """ + print("Starting pre-training...") + + old_epoch = 0 + + if self.use_pretrained: + self._session.run(self.embedding_init, feed_dict={ + self.embedding_placeholder: + self.data.embedding_matrix}) + + tmp_data, tmp_labels = self.data.get_training() + tmp_size = self.data.train_size + + self.data.train_data = self.data.pre_train_data + self.data.train_labels = self.data.pre_train_data + self.data.train_size = self.data.pre_train_size + + # Train for a specified amount of epochs + for i in self.data.for_n_train_epochs(self.training_epochs, + self.batch_size): + # Debug print out + epoch = self.data.completed_training_epochs + training_error = self.train_batch() + # validation_error = self.validate_batch() + + # Don't validate so often + if i % (self.data.train_size // self.batch_size // 10) == 0 and i: + done = self.data.percent_of_epoch + print( + "Validation error: {:f} | Training error: {:f} | Done: {:.0%}" + .format(0, training_error, done)) + + # Do a full evaluation once an epoch is complete + if epoch != old_epoch: + self._session.run(self.epoch.assign_add(1)) + print("Epoch complete...old ", old_epoch) + self.save_checkpoint() + # self.validate() + old_epoch = epoch + + self.data.train_data = tmp_data + self.data.train_labels = tmp_labels + self.data.train_size = tmp_size + self.data.completed_training_epochs = 0 + def train_batch(self): """ Trains for one batch and returns cross entropy error """ with tf.device("/cpu:0"): diff --git a/model/util/csv_reader.py b/model/util/csv_reader.py index c5b7526..9913c68 100644 --- a/model/util/csv_reader.py +++ b/model/util/csv_reader.py @@ -34,7 +34,7 @@ class Dataenum(Enum): TESTING = "testing_data" TRAINING = "training_data" VALIDATION = "validation_data" - + PRE_TRAINING = "pre_training_data" class CsvReader: def __init__(self, netcfg): diff --git a/model/util/data.py b/model/util/data.py index 4d9ac54..b50fb31 100644 --- a/model/util/data.py +++ b/model/util/data.py @@ -56,6 +56,11 @@ def __init__(self, networkconfig): def _read_data(self): """ Reads all the data from specified path """ + logging.debug("Reading pre-training data...") + + self.pre_train_data, self.pre_train_labels = self.reader.get_data(Dataenum.PRE_TRAINING) + self.pre_train_size = len(self.pre_train_data) + logging.debug("Reading training data...") self.train_data, self.train_subreddits, self.train_labels = \ From bc9154d45ebc502249b612d3ec0cc40b55044c46 Mon Sep 17 00:00:00 2001 From: Jesper Jaxing Date: Tue, 4 Apr 2017 10:55:16 +0200 Subject: [PATCH 02/60] added method to add secondary output layer --- model/model_builder.py | 56 +++++++++++++++++++++++++++++++++++------- 1 file changed, 47 insertions(+), 9 deletions(-) diff --git a/model/model_builder.py b/model/model_builder.py index a1b4f39..74ffa7a 100644 --- a/model/model_builder.py +++ b/model/model_builder.py @@ -52,6 +52,10 @@ def add_input_layer(self): tf.placeholder(tf.float64, [None, self._model.user_count], name="target") + self._model.sec_target = \ + tf.placeholder(tf.float64, + [None, self._model.data.subreddit_count], + name="sec_target") self._model.keep_prob = tf.placeholder(tf.float64, name="keep_prob") @@ -178,6 +182,49 @@ def add_output_layer(self): return self + def add_secondary_output(self): + """Adds a layer that can be used to train the network on data that is + labeled in a different way than the final data""" + # Output layer + # Feed the output of the previous layer to a sigmoid layer + sigmoid_weights = tf.Variable(tf.random_normal( + [self._model.latest_layer.get_shape()[1].value, + self._model.subreddit_count], + stddev=0.35, + dtype=tf.float64), + name="secondary_output_weights") + + sigmoid_bias = tf.Variable(tf.random_normal([self._model.subreddit_count], + stddev=0.35, + dtype=tf.float64), + name="secondary_output_biases") + + logits = tf.add(tf.matmul(self._model.latest_layer, sigmoid_weights), + sigmoid_bias) + # Training + + # Defne error function + error = tf.nn.sigmoid_cross_entropy_with_logits( + labels=self._model.sec_target, + logits=logits) + + if self._model.use_l2_loss: + cross_entropy = \ + tf.reduce_mean(tf.add( + tf.add(error, + tf.multiply(self._model.l2_factor, + tf.nn.l2_loss(sigmoid_weights))), + tf.add(tf.multiply(self._model.l2_factor, + tf.nn.l2_loss(sigmoid_bias)), + tf.multiply(self._model.l2_factor, + self._model.l2_term)))) + else: + cross_entropy = tf.reduce_mean(error) + + self._model.pre_train_op = tf.train.AdamOptimizer( + self._model.learning_rate).minimize(cross_entropy) + return self + def add_precision_operations(self): """Adds precision operation and tensorboard operations""" # Determine which prediction function to use. Casts a tensor to @@ -267,15 +314,6 @@ def add_precision_operations(self): def build(self): """Adds saver and init operation and returns the model""" - self.add_input_layer() - - # Add a number of hidden layers - for _ in range(self._model.hidden_layers): - self.add_layer(self._model.hidden_neurons) - - self.add_output_layer() - - self.add_precision_operations() self._model.init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) From 1afbf9f75a37c3f6616d82c22b0b4d535ad3f088 Mon Sep 17 00:00:00 2001 From: Jesper Jaxing Date: Tue, 4 Apr 2017 10:58:53 +0200 Subject: [PATCH 03/60] implemented pre-training on subreddits --- main.py | 16 ++++++- model/model.py | 114 +++++++++++++++++++------------------------------ 2 files changed, 57 insertions(+), 73 deletions(-) diff --git a/main.py b/main.py index d2c06d6..68694b2 100644 --- a/main.py +++ b/main.py @@ -23,6 +23,7 @@ # ============================================================================== import argparse import tensorflow as tf +from definitions import * from model.util.networkconfig import yamlconfig as networkconfig from model.model_builder import ModelBuilder @@ -38,9 +39,20 @@ def main(): config_file = networkconfig[conf] with tf.Session() as sess: builder = ModelBuilder(config_file, sess) + builder.add_input_layer() + + # Add a number of hidden layers + for _ in range(config_file[HIDDEN_LAYERS]): + builder.add_layer(config_file[HIDDEN_NEURONS]) + + builder.add_output_layer()\ + .add_secondary_output()\ + .add_precision_operations() + network_model = builder.build() - if config_file["pre-train-subreddit"]: - network_model.pre_train() + if config_file[USE_PRETRAINED_NET]: + network_model.train(USE_PRETRAINED_NET) + network_model.data.completed_training_epochs = 0 network_model.train() network_model.close_writers() tf.reset_default_graph() diff --git a/model/model.py b/model/model.py index 9d5b0e4..4c11fa2 100644 --- a/model/model.py +++ b/model/model.py @@ -67,13 +67,17 @@ def __init__(self, config, session): self.use_constant_limit = config[USE_CONSTANT_LIMIT] self.constant_prediction_limit = config[CONSTANT_PREDICTION_LIMIT] self.use_concat_input = config[USE_CONCAT_INPUT] + self.use_pretrained_net = config[USE_PRETRAINED_NET] + self.subreddit_count = 0 # Will be set in build_graph self.input = None self.subreddit_input = None self.target = None + self.sec_target = None self.sigmoid = None self.train_op = None + self.pre_train_op = None self.error = None self.init_op = None self.saver = None @@ -105,6 +109,7 @@ def __init__(self, config, session): with tf.device("/cpu:0"): self.data = data.Data(config) + self.subreddit_count = self.data.subreddit_count if self.use_pretrained: self.vocabulary_size = len(self.data.embedding_matrix) @@ -184,7 +189,7 @@ def validate_batch(self): # TODO funktionen gör alldeles för mycket, # dela upp utskrift, beräkning och träning - def train(self): + def train(self, use_pretrained_net=False): """ Trains the model on the dataset """ print("Starting training...") @@ -200,29 +205,33 @@ def train(self): old_epoch = 0 - if self.epoch.eval(self._session) == 0: + if self.epoch.eval(self._session) == 0 and not use_pretrained_net: self.validate() # Train for a specified amount of epochs - for i in self.data.for_n_train_epochs(self.training_epochs, - self.batch_size): + for i in self.data.for_n_train_epochs(self.batch_size, self.training_epochs): # Debug print out epoch = self.data.completed_training_epochs - training_error = self.train_batch() - validation_error = self.validate_batch() - # Don't validate so often - if i % (self.data.train_size // self.batch_size // 10) == 0 and i: - done = self.data.percent_of_epoch - print("Validation error: {:f} | Training error: {:f} | Done: {:.0%}" - .format(validation_error, training_error, done)) + if not use_pretrained_net: + training_error = self.train_batch() + validation_error = self.validate_batch() + + # Don't validate so often + if i % (self.data.train_size // self.batch_size // 10) == 0 and i: + done = self.data.percent_of_epoch + print("Validation error: {:f} | Training error: {:f} | Done: {:.0%}" + .format(validation_error, training_error, done)) + else: + self.train_batch(True) # Do a full evaluation once an epoch is complete if epoch != old_epoch: self._session.run(self.epoch.assign_add(1)) print("Epoch complete...old ", old_epoch) self.save_checkpoint() - self.validate() + if not self.use_pretrained_net: + self.validate() old_epoch = epoch # Save model when done training @@ -231,67 +240,30 @@ def train(self): epoch_top=self.epoch_top, prec_valid=self.prec_valid, prec_train=self.prec_train, recall_valid=self.recall_valid, recall_train=self.recall_train) - def pre_train(self): - """ Pre-trains the model on the pre-training dataset """ - print("Starting pre-training...") - - old_epoch = 0 - - if self.use_pretrained: - self._session.run(self.embedding_init, feed_dict={ - self.embedding_placeholder: - self.data.embedding_matrix}) - - tmp_data, tmp_labels = self.data.get_training() - tmp_size = self.data.train_size - - self.data.train_data = self.data.pre_train_data - self.data.train_labels = self.data.pre_train_data - self.data.train_size = self.data.pre_train_size - - # Train for a specified amount of epochs - for i in self.data.for_n_train_epochs(self.training_epochs, - self.batch_size): - # Debug print out - epoch = self.data.completed_training_epochs - training_error = self.train_batch() - # validation_error = self.validate_batch() - - # Don't validate so often - if i % (self.data.train_size // self.batch_size // 10) == 0 and i: - done = self.data.percent_of_epoch - print( - "Validation error: {:f} | Training error: {:f} | Done: {:.0%}" - .format(0, training_error, done)) - - # Do a full evaluation once an epoch is complete - if epoch != old_epoch: - self._session.run(self.epoch.assign_add(1)) - print("Epoch complete...old ", old_epoch) - self.save_checkpoint() - # self.validate() - old_epoch = epoch - - self.data.train_data = tmp_data - self.data.train_labels = tmp_labels - self.data.train_size = tmp_size - self.data.completed_training_epochs = 0 - - def train_batch(self): + def train_batch(self, pre_train_net=False): """ Trains for one batch and returns cross entropy error """ with tf.device("/cpu:0"): - batch_input, batch_sub, batch_label = \ - self.data.next_train_batch() - - self._session.run(self.train_op, - {self.input: batch_input, - self.subreddit_input: batch_sub, - self.target: batch_label}) - - return self._session.run(self.error, - feed_dict={self.input: batch_input, - self.subreddit_input: batch_sub, - self.target: batch_label}) + if not pre_train_net: + batch_input, batch_sub, batch_label = \ + self.data.next_train_batch() + else: + batch_input, batch_label = \ + self.data.next_pre_train_batch() + + if pre_train_net: + self._session.run(self.pre_train_op, + {self.input: batch_input, + self.sec_target: batch_label}) + else: + self._session.run(self.train_op, + {self.input: batch_input, + self.subreddit_input: batch_sub, + self.target: batch_label}) + + return self._session.run(self.error, + feed_dict={self.input: batch_input, + self.subreddit_input: batch_sub, + self.target: batch_label}) def close_writers(self): """ Close tensorboard writers """ self.train_writer.close() From 95b779e619e7e6ce58c0882595517f8a263b4edc Mon Sep 17 00:00:00 2001 From: Jesper Jaxing Date: Tue, 4 Apr 2017 10:59:30 +0200 Subject: [PATCH 04/60] create methods for handling pre-training data --- definitions.py | 1 + model/util/data.py | 45 +++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 42 insertions(+), 4 deletions(-) diff --git a/definitions.py b/definitions.py index fd70b22..b44179e 100644 --- a/definitions.py +++ b/definitions.py @@ -55,6 +55,7 @@ TRAINABLE_MATRIX = 'trainable_matrix' PRE_TRAINED_MATRIX = 'pre_trained_matrix' USE_PRETRAINED = 'use_pretrained' +USE_PRETRAINED_NET = 'pre_train_subreddit' VALIDATION_DATA = 'validation_data' TRAINING_DATA = 'training_data' TESTING_DATA = 'testing_data' diff --git a/model/util/data.py b/model/util/data.py index b50fb31..7300967 100644 --- a/model/util/data.py +++ b/model/util/data.py @@ -35,8 +35,11 @@ def __init__(self, networkconfig): self._current_train_index = 0 self._current_valid_index = 0 self._current_test_index = 0 + self._current_pre_train_index = 0 self.completed_training_epochs = 0 + self.completed_pre_training_epochs = 0 self.percent_of_epoch = 0.0 + self.subreddit_count = 0 self.title_length = networkconfig['max_title_length'] self.batch_size = self.netcfg['batch_size'] self.reader = CsvReader(networkconfig) @@ -58,8 +61,8 @@ def _read_data(self): """ Reads all the data from specified path """ logging.debug("Reading pre-training data...") - self.pre_train_data, self.pre_train_labels = self.reader.get_data(Dataenum.PRE_TRAINING) - self.pre_train_size = len(self.pre_train_data) + # self.pre_train_data, self.pre_train_labels = self.reader.get_data(Dataenum.TRAINING, label_column=1) + # self.pre_train_size = len(self.pre_train_data) logging.debug("Reading training data...") @@ -97,6 +100,7 @@ def _build_dict(self): subreddits = " ".join(self.train_subreddits).split() self.subreddit_dict = helper.build_subreddit_dict(subreddits) + self.subreddit_count = len(self.subreddit_dict) def next_train_batch(self, batch_size=None): """ Get the next batch of training data """ @@ -136,6 +140,39 @@ def next_train_batch(self, batch_size=None): self.percent_of_epoch = self._current_train_index / self.train_size return batch_x, batch_sub, batch_y + def next_pre_train_batch(self, batch_size=None): + """ Get the next batch of training data """ + batch_size = batch_size or self.batch_size + batch_x = [] + batch_y = [] + for _ in range(0, batch_size): + sentence = self.train_data[self._current_train_index] + label = self.train_labels[self._current_train_index] + self._current_pre_train_index += 1 + # Support multiple epochs + if self._current_pre_train_index >= self.train_size: + self._current_pre_train_index = 0 + self.completed_pre_training_epochs += 1 + self.percent_of_epoch = 0.0 + # TODO Detta ska inte ligga i funktionen som generar ny data + + # Turn sentences and labels into vector representations + sentence_vec, present, absent = \ + helper.get_indicies(sentence, + self.word_dict, + self.max_title_length) + self.train_present += present + self.train_absent += absent + + label_vec = helper.label_vector(label, + self.subreddit_dict, + self.subreddit_count) + batch_x.append(sentence_vec) + batch_y.append(label_vec) + + self.percent_of_epoch = self._current_train_index / self.train_size + return batch_x, batch_y + def get_validation(self): """ Get the whole validation set in a vectorized form """ old_ind = self._current_valid_index @@ -220,10 +257,10 @@ def next_test_batch(self, batch_size=None): batch_y.append(label_vec) return batch_x, batch_sub, batch_y - def for_n_train_epochs(self, num_epochs=1, batch_size=25): + def for_n_train_epochs(self, batch_size=25, num_epochs=1): # TODO Ta bort parameterar """ Calculates how many training iterations to do for num_epochs - number of epochs with a batch size of batch_size """ + number of epochs with a batch size of batch_size""" return range((self.train_size * num_epochs) // batch_size) def get_training(self): From 4afbf982ef56d345b55677662fcae6ea421d9ccc Mon Sep 17 00:00:00 2001 From: Jesper Jaxing Date: Tue, 4 Apr 2017 14:10:06 +0200 Subject: [PATCH 05/60] non-working commit to allow for debugg help. --- main.py | 1 - model/model.py | 11 ++++------- model/model_builder.py | 6 ++++++ model/util/data.py | 2 +- 4 files changed, 11 insertions(+), 9 deletions(-) diff --git a/main.py b/main.py index 68694b2..78e9255 100644 --- a/main.py +++ b/main.py @@ -52,7 +52,6 @@ def main(): network_model = builder.build() if config_file[USE_PRETRAINED_NET]: network_model.train(USE_PRETRAINED_NET) - network_model.data.completed_training_epochs = 0 network_model.train() network_model.close_writers() tf.reset_default_graph() diff --git a/model/model.py b/model/model.py index 4c11fa2..ce61b13 100644 --- a/model/model.py +++ b/model/model.py @@ -193,15 +193,12 @@ def train(self, use_pretrained_net=False): """ Trains the model on the dataset """ print("Starting training...") - if self.use_pretrained: + if self.use_pretrained and \ + (self.use_pretrained_net and use_pretrained_net) or \ + (not self.use_pretrained_net and not use_pretrained_net): self._session.run(self.embedding_init, feed_dict={self.embedding_placeholder: self.data.embedding_matrix}) - self.train_writer = \ - tf.summary.FileWriter(self.logging_dir + '/' + TENSOR_DIR_TRAIN, - self._session.graph) - self.valid_writer = \ - tf.summary.FileWriter(self.logging_dir + '/' + TENSOR_DIR_VALID) old_epoch = 0 @@ -209,7 +206,7 @@ def train(self, use_pretrained_net=False): self.validate() # Train for a specified amount of epochs - for i in self.data.for_n_train_epochs(self.batch_size, self.training_epochs): + for i in self.data.for_n_train_epochs(self.training_epochs, self.batch_size): # Debug print out epoch = self.data.completed_training_epochs diff --git a/model/model_builder.py b/model/model_builder.py index 74ffa7a..39d27cb 100644 --- a/model/model_builder.py +++ b/model/model_builder.py @@ -24,6 +24,7 @@ import tensorflow as tf from model.model import Model +from definitions import * class ModelBuilder(object): """A class following the builder pattern to create a model""" @@ -315,6 +316,11 @@ def add_precision_operations(self): def build(self): """Adds saver and init operation and returns the model""" + self._model.train_writer = \ + tf.summary.FileWriter(self._model.logging_dir + '/' + TENSOR_DIR_TRAIN, + self._model._session.graph) + self._model.valid_writer = \ + tf.summary.FileWriter(self._model.logging_dir + '/' + TENSOR_DIR_VALID) self._model.init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) self._model.saver = tf.train.Saver() diff --git a/model/util/data.py b/model/util/data.py index 7300967..3da2866 100644 --- a/model/util/data.py +++ b/model/util/data.py @@ -257,7 +257,7 @@ def next_test_batch(self, batch_size=None): batch_y.append(label_vec) return batch_x, batch_sub, batch_y - def for_n_train_epochs(self, batch_size=25, num_epochs=1): + def for_n_train_epochs(self, num_epochs=1, batch_size=25): # TODO Ta bort parameterar """ Calculates how many training iterations to do for num_epochs number of epochs with a batch size of batch_size""" From 193811de992c561a8c88e09d62c1777ff6a23ec6 Mon Sep 17 00:00:00 2001 From: Jesper Jaxing Date: Wed, 5 Apr 2017 11:09:52 +0200 Subject: [PATCH 06/60] changed which variable was used to check if pre-training --- model/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/model/model.py b/model/model.py index ce61b13..998099e 100644 --- a/model/model.py +++ b/model/model.py @@ -227,7 +227,7 @@ def train(self, use_pretrained_net=False): self._session.run(self.epoch.assign_add(1)) print("Epoch complete...old ", old_epoch) self.save_checkpoint() - if not self.use_pretrained_net: + if not use_pretrained_net: self.validate() old_epoch = epoch From e5be63835298f1640f5f7bd7ddd2bf126a84263e Mon Sep 17 00:00:00 2001 From: Jesper Jaxing Date: Wed, 5 Apr 2017 11:12:18 +0200 Subject: [PATCH 07/60] only add sec_output when pre-train and add an extra layer after --- main.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/main.py b/main.py index 78e9255..2b63eaa 100644 --- a/main.py +++ b/main.py @@ -45,8 +45,11 @@ def main(): for _ in range(config_file[HIDDEN_LAYERS]): builder.add_layer(config_file[HIDDEN_NEURONS]) + if builder._model.use_pretrained_net: + builder.add_secondary_output() + builder.add_layer(config_file[HIDDEN_NEURONS]) + builder.add_output_layer()\ - .add_secondary_output()\ .add_precision_operations() network_model = builder.build() From d94d187f9dff5da850fdf53a3a767edce6971d84 Mon Sep 17 00:00:00 2001 From: Jesper Jaxing Date: Wed, 5 Apr 2017 17:38:52 +0200 Subject: [PATCH 08/60] changed config to not include unnecissary parameters --- config.template.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/config.template.yaml b/config.template.yaml index 3ff473a..4b68e69 100644 --- a/config.template.yaml +++ b/config.template.yaml @@ -11,7 +11,6 @@ network: validation_data: 'validation_data_top_n_single.csv' training_data: 'training_data_top_n_single.csv' testing_data: 'testing_data_top_n_single.csv' - pre_train_data: 'pre-train_data_top_n.csv' # Embedding matrix configs: embedding_size: 150 # Make sure to match pretrained matrix dimensions trainable_matrix: true From fa9859cdee00e07cc730718ef0f854cad8ce515a Mon Sep 17 00:00:00 2001 From: Jesper Jaxing Date: Wed, 5 Apr 2017 17:46:21 +0200 Subject: [PATCH 09/60] cleaned main.py by adding it to builder --- main.py | 14 +------------- model/model_builder.py | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/main.py b/main.py index 2b63eaa..6780e31 100644 --- a/main.py +++ b/main.py @@ -39,20 +39,8 @@ def main(): config_file = networkconfig[conf] with tf.Session() as sess: builder = ModelBuilder(config_file, sess) - builder.add_input_layer() - # Add a number of hidden layers - for _ in range(config_file[HIDDEN_LAYERS]): - builder.add_layer(config_file[HIDDEN_NEURONS]) - - if builder._model.use_pretrained_net: - builder.add_secondary_output() - builder.add_layer(config_file[HIDDEN_NEURONS]) - - builder.add_output_layer()\ - .add_precision_operations() - - network_model = builder.build() + network_model = builder.add_as_conf().build() if config_file[USE_PRETRAINED_NET]: network_model.train(USE_PRETRAINED_NET) network_model.train() diff --git a/model/model_builder.py b/model/model_builder.py index 39d27cb..14b243b 100644 --- a/model/model_builder.py +++ b/model/model_builder.py @@ -326,3 +326,17 @@ def build(self): self._model.saver = tf.train.Saver() self._model.load_checkpoint() return self._model + + def add_as_conf(self): + self.add_input_layer() + + # Add a number of hidden layers + for _ in range(self._model.hidden_layers): + self.add_layer(self._model.hidden_neurons) + + if self._model.use_pretrained_net: + self.add_secondary_output() + + self.add_output_layer() \ + .add_precision_operations() + return self From 1de1133b19762257809e361d5066135f50e2166f Mon Sep 17 00:00:00 2001 From: Jesper Jaxing Date: Wed, 5 Apr 2017 17:50:16 +0200 Subject: [PATCH 10/60] sec_output uses softmax since a title only have one subreddit --- model/model_builder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/model/model_builder.py b/model/model_builder.py index 14b243b..c210b8a 100644 --- a/model/model_builder.py +++ b/model/model_builder.py @@ -205,7 +205,7 @@ def add_secondary_output(self): # Training # Defne error function - error = tf.nn.sigmoid_cross_entropy_with_logits( + error = tf.nn.softmax_cross_entropy_with_logits( labels=self._model.sec_target, logits=logits) From 5c2db6cfb133b7639c366b0087c8fe47a01e53dd Mon Sep 17 00:00:00 2001 From: Jesper Jaxing Date: Wed, 5 Apr 2017 18:15:10 +0200 Subject: [PATCH 11/60] refactored add output method to one method --- model/model_builder.py | 77 ++++++++++++------------------------------ 1 file changed, 22 insertions(+), 55 deletions(-) diff --git a/model/model_builder.py b/model/model_builder.py index c210b8a..b16fc2d 100644 --- a/model/model_builder.py +++ b/model/model_builder.py @@ -138,77 +138,38 @@ def add_layer(self, number_of_neurons): return self - def add_output_layer(self): + def add_output_layer(self, output_size, secondary_output=False): """Adds an output layer, including error and optimisation functions. After this method no new layers should be added.""" # Output layer # Feed the output of the previous layer to a sigmoid layer sigmoid_weights = tf.Variable(tf.random_normal( - [self._model.latest_layer.get_shape()[1].value, self._model.user_count], + [self._model.latest_layer.get_shape()[1].value, output_size], stddev=0.35, dtype=tf.float64), name="output_weights") - sigmoid_bias = tf.Variable(tf.random_normal([self._model.user_count], + sigmoid_bias = tf.Variable(tf.random_normal([output_size], stddev=0.35, dtype=tf.float64), name="output_biases") logits = tf.add(tf.matmul(self._model.latest_layer, sigmoid_weights), sigmoid_bias) - self._model.sigmoid = tf.nn.sigmoid(logits) - # Training - - # Defne error function - error = tf.nn.sigmoid_cross_entropy_with_logits(labels=self._model.target, - logits=logits) - - if self._model.use_l2_loss: - cross_entropy = \ - tf.reduce_mean(tf.add( - tf.add(error, - tf.multiply(self._model.l2_factor, - tf.nn.l2_loss(sigmoid_weights))), - tf.add(tf.multiply(self._model.l2_factor, - tf.nn.l2_loss(sigmoid_bias)), - tf.multiply(self._model.l2_factor, - self._model.l2_term)))) + if secondary_output: + error = tf.nn.softmax_cross_entropy_with_logits( + labels=self._model.sec_target, + logits=logits) else: - cross_entropy = tf.reduce_mean(error) - - self._model.error = cross_entropy - self._model.train_op = tf.train.AdamOptimizer( - self._model.learning_rate).minimize(cross_entropy) + self._model.sigmoid = tf.nn.sigmoid(logits) + # Defne error function + error = tf.nn.sigmoid_cross_entropy_with_logits( + labels=self._model.target, + logits=logits) - return self - - def add_secondary_output(self): - """Adds a layer that can be used to train the network on data that is - labeled in a different way than the final data""" - # Output layer - # Feed the output of the previous layer to a sigmoid layer - sigmoid_weights = tf.Variable(tf.random_normal( - [self._model.latest_layer.get_shape()[1].value, - self._model.subreddit_count], - stddev=0.35, - dtype=tf.float64), - name="secondary_output_weights") - - sigmoid_bias = tf.Variable(tf.random_normal([self._model.subreddit_count], - stddev=0.35, - dtype=tf.float64), - name="secondary_output_biases") - - logits = tf.add(tf.matmul(self._model.latest_layer, sigmoid_weights), - sigmoid_bias) # Training - # Defne error function - error = tf.nn.softmax_cross_entropy_with_logits( - labels=self._model.sec_target, - logits=logits) - if self._model.use_l2_loss: cross_entropy = \ tf.reduce_mean(tf.add( @@ -222,8 +183,14 @@ def add_secondary_output(self): else: cross_entropy = tf.reduce_mean(error) - self._model.pre_train_op = tf.train.AdamOptimizer( - self._model.learning_rate).minimize(cross_entropy) + if secondary_output: + self._model.pre_train_op = tf.train.AdamOptimizer( + self._model.learning_rate).minimize(cross_entropy) + else: + self._model.error = cross_entropy + self._model.train_op = tf.train.AdamOptimizer( + self._model.learning_rate).minimize(cross_entropy) + return self def add_precision_operations(self): @@ -335,8 +302,8 @@ def add_as_conf(self): self.add_layer(self._model.hidden_neurons) if self._model.use_pretrained_net: - self.add_secondary_output() + self.add_output_layer(self._model.subreddit_count, secondary_output=True) - self.add_output_layer() \ + self.add_output_layer(self._model.user_count) \ .add_precision_operations() return self From 2374ed40c6ea268f213231f0cacc3ad202bd6cb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexander=20H=C3=A5kansson?= Date: Wed, 5 Apr 2017 11:40:56 +0200 Subject: [PATCH 12/60] Adds the option to choose between GRU and LSTM units --- config.template.yaml | 3 ++- definitions.py | 3 ++- model/model.py | 3 ++- model/model_builder.py | 10 ++++++++-- model/util/writer.py | 2 +- 5 files changed, 15 insertions(+), 6 deletions(-) diff --git a/config.template.yaml b/config.template.yaml index 115f562..1199a43 100644 --- a/config.template.yaml +++ b/config.template.yaml @@ -20,7 +20,8 @@ network: learning_rate: 0.5 training_epochs: 5 batch_size: 25 - lstm_neurons: 200 + rnn_neurons: 200 + rnn_unit: 'lstm' # Can be 'gru' or 'lstm', default: 'lstm' hidden_layers: 0 hidden_neurons: 300 use_concat_input: false diff --git a/definitions.py b/definitions.py index fd70b22..efdcf9e 100644 --- a/definitions.py +++ b/definitions.py @@ -40,7 +40,8 @@ LEARN_RATE = 'learning_rate' EMBEDD_SIZE = 'embedding_size' MAX_TITLE_LENGTH = 'max_title_length' -LSTM_NEURONS = 'lstm_neurons' +RNN_NEURONS = 'rnn_neurons' +RNN_UNIT = "rnn_unit" HIDDEN_NEURONS = 'hidden_neurons' HIDDEN_LAYERS = 'hidden_layers' USE_CONCAT_INPUT = 'use_concat_input' diff --git a/model/model.py b/model/model.py index 10fe2ea..c028882 100644 --- a/model/model.py +++ b/model/model.py @@ -53,7 +53,8 @@ def __init__(self, config, session): self.learning_rate = config[LEARN_RATE] self.embedding_size = config[EMBEDD_SIZE] self.max_title_length = config[MAX_TITLE_LENGTH] - self.lstm_neurons = config[LSTM_NEURONS] + self.rnn_neurons = config[RNN_NEURONS] + self.rnn_unit = config[RNN_UNIT] self.batch_size = config[BATCH_SIZE] self.training_epochs = config[TRAINING_EPOCHS] self.use_l2_loss = config[USE_L2_LOSS] diff --git a/model/model_builder.py b/model/model_builder.py index a1b4f39..02a5e5b 100644 --- a/model/model_builder.py +++ b/model/model_builder.py @@ -55,7 +55,13 @@ def add_input_layer(self): self._model.keep_prob = tf.placeholder(tf.float64, name="keep_prob") - lstm_layer = tf.contrib.rnn.LSTMCell(self._model.lstm_neurons, state_is_tuple=True) + if self._model.rnn_unit == 'lstm': + rnn_layer = tf.contrib.rnn.LSTMCell(self._model.rnn_neurons) + elif self._model.rnn_unit == 'gru': + rnn_layer = tf.contrib.rnn.GRUCell(self._model.rnn_neurons) + else: + print("Incorrect RNN unit, defaulting to LSTM") + rnn_layer = tf.contrib.rnn.LSTMCell(self._model.rnn_neurons) # Embedding matrix for the words embedding_matrix = tf.Variable( @@ -76,7 +82,7 @@ def add_input_layer(self): embedded_input = tf.nn.embedding_lookup(embedding_matrix, self._model.input) # Run the LSTM layer with the embedded input - outputs, _ = tf.nn.dynamic_rnn(lstm_layer, embedded_input, + outputs, _ = tf.nn.dynamic_rnn(rnn_layer, embedded_input, dtype=tf.float64) outputs = tf.transpose(outputs, [1, 0, 2]) diff --git a/model/util/writer.py b/model/util/writer.py index 09b8152..9b452ae 100644 --- a/model/util/writer.py +++ b/model/util/writer.py @@ -33,7 +33,7 @@ def log_samefile(config, f1_score_valid, f1_score_train, epoch_top, prec_valid, if not os.path.exists(LOGS_DIR): raise FileNotFoundError('Can not write because no directory is created') - config_headers = [NET_TYPE, NET_NAME, VOC_SIZE, USER_COUNT, LEARN_RATE, EMBEDD_SIZE, MAX_TITLE_LENGTH, LSTM_NEURONS, + config_headers = [NET_TYPE, NET_NAME, VOC_SIZE, USER_COUNT, LEARN_RATE, EMBEDD_SIZE, MAX_TITLE_LENGTH, RNN_NEURONS, RNN_UNIT, HIDDEN_NEURONS, HIDDEN_LAYERS, USE_CONCAT_INPUT, BATCH_SIZE, TRAINING_EPOCHS, USE_L2_LOSS, L2_FACTOR, USE_DROPOUT, DROPOUT_PROB, USE_CONSTANT_LIMIT, CONSTANT_PREDICTION_LIMIT, TRAINABLE_MATRIX, PRE_TRAINED_MATRIX, USE_PRETRAINED, VALIDATION_DATA, TRAINING_DATA, TESTING_DATA] From f46c8ce7e1688ccfcd735212111c2d6554e3bd57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexander=20H=C3=A5kansson?= Date: Wed, 5 Apr 2017 13:40:25 +0200 Subject: [PATCH 13/60] Adds a missing variable name change --- model/model_builder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/model/model_builder.py b/model/model_builder.py index 02a5e5b..1121a33 100644 --- a/model/model_builder.py +++ b/model/model_builder.py @@ -101,7 +101,7 @@ def add_layer(self, number_of_neurons): if not self.added_layers: self.added_layers = True weights = tf.Variable(tf.random_normal( - [self._model.lstm_neurons + + [self._model.rnn_neurons + (1 if self._model.use_concat_input else 0), number_of_neurons], stddev=0.35, From 634d79b864b931c30e50c5e3fd52dfeaa3c7d9b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexander=20H=C3=A5kansson?= Date: Wed, 5 Apr 2017 17:47:13 +0200 Subject: [PATCH 14/60] Changes default dataset in config template --- config.template.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/config.template.yaml b/config.template.yaml index 1199a43..12d9cb9 100644 --- a/config.template.yaml +++ b/config.template.yaml @@ -8,9 +8,9 @@ network: vocabulary_size: 10000 user_count: 6 max_title_length: 30 - validation_data: 'validation_data_top_n_single.csv' - training_data: 'training_data_top_n_single.csv' - testing_data: 'testing_data_top_n_single.csv' + validation_data: 'validation_data_top_5_subreddit_allvotes.csv' + training_data: 'training_data_top_5_subreddit_allvotes.csv' + testing_data: 'testing_data_top_5_subreddit_allvotes.csv' # Embedding matrix configs: embedding_size: 150 # Make sure to match pretrained matrix dimensions trainable_matrix: true From d73a5c48bacb30c4e8d192cb275fd0b0be1c35b3 Mon Sep 17 00:00:00 2001 From: Jesper Jaxing Date: Mon, 3 Apr 2017 13:28:45 +0200 Subject: [PATCH 15/60] merge --- config.template.yaml | 1 + main.py | 2 ++ model/model.py | 46 ++++++++++++++++++++++++++++++++++++++++ model/util/csv_reader.py | 2 +- model/util/data.py | 5 +++++ 5 files changed, 55 insertions(+), 1 deletion(-) diff --git a/config.template.yaml b/config.template.yaml index 12d9cb9..d70c202 100644 --- a/config.template.yaml +++ b/config.template.yaml @@ -25,6 +25,7 @@ network: hidden_layers: 0 hidden_neurons: 300 use_concat_input: false + pre_train_subreddit: false # Regularisation configs: use_l2_loss: false l2_factor: 0.01 diff --git a/main.py b/main.py index dd048e3..d2c06d6 100644 --- a/main.py +++ b/main.py @@ -39,6 +39,8 @@ def main(): with tf.Session() as sess: builder = ModelBuilder(config_file, sess) network_model = builder.build() + if config_file["pre-train-subreddit"]: + network_model.pre_train() network_model.train() network_model.close_writers() tf.reset_default_graph() diff --git a/model/model.py b/model/model.py index c028882..83b2758 100644 --- a/model/model.py +++ b/model/model.py @@ -232,6 +232,52 @@ def train(self): epoch_top=self.epoch_top, prec_valid=self.prec_valid, prec_train=self.prec_train, recall_valid=self.recall_valid, recall_train=self.recall_train) + def pre_train(self): + """ Pre-trains the model on the pre-training dataset """ + print("Starting pre-training...") + + old_epoch = 0 + + if self.use_pretrained: + self._session.run(self.embedding_init, feed_dict={ + self.embedding_placeholder: + self.data.embedding_matrix}) + + tmp_data, tmp_labels = self.data.get_training() + tmp_size = self.data.train_size + + self.data.train_data = self.data.pre_train_data + self.data.train_labels = self.data.pre_train_data + self.data.train_size = self.data.pre_train_size + + # Train for a specified amount of epochs + for i in self.data.for_n_train_epochs(self.training_epochs, + self.batch_size): + # Debug print out + epoch = self.data.completed_training_epochs + training_error = self.train_batch() + # validation_error = self.validate_batch() + + # Don't validate so often + if i % (self.data.train_size // self.batch_size // 10) == 0 and i: + done = self.data.percent_of_epoch + print( + "Validation error: {:f} | Training error: {:f} | Done: {:.0%}" + .format(0, training_error, done)) + + # Do a full evaluation once an epoch is complete + if epoch != old_epoch: + self._session.run(self.epoch.assign_add(1)) + print("Epoch complete...old ", old_epoch) + self.save_checkpoint() + # self.validate() + old_epoch = epoch + + self.data.train_data = tmp_data + self.data.train_labels = tmp_labels + self.data.train_size = tmp_size + self.data.completed_training_epochs = 0 + def train_batch(self): """ Trains for one batch and returns cross entropy error """ with tf.device("/cpu:0"): diff --git a/model/util/csv_reader.py b/model/util/csv_reader.py index c5b7526..9913c68 100644 --- a/model/util/csv_reader.py +++ b/model/util/csv_reader.py @@ -34,7 +34,7 @@ class Dataenum(Enum): TESTING = "testing_data" TRAINING = "training_data" VALIDATION = "validation_data" - + PRE_TRAINING = "pre_training_data" class CsvReader: def __init__(self, netcfg): diff --git a/model/util/data.py b/model/util/data.py index 4d9ac54..b50fb31 100644 --- a/model/util/data.py +++ b/model/util/data.py @@ -56,6 +56,11 @@ def __init__(self, networkconfig): def _read_data(self): """ Reads all the data from specified path """ + logging.debug("Reading pre-training data...") + + self.pre_train_data, self.pre_train_labels = self.reader.get_data(Dataenum.PRE_TRAINING) + self.pre_train_size = len(self.pre_train_data) + logging.debug("Reading training data...") self.train_data, self.train_subreddits, self.train_labels = \ From 468d07dacd3855707937b88777cc46966ec5c086 Mon Sep 17 00:00:00 2001 From: Jesper Jaxing Date: Tue, 4 Apr 2017 10:55:16 +0200 Subject: [PATCH 16/60] added method to add secondary output layer --- model/model_builder.py | 56 +++++++++++++++++++++++++++++++++++------- 1 file changed, 47 insertions(+), 9 deletions(-) diff --git a/model/model_builder.py b/model/model_builder.py index 1121a33..e2e196e 100644 --- a/model/model_builder.py +++ b/model/model_builder.py @@ -52,6 +52,10 @@ def add_input_layer(self): tf.placeholder(tf.float64, [None, self._model.user_count], name="target") + self._model.sec_target = \ + tf.placeholder(tf.float64, + [None, self._model.data.subreddit_count], + name="sec_target") self._model.keep_prob = tf.placeholder(tf.float64, name="keep_prob") @@ -184,6 +188,49 @@ def add_output_layer(self): return self + def add_secondary_output(self): + """Adds a layer that can be used to train the network on data that is + labeled in a different way than the final data""" + # Output layer + # Feed the output of the previous layer to a sigmoid layer + sigmoid_weights = tf.Variable(tf.random_normal( + [self._model.latest_layer.get_shape()[1].value, + self._model.subreddit_count], + stddev=0.35, + dtype=tf.float64), + name="secondary_output_weights") + + sigmoid_bias = tf.Variable(tf.random_normal([self._model.subreddit_count], + stddev=0.35, + dtype=tf.float64), + name="secondary_output_biases") + + logits = tf.add(tf.matmul(self._model.latest_layer, sigmoid_weights), + sigmoid_bias) + # Training + + # Defne error function + error = tf.nn.sigmoid_cross_entropy_with_logits( + labels=self._model.sec_target, + logits=logits) + + if self._model.use_l2_loss: + cross_entropy = \ + tf.reduce_mean(tf.add( + tf.add(error, + tf.multiply(self._model.l2_factor, + tf.nn.l2_loss(sigmoid_weights))), + tf.add(tf.multiply(self._model.l2_factor, + tf.nn.l2_loss(sigmoid_bias)), + tf.multiply(self._model.l2_factor, + self._model.l2_term)))) + else: + cross_entropy = tf.reduce_mean(error) + + self._model.pre_train_op = tf.train.AdamOptimizer( + self._model.learning_rate).minimize(cross_entropy) + return self + def add_precision_operations(self): """Adds precision operation and tensorboard operations""" # Determine which prediction function to use. Casts a tensor to @@ -273,15 +320,6 @@ def add_precision_operations(self): def build(self): """Adds saver and init operation and returns the model""" - self.add_input_layer() - - # Add a number of hidden layers - for _ in range(self._model.hidden_layers): - self.add_layer(self._model.hidden_neurons) - - self.add_output_layer() - - self.add_precision_operations() self._model.init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) From 2e957a375bfee31df6d981ed0c2b99d0def405b9 Mon Sep 17 00:00:00 2001 From: Jesper Jaxing Date: Tue, 4 Apr 2017 10:58:53 +0200 Subject: [PATCH 17/60] implemented pre-training on subreddits --- main.py | 16 ++++++- model/model.py | 114 +++++++++++++++++++------------------------------ 2 files changed, 57 insertions(+), 73 deletions(-) diff --git a/main.py b/main.py index d2c06d6..68694b2 100644 --- a/main.py +++ b/main.py @@ -23,6 +23,7 @@ # ============================================================================== import argparse import tensorflow as tf +from definitions import * from model.util.networkconfig import yamlconfig as networkconfig from model.model_builder import ModelBuilder @@ -38,9 +39,20 @@ def main(): config_file = networkconfig[conf] with tf.Session() as sess: builder = ModelBuilder(config_file, sess) + builder.add_input_layer() + + # Add a number of hidden layers + for _ in range(config_file[HIDDEN_LAYERS]): + builder.add_layer(config_file[HIDDEN_NEURONS]) + + builder.add_output_layer()\ + .add_secondary_output()\ + .add_precision_operations() + network_model = builder.build() - if config_file["pre-train-subreddit"]: - network_model.pre_train() + if config_file[USE_PRETRAINED_NET]: + network_model.train(USE_PRETRAINED_NET) + network_model.data.completed_training_epochs = 0 network_model.train() network_model.close_writers() tf.reset_default_graph() diff --git a/model/model.py b/model/model.py index 83b2758..18f8a0a 100644 --- a/model/model.py +++ b/model/model.py @@ -68,13 +68,17 @@ def __init__(self, config, session): self.use_constant_limit = config[USE_CONSTANT_LIMIT] self.constant_prediction_limit = config[CONSTANT_PREDICTION_LIMIT] self.use_concat_input = config[USE_CONCAT_INPUT] + self.use_pretrained_net = config[USE_PRETRAINED_NET] + self.subreddit_count = 0 # Will be set in build_graph self.input = None self.subreddit_input = None self.target = None + self.sec_target = None self.sigmoid = None self.train_op = None + self.pre_train_op = None self.error = None self.init_op = None self.saver = None @@ -106,6 +110,7 @@ def __init__(self, config, session): with tf.device("/cpu:0"): self.data = data.Data(config) + self.subreddit_count = self.data.subreddit_count if self.use_pretrained: self.vocabulary_size = len(self.data.embedding_matrix) @@ -185,7 +190,7 @@ def validate_batch(self): # TODO funktionen gör alldeles för mycket, # dela upp utskrift, beräkning och träning - def train(self): + def train(self, use_pretrained_net=False): """ Trains the model on the dataset """ print("Starting training...") @@ -201,29 +206,33 @@ def train(self): old_epoch = 0 - if self.epoch.eval(self._session) == 0: + if self.epoch.eval(self._session) == 0 and not use_pretrained_net: self.validate() # Train for a specified amount of epochs - for i in self.data.for_n_train_epochs(self.training_epochs, - self.batch_size): + for i in self.data.for_n_train_epochs(self.batch_size, self.training_epochs): # Debug print out epoch = self.data.completed_training_epochs - training_error = self.train_batch() - validation_error = self.validate_batch() - # Don't validate so often - if i % (self.data.train_size // self.batch_size // 10) == 0 and i: - done = self.data.percent_of_epoch - print("Validation error: {:f} | Training error: {:f} | Done: {:.0%}" - .format(validation_error, training_error, done)) + if not use_pretrained_net: + training_error = self.train_batch() + validation_error = self.validate_batch() + + # Don't validate so often + if i % (self.data.train_size // self.batch_size // 10) == 0 and i: + done = self.data.percent_of_epoch + print("Validation error: {:f} | Training error: {:f} | Done: {:.0%}" + .format(validation_error, training_error, done)) + else: + self.train_batch(True) # Do a full evaluation once an epoch is complete if epoch != old_epoch: self._session.run(self.epoch.assign_add(1)) print("Epoch complete...old ", old_epoch) self.save_checkpoint() - self.validate() + if not self.use_pretrained_net: + self.validate() old_epoch = epoch # Save model when done training @@ -232,67 +241,30 @@ def train(self): epoch_top=self.epoch_top, prec_valid=self.prec_valid, prec_train=self.prec_train, recall_valid=self.recall_valid, recall_train=self.recall_train) - def pre_train(self): - """ Pre-trains the model on the pre-training dataset """ - print("Starting pre-training...") - - old_epoch = 0 - - if self.use_pretrained: - self._session.run(self.embedding_init, feed_dict={ - self.embedding_placeholder: - self.data.embedding_matrix}) - - tmp_data, tmp_labels = self.data.get_training() - tmp_size = self.data.train_size - - self.data.train_data = self.data.pre_train_data - self.data.train_labels = self.data.pre_train_data - self.data.train_size = self.data.pre_train_size - - # Train for a specified amount of epochs - for i in self.data.for_n_train_epochs(self.training_epochs, - self.batch_size): - # Debug print out - epoch = self.data.completed_training_epochs - training_error = self.train_batch() - # validation_error = self.validate_batch() - - # Don't validate so often - if i % (self.data.train_size // self.batch_size // 10) == 0 and i: - done = self.data.percent_of_epoch - print( - "Validation error: {:f} | Training error: {:f} | Done: {:.0%}" - .format(0, training_error, done)) - - # Do a full evaluation once an epoch is complete - if epoch != old_epoch: - self._session.run(self.epoch.assign_add(1)) - print("Epoch complete...old ", old_epoch) - self.save_checkpoint() - # self.validate() - old_epoch = epoch - - self.data.train_data = tmp_data - self.data.train_labels = tmp_labels - self.data.train_size = tmp_size - self.data.completed_training_epochs = 0 - - def train_batch(self): + def train_batch(self, pre_train_net=False): """ Trains for one batch and returns cross entropy error """ with tf.device("/cpu:0"): - batch_input, batch_sub, batch_label = \ - self.data.next_train_batch() - - self._session.run(self.train_op, - {self.input: batch_input, - self.subreddit_input: batch_sub, - self.target: batch_label}) - - return self._session.run(self.error, - feed_dict={self.input: batch_input, - self.subreddit_input: batch_sub, - self.target: batch_label}) + if not pre_train_net: + batch_input, batch_sub, batch_label = \ + self.data.next_train_batch() + else: + batch_input, batch_label = \ + self.data.next_pre_train_batch() + + if pre_train_net: + self._session.run(self.pre_train_op, + {self.input: batch_input, + self.sec_target: batch_label}) + else: + self._session.run(self.train_op, + {self.input: batch_input, + self.subreddit_input: batch_sub, + self.target: batch_label}) + + return self._session.run(self.error, + feed_dict={self.input: batch_input, + self.subreddit_input: batch_sub, + self.target: batch_label}) def close_writers(self): """ Close tensorboard writers """ self.train_writer.close() From 796a81ded518244e0fa01162ad4f1d9358b34717 Mon Sep 17 00:00:00 2001 From: Jesper Jaxing Date: Tue, 4 Apr 2017 10:59:30 +0200 Subject: [PATCH 18/60] create methods for handling pre-training data --- definitions.py | 1 + model/util/data.py | 45 +++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 42 insertions(+), 4 deletions(-) diff --git a/definitions.py b/definitions.py index efdcf9e..6ff5dac 100644 --- a/definitions.py +++ b/definitions.py @@ -56,6 +56,7 @@ TRAINABLE_MATRIX = 'trainable_matrix' PRE_TRAINED_MATRIX = 'pre_trained_matrix' USE_PRETRAINED = 'use_pretrained' +USE_PRETRAINED_NET = 'pre_train_subreddit' VALIDATION_DATA = 'validation_data' TRAINING_DATA = 'training_data' TESTING_DATA = 'testing_data' diff --git a/model/util/data.py b/model/util/data.py index b50fb31..7300967 100644 --- a/model/util/data.py +++ b/model/util/data.py @@ -35,8 +35,11 @@ def __init__(self, networkconfig): self._current_train_index = 0 self._current_valid_index = 0 self._current_test_index = 0 + self._current_pre_train_index = 0 self.completed_training_epochs = 0 + self.completed_pre_training_epochs = 0 self.percent_of_epoch = 0.0 + self.subreddit_count = 0 self.title_length = networkconfig['max_title_length'] self.batch_size = self.netcfg['batch_size'] self.reader = CsvReader(networkconfig) @@ -58,8 +61,8 @@ def _read_data(self): """ Reads all the data from specified path """ logging.debug("Reading pre-training data...") - self.pre_train_data, self.pre_train_labels = self.reader.get_data(Dataenum.PRE_TRAINING) - self.pre_train_size = len(self.pre_train_data) + # self.pre_train_data, self.pre_train_labels = self.reader.get_data(Dataenum.TRAINING, label_column=1) + # self.pre_train_size = len(self.pre_train_data) logging.debug("Reading training data...") @@ -97,6 +100,7 @@ def _build_dict(self): subreddits = " ".join(self.train_subreddits).split() self.subreddit_dict = helper.build_subreddit_dict(subreddits) + self.subreddit_count = len(self.subreddit_dict) def next_train_batch(self, batch_size=None): """ Get the next batch of training data """ @@ -136,6 +140,39 @@ def next_train_batch(self, batch_size=None): self.percent_of_epoch = self._current_train_index / self.train_size return batch_x, batch_sub, batch_y + def next_pre_train_batch(self, batch_size=None): + """ Get the next batch of training data """ + batch_size = batch_size or self.batch_size + batch_x = [] + batch_y = [] + for _ in range(0, batch_size): + sentence = self.train_data[self._current_train_index] + label = self.train_labels[self._current_train_index] + self._current_pre_train_index += 1 + # Support multiple epochs + if self._current_pre_train_index >= self.train_size: + self._current_pre_train_index = 0 + self.completed_pre_training_epochs += 1 + self.percent_of_epoch = 0.0 + # TODO Detta ska inte ligga i funktionen som generar ny data + + # Turn sentences and labels into vector representations + sentence_vec, present, absent = \ + helper.get_indicies(sentence, + self.word_dict, + self.max_title_length) + self.train_present += present + self.train_absent += absent + + label_vec = helper.label_vector(label, + self.subreddit_dict, + self.subreddit_count) + batch_x.append(sentence_vec) + batch_y.append(label_vec) + + self.percent_of_epoch = self._current_train_index / self.train_size + return batch_x, batch_y + def get_validation(self): """ Get the whole validation set in a vectorized form """ old_ind = self._current_valid_index @@ -220,10 +257,10 @@ def next_test_batch(self, batch_size=None): batch_y.append(label_vec) return batch_x, batch_sub, batch_y - def for_n_train_epochs(self, num_epochs=1, batch_size=25): + def for_n_train_epochs(self, batch_size=25, num_epochs=1): # TODO Ta bort parameterar """ Calculates how many training iterations to do for num_epochs - number of epochs with a batch size of batch_size """ + number of epochs with a batch size of batch_size""" return range((self.train_size * num_epochs) // batch_size) def get_training(self): From 81c149d4fd035012eb1b00c30dbe2b2f661f9039 Mon Sep 17 00:00:00 2001 From: Jesper Jaxing Date: Tue, 4 Apr 2017 14:10:06 +0200 Subject: [PATCH 19/60] non-working commit to allow for debugg help. --- main.py | 1 - model/model.py | 11 ++++------- model/model_builder.py | 6 ++++++ model/util/data.py | 2 +- 4 files changed, 11 insertions(+), 9 deletions(-) diff --git a/main.py b/main.py index 68694b2..78e9255 100644 --- a/main.py +++ b/main.py @@ -52,7 +52,6 @@ def main(): network_model = builder.build() if config_file[USE_PRETRAINED_NET]: network_model.train(USE_PRETRAINED_NET) - network_model.data.completed_training_epochs = 0 network_model.train() network_model.close_writers() tf.reset_default_graph() diff --git a/model/model.py b/model/model.py index 18f8a0a..726e8d5 100644 --- a/model/model.py +++ b/model/model.py @@ -194,15 +194,12 @@ def train(self, use_pretrained_net=False): """ Trains the model on the dataset """ print("Starting training...") - if self.use_pretrained: + if self.use_pretrained and \ + (self.use_pretrained_net and use_pretrained_net) or \ + (not self.use_pretrained_net and not use_pretrained_net): self._session.run(self.embedding_init, feed_dict={self.embedding_placeholder: self.data.embedding_matrix}) - self.train_writer = \ - tf.summary.FileWriter(self.logging_dir + '/' + TENSOR_DIR_TRAIN, - self._session.graph) - self.valid_writer = \ - tf.summary.FileWriter(self.logging_dir + '/' + TENSOR_DIR_VALID) old_epoch = 0 @@ -210,7 +207,7 @@ def train(self, use_pretrained_net=False): self.validate() # Train for a specified amount of epochs - for i in self.data.for_n_train_epochs(self.batch_size, self.training_epochs): + for i in self.data.for_n_train_epochs(self.training_epochs, self.batch_size): # Debug print out epoch = self.data.completed_training_epochs diff --git a/model/model_builder.py b/model/model_builder.py index e2e196e..4f96ef3 100644 --- a/model/model_builder.py +++ b/model/model_builder.py @@ -24,6 +24,7 @@ import tensorflow as tf from model.model import Model +from definitions import * class ModelBuilder(object): """A class following the builder pattern to create a model""" @@ -321,6 +322,11 @@ def add_precision_operations(self): def build(self): """Adds saver and init operation and returns the model""" + self._model.train_writer = \ + tf.summary.FileWriter(self._model.logging_dir + '/' + TENSOR_DIR_TRAIN, + self._model._session.graph) + self._model.valid_writer = \ + tf.summary.FileWriter(self._model.logging_dir + '/' + TENSOR_DIR_VALID) self._model.init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) self._model.saver = tf.train.Saver() diff --git a/model/util/data.py b/model/util/data.py index 7300967..3da2866 100644 --- a/model/util/data.py +++ b/model/util/data.py @@ -257,7 +257,7 @@ def next_test_batch(self, batch_size=None): batch_y.append(label_vec) return batch_x, batch_sub, batch_y - def for_n_train_epochs(self, batch_size=25, num_epochs=1): + def for_n_train_epochs(self, num_epochs=1, batch_size=25): # TODO Ta bort parameterar """ Calculates how many training iterations to do for num_epochs number of epochs with a batch size of batch_size""" From f5c49ed5ea037808e0354d3de22ba80a0695fed4 Mon Sep 17 00:00:00 2001 From: Jesper Jaxing Date: Wed, 5 Apr 2017 11:09:52 +0200 Subject: [PATCH 20/60] changed which variable was used to check if pre-training --- model/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/model/model.py b/model/model.py index 726e8d5..4316c82 100644 --- a/model/model.py +++ b/model/model.py @@ -228,7 +228,7 @@ def train(self, use_pretrained_net=False): self._session.run(self.epoch.assign_add(1)) print("Epoch complete...old ", old_epoch) self.save_checkpoint() - if not self.use_pretrained_net: + if not use_pretrained_net: self.validate() old_epoch = epoch From 8c5cb78929a2555d5bd25ff139239ad4aa2f200e Mon Sep 17 00:00:00 2001 From: Jesper Jaxing Date: Wed, 5 Apr 2017 11:12:18 +0200 Subject: [PATCH 21/60] only add sec_output when pre-train and add an extra layer after --- main.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/main.py b/main.py index 78e9255..2b63eaa 100644 --- a/main.py +++ b/main.py @@ -45,8 +45,11 @@ def main(): for _ in range(config_file[HIDDEN_LAYERS]): builder.add_layer(config_file[HIDDEN_NEURONS]) + if builder._model.use_pretrained_net: + builder.add_secondary_output() + builder.add_layer(config_file[HIDDEN_NEURONS]) + builder.add_output_layer()\ - .add_secondary_output()\ .add_precision_operations() network_model = builder.build() From 7383a0b4fd6db1c9133362520ece9acaa040b2d5 Mon Sep 17 00:00:00 2001 From: Jesper Jaxing Date: Wed, 5 Apr 2017 17:46:21 +0200 Subject: [PATCH 22/60] cleaned main.py by adding it to builder --- main.py | 14 +------------- model/model_builder.py | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/main.py b/main.py index 2b63eaa..6780e31 100644 --- a/main.py +++ b/main.py @@ -39,20 +39,8 @@ def main(): config_file = networkconfig[conf] with tf.Session() as sess: builder = ModelBuilder(config_file, sess) - builder.add_input_layer() - # Add a number of hidden layers - for _ in range(config_file[HIDDEN_LAYERS]): - builder.add_layer(config_file[HIDDEN_NEURONS]) - - if builder._model.use_pretrained_net: - builder.add_secondary_output() - builder.add_layer(config_file[HIDDEN_NEURONS]) - - builder.add_output_layer()\ - .add_precision_operations() - - network_model = builder.build() + network_model = builder.add_as_conf().build() if config_file[USE_PRETRAINED_NET]: network_model.train(USE_PRETRAINED_NET) network_model.train() diff --git a/model/model_builder.py b/model/model_builder.py index 4f96ef3..bcb411b 100644 --- a/model/model_builder.py +++ b/model/model_builder.py @@ -332,3 +332,17 @@ def build(self): self._model.saver = tf.train.Saver() self._model.load_checkpoint() return self._model + + def add_as_conf(self): + self.add_input_layer() + + # Add a number of hidden layers + for _ in range(self._model.hidden_layers): + self.add_layer(self._model.hidden_neurons) + + if self._model.use_pretrained_net: + self.add_secondary_output() + + self.add_output_layer() \ + .add_precision_operations() + return self From 157987d005a011c757462faaf67121ae47fbadb1 Mon Sep 17 00:00:00 2001 From: Jesper Jaxing Date: Wed, 5 Apr 2017 17:50:16 +0200 Subject: [PATCH 23/60] sec_output uses softmax since a title only have one subreddit --- model/model_builder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/model/model_builder.py b/model/model_builder.py index bcb411b..895f983 100644 --- a/model/model_builder.py +++ b/model/model_builder.py @@ -211,7 +211,7 @@ def add_secondary_output(self): # Training # Defne error function - error = tf.nn.sigmoid_cross_entropy_with_logits( + error = tf.nn.softmax_cross_entropy_with_logits( labels=self._model.sec_target, logits=logits) From 90086abcd19526e7ab62727a0d1ec7efdf6bde99 Mon Sep 17 00:00:00 2001 From: Jesper Jaxing Date: Wed, 5 Apr 2017 18:15:10 +0200 Subject: [PATCH 24/60] refactored add output method to one method --- model/model_builder.py | 77 ++++++++++++------------------------------ 1 file changed, 22 insertions(+), 55 deletions(-) diff --git a/model/model_builder.py b/model/model_builder.py index 895f983..f6010bb 100644 --- a/model/model_builder.py +++ b/model/model_builder.py @@ -144,77 +144,38 @@ def add_layer(self, number_of_neurons): return self - def add_output_layer(self): + def add_output_layer(self, output_size, secondary_output=False): """Adds an output layer, including error and optimisation functions. After this method no new layers should be added.""" # Output layer # Feed the output of the previous layer to a sigmoid layer sigmoid_weights = tf.Variable(tf.random_normal( - [self._model.latest_layer.get_shape()[1].value, self._model.user_count], + [self._model.latest_layer.get_shape()[1].value, output_size], stddev=0.35, dtype=tf.float64), name="output_weights") - sigmoid_bias = tf.Variable(tf.random_normal([self._model.user_count], + sigmoid_bias = tf.Variable(tf.random_normal([output_size], stddev=0.35, dtype=tf.float64), name="output_biases") logits = tf.add(tf.matmul(self._model.latest_layer, sigmoid_weights), sigmoid_bias) - self._model.sigmoid = tf.nn.sigmoid(logits) - # Training - - # Defne error function - error = tf.nn.sigmoid_cross_entropy_with_logits(labels=self._model.target, - logits=logits) - - if self._model.use_l2_loss: - cross_entropy = \ - tf.reduce_mean(tf.add( - tf.add(error, - tf.multiply(self._model.l2_factor, - tf.nn.l2_loss(sigmoid_weights))), - tf.add(tf.multiply(self._model.l2_factor, - tf.nn.l2_loss(sigmoid_bias)), - tf.multiply(self._model.l2_factor, - self._model.l2_term)))) + if secondary_output: + error = tf.nn.softmax_cross_entropy_with_logits( + labels=self._model.sec_target, + logits=logits) else: - cross_entropy = tf.reduce_mean(error) - - self._model.error = cross_entropy - self._model.train_op = tf.train.AdamOptimizer( - self._model.learning_rate).minimize(cross_entropy) + self._model.sigmoid = tf.nn.sigmoid(logits) + # Defne error function + error = tf.nn.sigmoid_cross_entropy_with_logits( + labels=self._model.target, + logits=logits) - return self - - def add_secondary_output(self): - """Adds a layer that can be used to train the network on data that is - labeled in a different way than the final data""" - # Output layer - # Feed the output of the previous layer to a sigmoid layer - sigmoid_weights = tf.Variable(tf.random_normal( - [self._model.latest_layer.get_shape()[1].value, - self._model.subreddit_count], - stddev=0.35, - dtype=tf.float64), - name="secondary_output_weights") - - sigmoid_bias = tf.Variable(tf.random_normal([self._model.subreddit_count], - stddev=0.35, - dtype=tf.float64), - name="secondary_output_biases") - - logits = tf.add(tf.matmul(self._model.latest_layer, sigmoid_weights), - sigmoid_bias) # Training - # Defne error function - error = tf.nn.softmax_cross_entropy_with_logits( - labels=self._model.sec_target, - logits=logits) - if self._model.use_l2_loss: cross_entropy = \ tf.reduce_mean(tf.add( @@ -228,8 +189,14 @@ def add_secondary_output(self): else: cross_entropy = tf.reduce_mean(error) - self._model.pre_train_op = tf.train.AdamOptimizer( - self._model.learning_rate).minimize(cross_entropy) + if secondary_output: + self._model.pre_train_op = tf.train.AdamOptimizer( + self._model.learning_rate).minimize(cross_entropy) + else: + self._model.error = cross_entropy + self._model.train_op = tf.train.AdamOptimizer( + self._model.learning_rate).minimize(cross_entropy) + return self def add_precision_operations(self): @@ -341,8 +308,8 @@ def add_as_conf(self): self.add_layer(self._model.hidden_neurons) if self._model.use_pretrained_net: - self.add_secondary_output() + self.add_output_layer(self._model.subreddit_count, secondary_output=True) - self.add_output_layer() \ + self.add_output_layer(self._model.user_count) \ .add_precision_operations() return self From 65cf80c4510392016e79dcac3b49adbbacc7c76b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexander=20H=C3=A5kansson?= Date: Wed, 5 Apr 2017 18:48:29 +0200 Subject: [PATCH 25/60] Refactors away redundant function in model builder --- main.py | 2 +- model/model.py | 7 ++++--- model/model_builder.py | 33 ++++++++++++++++++--------------- 3 files changed, 23 insertions(+), 19 deletions(-) diff --git a/main.py b/main.py index 6780e31..030cb35 100644 --- a/main.py +++ b/main.py @@ -40,7 +40,7 @@ def main(): with tf.Session() as sess: builder = ModelBuilder(config_file, sess) - network_model = builder.add_as_conf().build() + network_model = builder.build() if config_file[USE_PRETRAINED_NET]: network_model.train(USE_PRETRAINED_NET) network_model.train() diff --git a/model/model.py b/model/model.py index 4316c82..78beb16 100644 --- a/model/model.py +++ b/model/model.py @@ -188,11 +188,12 @@ def validate_batch(self): self.subreddit_input: batch_sub, self.target: batch_label}) - # TODO funktionen gör alldeles för mycket, - # dela upp utskrift, beräkning och träning def train(self, use_pretrained_net=False): """ Trains the model on the dataset """ - print("Starting training...") + if use_pretrained_net: + print("Pre-training on subreddits...") + else: + print("Starting training...") if self.use_pretrained and \ (self.use_pretrained_net and use_pretrained_net) or \ diff --git a/model/model_builder.py b/model/model_builder.py index f6010bb..bf9f19d 100644 --- a/model/model_builder.py +++ b/model/model_builder.py @@ -165,8 +165,8 @@ def add_output_layer(self, output_size, secondary_output=False): if secondary_output: error = tf.nn.softmax_cross_entropy_with_logits( - labels=self._model.sec_target, - logits=logits) + labels=self._model.sec_target, + logits=logits) else: self._model.sigmoid = tf.nn.sigmoid(logits) # Defne error function @@ -289,27 +289,30 @@ def add_precision_operations(self): def build(self): """Adds saver and init operation and returns the model""" - self._model.train_writer = \ - tf.summary.FileWriter(self._model.logging_dir + '/' + TENSOR_DIR_TRAIN, - self._model._session.graph) - self._model.valid_writer = \ - tf.summary.FileWriter(self._model.logging_dir + '/' + TENSOR_DIR_VALID) - self._model.init_op = tf.group(tf.global_variables_initializer(), - tf.local_variables_initializer()) - self._model.saver = tf.train.Saver() - self._model.load_checkpoint() - return self._model - - def add_as_conf(self): + # Add input layer self.add_input_layer() # Add a number of hidden layers for _ in range(self._model.hidden_layers): self.add_layer(self._model.hidden_neurons) + # Add output layer for pretraining, if used if self._model.use_pretrained_net: self.add_output_layer(self._model.subreddit_count, secondary_output=True) + # Add output layer for users self.add_output_layer(self._model.user_count) \ .add_precision_operations() - return self + + # Initialize + self._model.train_writer = \ + tf.summary.FileWriter(self._model.logging_dir + '/' + TENSOR_DIR_TRAIN, + self._model._session.graph) + self._model.valid_writer = \ + tf.summary.FileWriter(self._model.logging_dir + '/' + TENSOR_DIR_VALID) + + self._model.init_op = tf.group(tf.global_variables_initializer(), + tf.local_variables_initializer()) + self._model.saver = tf.train.Saver() + self._model.load_checkpoint() + return self._model From fbe203f58d90302929cabdd85c2217daf938167d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexander=20H=C3=A5kansson?= Date: Wed, 5 Apr 2017 19:24:57 +0200 Subject: [PATCH 26/60] Removes un-used constant --- model/util/csv_reader.py | 1 - 1 file changed, 1 deletion(-) diff --git a/model/util/csv_reader.py b/model/util/csv_reader.py index 9913c68..17c8357 100644 --- a/model/util/csv_reader.py +++ b/model/util/csv_reader.py @@ -34,7 +34,6 @@ class Dataenum(Enum): TESTING = "testing_data" TRAINING = "training_data" VALIDATION = "validation_data" - PRE_TRAINING = "pre_training_data" class CsvReader: def __init__(self, netcfg): From ac40573c0b292970cccf0a5cf8336deec12e0b06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexander=20H=C3=A5kansson?= Date: Wed, 5 Apr 2017 19:43:00 +0200 Subject: [PATCH 27/60] Removes unused epoch counting for pre-training --- model/model.py | 3 +-- model/util/data.py | 9 --------- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/model/model.py b/model/model.py index 78beb16..a0f3494 100644 --- a/model/model.py +++ b/model/model.py @@ -229,8 +229,7 @@ def train(self, use_pretrained_net=False): self._session.run(self.epoch.assign_add(1)) print("Epoch complete...old ", old_epoch) self.save_checkpoint() - if not use_pretrained_net: - self.validate() + self.validate() old_epoch = epoch # Save model when done training diff --git a/model/util/data.py b/model/util/data.py index 3da2866..7672e32 100644 --- a/model/util/data.py +++ b/model/util/data.py @@ -37,7 +37,6 @@ def __init__(self, networkconfig): self._current_test_index = 0 self._current_pre_train_index = 0 self.completed_training_epochs = 0 - self.completed_pre_training_epochs = 0 self.percent_of_epoch = 0.0 self.subreddit_count = 0 self.title_length = networkconfig['max_title_length'] @@ -59,10 +58,6 @@ def __init__(self, networkconfig): def _read_data(self): """ Reads all the data from specified path """ - logging.debug("Reading pre-training data...") - - # self.pre_train_data, self.pre_train_labels = self.reader.get_data(Dataenum.TRAINING, label_column=1) - # self.pre_train_size = len(self.pre_train_data) logging.debug("Reading training data...") @@ -152,9 +147,6 @@ def next_pre_train_batch(self, batch_size=None): # Support multiple epochs if self._current_pre_train_index >= self.train_size: self._current_pre_train_index = 0 - self.completed_pre_training_epochs += 1 - self.percent_of_epoch = 0.0 - # TODO Detta ska inte ligga i funktionen som generar ny data # Turn sentences and labels into vector representations sentence_vec, present, absent = \ @@ -170,7 +162,6 @@ def next_pre_train_batch(self, batch_size=None): batch_x.append(sentence_vec) batch_y.append(label_vec) - self.percent_of_epoch = self._current_train_index / self.train_size return batch_x, batch_y def get_validation(self): From 5f2d82a06ab63d9deaf81505059b5b2e2fc91ccd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexander=20H=C3=A5kansson?= Date: Wed, 5 Apr 2017 19:55:55 +0200 Subject: [PATCH 28/60] Removes redundant printing of cross entropy error This removes the printing of cross entropy for training and validation error to the terminal. This is since the erros can still be visualised using tensorboard. It also makes the code more effective as it doesn't have to compute error twice. --- model/model.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/model/model.py b/model/model.py index a0f3494..7230843 100644 --- a/model/model.py +++ b/model/model.py @@ -178,6 +178,7 @@ def validate(self): epoch, get_val_summary_tensor(val_prec), get_val_summary_tensor(train_prec), \ get_val_summary_tensor(val_recall), get_val_summary_tensor(train_recall) + # Currently not used. Saving for now. Might come in handy later def validate_batch(self): """ Validates a batch of data and returns cross entropy error """ with tf.device("/cpu:0"): @@ -213,14 +214,12 @@ def train(self, use_pretrained_net=False): epoch = self.data.completed_training_epochs if not use_pretrained_net: - training_error = self.train_batch() - validation_error = self.validate_batch() + self.train_batch() - # Don't validate so often + # Don't print so often if i % (self.data.train_size // self.batch_size // 10) == 0 and i: done = self.data.percent_of_epoch - print("Validation error: {:f} | Training error: {:f} | Done: {:.0%}" - .format(validation_error, training_error, done)) + print("Epoch comletion: {:.0%}".format(done)) else: self.train_batch(True) @@ -258,10 +257,6 @@ def train_batch(self, pre_train_net=False): self.subreddit_input: batch_sub, self.target: batch_label}) - return self._session.run(self.error, - feed_dict={self.input: batch_input, - self.subreddit_input: batch_sub, - self.target: batch_label}) def close_writers(self): """ Close tensorboard writers """ self.train_writer.close() From 1606e4f74027573ca5d315905def8db871cc0e38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexander=20H=C3=A5kansson?= Date: Wed, 5 Apr 2017 20:10:21 +0200 Subject: [PATCH 29/60] Removes bug with incorrect matrix assignment A bug occured if both use_pretrained and pre_train_subreddit were set to false. The incorrect if-statement would try to assign a non-exisiting pre-trained embeddings matrix. --- model/model.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/model/model.py b/model/model.py index 7230843..7c98bc8 100644 --- a/model/model.py +++ b/model/model.py @@ -196,9 +196,7 @@ def train(self, use_pretrained_net=False): else: print("Starting training...") - if self.use_pretrained and \ - (self.use_pretrained_net and use_pretrained_net) or \ - (not self.use_pretrained_net and not use_pretrained_net): + if self.use_pretrained: self._session.run(self.embedding_init, feed_dict={self.embedding_placeholder: self.data.embedding_matrix}) From 7e702911c6869dcb22dab881b21f3b2e13b6db2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexander=20H=C3=A5kansson?= Date: Wed, 5 Apr 2017 20:23:30 +0200 Subject: [PATCH 30/60] Uses a uniformly random embedding matrix as default if no pre-trained available --- model/model_builder.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/model/model_builder.py b/model/model_builder.py index bf9f19d..f8ac89b 100644 --- a/model/model_builder.py +++ b/model/model_builder.py @@ -70,10 +70,10 @@ def add_input_layer(self): # Embedding matrix for the words embedding_matrix = tf.Variable( - tf.constant(0.0, - shape=[self._model.vocabulary_size, - self._model.embedding_size], - dtype=tf.float64), + tf.random_uniform( + [self._model.vocabulary_size, + self._model.embedding_size], + -1.0, 1.0, dtype=tf.float64), trainable=self._model.is_trainable_matrix, name="embedding_matrix", dtype=tf.float64) From 961b30c782181ae36568853da37cbdc4b8c798e5 Mon Sep 17 00:00:00 2001 From: Jesper Jaxing Date: Thu, 6 Apr 2017 17:20:49 +0200 Subject: [PATCH 31/60] changed subreddit input to be a one hot vector --- config.template.yaml | 1 + definitions.py | 1 + model/model.py | 1 + model/model_builder.py | 24 +++++++++++++++++++++--- model/util/data.py | 15 +++++++++------ model/util/writer.py | 2 +- 6 files changed, 34 insertions(+), 10 deletions(-) diff --git a/config.template.yaml b/config.template.yaml index d70c202..9b6e1dd 100644 --- a/config.template.yaml +++ b/config.template.yaml @@ -24,6 +24,7 @@ network: rnn_unit: 'lstm' # Can be 'gru' or 'lstm', default: 'lstm' hidden_layers: 0 hidden_neurons: 300 + subreddit_input_neurons: 10 #Probebly not the best default value use_concat_input: false pre_train_subreddit: false # Regularisation configs: diff --git a/definitions.py b/definitions.py index 6ff5dac..0fb1b88 100644 --- a/definitions.py +++ b/definitions.py @@ -44,6 +44,7 @@ RNN_UNIT = "rnn_unit" HIDDEN_NEURONS = 'hidden_neurons' HIDDEN_LAYERS = 'hidden_layers' +SUB_INPUT_NEURONS = 'subreddit_input_neurons' USE_CONCAT_INPUT = 'use_concat_input' BATCH_SIZE = 'batch_size' TRAINING_EPOCHS = 'training_epochs' diff --git a/model/model.py b/model/model.py index 7c98bc8..412f6b5 100644 --- a/model/model.py +++ b/model/model.py @@ -63,6 +63,7 @@ def __init__(self, config, session): self.dropout_prob = config[DROPOUT_PROB] # Only used for train op self.hidden_layers = config[HIDDEN_LAYERS] self.hidden_neurons = config[HIDDEN_NEURONS] + self.subreddit_input_neurons = config[SUB_INPUT_NEURONS] self.is_trainable_matrix = config[TRAINABLE_MATRIX] self.use_pretrained = config[USE_PRETRAINED] self.use_constant_limit = config[USE_CONSTANT_LIMIT] diff --git a/model/model_builder.py b/model/model_builder.py index f8ac89b..0c8eb7a 100644 --- a/model/model_builder.py +++ b/model/model_builder.py @@ -47,7 +47,7 @@ def add_input_layer(self): name="input") self._model.subreddit_input = \ tf.placeholder(tf.float64, - [None, 1], + [None, self._model.subreddit_count], name="subreddit_input") self._model.target = \ tf.placeholder(tf.float64, @@ -94,7 +94,23 @@ def add_input_layer(self): output = outputs[-1] if self._model.use_concat_input: # Add subreddit to end of input - output = tf.concat([output, self._model.subreddit_input], 1) + subreddit_weights = tf.Variable(tf.random_normal( + [self._model.subreddit_count, + self._model.subreddit_input_neurons], + stddev=0.35, + dtype=tf.float64), + name="sub_input_weights") + + subreddit_bias = tf.Variable(tf.random_normal( + [self._model.subreddit_input_neurons], + stddev=0.35, + dtype=tf.float64), + name="sub_input_bias") + + logit_subreddit = tf.add( + tf.matmul(self._model.subreddit_input, subreddit_weights), + subreddit_bias) + output = tf.concat([output, logit_subreddit], 1) self._model.latest_layer = output @@ -107,7 +123,9 @@ def add_layer(self, number_of_neurons): self.added_layers = True weights = tf.Variable(tf.random_normal( [self._model.rnn_neurons + - (1 if self._model.use_concat_input else 0), + (self._model.subreddit_input_neurons + if self._model.use_concat_input + else 0), number_of_neurons], stddev=0.35, dtype=tf.float64), diff --git a/model/util/data.py b/model/util/data.py index 7672e32..49d047a 100644 --- a/model/util/data.py +++ b/model/util/data.py @@ -123,8 +123,9 @@ def next_train_batch(self, batch_size=None): self.train_present += present self.train_absent += absent - subreddit_vec = helper.subreddit_index(subreddit, - self.subreddit_dict) + subreddit_vec = helper.label_vector(subreddit, + self.subreddit_dict, + self.subreddit_count) label_vec = helper.label_vector(label.split(), self.users_dict, self.user_count) @@ -198,8 +199,9 @@ def next_valid_batch(self, batch_size=None): self.valid_present += pres self.valid_absent += absent - subreddit_vec = helper.subreddit_index(subreddit, - self.subreddit_dict) + subreddit_vec = helper.label_vector(subreddit, + self.subreddit_dict, + self.subreddit_count) label_vec = helper.label_vector(label.split(), self.users_dict, self.user_count) @@ -238,8 +240,9 @@ def next_test_batch(self, batch_size=None): self.word_dict, self.max_title_length) - subreddit_vec = helper.subreddit_index(subreddit, - self.subreddit_dict) + subreddit_vec = helper.label_vector(subreddit, + self.subreddit_dict, + self.subreddit_count) label_vec = helper.label_vector(label.split(), self.users_dict, self.user_count) diff --git a/model/util/writer.py b/model/util/writer.py index 9b452ae..7d5bb81 100644 --- a/model/util/writer.py +++ b/model/util/writer.py @@ -34,7 +34,7 @@ def log_samefile(config, f1_score_valid, f1_score_train, epoch_top, prec_valid, raise FileNotFoundError('Can not write because no directory is created') config_headers = [NET_TYPE, NET_NAME, VOC_SIZE, USER_COUNT, LEARN_RATE, EMBEDD_SIZE, MAX_TITLE_LENGTH, RNN_NEURONS, RNN_UNIT, - HIDDEN_NEURONS, HIDDEN_LAYERS, USE_CONCAT_INPUT, BATCH_SIZE, TRAINING_EPOCHS, USE_L2_LOSS, L2_FACTOR, USE_DROPOUT, + HIDDEN_NEURONS, HIDDEN_LAYERS, SUB_INPUT_NEURONS, USE_CONCAT_INPUT, BATCH_SIZE, TRAINING_EPOCHS, USE_L2_LOSS, L2_FACTOR, USE_DROPOUT, DROPOUT_PROB, USE_CONSTANT_LIMIT, CONSTANT_PREDICTION_LIMIT, TRAINABLE_MATRIX, PRE_TRAINED_MATRIX, USE_PRETRAINED, VALIDATION_DATA, TRAINING_DATA, TESTING_DATA] From 7a2e256b78bc2d7ddb1bb2e7c6932a737ab32b70 Mon Sep 17 00:00:00 2001 From: Jesper Jaxing Date: Thu, 6 Apr 2017 17:24:27 +0200 Subject: [PATCH 32/60] removed bias from l2_reg --- model/model_builder.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/model/model_builder.py b/model/model_builder.py index 0c8eb7a..c05ae8b 100644 --- a/model/model_builder.py +++ b/model/model_builder.py @@ -151,9 +151,7 @@ def add_layer(self, number_of_neurons): logits, name="hidden_layer-" + str(self.number_of_layers)) if self._model.use_l2_loss: - self._model.l2_term = tf.add( - tf.add(self._model.l2_term, tf.nn.l2_loss(weights)), - tf.nn.l2_loss(bias)) + self._model.l2_term = tf.add(self._model.l2_term, tf.nn.l2_loss(weights)) if self._model.use_dropout: self._model.latest_layer = \ tf.nn.dropout(self._model.latest_layer, From 761905146dda51de28eb8df63d8d707174d9a074 Mon Sep 17 00:00:00 2001 From: Jesper Jaxing Date: Thu, 6 Apr 2017 17:30:05 +0200 Subject: [PATCH 33/60] main handels exceptions --- main.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/main.py b/main.py index 030cb35..b460f71 100644 --- a/main.py +++ b/main.py @@ -36,16 +36,20 @@ def main(): args = parser.parse_args() for conf in args.configs: - config_file = networkconfig[conf] - with tf.Session() as sess: - builder = ModelBuilder(config_file, sess) + try: + print("Starting config ", conf) + config_file = networkconfig[conf] + with tf.Session() as sess: + builder = ModelBuilder(config_file, sess) - network_model = builder.build() - if config_file[USE_PRETRAINED_NET]: - network_model.train(USE_PRETRAINED_NET) - network_model.train() - network_model.close_writers() - tf.reset_default_graph() + network_model = builder.build() + if config_file[USE_PRETRAINED_NET]: + network_model.train(USE_PRETRAINED_NET) + network_model.train() + network_model.close_writers() + tf.reset_default_graph() + except: + print("Config ", config_file["name"], "failed to complete") if __name__ == "__main__": main() From fc8a0e2c24b12c1e517fc2a5795a54065a1d718d Mon Sep 17 00:00:00 2001 From: Jesper Jaxing Date: Thu, 6 Apr 2017 17:53:41 +0200 Subject: [PATCH 34/60] chande to actual variable --- main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.py b/main.py index b460f71..2d45dbd 100644 --- a/main.py +++ b/main.py @@ -49,7 +49,7 @@ def main(): network_model.close_writers() tf.reset_default_graph() except: - print("Config ", config_file["name"], "failed to complete") + print("Config ", networkconfig[conf]["name"], "failed to complete") if __name__ == "__main__": main() From ee1a015ce1090c963d81ac810c69b824facc5400 Mon Sep 17 00:00:00 2001 From: Maxim Goretskyy <13maxim13@live.se> Date: Tue, 11 Apr 2017 16:44:58 +0200 Subject: [PATCH 35/60] Fixes bug when we would log twice for pretraining and actual training --- model/model.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/model/model.py b/model/model.py index 412f6b5..7581d1f 100644 --- a/model/model.py +++ b/model/model.py @@ -223,7 +223,7 @@ def train(self, use_pretrained_net=False): self.train_batch(True) # Do a full evaluation once an epoch is complete - if epoch != old_epoch: + if epoch != old_epoch and not use_pretrained_net: self._session.run(self.epoch.assign_add(1)) print("Epoch complete...old ", old_epoch) self.save_checkpoint() @@ -232,9 +232,10 @@ def train(self, use_pretrained_net=False): # Save model when done training self.save_checkpoint() - log_samefile(config=self.config, f1_score_valid=self.f1_score_valid, f1_score_train=self.f1_score_train, - epoch_top=self.epoch_top, prec_valid=self.prec_valid, prec_train=self.prec_train, - recall_valid=self.recall_valid, recall_train=self.recall_train) + if not use_pretrained_net: + log_samefile(config=self.config, f1_score_valid=self.f1_score_valid, f1_score_train=self.f1_score_train, + epoch_top=self.epoch_top, prec_valid=self.prec_valid, prec_train=self.prec_train, + recall_valid=self.recall_valid, recall_train=self.recall_train) def train_batch(self, pre_train_net=False): """ Trains for one batch and returns cross entropy error """ From 3358a0e8165ed902004c78f0955c29f199272ef1 Mon Sep 17 00:00:00 2001 From: Maxim Goretskyy <13maxim13@live.se> Date: Wed, 12 Apr 2017 11:25:02 +0200 Subject: [PATCH 36/60] Fixes bug when we couldnt use concat-input AND pretraining for a model --- model/model.py | 9 +++++++-- model/util/data.py | 14 +++++++++++--- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/model/model.py b/model/model.py index 7581d1f..1f7f313 100644 --- a/model/model.py +++ b/model/model.py @@ -244,10 +244,15 @@ def train_batch(self, pre_train_net=False): batch_input, batch_sub, batch_label = \ self.data.next_train_batch() else: - batch_input, batch_label = \ + batch_input, batch_sub, batch_label = \ self.data.next_pre_train_batch() - if pre_train_net: + if pre_train_net and self.use_concat_input: + self._session.run(self.pre_train_op, + {self.input: batch_input, + self.subreddit_input: batch_sub, + self.sec_target: batch_label}) + elif pre_train_net: self._session.run(self.pre_train_op, {self.input: batch_input, self.sec_target: batch_label}) diff --git a/model/util/data.py b/model/util/data.py index 49d047a..b2c23ad 100644 --- a/model/util/data.py +++ b/model/util/data.py @@ -141,9 +141,12 @@ def next_pre_train_batch(self, batch_size=None): batch_size = batch_size or self.batch_size batch_x = [] batch_y = [] + batch_sub = [] + for _ in range(0, batch_size): - sentence = self.train_data[self._current_train_index] - label = self.train_labels[self._current_train_index] + sentence = self.train_data[self._current_pre_train_index] + subreddit = self.train_subreddits[self._current_pre_train_index] + label = self.train_labels[self._current_pre_train_index] self._current_pre_train_index += 1 # Support multiple epochs if self._current_pre_train_index >= self.train_size: @@ -160,10 +163,15 @@ def next_pre_train_batch(self, batch_size=None): label_vec = helper.label_vector(label, self.subreddit_dict, self.subreddit_count) + subreddit_vec = helper.label_vector(subreddit, + self.subreddit_dict, + self.subreddit_count) batch_x.append(sentence_vec) batch_y.append(label_vec) + batch_sub.append(subreddit_vec) - return batch_x, batch_y + + return batch_x, batch_sub, batch_y def get_validation(self): """ Get the whole validation set in a vectorized form """ From 515489ec9be8cac1cf33e08059c8ac1e1b4144da Mon Sep 17 00:00:00 2001 From: Jesper Jaxing Date: Wed, 12 Apr 2017 13:07:10 +0200 Subject: [PATCH 37/60] only make one call to label_vector --- model/util/data.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/model/util/data.py b/model/util/data.py index b2c23ad..14d755c 100644 --- a/model/util/data.py +++ b/model/util/data.py @@ -160,14 +160,11 @@ def next_pre_train_batch(self, batch_size=None): self.train_present += present self.train_absent += absent - label_vec = helper.label_vector(label, - self.subreddit_dict, - self.subreddit_count) subreddit_vec = helper.label_vector(subreddit, self.subreddit_dict, self.subreddit_count) batch_x.append(sentence_vec) - batch_y.append(label_vec) + batch_y.append(subreddit_vec) batch_sub.append(subreddit_vec) From b856b66b9c2e1482112bf388d086fe416a3d1593 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonatan=20Alm=C3=A9n?= Date: Thu, 13 Apr 2017 14:10:59 +0200 Subject: [PATCH 38/60] Adds docker support adds readme on how to run docker. --- Dockerfile | 5 +++++ README.md | 5 +++++ 2 files changed, 10 insertions(+) create mode 100644 Dockerfile diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..ae616e7 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,5 @@ +from gcr.io/tensorflow/tensorflow:latest-gpu-py3 +WORKDIR /app +COPY ./project /app +RUN pip install -r requirements.txt + diff --git a/README.md b/README.md index b10bfa0..ef22a06 100644 --- a/README.md +++ b/README.md @@ -26,3 +26,8 @@ For more details, take a look at the [dataset repository](https://github.com/kan ## Configuration To edit configs, take a look at the `config.yaml` file. Please prefer making new configs instead of editing old (for academic purposes). If implementing a new model, make sure to add support for it in the `main.py` file so its configs can be automatically parsed. +## Build/Run with Docker + +Build with ```docker build -t YOURTAG .``` + +Run with ```nvidia-docker run [-v YOURLOGDIR:/app/logs] -t -rm YOURTAG python -u ./YOURENTRYPOINT.py From 8319891cf9043120a9e2587ed7152cc4e060390d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexander=20H=C3=A5kansson?= Date: Thu, 13 Apr 2017 14:30:13 +0200 Subject: [PATCH 39/60] Automatically run all configs if none is specified --- main.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/main.py b/main.py index 2d45dbd..52d0f8d 100644 --- a/main.py +++ b/main.py @@ -21,6 +21,7 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # ============================================================================== +import sys import argparse import tensorflow as tf from definitions import * @@ -31,11 +32,10 @@ def main(): """ A main method that creates the model and starts training it """ # Parse arguments parser = argparse.ArgumentParser(add_help=True) - parser.add_argument('configs', metavar='C', type=int, nargs='+', + parser.add_argument('configs', metavar='C', type=int, nargs='*', help='Config number to use (can be multiple)') args = parser.parse_args() - - for conf in args.configs: + for conf in args.configs if args.configs else range(len(networkconfig)): try: print("Starting config ", conf) config_file = networkconfig[conf] @@ -48,8 +48,9 @@ def main(): network_model.train() network_model.close_writers() tf.reset_default_graph() - except: - print("Config ", networkconfig[conf]["name"], "failed to complete") + except Exception as e: + print("Config ", networkconfig[conf]["name"], "failed to complete", file=sys.stderr) + print(e, file=sys.stderr) if __name__ == "__main__": main() From c1726bb550446d5161eff7d693385743a36852ab Mon Sep 17 00:00:00 2001 From: axel olivecrona Date: Thu, 6 Apr 2017 13:16:49 +0200 Subject: [PATCH 40/60] still need to fill in all ranges --- template.py | 154 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 154 insertions(+) create mode 100644 template.py diff --git a/template.py b/template.py new file mode 100644 index 0000000..6df40db --- /dev/null +++ b/template.py @@ -0,0 +1,154 @@ +template = """ -type: {0} + name: {1} + # Data related configs: + vocabulary_size: {2} + user_count: {3} + max_title_length: {4} + validation_data: {5} + training_data: {6} + testing_data: {7} + # Embedding matrix configs: + embedding_size: {8} # Make sure to match pretrained matrix dimensions + trainable_matrix: {9} + use_pretrained: {10} + pre_trained_matrix: {11} + # Learning configs: + learning_rate: {12} + training_epochs: {13} + batch_size: {14} + lstm_neurons: {15} + hidden_layers: {16} + hidden_neurons: {17} + use_concat_input: {18} + # Regularisation configs: + use_l2_loss: {19} + l2_factor: {20} + use_dropout: {21} + dropout_prob: {22} + # Validation configs: + use_constant_limit: {23} + constant_prediction_limit: {24}""" + +def rtype(): + return "model-builder" + +def name(): + name = "network" + x = 0 + while True: + yield name + str(x) + x+=1 + +def vocabulary_size(): + pass + +def user_count(): + return input("Enter number of users > ") + +def max_title_length(): + pass + +def validation_data(): + pass + +def training_data(): + pass + +def testing_data(): + pass + +def embedding_size(): + pass + +def trainable_matrix(): + pass + +def use_pretrained(): + pass + +def pre_trained_matrix(): + pass + +def learning_rate(): + pass + +def training_epochs(): + pass + +def batch_size(): + pass + +def lstm_neurons(): + pass + +def hidden_layers(): + pass + +def hidden_neurons(): + pass + +def use_concat_input(): + pass + +def use_l2_loss(): + pass + +def l2_factor(): + pass + +def use_dropout(): + pass + +def dropout_prob(): + pass + +def use_constant_limit(): + pass + +def constant_prediction_limit(): + pass + + +name_generator = name() +users = user_count() + +def get_random_config(): + config = template.format(rtype(), + name_generator.__next__(), + vocabulary_size(), + users, + max_title_length(), + validation_data(), + training_data(), + testing_data(), + embedding_size(), + trainable_matrix(), + use_pretrained(), + pre_trained_matrix(), + learning_rate(), + training_epochs(), + batch_size(), + lstm_neurons(), + hidden_layers(), + hidden_neurons(), + use_concat_input(), + use_l2_loss(), + l2_factor(), + use_dropout(), + dropout_prob(), + use_constant_limit(), + constant_prediction_limit()) + return config + +nbr_of_configs = int(input("Enter number of configs to generate > ")) +configs = "\n".join( (get_random_config() for x in range(nbr_of_configs)) ) + +header = """data: + path: 'resources/datasets/' + +network:\n""" + +f = open("config.yaml","w") +f.write(header + configs) +f.close() + From fcd3bb86074ef6acee7ed3b00e1f4b534e45d916 Mon Sep 17 00:00:00 2001 From: axel olivecrona Date: Sun, 9 Apr 2017 16:09:56 +0200 Subject: [PATCH 41/60] all but the datasets --- template.py | 78 ++++++++++++++++++++++++++++++++--------------------- 1 file changed, 48 insertions(+), 30 deletions(-) diff --git a/template.py b/template.py index 6df40db..3c7e426 100644 --- a/template.py +++ b/template.py @@ -1,3 +1,5 @@ +from random import choice + template = """ -type: {0} name: {1} # Data related configs: @@ -40,91 +42,107 @@ def name(): x+=1 def vocabulary_size(): - pass + possibilites = ["10000","12000","15000","17000","19000"] + return choice(possibilites) def user_count(): - return input("Enter number of users > ") + pass def max_title_length(): - pass + possibilites = ["20","25","30","35","40"] + return choice(possibilites) -def validation_data(): +def validation_data(nbr_of_users): pass -def training_data(): +def training_data(nbr_of_users): pass -def testing_data(): +def testing_data(nbr_of_users): pass def embedding_size(): - pass + return choice("100","150","300") def trainable_matrix(): - pass + return choice(["true","false"]) def use_pretrained(): - pass + return choice(["true","false"]) -def pre_trained_matrix(): - pass +def pre_trained_matrix(dim): + if dim == "100": + return "vectors.txt" + elif dim == "150": + return "vectors150d.txt" + elif dim == "300": + return "vectors300d.txt" def learning_rate(): - pass + possibilites = ["0.15","0.2","0.3","0.35","0.4","0.5"] + return choice(possibilites) def training_epochs(): - pass + possibilites = ["5","6","7","8","9"] + return choice(possibilites) def batch_size(): - pass + possibilites = ["20","25","30","35","40"] + return choice(possibilites) def lstm_neurons(): - pass + possibilites = ["100","150","175","200","250","300"] + return choice(possibilites) def hidden_layers(): - pass + return choice(["1","2","3","4","5"]) def hidden_neurons(): - pass + possibilites = ["150","200","250","300","350"] + return choice(possibilites) def use_concat_input(): - pass + return choice(["true","false"]) def use_l2_loss(): - pass + return choice(["true","false"]) def l2_factor(): - pass + possibilites = [] + return choice(possibilites) def use_dropout(): - pass + return choice(["true","false"]) def dropout_prob(): - pass + possibilites = ["0.65","0.70","0.75","0.80"] + return choice(possibilites) def use_constant_limit(): - pass + return choice(["true","false"]) def constant_prediction_limit(): - pass + possibilites = ["0.25","0.3","0.35","0.40"] + return choice(possibilites) name_generator = name() -users = user_count() def get_random_config(): + embed_size = embedding_size() + nbr_of_users = user_count() config = template.format(rtype(), name_generator.__next__(), vocabulary_size(), - users, + nbr_of_users, max_title_length(), - validation_data(), - training_data(), - testing_data(), + validation_data(nbr_of_users), + training_data(nbr_of_users), + testing_data(nbr_of_users), embedding_size(), trainable_matrix(), use_pretrained(), - pre_trained_matrix(), + pre_trained_matrix(embed_size), learning_rate(), training_epochs(), batch_size(), From 112a5d11a6e52c36cdd8dbfbdcd38d21a413a389 Mon Sep 17 00:00:00 2001 From: axel olivecrona Date: Mon, 10 Apr 2017 14:34:11 +0200 Subject: [PATCH 42/60] added dataset --- template.py | 62 ++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 42 insertions(+), 20 deletions(-) diff --git a/template.py b/template.py index 3c7e426..860a8a5 100644 --- a/template.py +++ b/template.py @@ -6,14 +6,14 @@ vocabulary_size: {2} user_count: {3} max_title_length: {4} - validation_data: {5} - training_data: {6} - testing_data: {7} + validation_data: '{5}' + training_data: '{6}' + testing_data: '{7}' # Embedding matrix configs: embedding_size: {8} # Make sure to match pretrained matrix dimensions trainable_matrix: {9} use_pretrained: {10} - pre_trained_matrix: {11} + pre_trained_matrix: '{11}' # Learning configs: learning_rate: {12} training_epochs: {13} @@ -46,23 +46,45 @@ def vocabulary_size(): return choice(possibilites) def user_count(): - pass + return choice(["50","5"]) def max_title_length(): possibilites = ["20","25","30","35","40"] return choice(possibilites) -def validation_data(nbr_of_users): - pass - -def training_data(nbr_of_users): - pass - -def testing_data(nbr_of_users): - pass +def data_set(nbr_of_users): + possibilites_5_users = [("validation_data_top_5_subreddit_allvotes.csv" + ,"training_data_top_5_subreddit_allvotes.csv" + ,"testing_data_top_5_subreddit_allvotes.csv"), + ("validation_data_top_5_subreddit_allvotes_binarycontent.csv", + "training_data_top_5_subreddit_allvotes_binarycontent.csv", + "testing_data_top_5_subreddit_allvotes_binarycontent.csv"), + ("validation_data_top_n.csv", + "training_data_top_n.csv", + "testing_data_top_n.csv") + ] + possibilites_50_users = [("validation_data_top_50_subreddit_allvotes.csv" + ,"training_data_top_50_subreddit_allvotes.csv" + ,"testing_data_top_50_subreddit_allvotes.csv"), + ("validation_data_top_50_subreddit_allvotes_binarycontent.csv", + "training_data_top_50_subreddit_allvotes_binarycontent.csv", + "testing_data_top_50_subreddit_allvotes_binarycontent.csv"), + ("validation_data_top_50_subreddit.csv", + "training_data_top_50_subreddit.csv", + "testing_data_top_50_subreddit.csv") + ] + + val = None + train = None + test = None + if nbr_of_users == "5": + val,train,test = choice(possibilites_5_users) + elif nbr_of_users == "50": + val,train,test = choice(possibilites_50_users) + return val,train,test def embedding_size(): - return choice("100","150","300") + return choice(["100","150","300"]) def trainable_matrix(): return choice(["true","false"]) @@ -83,8 +105,7 @@ def learning_rate(): return choice(possibilites) def training_epochs(): - possibilites = ["5","6","7","8","9"] - return choice(possibilites) + return choice(["50"]) def batch_size(): possibilites = ["20","25","30","35","40"] @@ -108,7 +129,7 @@ def use_l2_loss(): return choice(["true","false"]) def l2_factor(): - possibilites = [] + possibilites = ["0.01"] return choice(possibilites) def use_dropout(): @@ -131,14 +152,15 @@ def constant_prediction_limit(): def get_random_config(): embed_size = embedding_size() nbr_of_users = user_count() + val,train,test = data_set(nbr_of_users) config = template.format(rtype(), name_generator.__next__(), vocabulary_size(), nbr_of_users, max_title_length(), - validation_data(nbr_of_users), - training_data(nbr_of_users), - testing_data(nbr_of_users), + val, + train, + test, embedding_size(), trainable_matrix(), use_pretrained(), From f600402d01feb8051529f6968851018d70aecbe3 Mon Sep 17 00:00:00 2001 From: Maxim Goretskyy <13maxim13@live.se> Date: Mon, 10 Apr 2017 16:10:16 +0200 Subject: [PATCH 43/60] Removes wrong files and adds two more learning rates --- template.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/template.py b/template.py index 860a8a5..1f4ef45 100644 --- a/template.py +++ b/template.py @@ -56,9 +56,6 @@ def data_set(nbr_of_users): possibilites_5_users = [("validation_data_top_5_subreddit_allvotes.csv" ,"training_data_top_5_subreddit_allvotes.csv" ,"testing_data_top_5_subreddit_allvotes.csv"), - ("validation_data_top_5_subreddit_allvotes_binarycontent.csv", - "training_data_top_5_subreddit_allvotes_binarycontent.csv", - "testing_data_top_5_subreddit_allvotes_binarycontent.csv"), ("validation_data_top_n.csv", "training_data_top_n.csv", "testing_data_top_n.csv") @@ -66,9 +63,6 @@ def data_set(nbr_of_users): possibilites_50_users = [("validation_data_top_50_subreddit_allvotes.csv" ,"training_data_top_50_subreddit_allvotes.csv" ,"testing_data_top_50_subreddit_allvotes.csv"), - ("validation_data_top_50_subreddit_allvotes_binarycontent.csv", - "training_data_top_50_subreddit_allvotes_binarycontent.csv", - "testing_data_top_50_subreddit_allvotes_binarycontent.csv"), ("validation_data_top_50_subreddit.csv", "training_data_top_50_subreddit.csv", "testing_data_top_50_subreddit.csv") @@ -101,7 +95,7 @@ def pre_trained_matrix(dim): return "vectors300d.txt" def learning_rate(): - possibilites = ["0.15","0.2","0.3","0.35","0.4","0.5"] + possibilites = ["0.05", "0.1", "0.15","0.2","0.3","0.35","0.4","0.5"] return choice(possibilites) def training_epochs(): From a8f23e608f9710c1c36351e9071e9a1351cce8f3 Mon Sep 17 00:00:00 2001 From: axel olivecrona Date: Mon, 10 Apr 2017 16:54:06 +0200 Subject: [PATCH 44/60] fixed bugg --- template.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/template.py b/template.py index 1f4ef45..8f2e13d 100644 --- a/template.py +++ b/template.py @@ -155,7 +155,7 @@ def get_random_config(): val, train, test, - embedding_size(), + embed_size, trainable_matrix(), use_pretrained(), pre_trained_matrix(embed_size), From 97997d24e4208079200098f08dc3b6e159ef6a0b Mon Sep 17 00:00:00 2001 From: hsson Date: Mon, 10 Apr 2017 17:12:14 +0200 Subject: [PATCH 45/60] Changes user count to take UNK user into account --- template.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/template.py b/template.py index 8f2e13d..9a2d5b0 100644 --- a/template.py +++ b/template.py @@ -46,7 +46,7 @@ def vocabulary_size(): return choice(possibilites) def user_count(): - return choice(["50","5"]) + return choice(["51","6"]) def max_title_length(): possibilites = ["20","25","30","35","40"] @@ -71,9 +71,9 @@ def data_set(nbr_of_users): val = None train = None test = None - if nbr_of_users == "5": + if nbr_of_users == "6": val,train,test = choice(possibilites_5_users) - elif nbr_of_users == "50": + elif nbr_of_users == "51": val,train,test = choice(possibilites_50_users) return val,train,test From 762e8f04071acb6b07fe97bf76a1c0583d916a26 Mon Sep 17 00:00:00 2001 From: hsson Date: Mon, 10 Apr 2017 17:15:09 +0200 Subject: [PATCH 46/60] Removes dataset that doesn't exist --- template.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/template.py b/template.py index 9a2d5b0..8c44490 100644 --- a/template.py +++ b/template.py @@ -55,10 +55,7 @@ def max_title_length(): def data_set(nbr_of_users): possibilites_5_users = [("validation_data_top_5_subreddit_allvotes.csv" ,"training_data_top_5_subreddit_allvotes.csv" - ,"testing_data_top_5_subreddit_allvotes.csv"), - ("validation_data_top_n.csv", - "training_data_top_n.csv", - "testing_data_top_n.csv") + ,"testing_data_top_5_subreddit_allvotes.csv") ] possibilites_50_users = [("validation_data_top_50_subreddit_allvotes.csv" ,"training_data_top_50_subreddit_allvotes.csv" From d01a68393ca538b403a4da3100dd8f0660bee066 Mon Sep 17 00:00:00 2001 From: hsson Date: Mon, 10 Apr 2017 17:18:11 +0200 Subject: [PATCH 47/60] Increases interval between hidden neuron sizes --- template.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/template.py b/template.py index 8c44490..bccb909 100644 --- a/template.py +++ b/template.py @@ -110,7 +110,7 @@ def hidden_layers(): return choice(["1","2","3","4","5"]) def hidden_neurons(): - possibilites = ["150","200","250","300","350"] + possibilites = ["150","300","450","600","750"] return choice(possibilites) def use_concat_input(): From c2bc3efd2197ad76c8cef3a2f3b5daaf456759d3 Mon Sep 17 00:00:00 2001 From: hsson Date: Mon, 10 Apr 2017 17:20:06 +0200 Subject: [PATCH 48/60] Increases prediction limit intervals --- template.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/template.py b/template.py index bccb909..f9b0a0a 100644 --- a/template.py +++ b/template.py @@ -134,7 +134,7 @@ def use_constant_limit(): return choice(["true","false"]) def constant_prediction_limit(): - possibilites = ["0.25","0.3","0.35","0.40"] + possibilites = ["0.2","0.3","0.4","0.5", "0.6", "0.7", "0.8"] return choice(possibilites) From 4bed82e8b84190542fee66a11714a3a08b58ecb2 Mon Sep 17 00:00:00 2001 From: hsson Date: Mon, 10 Apr 2017 17:21:37 +0200 Subject: [PATCH 49/60] Tries more common batch sizes --- template.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/template.py b/template.py index f9b0a0a..47cba25 100644 --- a/template.py +++ b/template.py @@ -99,7 +99,7 @@ def training_epochs(): return choice(["50"]) def batch_size(): - possibilites = ["20","25","30","35","40"] + possibilites = ["25", "50", "100"] return choice(possibilites) def lstm_neurons(): From 8883e782e136e86be5446a2fb6fe7d84431b2555 Mon Sep 17 00:00:00 2001 From: hsson Date: Mon, 10 Apr 2017 17:22:39 +0200 Subject: [PATCH 50/60] USe consistent step between lstm neurons --- template.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/template.py b/template.py index 47cba25..9b1f3ab 100644 --- a/template.py +++ b/template.py @@ -103,7 +103,7 @@ def batch_size(): return choice(possibilites) def lstm_neurons(): - possibilites = ["100","150","175","200","250","300"] + possibilites = ["100","150","200","250","300"] return choice(possibilites) def hidden_layers(): From 16acae6a5abfe02486e4bac75f296fc9bbc90790 Mon Sep 17 00:00:00 2001 From: hsson Date: Mon, 10 Apr 2017 17:31:09 +0200 Subject: [PATCH 51/60] Uses dropout probabilities from original dropout paper --- template.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/template.py b/template.py index 9b1f3ab..31dd4f9 100644 --- a/template.py +++ b/template.py @@ -127,7 +127,7 @@ def use_dropout(): return choice(["true","false"]) def dropout_prob(): - possibilites = ["0.65","0.70","0.75","0.80"] + possibilites = ["0.5", "0.75", "0.9"] return choice(possibilites) def use_constant_limit(): From 51bde47745ee83ef6f901841175f832f66a9de6b Mon Sep 17 00:00:00 2001 From: hsson Date: Mon, 10 Apr 2017 17:45:42 +0200 Subject: [PATCH 52/60] Uses new hyperparameters from dev --- template.py | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/template.py b/template.py index 31dd4f9..2c1bd29 100644 --- a/template.py +++ b/template.py @@ -18,18 +18,21 @@ learning_rate: {12} training_epochs: {13} batch_size: {14} - lstm_neurons: {15} - hidden_layers: {16} - hidden_neurons: {17} - use_concat_input: {18} + rnn_neurons: {15} + rnn_unit: '{16}' # Can be 'gru' or 'lstm', default: 'lstm' + hidden_layers: {17} + hidden_neurons: {18} + subreddit_input_neurons: {19} + use_concat_input: {20} + pre_train_subreddit: {21} # Regularisation configs: - use_l2_loss: {19} - l2_factor: {20} - use_dropout: {21} - dropout_prob: {22} + use_l2_loss: {22} + l2_factor: {23} + use_dropout: {24} + dropout_prob: {25} # Validation configs: - use_constant_limit: {23} - constant_prediction_limit: {24}""" + use_constant_limit: {26} + constant_prediction_limit: {27}""" def rtype(): return "model-builder" @@ -106,6 +109,15 @@ def lstm_neurons(): possibilites = ["100","150","200","250","300"] return choice(possibilites) +def rnn_unit(): + return choice(["gru","lstm"]) + +def subreddit_input_neurons(): + return choice(["50", "100", "200"]) + +def use_pretrain_subreddit(): + return choice(["true", "false"]) + def hidden_layers(): return choice(["1","2","3","4","5"]) @@ -160,9 +172,12 @@ def get_random_config(): training_epochs(), batch_size(), lstm_neurons(), + rnn_unit(), hidden_layers(), hidden_neurons(), + subreddit_input_neurons(), use_concat_input(), + use_pretrain_subreddit(), use_l2_loss(), l2_factor(), use_dropout(), From 72fdb3bbb53e217a09f298c6d0c0c2ce8dc60b23 Mon Sep 17 00:00:00 2001 From: hsson Date: Mon, 10 Apr 2017 17:57:45 +0200 Subject: [PATCH 53/60] Adds more possible l2 factors --- template.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/template.py b/template.py index 2c1bd29..69c846a 100644 --- a/template.py +++ b/template.py @@ -132,7 +132,7 @@ def use_l2_loss(): return choice(["true","false"]) def l2_factor(): - possibilites = ["0.01"] + possibilites = ["0.01", "0.05", "0.1"] return choice(possibilites) def use_dropout(): From 9e6855724a52a8c4c5e00c920cc6d615e64b507a Mon Sep 17 00:00:00 2001 From: Maxim Goretskyy <13maxim13@live.se> Date: Wed, 12 Apr 2017 10:27:16 +0200 Subject: [PATCH 54/60] Adds 0 for hidden layer, adds 400 for lstm_neurons and removes 100 from lstm_neurons --- template.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/template.py b/template.py index 69c846a..09e9a16 100644 --- a/template.py +++ b/template.py @@ -106,7 +106,7 @@ def batch_size(): return choice(possibilites) def lstm_neurons(): - possibilites = ["100","150","200","250","300"] + possibilites = ["150","200","250","300","400"] return choice(possibilites) def rnn_unit(): @@ -119,7 +119,7 @@ def use_pretrain_subreddit(): return choice(["true", "false"]) def hidden_layers(): - return choice(["1","2","3","4","5"]) + return choice(["0","1","2","3","4","5"]) def hidden_neurons(): possibilites = ["150","300","450","600","750"] From 50c14aa05dd5afc27213185545b720007ec210fa Mon Sep 17 00:00:00 2001 From: Maxim Goretskyy <13maxim13@live.se> Date: Wed, 12 Apr 2017 10:27:59 +0200 Subject: [PATCH 55/60] Fixes indentation --- template.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/template.py b/template.py index 09e9a16..813bfe7 100644 --- a/template.py +++ b/template.py @@ -1,6 +1,6 @@ from random import choice -template = """ -type: {0} +template = """ - type: {0} name: {1} # Data related configs: vocabulary_size: {2} From 45d63702d32da52b53bd4f49fe85c327cd909891 Mon Sep 17 00:00:00 2001 From: Jesper Jaxing Date: Thu, 13 Apr 2017 14:07:20 +0200 Subject: [PATCH 56/60] network name is guaranteed unique --- template.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/template.py b/template.py index 813bfe7..125beda 100644 --- a/template.py +++ b/template.py @@ -1,4 +1,5 @@ from random import choice +from datetime import datetime template = """ - type: {0} name: {1} @@ -32,13 +33,13 @@ dropout_prob: {25} # Validation configs: use_constant_limit: {26} - constant_prediction_limit: {27}""" + constant_prediction_limit: {27}\n""" def rtype(): return "model-builder" def name(): - name = "network" + name = "network-" + str(datetime.now()) + "-" x = 0 while True: yield name + str(x) From d479982e5cd636032b6d1f13f7538cdeab31287d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexander=20H=C3=A5kansson?= Date: Thu, 13 Apr 2017 15:12:45 +0200 Subject: [PATCH 57/60] Adds a script that automatically downloads the data needed --- download_data.sh | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100755 download_data.sh diff --git a/download_data.sh b/download_data.sh new file mode 100755 index 0000000..78f2a6a --- /dev/null +++ b/download_data.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +mkdir -p resources/datasets +cd resources/datasets + +# Download embedding matrices +wget https://github.com/kandidat-highlights/data/raw/master/Glove/vectors100d.tar.gz +tar -xzf vectors100d.tar.gz +rm vectors100d.tar.gz + +wget https://github.com/kandidat-highlights/data/raw/master/Glove/vectors150d.tar.gz +tar -xzf vectors150d.tar.gz +rm vectors150d.tar.gz + +# Download datasets +wget https://github.com/kandidat-highlights/data/raw/master/allVotes/data_top50_users_subreddit_title_all_votes.tar.gz +tar -xzf data_top50_users_subreddit_title_all_votes.tar.gz +rm data_top50_users_subreddit_title_all_votes.tar.gz + +wget https://github.com/kandidat-highlights/data/raw/master/allVotes/data_top5_users_subreddit_title_all_votes.tar.gz +tar -xzf data_top5_users_subreddit_title_all_votes.tar.gz +rm data_top5_users_subreddit_title_all_votes.tar.gz + +wget https://github.com/kandidat-highlights/data/raw/master/top50/data_top50_users_subreddit_title.tar.gz +tar -xzf data_top50_users_subreddit_title.tar.gz +rm data_top50_users_subreddit_title.tar.gz + +cd ../../ From 97f0ffbf77dac4d6a7ce28b41a469cfe4be3a9d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexander=20H=C3=A5kansson?= Date: Thu, 13 Apr 2017 19:35:56 +0200 Subject: [PATCH 58/60] Refactors all double precision floating points to single precision to save memory --- model/model.py | 2 +- model/model_builder.py | 32 ++++++++++++++++---------------- model/util/config.py | 2 +- model/util/csv_reader.py | 30 ++++++++++++++---------------- model/util/data.py | 2 +- 5 files changed, 33 insertions(+), 35 deletions(-) diff --git a/model/model.py b/model/model.py index 1f7f313..1358564 100644 --- a/model/model.py +++ b/model/model.py @@ -46,7 +46,7 @@ def __init__(self, config, session): self.latest_layer = None self.output_weights = None self.output_bias = None - self.l2_term = tf.constant(0, dtype=tf.float64) + self.l2_term = tf.constant(0, dtype=tf.float32) self.vocabulary_size = config[VOC_SIZE] self.user_count = config[USER_COUNT] diff --git a/model/model_builder.py b/model/model_builder.py index c05ae8b..0a62bc2 100644 --- a/model/model_builder.py +++ b/model/model_builder.py @@ -46,19 +46,19 @@ def add_input_layer(self): [None, self._model.max_title_length], name="input") self._model.subreddit_input = \ - tf.placeholder(tf.float64, + tf.placeholder(tf.float32, [None, self._model.subreddit_count], name="subreddit_input") self._model.target = \ - tf.placeholder(tf.float64, + tf.placeholder(tf.float32, [None, self._model.user_count], name="target") self._model.sec_target = \ - tf.placeholder(tf.float64, + tf.placeholder(tf.float32, [None, self._model.data.subreddit_count], name="sec_target") - self._model.keep_prob = tf.placeholder(tf.float64, name="keep_prob") + self._model.keep_prob = tf.placeholder(tf.float32, name="keep_prob") if self._model.rnn_unit == 'lstm': rnn_layer = tf.contrib.rnn.LSTMCell(self._model.rnn_neurons) @@ -73,13 +73,13 @@ def add_input_layer(self): tf.random_uniform( [self._model.vocabulary_size, self._model.embedding_size], - -1.0, 1.0, dtype=tf.float64), + -1.0, 1.0, dtype=tf.float32), trainable=self._model.is_trainable_matrix, name="embedding_matrix", - dtype=tf.float64) + dtype=tf.float32) self._model.embedding_placeholder = \ - tf.placeholder(tf.float64, + tf.placeholder(tf.float32, [self._model.vocabulary_size, self._model.embedding_size]) self._model.embedding_init = \ embedding_matrix.assign(self._model.embedding_placeholder) @@ -88,7 +88,7 @@ def add_input_layer(self): self._model.input) # Run the LSTM layer with the embedded input outputs, _ = tf.nn.dynamic_rnn(rnn_layer, embedded_input, - dtype=tf.float64) + dtype=tf.float32) outputs = tf.transpose(outputs, [1, 0, 2]) output = outputs[-1] @@ -98,13 +98,13 @@ def add_input_layer(self): [self._model.subreddit_count, self._model.subreddit_input_neurons], stddev=0.35, - dtype=tf.float64), + dtype=tf.float32), name="sub_input_weights") subreddit_bias = tf.Variable(tf.random_normal( [self._model.subreddit_input_neurons], stddev=0.35, - dtype=tf.float64), + dtype=tf.float32), name="sub_input_bias") logit_subreddit = tf.add( @@ -128,22 +128,22 @@ def add_layer(self, number_of_neurons): else 0), number_of_neurons], stddev=0.35, - dtype=tf.float64), + dtype=tf.float32), name="weights" + str(self.number_of_layers)) bias = tf.Variable(tf.random_normal([number_of_neurons], stddev=0.35, - dtype=tf.float64), + dtype=tf.float32), name="biases" + str(self.number_of_layers)) else: weights = tf.Variable(tf.random_normal( [self._model.latest_layer.get_shape()[1].value, number_of_neurons], stddev=0.35, - dtype=tf.float64), + dtype=tf.float32), name="weights" + str(self.number_of_layers)) bias = tf.Variable(tf.random_normal([number_of_neurons], stddev=0.35, - dtype=tf.float64), + dtype=tf.float32), name="biases" + str(self.number_of_layers)) logits = tf.add(tf.matmul(self._model.latest_layer, weights), bias) @@ -169,12 +169,12 @@ def add_output_layer(self, output_size, secondary_output=False): sigmoid_weights = tf.Variable(tf.random_normal( [self._model.latest_layer.get_shape()[1].value, output_size], stddev=0.35, - dtype=tf.float64), + dtype=tf.float32), name="output_weights") sigmoid_bias = tf.Variable(tf.random_normal([output_size], stddev=0.35, - dtype=tf.float64), + dtype=tf.float32), name="output_biases") logits = tf.add(tf.matmul(self._model.latest_layer, sigmoid_weights), sigmoid_bias) diff --git a/model/util/config.py b/model/util/config.py index c2cfbfc..6bdf4ca 100644 --- a/model/util/config.py +++ b/model/util/config.py @@ -29,6 +29,6 @@ for conf in cfg['network']: for (key, value) in conf.items(): if type(value) is float: - conf[key] = np.float64(value) + conf[key] = np.float32(value) diff --git a/model/util/csv_reader.py b/model/util/csv_reader.py index 17c8357..b6894c7 100644 --- a/model/util/csv_reader.py +++ b/model/util/csv_reader.py @@ -74,33 +74,31 @@ def get_data(self, datatype, data_column=[0], sub_column=1, label_column=2): subreddit_full.append(subreddit) return data_full, subreddit_full, label_full + def load_pretrained_embeddings(self, file_name, dimension_size=50): + file_path = os.path.join(DATASETS_PATH, file_name) + matrix = [np.random.rand(dimension_size).astype(np.float32)] + word_dict = dict() + word_dict['UNK'] = 0 + count = 1 - def test_load_pretrained_embeddings(self, fileName, dimension_size=50): - file_path = os.path.join(DATASETS_PATH, fileName) with open(file_path, 'r', encoding='UTF-8') as csvfile: reader = csv.reader(csvfile, delimiter=' ', quoting=csv.QUOTE_NONE) - word_dict = dict() - matrix = [] - - word_dict['UNK'] = len(matrix) - matrix.append(np.random.rand(1, dimension_size)[0].tolist()) for row in reader: first_col = row[0] - if first_col in ['!', '?', '-', '_', '.', ',', '\'', '\"', ':', ';', '%', '(', ')']: continue - if first_col[0] == '<': #some words are tokens for usernames like or etc, ignore them. + # Some words are tokens for usernames like or + # etc, ignore them. + if first_col[0] == '<': continue - word_dict[first_col] = len(matrix) - matrix.append(row[1:]) - embed_matrix = np.array(matrix) - embed_matrix = embed_matrix.astype(np.float64) - return word_dict, embed_matrix - - + word_dict[first_col] = count + row_array = np.array(row[1:], dtype=np.float32) + matrix.append(row_array) + count += 1 + return word_dict, np.array(matrix) diff --git a/model/util/data.py b/model/util/data.py index 14d755c..22bbc17 100644 --- a/model/util/data.py +++ b/model/util/data.py @@ -86,7 +86,7 @@ def _build_dict(self): helper.build_dataset(vocab, vocabulary_size=self.vocabulary_size) else: self.word_dict, self.embedding_matrix = \ - self.reader.test_load_pretrained_embeddings( + self.reader.load_pretrained_embeddings( self.pre_trained_matrix, self.embedding_size) users = " ".join(self.train_labels).split() From 233b090d51a82e1e53ef331e7cee790e22d11a81 Mon Sep 17 00:00:00 2001 From: Maxim Goretskyy <13maxim13@live.se> Date: Sun, 16 Apr 2017 08:48:07 +0200 Subject: [PATCH 59/60] Resets graph even if it failed --- main.py | 1 + 1 file changed, 1 insertion(+) diff --git a/main.py b/main.py index 52d0f8d..d1b7f1f 100644 --- a/main.py +++ b/main.py @@ -51,6 +51,7 @@ def main(): except Exception as e: print("Config ", networkconfig[conf]["name"], "failed to complete", file=sys.stderr) print(e, file=sys.stderr) + tf.reset_default_graph() if __name__ == "__main__": main() From 0a3509219d9c62e2d0d2186b668db833318ab9ea Mon Sep 17 00:00:00 2001 From: Jesper Jaxing Date: Mon, 17 Apr 2017 14:58:26 +0200 Subject: [PATCH 60/60] Changed the header to include pre_train_subreddit --- model/util/writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/model/util/writer.py b/model/util/writer.py index 7d5bb81..9db1ccd 100644 --- a/model/util/writer.py +++ b/model/util/writer.py @@ -36,7 +36,7 @@ def log_samefile(config, f1_score_valid, f1_score_train, epoch_top, prec_valid, config_headers = [NET_TYPE, NET_NAME, VOC_SIZE, USER_COUNT, LEARN_RATE, EMBEDD_SIZE, MAX_TITLE_LENGTH, RNN_NEURONS, RNN_UNIT, HIDDEN_NEURONS, HIDDEN_LAYERS, SUB_INPUT_NEURONS, USE_CONCAT_INPUT, BATCH_SIZE, TRAINING_EPOCHS, USE_L2_LOSS, L2_FACTOR, USE_DROPOUT, DROPOUT_PROB, USE_CONSTANT_LIMIT, CONSTANT_PREDICTION_LIMIT, TRAINABLE_MATRIX, - PRE_TRAINED_MATRIX, USE_PRETRAINED, VALIDATION_DATA, TRAINING_DATA, TESTING_DATA] + PRE_TRAINED_MATRIX, USE_PRETRAINED, USE_PRETRAINED_NET, VALIDATION_DATA, TRAINING_DATA, TESTING_DATA] additional_headers = [F1_SCORE_TOP_VALID, F1_SCORE_TRAIN, EPOCH_WHEN_F1_TOP, PRECISION_VALIDATION, PRECISION_TRAINING, RECALL_VALIDATION, RECALL_TRAINING, DATE]