diff --git a/config.template.yaml b/config.template.yaml index 115f562..d70c202 100644 --- a/config.template.yaml +++ b/config.template.yaml @@ -8,9 +8,9 @@ network: vocabulary_size: 10000 user_count: 6 max_title_length: 30 - validation_data: 'validation_data_top_n_single.csv' - training_data: 'training_data_top_n_single.csv' - testing_data: 'testing_data_top_n_single.csv' + validation_data: 'validation_data_top_5_subreddit_allvotes.csv' + training_data: 'training_data_top_5_subreddit_allvotes.csv' + testing_data: 'testing_data_top_5_subreddit_allvotes.csv' # Embedding matrix configs: embedding_size: 150 # Make sure to match pretrained matrix dimensions trainable_matrix: true @@ -20,10 +20,12 @@ network: learning_rate: 0.5 training_epochs: 5 batch_size: 25 - lstm_neurons: 200 + rnn_neurons: 200 + rnn_unit: 'lstm' # Can be 'gru' or 'lstm', default: 'lstm' hidden_layers: 0 hidden_neurons: 300 use_concat_input: false + pre_train_subreddit: false # Regularisation configs: use_l2_loss: false l2_factor: 0.01 diff --git a/definitions.py b/definitions.py index fd70b22..6ff5dac 100644 --- a/definitions.py +++ b/definitions.py @@ -40,7 +40,8 @@ LEARN_RATE = 'learning_rate' EMBEDD_SIZE = 'embedding_size' MAX_TITLE_LENGTH = 'max_title_length' -LSTM_NEURONS = 'lstm_neurons' +RNN_NEURONS = 'rnn_neurons' +RNN_UNIT = "rnn_unit" HIDDEN_NEURONS = 'hidden_neurons' HIDDEN_LAYERS = 'hidden_layers' USE_CONCAT_INPUT = 'use_concat_input' @@ -55,6 +56,7 @@ TRAINABLE_MATRIX = 'trainable_matrix' PRE_TRAINED_MATRIX = 'pre_trained_matrix' USE_PRETRAINED = 'use_pretrained' +USE_PRETRAINED_NET = 'pre_train_subreddit' VALIDATION_DATA = 'validation_data' TRAINING_DATA = 'training_data' TESTING_DATA = 'testing_data' diff --git a/main.py b/main.py index dd048e3..030cb35 100644 --- a/main.py +++ b/main.py @@ -23,6 +23,7 @@ # ============================================================================== import argparse import tensorflow as tf +from definitions import * from model.util.networkconfig import yamlconfig as networkconfig from model.model_builder import ModelBuilder @@ -38,7 +39,10 @@ def main(): config_file = networkconfig[conf] with tf.Session() as sess: builder = ModelBuilder(config_file, sess) + network_model = builder.build() + if config_file[USE_PRETRAINED_NET]: + network_model.train(USE_PRETRAINED_NET) network_model.train() network_model.close_writers() tf.reset_default_graph() diff --git a/model/model.py b/model/model.py index 10fe2ea..a0f3494 100644 --- a/model/model.py +++ b/model/model.py @@ -53,7 +53,8 @@ def __init__(self, config, session): self.learning_rate = config[LEARN_RATE] self.embedding_size = config[EMBEDD_SIZE] self.max_title_length = config[MAX_TITLE_LENGTH] - self.lstm_neurons = config[LSTM_NEURONS] + self.rnn_neurons = config[RNN_NEURONS] + self.rnn_unit = config[RNN_UNIT] self.batch_size = config[BATCH_SIZE] self.training_epochs = config[TRAINING_EPOCHS] self.use_l2_loss = config[USE_L2_LOSS] @@ -67,13 +68,17 @@ def __init__(self, config, session): self.use_constant_limit = config[USE_CONSTANT_LIMIT] self.constant_prediction_limit = config[CONSTANT_PREDICTION_LIMIT] self.use_concat_input = config[USE_CONCAT_INPUT] + self.use_pretrained_net = config[USE_PRETRAINED_NET] + self.subreddit_count = 0 # Will be set in build_graph self.input = None self.subreddit_input = None self.target = None + self.sec_target = None self.sigmoid = None self.train_op = None + self.pre_train_op = None self.error = None self.init_op = None self.saver = None @@ -105,6 +110,7 @@ def __init__(self, config, session): with tf.device("/cpu:0"): self.data = data.Data(config) + self.subreddit_count = self.data.subreddit_count if self.use_pretrained: self.vocabulary_size = len(self.data.embedding_matrix) @@ -182,40 +188,41 @@ def validate_batch(self): self.subreddit_input: batch_sub, self.target: batch_label}) - # TODO funktionen gör alldeles för mycket, - # dela upp utskrift, beräkning och träning - def train(self): + def train(self, use_pretrained_net=False): """ Trains the model on the dataset """ - print("Starting training...") + if use_pretrained_net: + print("Pre-training on subreddits...") + else: + print("Starting training...") - if self.use_pretrained: + if self.use_pretrained and \ + (self.use_pretrained_net and use_pretrained_net) or \ + (not self.use_pretrained_net and not use_pretrained_net): self._session.run(self.embedding_init, feed_dict={self.embedding_placeholder: self.data.embedding_matrix}) - self.train_writer = \ - tf.summary.FileWriter(self.logging_dir + '/' + TENSOR_DIR_TRAIN, - self._session.graph) - self.valid_writer = \ - tf.summary.FileWriter(self.logging_dir + '/' + TENSOR_DIR_VALID) old_epoch = 0 - if self.epoch.eval(self._session) == 0: + if self.epoch.eval(self._session) == 0 and not use_pretrained_net: self.validate() # Train for a specified amount of epochs - for i in self.data.for_n_train_epochs(self.training_epochs, - self.batch_size): + for i in self.data.for_n_train_epochs(self.training_epochs, self.batch_size): # Debug print out epoch = self.data.completed_training_epochs - training_error = self.train_batch() - validation_error = self.validate_batch() - # Don't validate so often - if i % (self.data.train_size // self.batch_size // 10) == 0 and i: - done = self.data.percent_of_epoch - print("Validation error: {:f} | Training error: {:f} | Done: {:.0%}" - .format(validation_error, training_error, done)) + if not use_pretrained_net: + training_error = self.train_batch() + validation_error = self.validate_batch() + + # Don't validate so often + if i % (self.data.train_size // self.batch_size // 10) == 0 and i: + done = self.data.percent_of_epoch + print("Validation error: {:f} | Training error: {:f} | Done: {:.0%}" + .format(validation_error, training_error, done)) + else: + self.train_batch(True) # Do a full evaluation once an epoch is complete if epoch != old_epoch: @@ -231,21 +238,30 @@ def train(self): epoch_top=self.epoch_top, prec_valid=self.prec_valid, prec_train=self.prec_train, recall_valid=self.recall_valid, recall_train=self.recall_train) - def train_batch(self): + def train_batch(self, pre_train_net=False): """ Trains for one batch and returns cross entropy error """ with tf.device("/cpu:0"): - batch_input, batch_sub, batch_label = \ - self.data.next_train_batch() - - self._session.run(self.train_op, - {self.input: batch_input, - self.subreddit_input: batch_sub, - self.target: batch_label}) - - return self._session.run(self.error, - feed_dict={self.input: batch_input, - self.subreddit_input: batch_sub, - self.target: batch_label}) + if not pre_train_net: + batch_input, batch_sub, batch_label = \ + self.data.next_train_batch() + else: + batch_input, batch_label = \ + self.data.next_pre_train_batch() + + if pre_train_net: + self._session.run(self.pre_train_op, + {self.input: batch_input, + self.sec_target: batch_label}) + else: + self._session.run(self.train_op, + {self.input: batch_input, + self.subreddit_input: batch_sub, + self.target: batch_label}) + + return self._session.run(self.error, + feed_dict={self.input: batch_input, + self.subreddit_input: batch_sub, + self.target: batch_label}) def close_writers(self): """ Close tensorboard writers """ self.train_writer.close() diff --git a/model/model_builder.py b/model/model_builder.py index a1b4f39..bf9f19d 100644 --- a/model/model_builder.py +++ b/model/model_builder.py @@ -24,6 +24,7 @@ import tensorflow as tf from model.model import Model +from definitions import * class ModelBuilder(object): """A class following the builder pattern to create a model""" @@ -52,10 +53,20 @@ def add_input_layer(self): tf.placeholder(tf.float64, [None, self._model.user_count], name="target") + self._model.sec_target = \ + tf.placeholder(tf.float64, + [None, self._model.data.subreddit_count], + name="sec_target") self._model.keep_prob = tf.placeholder(tf.float64, name="keep_prob") - lstm_layer = tf.contrib.rnn.LSTMCell(self._model.lstm_neurons, state_is_tuple=True) + if self._model.rnn_unit == 'lstm': + rnn_layer = tf.contrib.rnn.LSTMCell(self._model.rnn_neurons) + elif self._model.rnn_unit == 'gru': + rnn_layer = tf.contrib.rnn.GRUCell(self._model.rnn_neurons) + else: + print("Incorrect RNN unit, defaulting to LSTM") + rnn_layer = tf.contrib.rnn.LSTMCell(self._model.rnn_neurons) # Embedding matrix for the words embedding_matrix = tf.Variable( @@ -76,7 +87,7 @@ def add_input_layer(self): embedded_input = tf.nn.embedding_lookup(embedding_matrix, self._model.input) # Run the LSTM layer with the embedded input - outputs, _ = tf.nn.dynamic_rnn(lstm_layer, embedded_input, + outputs, _ = tf.nn.dynamic_rnn(rnn_layer, embedded_input, dtype=tf.float64) outputs = tf.transpose(outputs, [1, 0, 2]) @@ -95,7 +106,7 @@ def add_layer(self, number_of_neurons): if not self.added_layers: self.added_layers = True weights = tf.Variable(tf.random_normal( - [self._model.lstm_neurons + + [self._model.rnn_neurons + (1 if self._model.use_concat_input else 0), number_of_neurons], stddev=0.35, @@ -133,31 +144,37 @@ def add_layer(self, number_of_neurons): return self - def add_output_layer(self): + def add_output_layer(self, output_size, secondary_output=False): """Adds an output layer, including error and optimisation functions. After this method no new layers should be added.""" # Output layer # Feed the output of the previous layer to a sigmoid layer sigmoid_weights = tf.Variable(tf.random_normal( - [self._model.latest_layer.get_shape()[1].value, self._model.user_count], + [self._model.latest_layer.get_shape()[1].value, output_size], stddev=0.35, dtype=tf.float64), name="output_weights") - sigmoid_bias = tf.Variable(tf.random_normal([self._model.user_count], + sigmoid_bias = tf.Variable(tf.random_normal([output_size], stddev=0.35, dtype=tf.float64), name="output_biases") logits = tf.add(tf.matmul(self._model.latest_layer, sigmoid_weights), sigmoid_bias) - self._model.sigmoid = tf.nn.sigmoid(logits) - # Training + if secondary_output: + error = tf.nn.softmax_cross_entropy_with_logits( + labels=self._model.sec_target, + logits=logits) + else: + self._model.sigmoid = tf.nn.sigmoid(logits) + # Defne error function + error = tf.nn.sigmoid_cross_entropy_with_logits( + labels=self._model.target, + logits=logits) - # Defne error function - error = tf.nn.sigmoid_cross_entropy_with_logits(labels=self._model.target, - logits=logits) + # Training if self._model.use_l2_loss: cross_entropy = \ @@ -172,9 +189,13 @@ def add_output_layer(self): else: cross_entropy = tf.reduce_mean(error) - self._model.error = cross_entropy - self._model.train_op = tf.train.AdamOptimizer( - self._model.learning_rate).minimize(cross_entropy) + if secondary_output: + self._model.pre_train_op = tf.train.AdamOptimizer( + self._model.learning_rate).minimize(cross_entropy) + else: + self._model.error = cross_entropy + self._model.train_op = tf.train.AdamOptimizer( + self._model.learning_rate).minimize(cross_entropy) return self @@ -267,15 +288,28 @@ def add_precision_operations(self): def build(self): """Adds saver and init operation and returns the model""" + + # Add input layer self.add_input_layer() # Add a number of hidden layers for _ in range(self._model.hidden_layers): self.add_layer(self._model.hidden_neurons) - self.add_output_layer() + # Add output layer for pretraining, if used + if self._model.use_pretrained_net: + self.add_output_layer(self._model.subreddit_count, secondary_output=True) + + # Add output layer for users + self.add_output_layer(self._model.user_count) \ + .add_precision_operations() - self.add_precision_operations() + # Initialize + self._model.train_writer = \ + tf.summary.FileWriter(self._model.logging_dir + '/' + TENSOR_DIR_TRAIN, + self._model._session.graph) + self._model.valid_writer = \ + tf.summary.FileWriter(self._model.logging_dir + '/' + TENSOR_DIR_VALID) self._model.init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) diff --git a/model/util/csv_reader.py b/model/util/csv_reader.py index c5b7526..17c8357 100644 --- a/model/util/csv_reader.py +++ b/model/util/csv_reader.py @@ -35,7 +35,6 @@ class Dataenum(Enum): TRAINING = "training_data" VALIDATION = "validation_data" - class CsvReader: def __init__(self, netcfg): self.netcfg = netcfg diff --git a/model/util/data.py b/model/util/data.py index 4d9ac54..7672e32 100644 --- a/model/util/data.py +++ b/model/util/data.py @@ -35,8 +35,10 @@ def __init__(self, networkconfig): self._current_train_index = 0 self._current_valid_index = 0 self._current_test_index = 0 + self._current_pre_train_index = 0 self.completed_training_epochs = 0 self.percent_of_epoch = 0.0 + self.subreddit_count = 0 self.title_length = networkconfig['max_title_length'] self.batch_size = self.netcfg['batch_size'] self.reader = CsvReader(networkconfig) @@ -56,6 +58,7 @@ def __init__(self, networkconfig): def _read_data(self): """ Reads all the data from specified path """ + logging.debug("Reading training data...") self.train_data, self.train_subreddits, self.train_labels = \ @@ -92,6 +95,7 @@ def _build_dict(self): subreddits = " ".join(self.train_subreddits).split() self.subreddit_dict = helper.build_subreddit_dict(subreddits) + self.subreddit_count = len(self.subreddit_dict) def next_train_batch(self, batch_size=None): """ Get the next batch of training data """ @@ -131,6 +135,35 @@ def next_train_batch(self, batch_size=None): self.percent_of_epoch = self._current_train_index / self.train_size return batch_x, batch_sub, batch_y + def next_pre_train_batch(self, batch_size=None): + """ Get the next batch of training data """ + batch_size = batch_size or self.batch_size + batch_x = [] + batch_y = [] + for _ in range(0, batch_size): + sentence = self.train_data[self._current_train_index] + label = self.train_labels[self._current_train_index] + self._current_pre_train_index += 1 + # Support multiple epochs + if self._current_pre_train_index >= self.train_size: + self._current_pre_train_index = 0 + + # Turn sentences and labels into vector representations + sentence_vec, present, absent = \ + helper.get_indicies(sentence, + self.word_dict, + self.max_title_length) + self.train_present += present + self.train_absent += absent + + label_vec = helper.label_vector(label, + self.subreddit_dict, + self.subreddit_count) + batch_x.append(sentence_vec) + batch_y.append(label_vec) + + return batch_x, batch_y + def get_validation(self): """ Get the whole validation set in a vectorized form """ old_ind = self._current_valid_index @@ -218,7 +251,7 @@ def next_test_batch(self, batch_size=None): def for_n_train_epochs(self, num_epochs=1, batch_size=25): # TODO Ta bort parameterar """ Calculates how many training iterations to do for num_epochs - number of epochs with a batch size of batch_size """ + number of epochs with a batch size of batch_size""" return range((self.train_size * num_epochs) // batch_size) def get_training(self): diff --git a/model/util/writer.py b/model/util/writer.py index 09b8152..9b452ae 100644 --- a/model/util/writer.py +++ b/model/util/writer.py @@ -33,7 +33,7 @@ def log_samefile(config, f1_score_valid, f1_score_train, epoch_top, prec_valid, if not os.path.exists(LOGS_DIR): raise FileNotFoundError('Can not write because no directory is created') - config_headers = [NET_TYPE, NET_NAME, VOC_SIZE, USER_COUNT, LEARN_RATE, EMBEDD_SIZE, MAX_TITLE_LENGTH, LSTM_NEURONS, + config_headers = [NET_TYPE, NET_NAME, VOC_SIZE, USER_COUNT, LEARN_RATE, EMBEDD_SIZE, MAX_TITLE_LENGTH, RNN_NEURONS, RNN_UNIT, HIDDEN_NEURONS, HIDDEN_LAYERS, USE_CONCAT_INPUT, BATCH_SIZE, TRAINING_EPOCHS, USE_L2_LOSS, L2_FACTOR, USE_DROPOUT, DROPOUT_PROB, USE_CONSTANT_LIMIT, CONSTANT_PREDICTION_LIMIT, TRAINABLE_MATRIX, PRE_TRAINED_MATRIX, USE_PRETRAINED, VALIDATION_DATA, TRAINING_DATA, TESTING_DATA]