diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..ae616e7 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,5 @@ +from gcr.io/tensorflow/tensorflow:latest-gpu-py3 +WORKDIR /app +COPY ./project /app +RUN pip install -r requirements.txt + diff --git a/README.md b/README.md index b10bfa0..ef22a06 100644 --- a/README.md +++ b/README.md @@ -26,3 +26,8 @@ For more details, take a look at the [dataset repository](https://github.com/kan ## Configuration To edit configs, take a look at the `config.yaml` file. Please prefer making new configs instead of editing old (for academic purposes). If implementing a new model, make sure to add support for it in the `main.py` file so its configs can be automatically parsed. +## Build/Run with Docker + +Build with ```docker build -t YOURTAG .``` + +Run with ```nvidia-docker run [-v YOURLOGDIR:/app/logs] -t -rm YOURTAG python -u ./YOURENTRYPOINT.py diff --git a/config.template.yaml b/config.template.yaml index 115f562..9b6e1dd 100644 --- a/config.template.yaml +++ b/config.template.yaml @@ -8,9 +8,9 @@ network: vocabulary_size: 10000 user_count: 6 max_title_length: 30 - validation_data: 'validation_data_top_n_single.csv' - training_data: 'training_data_top_n_single.csv' - testing_data: 'testing_data_top_n_single.csv' + validation_data: 'validation_data_top_5_subreddit_allvotes.csv' + training_data: 'training_data_top_5_subreddit_allvotes.csv' + testing_data: 'testing_data_top_5_subreddit_allvotes.csv' # Embedding matrix configs: embedding_size: 150 # Make sure to match pretrained matrix dimensions trainable_matrix: true @@ -20,10 +20,13 @@ network: learning_rate: 0.5 training_epochs: 5 batch_size: 25 - lstm_neurons: 200 + rnn_neurons: 200 + rnn_unit: 'lstm' # Can be 'gru' or 'lstm', default: 'lstm' hidden_layers: 0 hidden_neurons: 300 + subreddit_input_neurons: 10 #Probebly not the best default value use_concat_input: false + pre_train_subreddit: false # Regularisation configs: use_l2_loss: false l2_factor: 0.01 diff --git a/definitions.py b/definitions.py index fd70b22..0fb1b88 100644 --- a/definitions.py +++ b/definitions.py @@ -40,9 +40,11 @@ LEARN_RATE = 'learning_rate' EMBEDD_SIZE = 'embedding_size' MAX_TITLE_LENGTH = 'max_title_length' -LSTM_NEURONS = 'lstm_neurons' +RNN_NEURONS = 'rnn_neurons' +RNN_UNIT = "rnn_unit" HIDDEN_NEURONS = 'hidden_neurons' HIDDEN_LAYERS = 'hidden_layers' +SUB_INPUT_NEURONS = 'subreddit_input_neurons' USE_CONCAT_INPUT = 'use_concat_input' BATCH_SIZE = 'batch_size' TRAINING_EPOCHS = 'training_epochs' @@ -55,6 +57,7 @@ TRAINABLE_MATRIX = 'trainable_matrix' PRE_TRAINED_MATRIX = 'pre_trained_matrix' USE_PRETRAINED = 'use_pretrained' +USE_PRETRAINED_NET = 'pre_train_subreddit' VALIDATION_DATA = 'validation_data' TRAINING_DATA = 'training_data' TESTING_DATA = 'testing_data' diff --git a/download_data.sh b/download_data.sh new file mode 100755 index 0000000..78f2a6a --- /dev/null +++ b/download_data.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +mkdir -p resources/datasets +cd resources/datasets + +# Download embedding matrices +wget https://github.com/kandidat-highlights/data/raw/master/Glove/vectors100d.tar.gz +tar -xzf vectors100d.tar.gz +rm vectors100d.tar.gz + +wget https://github.com/kandidat-highlights/data/raw/master/Glove/vectors150d.tar.gz +tar -xzf vectors150d.tar.gz +rm vectors150d.tar.gz + +# Download datasets +wget https://github.com/kandidat-highlights/data/raw/master/allVotes/data_top50_users_subreddit_title_all_votes.tar.gz +tar -xzf data_top50_users_subreddit_title_all_votes.tar.gz +rm data_top50_users_subreddit_title_all_votes.tar.gz + +wget https://github.com/kandidat-highlights/data/raw/master/allVotes/data_top5_users_subreddit_title_all_votes.tar.gz +tar -xzf data_top5_users_subreddit_title_all_votes.tar.gz +rm data_top5_users_subreddit_title_all_votes.tar.gz + +wget https://github.com/kandidat-highlights/data/raw/master/top50/data_top50_users_subreddit_title.tar.gz +tar -xzf data_top50_users_subreddit_title.tar.gz +rm data_top50_users_subreddit_title.tar.gz + +cd ../../ diff --git a/main.py b/main.py index dd048e3..d1b7f1f 100644 --- a/main.py +++ b/main.py @@ -21,8 +21,10 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # ============================================================================== +import sys import argparse import tensorflow as tf +from definitions import * from model.util.networkconfig import yamlconfig as networkconfig from model.model_builder import ModelBuilder @@ -30,18 +32,26 @@ def main(): """ A main method that creates the model and starts training it """ # Parse arguments parser = argparse.ArgumentParser(add_help=True) - parser.add_argument('configs', metavar='C', type=int, nargs='+', + parser.add_argument('configs', metavar='C', type=int, nargs='*', help='Config number to use (can be multiple)') args = parser.parse_args() + for conf in args.configs if args.configs else range(len(networkconfig)): + try: + print("Starting config ", conf) + config_file = networkconfig[conf] + with tf.Session() as sess: + builder = ModelBuilder(config_file, sess) - for conf in args.configs: - config_file = networkconfig[conf] - with tf.Session() as sess: - builder = ModelBuilder(config_file, sess) - network_model = builder.build() - network_model.train() - network_model.close_writers() - tf.reset_default_graph() + network_model = builder.build() + if config_file[USE_PRETRAINED_NET]: + network_model.train(USE_PRETRAINED_NET) + network_model.train() + network_model.close_writers() + tf.reset_default_graph() + except Exception as e: + print("Config ", networkconfig[conf]["name"], "failed to complete", file=sys.stderr) + print(e, file=sys.stderr) + tf.reset_default_graph() if __name__ == "__main__": main() diff --git a/model/model.py b/model/model.py index 10fe2ea..1358564 100644 --- a/model/model.py +++ b/model/model.py @@ -46,14 +46,15 @@ def __init__(self, config, session): self.latest_layer = None self.output_weights = None self.output_bias = None - self.l2_term = tf.constant(0, dtype=tf.float64) + self.l2_term = tf.constant(0, dtype=tf.float32) self.vocabulary_size = config[VOC_SIZE] self.user_count = config[USER_COUNT] self.learning_rate = config[LEARN_RATE] self.embedding_size = config[EMBEDD_SIZE] self.max_title_length = config[MAX_TITLE_LENGTH] - self.lstm_neurons = config[LSTM_NEURONS] + self.rnn_neurons = config[RNN_NEURONS] + self.rnn_unit = config[RNN_UNIT] self.batch_size = config[BATCH_SIZE] self.training_epochs = config[TRAINING_EPOCHS] self.use_l2_loss = config[USE_L2_LOSS] @@ -62,18 +63,23 @@ def __init__(self, config, session): self.dropout_prob = config[DROPOUT_PROB] # Only used for train op self.hidden_layers = config[HIDDEN_LAYERS] self.hidden_neurons = config[HIDDEN_NEURONS] + self.subreddit_input_neurons = config[SUB_INPUT_NEURONS] self.is_trainable_matrix = config[TRAINABLE_MATRIX] self.use_pretrained = config[USE_PRETRAINED] self.use_constant_limit = config[USE_CONSTANT_LIMIT] self.constant_prediction_limit = config[CONSTANT_PREDICTION_LIMIT] self.use_concat_input = config[USE_CONCAT_INPUT] + self.use_pretrained_net = config[USE_PRETRAINED_NET] + self.subreddit_count = 0 # Will be set in build_graph self.input = None self.subreddit_input = None self.target = None + self.sec_target = None self.sigmoid = None self.train_op = None + self.pre_train_op = None self.error = None self.init_op = None self.saver = None @@ -105,6 +111,7 @@ def __init__(self, config, session): with tf.device("/cpu:0"): self.data = data.Data(config) + self.subreddit_count = self.data.subreddit_count if self.use_pretrained: self.vocabulary_size = len(self.data.embedding_matrix) @@ -172,6 +179,7 @@ def validate(self): epoch, get_val_summary_tensor(val_prec), get_val_summary_tensor(train_prec), \ get_val_summary_tensor(val_recall), get_val_summary_tensor(train_recall) + # Currently not used. Saving for now. Might come in handy later def validate_batch(self): """ Validates a batch of data and returns cross entropy error """ with tf.device("/cpu:0"): @@ -182,43 +190,40 @@ def validate_batch(self): self.subreddit_input: batch_sub, self.target: batch_label}) - # TODO funktionen gör alldeles för mycket, - # dela upp utskrift, beräkning och träning - def train(self): + def train(self, use_pretrained_net=False): """ Trains the model on the dataset """ - print("Starting training...") + if use_pretrained_net: + print("Pre-training on subreddits...") + else: + print("Starting training...") if self.use_pretrained: self._session.run(self.embedding_init, feed_dict={self.embedding_placeholder: self.data.embedding_matrix}) - self.train_writer = \ - tf.summary.FileWriter(self.logging_dir + '/' + TENSOR_DIR_TRAIN, - self._session.graph) - self.valid_writer = \ - tf.summary.FileWriter(self.logging_dir + '/' + TENSOR_DIR_VALID) old_epoch = 0 - if self.epoch.eval(self._session) == 0: + if self.epoch.eval(self._session) == 0 and not use_pretrained_net: self.validate() # Train for a specified amount of epochs - for i in self.data.for_n_train_epochs(self.training_epochs, - self.batch_size): + for i in self.data.for_n_train_epochs(self.training_epochs, self.batch_size): # Debug print out epoch = self.data.completed_training_epochs - training_error = self.train_batch() - validation_error = self.validate_batch() - # Don't validate so often - if i % (self.data.train_size // self.batch_size // 10) == 0 and i: - done = self.data.percent_of_epoch - print("Validation error: {:f} | Training error: {:f} | Done: {:.0%}" - .format(validation_error, training_error, done)) + if not use_pretrained_net: + self.train_batch() + + # Don't print so often + if i % (self.data.train_size // self.batch_size // 10) == 0 and i: + done = self.data.percent_of_epoch + print("Epoch comletion: {:.0%}".format(done)) + else: + self.train_batch(True) # Do a full evaluation once an epoch is complete - if epoch != old_epoch: + if epoch != old_epoch and not use_pretrained_net: self._session.run(self.epoch.assign_add(1)) print("Epoch complete...old ", old_epoch) self.save_checkpoint() @@ -227,25 +232,36 @@ def train(self): # Save model when done training self.save_checkpoint() - log_samefile(config=self.config, f1_score_valid=self.f1_score_valid, f1_score_train=self.f1_score_train, - epoch_top=self.epoch_top, prec_valid=self.prec_valid, prec_train=self.prec_train, - recall_valid=self.recall_valid, recall_train=self.recall_train) + if not use_pretrained_net: + log_samefile(config=self.config, f1_score_valid=self.f1_score_valid, f1_score_train=self.f1_score_train, + epoch_top=self.epoch_top, prec_valid=self.prec_valid, prec_train=self.prec_train, + recall_valid=self.recall_valid, recall_train=self.recall_train) - def train_batch(self): + def train_batch(self, pre_train_net=False): """ Trains for one batch and returns cross entropy error """ with tf.device("/cpu:0"): - batch_input, batch_sub, batch_label = \ - self.data.next_train_batch() - - self._session.run(self.train_op, - {self.input: batch_input, - self.subreddit_input: batch_sub, - self.target: batch_label}) + if not pre_train_net: + batch_input, batch_sub, batch_label = \ + self.data.next_train_batch() + else: + batch_input, batch_sub, batch_label = \ + self.data.next_pre_train_batch() + + if pre_train_net and self.use_concat_input: + self._session.run(self.pre_train_op, + {self.input: batch_input, + self.subreddit_input: batch_sub, + self.sec_target: batch_label}) + elif pre_train_net: + self._session.run(self.pre_train_op, + {self.input: batch_input, + self.sec_target: batch_label}) + else: + self._session.run(self.train_op, + {self.input: batch_input, + self.subreddit_input: batch_sub, + self.target: batch_label}) - return self._session.run(self.error, - feed_dict={self.input: batch_input, - self.subreddit_input: batch_sub, - self.target: batch_label}) def close_writers(self): """ Close tensorboard writers """ self.train_writer.close() diff --git a/model/model_builder.py b/model/model_builder.py index a1b4f39..0a62bc2 100644 --- a/model/model_builder.py +++ b/model/model_builder.py @@ -24,6 +24,7 @@ import tensorflow as tf from model.model import Model +from definitions import * class ModelBuilder(object): """A class following the builder pattern to create a model""" @@ -45,30 +46,40 @@ def add_input_layer(self): [None, self._model.max_title_length], name="input") self._model.subreddit_input = \ - tf.placeholder(tf.float64, - [None, 1], + tf.placeholder(tf.float32, + [None, self._model.subreddit_count], name="subreddit_input") self._model.target = \ - tf.placeholder(tf.float64, + tf.placeholder(tf.float32, [None, self._model.user_count], name="target") + self._model.sec_target = \ + tf.placeholder(tf.float32, + [None, self._model.data.subreddit_count], + name="sec_target") - self._model.keep_prob = tf.placeholder(tf.float64, name="keep_prob") + self._model.keep_prob = tf.placeholder(tf.float32, name="keep_prob") - lstm_layer = tf.contrib.rnn.LSTMCell(self._model.lstm_neurons, state_is_tuple=True) + if self._model.rnn_unit == 'lstm': + rnn_layer = tf.contrib.rnn.LSTMCell(self._model.rnn_neurons) + elif self._model.rnn_unit == 'gru': + rnn_layer = tf.contrib.rnn.GRUCell(self._model.rnn_neurons) + else: + print("Incorrect RNN unit, defaulting to LSTM") + rnn_layer = tf.contrib.rnn.LSTMCell(self._model.rnn_neurons) # Embedding matrix for the words embedding_matrix = tf.Variable( - tf.constant(0.0, - shape=[self._model.vocabulary_size, - self._model.embedding_size], - dtype=tf.float64), + tf.random_uniform( + [self._model.vocabulary_size, + self._model.embedding_size], + -1.0, 1.0, dtype=tf.float32), trainable=self._model.is_trainable_matrix, name="embedding_matrix", - dtype=tf.float64) + dtype=tf.float32) self._model.embedding_placeholder = \ - tf.placeholder(tf.float64, + tf.placeholder(tf.float32, [self._model.vocabulary_size, self._model.embedding_size]) self._model.embedding_init = \ embedding_matrix.assign(self._model.embedding_placeholder) @@ -76,14 +87,30 @@ def add_input_layer(self): embedded_input = tf.nn.embedding_lookup(embedding_matrix, self._model.input) # Run the LSTM layer with the embedded input - outputs, _ = tf.nn.dynamic_rnn(lstm_layer, embedded_input, - dtype=tf.float64) + outputs, _ = tf.nn.dynamic_rnn(rnn_layer, embedded_input, + dtype=tf.float32) outputs = tf.transpose(outputs, [1, 0, 2]) output = outputs[-1] if self._model.use_concat_input: # Add subreddit to end of input - output = tf.concat([output, self._model.subreddit_input], 1) + subreddit_weights = tf.Variable(tf.random_normal( + [self._model.subreddit_count, + self._model.subreddit_input_neurons], + stddev=0.35, + dtype=tf.float32), + name="sub_input_weights") + + subreddit_bias = tf.Variable(tf.random_normal( + [self._model.subreddit_input_neurons], + stddev=0.35, + dtype=tf.float32), + name="sub_input_bias") + + logit_subreddit = tf.add( + tf.matmul(self._model.subreddit_input, subreddit_weights), + subreddit_bias) + output = tf.concat([output, logit_subreddit], 1) self._model.latest_layer = output @@ -95,26 +122,28 @@ def add_layer(self, number_of_neurons): if not self.added_layers: self.added_layers = True weights = tf.Variable(tf.random_normal( - [self._model.lstm_neurons + - (1 if self._model.use_concat_input else 0), + [self._model.rnn_neurons + + (self._model.subreddit_input_neurons + if self._model.use_concat_input + else 0), number_of_neurons], stddev=0.35, - dtype=tf.float64), + dtype=tf.float32), name="weights" + str(self.number_of_layers)) bias = tf.Variable(tf.random_normal([number_of_neurons], stddev=0.35, - dtype=tf.float64), + dtype=tf.float32), name="biases" + str(self.number_of_layers)) else: weights = tf.Variable(tf.random_normal( [self._model.latest_layer.get_shape()[1].value, number_of_neurons], stddev=0.35, - dtype=tf.float64), + dtype=tf.float32), name="weights" + str(self.number_of_layers)) bias = tf.Variable(tf.random_normal([number_of_neurons], stddev=0.35, - dtype=tf.float64), + dtype=tf.float32), name="biases" + str(self.number_of_layers)) logits = tf.add(tf.matmul(self._model.latest_layer, weights), bias) @@ -122,9 +151,7 @@ def add_layer(self, number_of_neurons): logits, name="hidden_layer-" + str(self.number_of_layers)) if self._model.use_l2_loss: - self._model.l2_term = tf.add( - tf.add(self._model.l2_term, tf.nn.l2_loss(weights)), - tf.nn.l2_loss(bias)) + self._model.l2_term = tf.add(self._model.l2_term, tf.nn.l2_loss(weights)) if self._model.use_dropout: self._model.latest_layer = \ tf.nn.dropout(self._model.latest_layer, @@ -133,31 +160,37 @@ def add_layer(self, number_of_neurons): return self - def add_output_layer(self): + def add_output_layer(self, output_size, secondary_output=False): """Adds an output layer, including error and optimisation functions. After this method no new layers should be added.""" # Output layer # Feed the output of the previous layer to a sigmoid layer sigmoid_weights = tf.Variable(tf.random_normal( - [self._model.latest_layer.get_shape()[1].value, self._model.user_count], + [self._model.latest_layer.get_shape()[1].value, output_size], stddev=0.35, - dtype=tf.float64), + dtype=tf.float32), name="output_weights") - sigmoid_bias = tf.Variable(tf.random_normal([self._model.user_count], + sigmoid_bias = tf.Variable(tf.random_normal([output_size], stddev=0.35, - dtype=tf.float64), + dtype=tf.float32), name="output_biases") logits = tf.add(tf.matmul(self._model.latest_layer, sigmoid_weights), sigmoid_bias) - self._model.sigmoid = tf.nn.sigmoid(logits) - # Training + if secondary_output: + error = tf.nn.softmax_cross_entropy_with_logits( + labels=self._model.sec_target, + logits=logits) + else: + self._model.sigmoid = tf.nn.sigmoid(logits) + # Defne error function + error = tf.nn.sigmoid_cross_entropy_with_logits( + labels=self._model.target, + logits=logits) - # Defne error function - error = tf.nn.sigmoid_cross_entropy_with_logits(labels=self._model.target, - logits=logits) + # Training if self._model.use_l2_loss: cross_entropy = \ @@ -172,9 +205,13 @@ def add_output_layer(self): else: cross_entropy = tf.reduce_mean(error) - self._model.error = cross_entropy - self._model.train_op = tf.train.AdamOptimizer( - self._model.learning_rate).minimize(cross_entropy) + if secondary_output: + self._model.pre_train_op = tf.train.AdamOptimizer( + self._model.learning_rate).minimize(cross_entropy) + else: + self._model.error = cross_entropy + self._model.train_op = tf.train.AdamOptimizer( + self._model.learning_rate).minimize(cross_entropy) return self @@ -267,15 +304,28 @@ def add_precision_operations(self): def build(self): """Adds saver and init operation and returns the model""" + + # Add input layer self.add_input_layer() # Add a number of hidden layers for _ in range(self._model.hidden_layers): self.add_layer(self._model.hidden_neurons) - self.add_output_layer() + # Add output layer for pretraining, if used + if self._model.use_pretrained_net: + self.add_output_layer(self._model.subreddit_count, secondary_output=True) + + # Add output layer for users + self.add_output_layer(self._model.user_count) \ + .add_precision_operations() - self.add_precision_operations() + # Initialize + self._model.train_writer = \ + tf.summary.FileWriter(self._model.logging_dir + '/' + TENSOR_DIR_TRAIN, + self._model._session.graph) + self._model.valid_writer = \ + tf.summary.FileWriter(self._model.logging_dir + '/' + TENSOR_DIR_VALID) self._model.init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) diff --git a/model/util/config.py b/model/util/config.py index c2cfbfc..6bdf4ca 100644 --- a/model/util/config.py +++ b/model/util/config.py @@ -29,6 +29,6 @@ for conf in cfg['network']: for (key, value) in conf.items(): if type(value) is float: - conf[key] = np.float64(value) + conf[key] = np.float32(value) diff --git a/model/util/csv_reader.py b/model/util/csv_reader.py index c5b7526..b6894c7 100644 --- a/model/util/csv_reader.py +++ b/model/util/csv_reader.py @@ -35,7 +35,6 @@ class Dataenum(Enum): TRAINING = "training_data" VALIDATION = "validation_data" - class CsvReader: def __init__(self, netcfg): self.netcfg = netcfg @@ -75,33 +74,31 @@ def get_data(self, datatype, data_column=[0], sub_column=1, label_column=2): subreddit_full.append(subreddit) return data_full, subreddit_full, label_full + def load_pretrained_embeddings(self, file_name, dimension_size=50): + file_path = os.path.join(DATASETS_PATH, file_name) + matrix = [np.random.rand(dimension_size).astype(np.float32)] + word_dict = dict() + word_dict['UNK'] = 0 + count = 1 - def test_load_pretrained_embeddings(self, fileName, dimension_size=50): - file_path = os.path.join(DATASETS_PATH, fileName) with open(file_path, 'r', encoding='UTF-8') as csvfile: reader = csv.reader(csvfile, delimiter=' ', quoting=csv.QUOTE_NONE) - word_dict = dict() - matrix = [] - - word_dict['UNK'] = len(matrix) - matrix.append(np.random.rand(1, dimension_size)[0].tolist()) for row in reader: first_col = row[0] - if first_col in ['!', '?', '-', '_', '.', ',', '\'', '\"', ':', ';', '%', '(', ')']: continue - if first_col[0] == '<': #some words are tokens for usernames like or etc, ignore them. + # Some words are tokens for usernames like or + # etc, ignore them. + if first_col[0] == '<': continue - word_dict[first_col] = len(matrix) - matrix.append(row[1:]) - embed_matrix = np.array(matrix) - embed_matrix = embed_matrix.astype(np.float64) - return word_dict, embed_matrix - - + word_dict[first_col] = count + row_array = np.array(row[1:], dtype=np.float32) + matrix.append(row_array) + count += 1 + return word_dict, np.array(matrix) diff --git a/model/util/data.py b/model/util/data.py index 4d9ac54..22bbc17 100644 --- a/model/util/data.py +++ b/model/util/data.py @@ -35,8 +35,10 @@ def __init__(self, networkconfig): self._current_train_index = 0 self._current_valid_index = 0 self._current_test_index = 0 + self._current_pre_train_index = 0 self.completed_training_epochs = 0 self.percent_of_epoch = 0.0 + self.subreddit_count = 0 self.title_length = networkconfig['max_title_length'] self.batch_size = self.netcfg['batch_size'] self.reader = CsvReader(networkconfig) @@ -56,6 +58,7 @@ def __init__(self, networkconfig): def _read_data(self): """ Reads all the data from specified path """ + logging.debug("Reading training data...") self.train_data, self.train_subreddits, self.train_labels = \ @@ -83,7 +86,7 @@ def _build_dict(self): helper.build_dataset(vocab, vocabulary_size=self.vocabulary_size) else: self.word_dict, self.embedding_matrix = \ - self.reader.test_load_pretrained_embeddings( + self.reader.load_pretrained_embeddings( self.pre_trained_matrix, self.embedding_size) users = " ".join(self.train_labels).split() @@ -92,6 +95,7 @@ def _build_dict(self): subreddits = " ".join(self.train_subreddits).split() self.subreddit_dict = helper.build_subreddit_dict(subreddits) + self.subreddit_count = len(self.subreddit_dict) def next_train_batch(self, batch_size=None): """ Get the next batch of training data """ @@ -119,8 +123,9 @@ def next_train_batch(self, batch_size=None): self.train_present += present self.train_absent += absent - subreddit_vec = helper.subreddit_index(subreddit, - self.subreddit_dict) + subreddit_vec = helper.label_vector(subreddit, + self.subreddit_dict, + self.subreddit_count) label_vec = helper.label_vector(label.split(), self.users_dict, self.user_count) @@ -131,6 +136,40 @@ def next_train_batch(self, batch_size=None): self.percent_of_epoch = self._current_train_index / self.train_size return batch_x, batch_sub, batch_y + def next_pre_train_batch(self, batch_size=None): + """ Get the next batch of training data """ + batch_size = batch_size or self.batch_size + batch_x = [] + batch_y = [] + batch_sub = [] + + for _ in range(0, batch_size): + sentence = self.train_data[self._current_pre_train_index] + subreddit = self.train_subreddits[self._current_pre_train_index] + label = self.train_labels[self._current_pre_train_index] + self._current_pre_train_index += 1 + # Support multiple epochs + if self._current_pre_train_index >= self.train_size: + self._current_pre_train_index = 0 + + # Turn sentences and labels into vector representations + sentence_vec, present, absent = \ + helper.get_indicies(sentence, + self.word_dict, + self.max_title_length) + self.train_present += present + self.train_absent += absent + + subreddit_vec = helper.label_vector(subreddit, + self.subreddit_dict, + self.subreddit_count) + batch_x.append(sentence_vec) + batch_y.append(subreddit_vec) + batch_sub.append(subreddit_vec) + + + return batch_x, batch_sub, batch_y + def get_validation(self): """ Get the whole validation set in a vectorized form """ old_ind = self._current_valid_index @@ -165,8 +204,9 @@ def next_valid_batch(self, batch_size=None): self.valid_present += pres self.valid_absent += absent - subreddit_vec = helper.subreddit_index(subreddit, - self.subreddit_dict) + subreddit_vec = helper.label_vector(subreddit, + self.subreddit_dict, + self.subreddit_count) label_vec = helper.label_vector(label.split(), self.users_dict, self.user_count) @@ -205,8 +245,9 @@ def next_test_batch(self, batch_size=None): self.word_dict, self.max_title_length) - subreddit_vec = helper.subreddit_index(subreddit, - self.subreddit_dict) + subreddit_vec = helper.label_vector(subreddit, + self.subreddit_dict, + self.subreddit_count) label_vec = helper.label_vector(label.split(), self.users_dict, self.user_count) @@ -218,7 +259,7 @@ def next_test_batch(self, batch_size=None): def for_n_train_epochs(self, num_epochs=1, batch_size=25): # TODO Ta bort parameterar """ Calculates how many training iterations to do for num_epochs - number of epochs with a batch size of batch_size """ + number of epochs with a batch size of batch_size""" return range((self.train_size * num_epochs) // batch_size) def get_training(self): diff --git a/model/util/writer.py b/model/util/writer.py index 09b8152..9db1ccd 100644 --- a/model/util/writer.py +++ b/model/util/writer.py @@ -33,10 +33,10 @@ def log_samefile(config, f1_score_valid, f1_score_train, epoch_top, prec_valid, if not os.path.exists(LOGS_DIR): raise FileNotFoundError('Can not write because no directory is created') - config_headers = [NET_TYPE, NET_NAME, VOC_SIZE, USER_COUNT, LEARN_RATE, EMBEDD_SIZE, MAX_TITLE_LENGTH, LSTM_NEURONS, - HIDDEN_NEURONS, HIDDEN_LAYERS, USE_CONCAT_INPUT, BATCH_SIZE, TRAINING_EPOCHS, USE_L2_LOSS, L2_FACTOR, USE_DROPOUT, + config_headers = [NET_TYPE, NET_NAME, VOC_SIZE, USER_COUNT, LEARN_RATE, EMBEDD_SIZE, MAX_TITLE_LENGTH, RNN_NEURONS, RNN_UNIT, + HIDDEN_NEURONS, HIDDEN_LAYERS, SUB_INPUT_NEURONS, USE_CONCAT_INPUT, BATCH_SIZE, TRAINING_EPOCHS, USE_L2_LOSS, L2_FACTOR, USE_DROPOUT, DROPOUT_PROB, USE_CONSTANT_LIMIT, CONSTANT_PREDICTION_LIMIT, TRAINABLE_MATRIX, - PRE_TRAINED_MATRIX, USE_PRETRAINED, VALIDATION_DATA, TRAINING_DATA, TESTING_DATA] + PRE_TRAINED_MATRIX, USE_PRETRAINED, USE_PRETRAINED_NET, VALIDATION_DATA, TRAINING_DATA, TESTING_DATA] additional_headers = [F1_SCORE_TOP_VALID, F1_SCORE_TRAIN, EPOCH_WHEN_F1_TOP, PRECISION_VALIDATION, PRECISION_TRAINING, RECALL_VALIDATION, RECALL_TRAINING, DATE] diff --git a/template.py b/template.py new file mode 100644 index 0000000..125beda --- /dev/null +++ b/template.py @@ -0,0 +1,201 @@ +from random import choice +from datetime import datetime + +template = """ - type: {0} + name: {1} + # Data related configs: + vocabulary_size: {2} + user_count: {3} + max_title_length: {4} + validation_data: '{5}' + training_data: '{6}' + testing_data: '{7}' + # Embedding matrix configs: + embedding_size: {8} # Make sure to match pretrained matrix dimensions + trainable_matrix: {9} + use_pretrained: {10} + pre_trained_matrix: '{11}' + # Learning configs: + learning_rate: {12} + training_epochs: {13} + batch_size: {14} + rnn_neurons: {15} + rnn_unit: '{16}' # Can be 'gru' or 'lstm', default: 'lstm' + hidden_layers: {17} + hidden_neurons: {18} + subreddit_input_neurons: {19} + use_concat_input: {20} + pre_train_subreddit: {21} + # Regularisation configs: + use_l2_loss: {22} + l2_factor: {23} + use_dropout: {24} + dropout_prob: {25} + # Validation configs: + use_constant_limit: {26} + constant_prediction_limit: {27}\n""" + +def rtype(): + return "model-builder" + +def name(): + name = "network-" + str(datetime.now()) + "-" + x = 0 + while True: + yield name + str(x) + x+=1 + +def vocabulary_size(): + possibilites = ["10000","12000","15000","17000","19000"] + return choice(possibilites) + +def user_count(): + return choice(["51","6"]) + +def max_title_length(): + possibilites = ["20","25","30","35","40"] + return choice(possibilites) + +def data_set(nbr_of_users): + possibilites_5_users = [("validation_data_top_5_subreddit_allvotes.csv" + ,"training_data_top_5_subreddit_allvotes.csv" + ,"testing_data_top_5_subreddit_allvotes.csv") + ] + possibilites_50_users = [("validation_data_top_50_subreddit_allvotes.csv" + ,"training_data_top_50_subreddit_allvotes.csv" + ,"testing_data_top_50_subreddit_allvotes.csv"), + ("validation_data_top_50_subreddit.csv", + "training_data_top_50_subreddit.csv", + "testing_data_top_50_subreddit.csv") + ] + + val = None + train = None + test = None + if nbr_of_users == "6": + val,train,test = choice(possibilites_5_users) + elif nbr_of_users == "51": + val,train,test = choice(possibilites_50_users) + return val,train,test + +def embedding_size(): + return choice(["100","150","300"]) + +def trainable_matrix(): + return choice(["true","false"]) + +def use_pretrained(): + return choice(["true","false"]) + +def pre_trained_matrix(dim): + if dim == "100": + return "vectors.txt" + elif dim == "150": + return "vectors150d.txt" + elif dim == "300": + return "vectors300d.txt" + +def learning_rate(): + possibilites = ["0.05", "0.1", "0.15","0.2","0.3","0.35","0.4","0.5"] + return choice(possibilites) + +def training_epochs(): + return choice(["50"]) + +def batch_size(): + possibilites = ["25", "50", "100"] + return choice(possibilites) + +def lstm_neurons(): + possibilites = ["150","200","250","300","400"] + return choice(possibilites) + +def rnn_unit(): + return choice(["gru","lstm"]) + +def subreddit_input_neurons(): + return choice(["50", "100", "200"]) + +def use_pretrain_subreddit(): + return choice(["true", "false"]) + +def hidden_layers(): + return choice(["0","1","2","3","4","5"]) + +def hidden_neurons(): + possibilites = ["150","300","450","600","750"] + return choice(possibilites) + +def use_concat_input(): + return choice(["true","false"]) + +def use_l2_loss(): + return choice(["true","false"]) + +def l2_factor(): + possibilites = ["0.01", "0.05", "0.1"] + return choice(possibilites) + +def use_dropout(): + return choice(["true","false"]) + +def dropout_prob(): + possibilites = ["0.5", "0.75", "0.9"] + return choice(possibilites) + +def use_constant_limit(): + return choice(["true","false"]) + +def constant_prediction_limit(): + possibilites = ["0.2","0.3","0.4","0.5", "0.6", "0.7", "0.8"] + return choice(possibilites) + + +name_generator = name() + +def get_random_config(): + embed_size = embedding_size() + nbr_of_users = user_count() + val,train,test = data_set(nbr_of_users) + config = template.format(rtype(), + name_generator.__next__(), + vocabulary_size(), + nbr_of_users, + max_title_length(), + val, + train, + test, + embed_size, + trainable_matrix(), + use_pretrained(), + pre_trained_matrix(embed_size), + learning_rate(), + training_epochs(), + batch_size(), + lstm_neurons(), + rnn_unit(), + hidden_layers(), + hidden_neurons(), + subreddit_input_neurons(), + use_concat_input(), + use_pretrain_subreddit(), + use_l2_loss(), + l2_factor(), + use_dropout(), + dropout_prob(), + use_constant_limit(), + constant_prediction_limit()) + return config + +nbr_of_configs = int(input("Enter number of configs to generate > ")) +configs = "\n".join( (get_random_config() for x in range(nbr_of_configs)) ) + +header = """data: + path: 'resources/datasets/' + +network:\n""" + +f = open("config.yaml","w") +f.write(header + configs) +f.close() +