From 97f0ffbf77dac4d6a7ce28b41a469cfe4be3a9d4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexander=20H=C3=A5kansson?= <al3xand3r.h@gmail.com>
Date: Thu, 13 Apr 2017 19:35:56 +0200
Subject: [PATCH] Refactors all double precision floating points to single
 precision to save memory

---
 model/model.py           |  2 +-
 model/model_builder.py   | 32 ++++++++++++++++----------------
 model/util/config.py     |  2 +-
 model/util/csv_reader.py | 30 ++++++++++++++----------------
 model/util/data.py       |  2 +-
 5 files changed, 33 insertions(+), 35 deletions(-)

diff --git a/model/model.py b/model/model.py
index 1f7f313..1358564 100644
--- a/model/model.py
+++ b/model/model.py
@@ -46,7 +46,7 @@ def __init__(self, config, session):
         self.latest_layer = None
         self.output_weights = None
         self.output_bias = None
-        self.l2_term = tf.constant(0, dtype=tf.float64)
+        self.l2_term = tf.constant(0, dtype=tf.float32)
 
         self.vocabulary_size = config[VOC_SIZE]
         self.user_count = config[USER_COUNT]
diff --git a/model/model_builder.py b/model/model_builder.py
index c05ae8b..0a62bc2 100644
--- a/model/model_builder.py
+++ b/model/model_builder.py
@@ -46,19 +46,19 @@ def add_input_layer(self):
                            [None, self._model.max_title_length],
                            name="input")
         self._model.subreddit_input = \
-            tf.placeholder(tf.float64,
+            tf.placeholder(tf.float32,
                            [None, self._model.subreddit_count],
                            name="subreddit_input")
         self._model.target = \
-            tf.placeholder(tf.float64,
+            tf.placeholder(tf.float32,
                            [None, self._model.user_count],
                            name="target")
         self._model.sec_target = \
-            tf.placeholder(tf.float64,
+            tf.placeholder(tf.float32,
                            [None, self._model.data.subreddit_count],
                            name="sec_target")
 
-        self._model.keep_prob = tf.placeholder(tf.float64, name="keep_prob")
+        self._model.keep_prob = tf.placeholder(tf.float32, name="keep_prob")
 
         if self._model.rnn_unit == 'lstm':
             rnn_layer = tf.contrib.rnn.LSTMCell(self._model.rnn_neurons)
@@ -73,13 +73,13 @@ def add_input_layer(self):
             tf.random_uniform(
                 [self._model.vocabulary_size,
                  self._model.embedding_size],
-                -1.0, 1.0, dtype=tf.float64),
+                -1.0, 1.0, dtype=tf.float32),
             trainable=self._model.is_trainable_matrix,
             name="embedding_matrix",
-            dtype=tf.float64)
+            dtype=tf.float32)
 
         self._model.embedding_placeholder = \
-            tf.placeholder(tf.float64,
+            tf.placeholder(tf.float32,
                            [self._model.vocabulary_size, self._model.embedding_size])
         self._model.embedding_init = \
             embedding_matrix.assign(self._model.embedding_placeholder)
@@ -88,7 +88,7 @@ def add_input_layer(self):
                                                 self._model.input)
         # Run the LSTM layer with the embedded input
         outputs, _ = tf.nn.dynamic_rnn(rnn_layer, embedded_input,
-                                       dtype=tf.float64)
+                                       dtype=tf.float32)
 
         outputs = tf.transpose(outputs, [1, 0, 2])
         output = outputs[-1]
@@ -98,13 +98,13 @@ def add_input_layer(self):
                     [self._model.subreddit_count,
                      self._model.subreddit_input_neurons],
                     stddev=0.35,
-                    dtype=tf.float64),
+                    dtype=tf.float32),
                 name="sub_input_weights")
 
             subreddit_bias = tf.Variable(tf.random_normal(
                     [self._model.subreddit_input_neurons],
                     stddev=0.35,
-                    dtype=tf.float64),
+                    dtype=tf.float32),
                 name="sub_input_bias")
 
             logit_subreddit = tf.add(
@@ -128,22 +128,22 @@ def add_layer(self, number_of_neurons):
                   else 0),
                  number_of_neurons],
                 stddev=0.35,
-                dtype=tf.float64),
+                dtype=tf.float32),
                                   name="weights" + str(self.number_of_layers))
             bias = tf.Variable(tf.random_normal([number_of_neurons],
                                                 stddev=0.35,
-                                                dtype=tf.float64),
+                                                dtype=tf.float32),
                                name="biases" + str(self.number_of_layers))
 
         else:
             weights = tf.Variable(tf.random_normal(
                 [self._model.latest_layer.get_shape()[1].value, number_of_neurons],
                 stddev=0.35,
-                dtype=tf.float64),
+                dtype=tf.float32),
                                   name="weights" + str(self.number_of_layers))
             bias = tf.Variable(tf.random_normal([number_of_neurons],
                                                 stddev=0.35,
-                                                dtype=tf.float64),
+                                                dtype=tf.float32),
                                name="biases" + str(self.number_of_layers))
 
         logits = tf.add(tf.matmul(self._model.latest_layer, weights), bias)
@@ -169,12 +169,12 @@ def add_output_layer(self, output_size, secondary_output=False):
         sigmoid_weights = tf.Variable(tf.random_normal(
             [self._model.latest_layer.get_shape()[1].value, output_size],
             stddev=0.35,
-            dtype=tf.float64),
+            dtype=tf.float32),
                                       name="output_weights")
 
         sigmoid_bias = tf.Variable(tf.random_normal([output_size],
                                                     stddev=0.35,
-                                                    dtype=tf.float64),
+                                                    dtype=tf.float32),
                                    name="output_biases")
 
         logits = tf.add(tf.matmul(self._model.latest_layer, sigmoid_weights), sigmoid_bias)
diff --git a/model/util/config.py b/model/util/config.py
index c2cfbfc..6bdf4ca 100644
--- a/model/util/config.py
+++ b/model/util/config.py
@@ -29,6 +29,6 @@
     for conf in cfg['network']:
         for (key, value) in conf.items():
             if type(value) is float:
-                conf[key] = np.float64(value)
+                conf[key] = np.float32(value)
 
 
diff --git a/model/util/csv_reader.py b/model/util/csv_reader.py
index 17c8357..b6894c7 100644
--- a/model/util/csv_reader.py
+++ b/model/util/csv_reader.py
@@ -74,33 +74,31 @@ def get_data(self, datatype, data_column=[0], sub_column=1, label_column=2):
                 subreddit_full.append(subreddit)
             return data_full, subreddit_full, label_full
 
+    def load_pretrained_embeddings(self, file_name, dimension_size=50):
+        file_path = os.path.join(DATASETS_PATH, file_name)
+        matrix = [np.random.rand(dimension_size).astype(np.float32)]
+        word_dict = dict()
+        word_dict['UNK'] = 0
+        count = 1
 
-    def test_load_pretrained_embeddings(self, fileName, dimension_size=50):
-        file_path = os.path.join(DATASETS_PATH, fileName)
         with open(file_path, 'r', encoding='UTF-8') as csvfile:
             reader = csv.reader(csvfile, delimiter=' ', quoting=csv.QUOTE_NONE)
-            word_dict = dict()
-            matrix = []
-
-            word_dict['UNK'] = len(matrix)
-            matrix.append(np.random.rand(1, dimension_size)[0].tolist())
 
             for row in reader:
                 first_col = row[0]
-
                 if first_col in ['!', '?', '-', '_', '.', ',', '\'', '\"', ':', ';', '%', '(', ')']:
                     continue
 
-                if first_col[0] == '<': #some words are tokens for usernames like <user> or <caps> etc, ignore them.
+                # Some words are tokens for usernames like <user> or
+                # <caps> etc, ignore them.
+                if first_col[0] == '<':
                     continue
-                word_dict[first_col] = len(matrix)
-                matrix.append(row[1:])
-        embed_matrix = np.array(matrix)
-        embed_matrix = embed_matrix.astype(np.float64)
-        return word_dict, embed_matrix
-
-
 
+                word_dict[first_col] = count
+                row_array = np.array(row[1:], dtype=np.float32)
+                matrix.append(row_array)
+                count += 1
+        return word_dict, np.array(matrix)
 
 
 
diff --git a/model/util/data.py b/model/util/data.py
index 14d755c..22bbc17 100644
--- a/model/util/data.py
+++ b/model/util/data.py
@@ -86,7 +86,7 @@ def _build_dict(self):
                 helper.build_dataset(vocab, vocabulary_size=self.vocabulary_size)
         else:
             self.word_dict, self.embedding_matrix = \
-                self.reader.test_load_pretrained_embeddings(
+                self.reader.load_pretrained_embeddings(
                     self.pre_trained_matrix,
                     self.embedding_size)
         users = " ".join(self.train_labels).split()