apache
diff --git a/‎ci/docker/runtime_functions.sh‎
Lines changed: 4 additions & 2 deletions b/‎ci/docker/runtime_functions.sh‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎python/mxnet/kvstore/kvstore.py‎
Lines changed: 4 additions & 1 deletion b/‎python/mxnet/kvstore/kvstore.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎src/kvstore/gradient_compression-inl.h‎
Lines changed: 120 additions & 19 deletions b/‎src/kvstore/gradient_compression-inl.h‎
Lines changed: 120 additions & 19 deletions
diff --git a/‎src/kvstore/gradient_compression.cc‎
Lines changed: 62 additions & 18 deletions b/‎src/kvstore/gradient_compression.cc‎
Lines changed: 62 additions & 18 deletions
diff --git a/‎src/kvstore/gradient_compression.cu‎
Lines changed: 10 additions & 0 deletions b/‎src/kvstore/gradient_compression.cu‎
Lines changed: 10 additions & 0 deletions
@@ -1315,8 +1315,10 @@ integrationtest_ubuntu_cpu_dist_kvstore() {
     python3 ../../tools/launch.py -n 7 --launcher local python3 dist_sync_kvstore.py --type=gluon_type_cpu
     python3 ../../tools/launch.py -n 7 --launcher local python3 dist_sync_kvstore.py
     python3 ../../tools/launch.py -n 7 --launcher local python3 dist_sync_kvstore.py --no-multiprecision
-    python3 ../../tools/launch.py -n 7 --launcher local python3 dist_sync_kvstore.py --type=compressed_cpu
-    python3 ../../tools/launch.py -n 7 --launcher local python3 dist_sync_kvstore.py --type=compressed_cpu --no-multiprecision
+    python3 ../../tools/launch.py -n 7 --launcher local python3 dist_sync_kvstore.py --type=compressed_cpu_1bit
+    python3 ../../tools/launch.py -n 7 --launcher local python3 dist_sync_kvstore.py --type=compressed_cpu_1bit --no-multiprecision
+    python3 ../../tools/launch.py -n 7 --launcher local python3 dist_sync_kvstore.py --type=compressed_cpu_2bit
+    python3 ../../tools/launch.py -n 7 --launcher local python3 dist_sync_kvstore.py --type=compressed_cpu_2bit --no-multiprecision
     python3 ../../tools/launch.py -n 3 --launcher local python3 test_server_profiling.py
     popd
 }
 
@@ -498,6 +498,9 @@ def set_gradient_compression(self, compression_params):
         """ Specifies type of low-bit quantization for gradient compression \
          and additional arguments depending on the type of compression being used.
 
+        The 1bit compression works as follows: values which is above the threshold in the
+        gradient will be set to +1, whereas values below threshold will be set to -1.
+
         2bit Gradient Compression takes a positive float `threshold`.
         The technique works by thresholding values such that positive values in the
         gradient above threshold will be set to threshold. Negative values whose absolute
@@ -538,7 +541,7 @@ def set_gradient_compression(self, compression_params):
             A dictionary specifying the type and parameters for gradient compression.
             The key `type` in this dictionary is a
             required string argument and specifies the type of gradient compression.
-            Currently `type` can be only `2bit`
+            Currently `type` can be only `1bit` and `2bit`
             Other keys in this dictionary are optional and specific to the type
             of gradient compression.
         """
 
@@ -32,48 +32,137 @@ namespace mxnet {
 namespace kvstore {
 
 // these gpu functions are defined in gradient_compression.cu
+void Quantize1BitImpl(mshadow::Stream<mshadow::gpu> *s, const std::vector<mxnet::TBlob> &inputs,
+                      const float threshold);
+void Dequantize1BitImpl(mshadow::Stream<mshadow::gpu> *s, const std::vector<mxnet::TBlob> &inputs,
+                        const float threshold);
 void Quantize2BitImpl(mshadow::Stream<mshadow::gpu> *s, const std::vector<mxnet::TBlob> &inputs,
                       const float threshold);
 void Dequantize2BitImpl(mshadow::Stream<mshadow::gpu> *s, const std::vector<mxnet::TBlob> &inputs,
                         const float threshold);
 
+struct quantize_1bit {
+  MSHADOW_XINLINE static void Map(int out_byte_id,
+                                  int original_size,
+                                  float *out,
+                                  float *grad,
+                                  float *residual,
+                                  const float threshold) {
+    // this byte contains the compressed representation of
+    // upto 8 values starting from (char*)out + out_byte_id
+    char *compr_byte = reinterpret_cast<char *>(out) + out_byte_id;
+
+    // init to 0
+    *compr_byte = 0;
+    // start and end are indices in original grad array
+    const int start = out_byte_id << 3;
+    const int end = (start + 8 <= original_size) ? start + 8 : original_size;
+
+    // masks used to quantize data
+    const uint8_t bits[] = {0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01};
+    for (int i = start; i < end; ++i) {
+      // adds gradient to existing residual to get updated grad
+      residual[i] += grad[i];
+      if (residual[i] > threshold) {
+        // set data to 1
+        *compr_byte |= bits[(i & 7)];
+        // reduce residual by 1
+        residual[i] -= 1;
+      } else {
+        // do nothing on compr_byte because it is initialized to 0
+        // add residual by 1
+        // because current position will be dequantized to -1
+        residual[i] += 1;
+      }
+    }
+  }
+};
+
+template<typename xpu>
+void Quantize1BitKernelLaunch(mshadow::Stream<xpu> *s, const std::vector<mxnet::TBlob> &inputs,
+                              const float threshold) {
+  mxnet::op::mxnet_op::Kernel<quantize_1bit, xpu>
+    ::Launch(s,
+            inputs[2].Size() * 4,         // compressed array byte size
+            inputs[0].Size(),             // original size
+            inputs[2].dptr<float>(),      // compressed array
+            inputs[0].dptr<float>(),      // original array
+            inputs[1].dptr<float>(),      // residual array
+            threshold);                   // threshold
+}
+
+struct dequantize_1bit {
+  MSHADOW_XINLINE static void Map(int i,
+                                  float *out,
+                                  float *in,
+                                  const float threshold) {
+    // get position of dequantized value to fill
+    float *outval = out + i;
+    // gets byte which holds quantized value for this position
+    char *ch_ptr = reinterpret_cast < char * > (in + (i >> 5));
+    ch_ptr += ((i & 31) >> 3);
+    // masks used to quantize data
+    const uint8_t bits[] = {0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01};
+    // col denotes which bit of a byte is set for this value
+    // col=0 implies the first bit, col=1 implies the second bit,...
+    const int col = i & 7;
+    const uint8_t mask = bits[col];
+    const uint8_t masked = *ch_ptr & mask;
+    if (masked == mask) {
+      *outval = +1;
+    } else {
+      // if current position of byte is 0
+      // dequantized it to -1
+      *outval = -1;
+    }
+  }
+};
+
+template<typename xpu>
+void Dequantize1BitKernelLaunch(mshadow::Stream<xpu> *s, const std::vector<mxnet::TBlob> &inputs,
+                                const float threshold) {
+  mxnet::op::mxnet_op::Kernel<dequantize_1bit, xpu>
+  ::Launch(s,
+          inputs[1].Size(),         // original size
+          inputs[1].dptr<float>(),  // out array
+          inputs[0].dptr<float>(),  // compressed array
+          threshold);               // threshold
+}
+
 struct quantize_2bit {
-  MSHADOW_XINLINE static void Map(int out_block_id,
+  MSHADOW_XINLINE static void Map(int out_byte_id,
                                   int original_size,
                                   float *out,
                                   float *grad,
                                   float *residual,
                                   const float neg_threshold,
                                   const float pos_threshold) {
     // this block contains the compressed representation of
-    // upto 16 values starting from out_block_id*16
-    float *compr_block = out + out_block_id;
+    // upto 4 values starting from (char*)out + out_byte_id
+    char *compr_byte = reinterpret_cast<char *>(out) + out_byte_id;
     // init to 0
-    *compr_block = 0;
+    *compr_byte = 0;
     // start and end are indices in original grad array
-    const int start = out_block_id << 4;
-    const int end = (start + 16 <= original_size) ? start + 16 : original_size;
-    // cast as char* to manipulate bits of float addresses
-    char *block_ptr = reinterpret_cast < char * > (compr_block);
+    const int start = out_byte_id << 2;
+    const int end = (start + 4 <= original_size) ? start + 4 : original_size;
+
     // masks to set bits when value meets pos_threshold
     // 0xc0 is mask when value is to be represented by the first two bits in a char*
     // 0xc0 means first two bits are set to 11
     const uint8_t posbits[] = {0xc0, 0x30, 0x0c, 0x03};
     // masks to set bits when value meets neg_threshold
     const uint8_t negbits[] = {0x80, 0x20, 0x08, 0x02};
     for (int i = start; i < end; i++) {
-      // adds offset to reach appropriate byte
-      char *curr_byte = block_ptr + ((i - start) >> 2);
       // adds gradient to existing residual to get updated grad
       residual[i] += grad[i];
       if (residual[i] >= pos_threshold) {
         // set data to 11
-        *curr_byte |= posbits[(i & 3)];
+        *compr_byte |= posbits[(i & 3)];
         // reduce residual by pos_threshold
         residual[i] -= pos_threshold;
       } else if (residual[i] <= neg_threshold) {
         // set data to 10
-        *curr_byte |= negbits[(i & 3)];
+        *compr_byte |= negbits[(i & 3)];
         residual[i] -= neg_threshold;
       }
     }
@@ -85,13 +174,13 @@ void Quantize2BitKernelLaunch(mshadow::Stream<xpu> *s, const std::vector<mxnet::
                               const float threshold) {
   mxnet::op::mxnet_op::Kernel<quantize_2bit, xpu>
     ::Launch(s,
-            inputs[2].Size(),         // compressed array size
-            inputs[0].Size(),         // original size
-            inputs[2].dptr<float>(),  // compressed array
-            inputs[0].dptr<float>(),  // original array
-            inputs[1].dptr<float>(),  // residual array
-            -1 *threshold,            // negative threshold
-            threshold);               // positive threshold
+            inputs[2].Size() * 4,         // compressed array byte size
+            inputs[0].Size(),             // original size
+            inputs[2].dptr<float>(),      // compressed array
+            inputs[0].dptr<float>(),      // original array
+            inputs[1].dptr<float>(),      // residual array
+            -1 *threshold,                // negative threshold
+            threshold);                   // positive threshold
 }
 
 struct dequantize_2bit {
@@ -138,6 +227,18 @@ void Dequantize2BitKernelLaunch(mshadow::Stream<xpu> *s, const std::vector<mxnet
           threshold);               // positive threshold
 }
 
+inline void Quantize1BitImpl(mshadow::Stream<mshadow::cpu> *s,
+                             const std::vector<mxnet::TBlob> &inputs,
+                             const float threshold) {
+  Quantize1BitKernelLaunch(s, inputs, threshold);
+}
+
+inline void Dequantize1BitImpl(mshadow::Stream<mshadow::cpu> *s,
+                               const std::vector<mxnet::TBlob> &inputs,
+                               const float threshold) {
+  Dequantize1BitKernelLaunch(s, inputs, threshold);
+}
+
 inline void Quantize2BitImpl(mshadow::Stream<mshadow::cpu> *s,
                              const std::vector<mxnet::TBlob> &inputs,
                              const float threshold) {
 
@@ -41,8 +41,10 @@ void GradientCompression::SetParams(const std::vector<std::pair<std::string, std
                                     & kwargs) {
   GradientCompressionParam params;
   params.InitAllowUnknown(kwargs);
-  CHECK_GT(params.threshold, 0) << "threshold must be greater than 0";
-  if (params.type == "2bit") {
+  if (params.type == "1bit") {
+    SetOneBitCompression(params.threshold);
+  } else if (params.type == "2bit") {
+    CHECK_GT(params.threshold, 0) << "threshold must be greater than 0 for two bit compression";
     SetTwoBitCompression(params.threshold);
   } else {
     LOG(FATAL) << "Unknown type for gradient compression " << params.type;
@@ -57,6 +59,11 @@ std::string GradientCompression::get_type_str() {
   return std::to_string(static_cast<int>(type_));
 }
 
+void GradientCompression::SetOneBitCompression(const float threshold) {
+  type_ = CompressionType::kOneBit;
+  threshold_ = threshold;
+}
+
 void GradientCompression::SetTwoBitCompression(const float threshold) {
   type_ = CompressionType::kTwoBit;
   threshold_ = threshold;
@@ -83,7 +90,9 @@ void GradientCompression::DecodeParams(const std::string &s) {
 }
 
 int GradientCompression::GetCompressionFactor() {
-  if (type_ == CompressionType::kTwoBit) {
+  if (type_ == CompressionType::kOneBit) {
+    return 32;
+  } else if (type_ == CompressionType::kTwoBit) {
     return 16;
   } else {
     LOG(FATAL) << "Unsupported compression type: " << get_type_str();
@@ -106,16 +115,34 @@ void GradientCompression::Quantize(const mxnet::NDArray &from, mxnet::NDArray *t
   const int a = from.ctx().dev_mask();
   const int b = to->ctx().dev_mask();
   const float threshold = threshold_;
-  if (type_ == CompressionType::kTwoBit) {
-    if (a == mshadow::cpu::kDevMask && b == mshadow::cpu::kDevMask) {
+  if (a == mshadow::cpu::kDevMask && b == mshadow::cpu::kDevMask) {
+    if (type_ == CompressionType::kOneBit) {
+      mxnet::Engine::Get()->PushSync([from, to, residual, threshold](mxnet::RunContext ctx) {
+        std::vector<mxnet::TBlob> inputs = {from.data(), residual->data(), to->data()};
+        Quantize1BitImpl(ctx.get_stream<mshadow::cpu>(), inputs, threshold);
+      }, from.ctx(), {from.var()}, {to->var(), residual->var()},
+      mxnet::FnProperty::kNormal, priority, "QuantizeCPU");
+    } else if (type_ == CompressionType::kTwoBit) {
       mxnet::Engine::Get()->PushSync([from, to, residual, threshold](mxnet::RunContext ctx) {
         std::vector<mxnet::TBlob> inputs = {from.data(), residual->data(), to->data()};
         Quantize2BitImpl(ctx.get_stream<mshadow::cpu>(), inputs, threshold);
       }, from.ctx(), {from.var()}, {to->var(), residual->var()},
       mxnet::FnProperty::kNormal, priority, "QuantizeCPU");
     } else {
+      LOG(FATAL) << "Unsupported quantization of type " << get_type_str();
+    }
+  } else {
+    if (a == mshadow::gpu::kDevMask && b == mshadow::gpu::kDevMask) {
 #if MXNET_USE_CUDA
-      if (a == mshadow::gpu::kDevMask && b == mshadow::gpu::kDevMask) {
+      if (type_ == CompressionType::kOneBit) {
+        mxnet::Engine::Get()->PushSync([from, to, residual, threshold](mxnet::RunContext ctx) {
+          std::vector<mxnet::TBlob> inputs = {from.data(), residual->data(), to->data()};
+          Quantize1BitImpl(ctx.get_stream<mshadow::gpu>(), inputs, threshold);
+          // Wait GPU kernel to complete
+          ctx.get_stream<mshadow::gpu>()->Wait();
+        }, from.ctx(), {from.var()}, {to->var(), residual->var()},
+        mxnet::FnProperty::kNormal, priority, "QuantizeGPU");
+      } else if (type_ == CompressionType::kTwoBit) {
         mxnet::Engine::Get()->PushSync([from, to, residual, threshold](mxnet::RunContext ctx) {
           std::vector<mxnet::TBlob> inputs = {from.data(), residual->data(), to->data()};
           Quantize2BitImpl(ctx.get_stream<mshadow::gpu>(), inputs, threshold);
@@ -124,14 +151,14 @@ void GradientCompression::Quantize(const mxnet::NDArray &from, mxnet::NDArray *t
         }, from.ctx(), {from.var()}, {to->var(), residual->var()},
         mxnet::FnProperty::kNormal, priority, "QuantizeGPU");
       } else {
-        LOG(FATAL) << "unknown device mask";
+        LOG(FATAL) << "Unsupported quantization of type " << get_type_str();
       }
 #else
     LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
 #endif
+    } else {
+      LOG(FATAL) << "Unknown device mask, from device mask " << a << " to device mask " << b;
     }
-  } else {
-    LOG(FATAL) << "Unsupported quantization of type " << get_type_str();
   }
 }
 
@@ -142,35 +169,52 @@ void GradientCompression::Dequantize(const mxnet::NDArray &from, mxnet::NDArray
   const int a = from.ctx().dev_mask();
   const int b = to->ctx().dev_mask();
   const float threshold = threshold_;
-  if (type_ == CompressionType::kTwoBit) {
-    if (a == mshadow::cpu::kDevMask && b == mshadow::cpu::kDevMask) {
+  if (a == mshadow::cpu::kDevMask && b == mshadow::cpu::kDevMask) {
+    if (type_ == CompressionType::kOneBit) {
+      mxnet::Engine::Get()->PushSync([from, to, threshold](mxnet::RunContext ctx) {
+        std::vector<mxnet::TBlob> inputs = {from.data(), to->data()};
+        Dequantize1BitImpl(ctx.get_stream<mshadow::cpu>(), inputs, threshold);
+      }, from.ctx(), {from.var()}, {to->var()},
+      mxnet::FnProperty::kNormal, priority, "DequantizeCPU");
+    } else if (type_ == CompressionType::kTwoBit) {
       mxnet::Engine::Get()->PushSync([from, to, threshold](mxnet::RunContext ctx) {
         std::vector<mxnet::TBlob> inputs = {from.data(), to->data()};
         Dequantize2BitImpl(ctx.get_stream<mshadow::cpu>(), inputs, threshold);
       }, from.ctx(), {from.var()}, {to->var()},
       mxnet::FnProperty::kNormal, priority, "DequantizeCPU");
     } else {
+      LOG(FATAL) << "Unsupported dequantization of type " << get_type_str();
+    }
+  } else {
+    if (a == mshadow::gpu::kDevMask && b == mshadow::gpu::kDevMask) {
 #if MXNET_USE_CUDA
-      if (a == mshadow::gpu::kDevMask && b == mshadow::gpu::kDevMask) {
+      if (type_ == CompressionType::kOneBit) {
         mxnet::Engine::Get()->PushSync([from, to, threshold](mxnet::RunContext ctx) {
           std::vector<mxnet::TBlob> inputs = {from.data(), to->data()};
-          Dequantize2BitImpl(ctx.get_stream<mshadow::gpu>(), inputs, threshold);
+          Dequantize1BitImpl(ctx.get_stream<mshadow::gpu>(), inputs, threshold);
           // Wait GPU kernel to complete
           ctx.get_stream<mshadow::gpu>()->Wait();
         }, from.ctx(), {from.var()}, {to->var()},
         mxnet::FnProperty::kNormal, priority, "DequantizeGPU");
+      } else if (type_ == CompressionType::kTwoBit) {
+        mxnet::Engine::Get()->PushSync([from, to, threshold](mxnet::RunContext ctx) {
+          std::vector<mxnet::TBlob> inputs = {from.data(), to->data()};
+          Dequantize2BitImpl(ctx.get_stream<mshadow::gpu>(), inputs, threshold);
+          // Wait GPU kernel to completes
+          ctx.get_stream<mshadow::gpu>()->Wait();
+        }, from.ctx(), {from.var()}, {to->var()},
+        mxnet::FnProperty::kNormal, priority, "DequantizeGPU");
       } else {
-        LOG(FATAL) << "unknown device mask";
+        LOG(FATAL) << "Unsupported dequantization of type " << get_type_str();
       }
 #else
-      LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+    LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
 #endif
+    } else {
+      LOG(FATAL) << "Unknown device mask, from device mask " << a << " to device mask " << b;
     }
-  } else {
-    LOG(FATAL) << "Unsupported dequantization of type " << get_type_str();
   }
 }
-
 }  // namespace kvstore
 }  // namespace mxnet
 
@@ -27,6 +27,16 @@
 
 namespace mxnet {
 namespace kvstore {
+void Quantize1BitImpl(mshadow::Stream<gpu>* s, const std::vector<TBlob>& inputs,
+                      const float threshold) {
+  Quantize1BitKernelLaunch(s, inputs, threshold);
+}
+
+void Dequantize1BitImpl(mshadow::Stream<gpu>* s, const std::vector<TBlob>& inputs,
+                        const float threshold) {
+  Dequantize1BitKernelLaunch(s, inputs, threshold);
+}
+
 void Quantize2BitImpl(mshadow::Stream<gpu>* s, const std::vector<TBlob>& inputs,
                       const float threshold) {
   Quantize2BitKernelLaunch(s, inputs, threshold);