Skip to content
This repository was archived by the owner on Nov 17, 2023. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions ci/docker/runtime_functions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -1315,8 +1315,10 @@ integrationtest_ubuntu_cpu_dist_kvstore() {
python3 ../../tools/launch.py -n 7 --launcher local python3 dist_sync_kvstore.py --type=gluon_type_cpu
python3 ../../tools/launch.py -n 7 --launcher local python3 dist_sync_kvstore.py
python3 ../../tools/launch.py -n 7 --launcher local python3 dist_sync_kvstore.py --no-multiprecision
python3 ../../tools/launch.py -n 7 --launcher local python3 dist_sync_kvstore.py --type=compressed_cpu
python3 ../../tools/launch.py -n 7 --launcher local python3 dist_sync_kvstore.py --type=compressed_cpu --no-multiprecision
python3 ../../tools/launch.py -n 7 --launcher local python3 dist_sync_kvstore.py --type=compressed_cpu_1bit
python3 ../../tools/launch.py -n 7 --launcher local python3 dist_sync_kvstore.py --type=compressed_cpu_1bit --no-multiprecision
python3 ../../tools/launch.py -n 7 --launcher local python3 dist_sync_kvstore.py --type=compressed_cpu_2bit
python3 ../../tools/launch.py -n 7 --launcher local python3 dist_sync_kvstore.py --type=compressed_cpu_2bit --no-multiprecision
python3 ../../tools/launch.py -n 3 --launcher local python3 test_server_profiling.py
popd
}
Expand Down
5 changes: 4 additions & 1 deletion python/mxnet/kvstore/kvstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -498,6 +498,9 @@ def set_gradient_compression(self, compression_params):
""" Specifies type of low-bit quantization for gradient compression \
and additional arguments depending on the type of compression being used.

The 1bit compression works as follows: values which is above the threshold in the
gradient will be set to +1, whereas values below threshold will be set to -1.

2bit Gradient Compression takes a positive float `threshold`.
The technique works by thresholding values such that positive values in the
gradient above threshold will be set to threshold. Negative values whose absolute
Expand Down Expand Up @@ -538,7 +541,7 @@ def set_gradient_compression(self, compression_params):
A dictionary specifying the type and parameters for gradient compression.
The key `type` in this dictionary is a
required string argument and specifies the type of gradient compression.
Currently `type` can be only `2bit`
Currently `type` can be only `1bit` and `2bit`
Other keys in this dictionary are optional and specific to the type
of gradient compression.
"""
Expand Down
139 changes: 120 additions & 19 deletions src/kvstore/gradient_compression-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,48 +32,137 @@ namespace mxnet {
namespace kvstore {

// these gpu functions are defined in gradient_compression.cu
void Quantize1BitImpl(mshadow::Stream<mshadow::gpu> *s, const std::vector<mxnet::TBlob> &inputs,
const float threshold);
void Dequantize1BitImpl(mshadow::Stream<mshadow::gpu> *s, const std::vector<mxnet::TBlob> &inputs,
const float threshold);
void Quantize2BitImpl(mshadow::Stream<mshadow::gpu> *s, const std::vector<mxnet::TBlob> &inputs,
const float threshold);
void Dequantize2BitImpl(mshadow::Stream<mshadow::gpu> *s, const std::vector<mxnet::TBlob> &inputs,
const float threshold);

struct quantize_1bit {
MSHADOW_XINLINE static void Map(int out_byte_id,
int original_size,
float *out,
float *grad,
float *residual,
const float threshold) {
// this byte contains the compressed representation of
// upto 8 values starting from (char*)out + out_byte_id
char *compr_byte = reinterpret_cast<char *>(out) + out_byte_id;

// init to 0
*compr_byte = 0;
// start and end are indices in original grad array
const int start = out_byte_id << 3;
const int end = (start + 8 <= original_size) ? start + 8 : original_size;

// masks used to quantize data
const uint8_t bits[] = {0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01};
for (int i = start; i < end; ++i) {
// adds gradient to existing residual to get updated grad
residual[i] += grad[i];
if (residual[i] > threshold) {
// set data to 1
*compr_byte |= bits[(i & 7)];
// reduce residual by 1
residual[i] -= 1;
} else {
// do nothing on compr_byte because it is initialized to 0
// add residual by 1
// because current position will be dequantized to -1
residual[i] += 1;
}
}
}
};

template<typename xpu>
void Quantize1BitKernelLaunch(mshadow::Stream<xpu> *s, const std::vector<mxnet::TBlob> &inputs,
const float threshold) {
mxnet::op::mxnet_op::Kernel<quantize_1bit, xpu>
::Launch(s,
inputs[2].Size() * 4, // compressed array byte size
inputs[0].Size(), // original size
inputs[2].dptr<float>(), // compressed array
inputs[0].dptr<float>(), // original array
inputs[1].dptr<float>(), // residual array
threshold); // threshold
}

struct dequantize_1bit {
MSHADOW_XINLINE static void Map(int i,
float *out,
float *in,
const float threshold) {
// get position of dequantized value to fill
float *outval = out + i;
// gets byte which holds quantized value for this position
char *ch_ptr = reinterpret_cast < char * > (in + (i >> 5));
ch_ptr += ((i & 31) >> 3);
// masks used to quantize data
const uint8_t bits[] = {0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01};
// col denotes which bit of a byte is set for this value
// col=0 implies the first bit, col=1 implies the second bit,...
const int col = i & 7;
const uint8_t mask = bits[col];
const uint8_t masked = *ch_ptr & mask;
if (masked == mask) {
*outval = +1;
} else {
// if current position of byte is 0
// dequantized it to -1
*outval = -1;
}
}
};

template<typename xpu>
void Dequantize1BitKernelLaunch(mshadow::Stream<xpu> *s, const std::vector<mxnet::TBlob> &inputs,
const float threshold) {
mxnet::op::mxnet_op::Kernel<dequantize_1bit, xpu>
::Launch(s,
inputs[1].Size(), // original size
inputs[1].dptr<float>(), // out array
inputs[0].dptr<float>(), // compressed array
threshold); // threshold
}

struct quantize_2bit {
MSHADOW_XINLINE static void Map(int out_block_id,
MSHADOW_XINLINE static void Map(int out_byte_id,
int original_size,
float *out,
float *grad,
float *residual,
const float neg_threshold,
const float pos_threshold) {
// this block contains the compressed representation of
// upto 16 values starting from out_block_id*16
float *compr_block = out + out_block_id;
// upto 4 values starting from (char*)out + out_byte_id
char *compr_byte = reinterpret_cast<char *>(out) + out_byte_id;
// init to 0
*compr_block = 0;
*compr_byte = 0;
// start and end are indices in original grad array
const int start = out_block_id << 4;
const int end = (start + 16 <= original_size) ? start + 16 : original_size;
// cast as char* to manipulate bits of float addresses
char *block_ptr = reinterpret_cast < char * > (compr_block);
const int start = out_byte_id << 2;
const int end = (start + 4 <= original_size) ? start + 4 : original_size;

// masks to set bits when value meets pos_threshold
// 0xc0 is mask when value is to be represented by the first two bits in a char*
// 0xc0 means first two bits are set to 11
const uint8_t posbits[] = {0xc0, 0x30, 0x0c, 0x03};
// masks to set bits when value meets neg_threshold
const uint8_t negbits[] = {0x80, 0x20, 0x08, 0x02};
for (int i = start; i < end; i++) {
// adds offset to reach appropriate byte
char *curr_byte = block_ptr + ((i - start) >> 2);
// adds gradient to existing residual to get updated grad
residual[i] += grad[i];
if (residual[i] >= pos_threshold) {
// set data to 11
*curr_byte |= posbits[(i & 3)];
*compr_byte |= posbits[(i & 3)];
// reduce residual by pos_threshold
residual[i] -= pos_threshold;
} else if (residual[i] <= neg_threshold) {
// set data to 10
*curr_byte |= negbits[(i & 3)];
*compr_byte |= negbits[(i & 3)];
residual[i] -= neg_threshold;
}
}
Expand All @@ -85,13 +174,13 @@ void Quantize2BitKernelLaunch(mshadow::Stream<xpu> *s, const std::vector<mxnet::
const float threshold) {
mxnet::op::mxnet_op::Kernel<quantize_2bit, xpu>
::Launch(s,
inputs[2].Size(), // compressed array size
inputs[0].Size(), // original size
inputs[2].dptr<float>(), // compressed array
inputs[0].dptr<float>(), // original array
inputs[1].dptr<float>(), // residual array
-1 *threshold, // negative threshold
threshold); // positive threshold
inputs[2].Size() * 4, // compressed array byte size
inputs[0].Size(), // original size
inputs[2].dptr<float>(), // compressed array
inputs[0].dptr<float>(), // original array
inputs[1].dptr<float>(), // residual array
-1 *threshold, // negative threshold
threshold); // positive threshold
}

struct dequantize_2bit {
Expand Down Expand Up @@ -138,6 +227,18 @@ void Dequantize2BitKernelLaunch(mshadow::Stream<xpu> *s, const std::vector<mxnet
threshold); // positive threshold
}

inline void Quantize1BitImpl(mshadow::Stream<mshadow::cpu> *s,
const std::vector<mxnet::TBlob> &inputs,
const float threshold) {
Quantize1BitKernelLaunch(s, inputs, threshold);
}

inline void Dequantize1BitImpl(mshadow::Stream<mshadow::cpu> *s,
const std::vector<mxnet::TBlob> &inputs,
const float threshold) {
Dequantize1BitKernelLaunch(s, inputs, threshold);
}

inline void Quantize2BitImpl(mshadow::Stream<mshadow::cpu> *s,
const std::vector<mxnet::TBlob> &inputs,
const float threshold) {
Expand Down
80 changes: 62 additions & 18 deletions src/kvstore/gradient_compression.cc
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,10 @@ void GradientCompression::SetParams(const std::vector<std::pair<std::string, std
& kwargs) {
GradientCompressionParam params;
params.InitAllowUnknown(kwargs);
CHECK_GT(params.threshold, 0) << "threshold must be greater than 0";
if (params.type == "2bit") {
if (params.type == "1bit") {
SetOneBitCompression(params.threshold);
} else if (params.type == "2bit") {
CHECK_GT(params.threshold, 0) << "threshold must be greater than 0 for two bit compression";
SetTwoBitCompression(params.threshold);
} else {
LOG(FATAL) << "Unknown type for gradient compression " << params.type;
Expand All @@ -57,6 +59,11 @@ std::string GradientCompression::get_type_str() {
return std::to_string(static_cast<int>(type_));
}

void GradientCompression::SetOneBitCompression(const float threshold) {
type_ = CompressionType::kOneBit;
threshold_ = threshold;
}

void GradientCompression::SetTwoBitCompression(const float threshold) {
type_ = CompressionType::kTwoBit;
threshold_ = threshold;
Expand All @@ -83,7 +90,9 @@ void GradientCompression::DecodeParams(const std::string &s) {
}

int GradientCompression::GetCompressionFactor() {
if (type_ == CompressionType::kTwoBit) {
if (type_ == CompressionType::kOneBit) {
return 32;
} else if (type_ == CompressionType::kTwoBit) {
return 16;
} else {
LOG(FATAL) << "Unsupported compression type: " << get_type_str();
Expand All @@ -106,16 +115,34 @@ void GradientCompression::Quantize(const mxnet::NDArray &from, mxnet::NDArray *t
const int a = from.ctx().dev_mask();
const int b = to->ctx().dev_mask();
const float threshold = threshold_;
if (type_ == CompressionType::kTwoBit) {
if (a == mshadow::cpu::kDevMask && b == mshadow::cpu::kDevMask) {
if (a == mshadow::cpu::kDevMask && b == mshadow::cpu::kDevMask) {
if (type_ == CompressionType::kOneBit) {
mxnet::Engine::Get()->PushSync([from, to, residual, threshold](mxnet::RunContext ctx) {
std::vector<mxnet::TBlob> inputs = {from.data(), residual->data(), to->data()};
Quantize1BitImpl(ctx.get_stream<mshadow::cpu>(), inputs, threshold);
}, from.ctx(), {from.var()}, {to->var(), residual->var()},
mxnet::FnProperty::kNormal, priority, "QuantizeCPU");
} else if (type_ == CompressionType::kTwoBit) {
mxnet::Engine::Get()->PushSync([from, to, residual, threshold](mxnet::RunContext ctx) {
std::vector<mxnet::TBlob> inputs = {from.data(), residual->data(), to->data()};
Quantize2BitImpl(ctx.get_stream<mshadow::cpu>(), inputs, threshold);
}, from.ctx(), {from.var()}, {to->var(), residual->var()},
mxnet::FnProperty::kNormal, priority, "QuantizeCPU");
} else {
LOG(FATAL) << "Unsupported quantization of type " << get_type_str();
}
} else {
if (a == mshadow::gpu::kDevMask && b == mshadow::gpu::kDevMask) {
#if MXNET_USE_CUDA
if (a == mshadow::gpu::kDevMask && b == mshadow::gpu::kDevMask) {
if (type_ == CompressionType::kOneBit) {
mxnet::Engine::Get()->PushSync([from, to, residual, threshold](mxnet::RunContext ctx) {
std::vector<mxnet::TBlob> inputs = {from.data(), residual->data(), to->data()};
Quantize1BitImpl(ctx.get_stream<mshadow::gpu>(), inputs, threshold);
// Wait GPU kernel to complete
ctx.get_stream<mshadow::gpu>()->Wait();
}, from.ctx(), {from.var()}, {to->var(), residual->var()},
mxnet::FnProperty::kNormal, priority, "QuantizeGPU");
} else if (type_ == CompressionType::kTwoBit) {
mxnet::Engine::Get()->PushSync([from, to, residual, threshold](mxnet::RunContext ctx) {
std::vector<mxnet::TBlob> inputs = {from.data(), residual->data(), to->data()};
Quantize2BitImpl(ctx.get_stream<mshadow::gpu>(), inputs, threshold);
Expand All @@ -124,14 +151,14 @@ void GradientCompression::Quantize(const mxnet::NDArray &from, mxnet::NDArray *t
}, from.ctx(), {from.var()}, {to->var(), residual->var()},
mxnet::FnProperty::kNormal, priority, "QuantizeGPU");
} else {
LOG(FATAL) << "unknown device mask";
LOG(FATAL) << "Unsupported quantization of type " << get_type_str();
}
#else
LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
#endif
} else {
LOG(FATAL) << "Unknown device mask, from device mask " << a << " to device mask " << b;
}
} else {
LOG(FATAL) << "Unsupported quantization of type " << get_type_str();
}
}

Expand All @@ -142,35 +169,52 @@ void GradientCompression::Dequantize(const mxnet::NDArray &from, mxnet::NDArray
const int a = from.ctx().dev_mask();
const int b = to->ctx().dev_mask();
const float threshold = threshold_;
if (type_ == CompressionType::kTwoBit) {
if (a == mshadow::cpu::kDevMask && b == mshadow::cpu::kDevMask) {
if (a == mshadow::cpu::kDevMask && b == mshadow::cpu::kDevMask) {
if (type_ == CompressionType::kOneBit) {
mxnet::Engine::Get()->PushSync([from, to, threshold](mxnet::RunContext ctx) {
std::vector<mxnet::TBlob> inputs = {from.data(), to->data()};
Dequantize1BitImpl(ctx.get_stream<mshadow::cpu>(), inputs, threshold);
}, from.ctx(), {from.var()}, {to->var()},
mxnet::FnProperty::kNormal, priority, "DequantizeCPU");
} else if (type_ == CompressionType::kTwoBit) {
mxnet::Engine::Get()->PushSync([from, to, threshold](mxnet::RunContext ctx) {
std::vector<mxnet::TBlob> inputs = {from.data(), to->data()};
Dequantize2BitImpl(ctx.get_stream<mshadow::cpu>(), inputs, threshold);
}, from.ctx(), {from.var()}, {to->var()},
mxnet::FnProperty::kNormal, priority, "DequantizeCPU");
} else {
LOG(FATAL) << "Unsupported dequantization of type " << get_type_str();
}
} else {
if (a == mshadow::gpu::kDevMask && b == mshadow::gpu::kDevMask) {
#if MXNET_USE_CUDA
if (a == mshadow::gpu::kDevMask && b == mshadow::gpu::kDevMask) {
if (type_ == CompressionType::kOneBit) {
mxnet::Engine::Get()->PushSync([from, to, threshold](mxnet::RunContext ctx) {
std::vector<mxnet::TBlob> inputs = {from.data(), to->data()};
Dequantize2BitImpl(ctx.get_stream<mshadow::gpu>(), inputs, threshold);
Dequantize1BitImpl(ctx.get_stream<mshadow::gpu>(), inputs, threshold);
// Wait GPU kernel to complete
ctx.get_stream<mshadow::gpu>()->Wait();
}, from.ctx(), {from.var()}, {to->var()},
mxnet::FnProperty::kNormal, priority, "DequantizeGPU");
} else if (type_ == CompressionType::kTwoBit) {
mxnet::Engine::Get()->PushSync([from, to, threshold](mxnet::RunContext ctx) {
std::vector<mxnet::TBlob> inputs = {from.data(), to->data()};
Dequantize2BitImpl(ctx.get_stream<mshadow::gpu>(), inputs, threshold);
// Wait GPU kernel to completes
ctx.get_stream<mshadow::gpu>()->Wait();
}, from.ctx(), {from.var()}, {to->var()},
mxnet::FnProperty::kNormal, priority, "DequantizeGPU");
} else {
LOG(FATAL) << "unknown device mask";
LOG(FATAL) << "Unsupported dequantization of type " << get_type_str();
}
#else
LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
#endif
} else {
LOG(FATAL) << "Unknown device mask, from device mask " << a << " to device mask " << b;
}
} else {
LOG(FATAL) << "Unsupported dequantization of type " << get_type_str();
}
}

} // namespace kvstore
} // namespace mxnet

10 changes: 10 additions & 0 deletions src/kvstore/gradient_compression.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,16 @@

namespace mxnet {
namespace kvstore {
void Quantize1BitImpl(mshadow::Stream<gpu>* s, const std::vector<TBlob>& inputs,
const float threshold) {
Quantize1BitKernelLaunch(s, inputs, threshold);
}

void Dequantize1BitImpl(mshadow::Stream<gpu>* s, const std::vector<TBlob>& inputs,
const float threshold) {
Dequantize1BitKernelLaunch(s, inputs, threshold);
}

void Quantize2BitImpl(mshadow::Stream<gpu>* s, const std::vector<TBlob>& inputs,
const float threshold) {
Quantize2BitKernelLaunch(s, inputs, threshold);
Expand Down
Loading