Skip to content
This repository was archived by the owner on Nov 17, 2023. It is now read-only.

Commit 9512cd3

Browse files
author
shuo-ouyang
committed
1bit gradient compression implementation
1 parent d4052fd commit 9512cd3

File tree

8 files changed

+439
-87
lines changed

8 files changed

+439
-87
lines changed

ci/docker/runtime_functions.sh

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1315,8 +1315,10 @@ integrationtest_ubuntu_cpu_dist_kvstore() {
13151315
python3 ../../tools/launch.py -n 7 --launcher local python3 dist_sync_kvstore.py --type=gluon_type_cpu
13161316
python3 ../../tools/launch.py -n 7 --launcher local python3 dist_sync_kvstore.py
13171317
python3 ../../tools/launch.py -n 7 --launcher local python3 dist_sync_kvstore.py --no-multiprecision
1318-
python3 ../../tools/launch.py -n 7 --launcher local python3 dist_sync_kvstore.py --type=compressed_cpu
1319-
python3 ../../tools/launch.py -n 7 --launcher local python3 dist_sync_kvstore.py --type=compressed_cpu --no-multiprecision
1318+
python3 ../../tools/launch.py -n 7 --launcher local python3 dist_sync_kvstore.py --type=compressed_cpu_1bit
1319+
python3 ../../tools/launch.py -n 7 --launcher local python3 dist_sync_kvstore.py --type=compressed_cpu_1bit --no-multiprecision
1320+
python3 ../../tools/launch.py -n 7 --launcher local python3 dist_sync_kvstore.py --type=compressed_cpu_2bit
1321+
python3 ../../tools/launch.py -n 7 --launcher local python3 dist_sync_kvstore.py --type=compressed_cpu_2bit --no-multiprecision
13201322
python3 ../../tools/launch.py -n 3 --launcher local python3 test_server_profiling.py
13211323
popd
13221324
}

python/mxnet/kvstore/kvstore.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -498,6 +498,9 @@ def set_gradient_compression(self, compression_params):
498498
""" Specifies type of low-bit quantization for gradient compression \
499499
and additional arguments depending on the type of compression being used.
500500
501+
The 1bit compression works as follows: values which is above the threshold in the
502+
gradient will be set to +1, whereas values below threshold will be set to -1.
503+
501504
2bit Gradient Compression takes a positive float `threshold`.
502505
The technique works by thresholding values such that positive values in the
503506
gradient above threshold will be set to threshold. Negative values whose absolute
@@ -538,7 +541,7 @@ def set_gradient_compression(self, compression_params):
538541
A dictionary specifying the type and parameters for gradient compression.
539542
The key `type` in this dictionary is a
540543
required string argument and specifies the type of gradient compression.
541-
Currently `type` can be only `2bit`
544+
Currently `type` can be only `1bit` and `2bit`
542545
Other keys in this dictionary are optional and specific to the type
543546
of gradient compression.
544547
"""

src/kvstore/gradient_compression-inl.h

Lines changed: 120 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -32,48 +32,137 @@ namespace mxnet {
3232
namespace kvstore {
3333

3434
// these gpu functions are defined in gradient_compression.cu
35+
void Quantize1BitImpl(mshadow::Stream<mshadow::gpu> *s, const std::vector<mxnet::TBlob> &inputs,
36+
const float threshold);
37+
void Dequantize1BitImpl(mshadow::Stream<mshadow::gpu> *s, const std::vector<mxnet::TBlob> &inputs,
38+
const float threshold);
3539
void Quantize2BitImpl(mshadow::Stream<mshadow::gpu> *s, const std::vector<mxnet::TBlob> &inputs,
3640
const float threshold);
3741
void Dequantize2BitImpl(mshadow::Stream<mshadow::gpu> *s, const std::vector<mxnet::TBlob> &inputs,
3842
const float threshold);
3943

44+
struct quantize_1bit {
45+
MSHADOW_XINLINE static void Map(int out_byte_id,
46+
int original_size,
47+
float *out,
48+
float *grad,
49+
float *residual,
50+
const float threshold) {
51+
// this byte contains the compressed representation of
52+
// upto 8 values starting from (char*)out + out_byte_id
53+
char *compr_byte = reinterpret_cast<char *>(out) + out_byte_id;
54+
55+
// init to 0
56+
*compr_byte = 0;
57+
// start and end are indices in original grad array
58+
const int start = out_byte_id << 3;
59+
const int end = (start + 8 <= original_size) ? start + 8 : original_size;
60+
61+
// masks used to quantize data
62+
const uint8_t bits[] = {0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01};
63+
for (int i = start; i < end; ++i) {
64+
// adds gradient to existing residual to get updated grad
65+
residual[i] += grad[i];
66+
if (residual[i] > threshold) {
67+
// set data to 1
68+
*compr_byte |= bits[(i & 7)];
69+
// reduce residual by 1
70+
residual[i] -= 1;
71+
} else {
72+
// do nothing on compr_byte because it is initialized to 0
73+
// add residual by 1
74+
// because current position will be dequantized to -1
75+
residual[i] += 1;
76+
}
77+
}
78+
}
79+
};
80+
81+
template<typename xpu>
82+
void Quantize1BitKernelLaunch(mshadow::Stream<xpu> *s, const std::vector<mxnet::TBlob> &inputs,
83+
const float threshold) {
84+
mxnet::op::mxnet_op::Kernel<quantize_1bit, xpu>
85+
::Launch(s,
86+
inputs[2].Size() * 4, // compressed array byte size
87+
inputs[0].Size(), // original size
88+
inputs[2].dptr<float>(), // compressed array
89+
inputs[0].dptr<float>(), // original array
90+
inputs[1].dptr<float>(), // residual array
91+
threshold); // threshold
92+
}
93+
94+
struct dequantize_1bit {
95+
MSHADOW_XINLINE static void Map(int i,
96+
float *out,
97+
float *in,
98+
const float threshold) {
99+
// get position of dequantized value to fill
100+
float *outval = out + i;
101+
// gets byte which holds quantized value for this position
102+
char *ch_ptr = reinterpret_cast < char * > (in + (i >> 5));
103+
ch_ptr += ((i & 31) >> 3);
104+
// masks used to quantize data
105+
const uint8_t bits[] = {0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01};
106+
// col denotes which bit of a byte is set for this value
107+
// col=0 implies the first bit, col=1 implies the second bit,...
108+
const int col = i & 7;
109+
const uint8_t mask = bits[col];
110+
const uint8_t masked = *ch_ptr & mask;
111+
if (masked == mask) {
112+
*outval = +1;
113+
} else {
114+
// if current position of byte is 0
115+
// dequantized it to -1
116+
*outval = -1;
117+
}
118+
}
119+
};
120+
121+
template<typename xpu>
122+
void Dequantize1BitKernelLaunch(mshadow::Stream<xpu> *s, const std::vector<mxnet::TBlob> &inputs,
123+
const float threshold) {
124+
mxnet::op::mxnet_op::Kernel<dequantize_1bit, xpu>
125+
::Launch(s,
126+
inputs[1].Size(), // original size
127+
inputs[1].dptr<float>(), // out array
128+
inputs[0].dptr<float>(), // compressed array
129+
threshold); // threshold
130+
}
131+
40132
struct quantize_2bit {
41-
MSHADOW_XINLINE static void Map(int out_block_id,
133+
MSHADOW_XINLINE static void Map(int out_byte_id,
42134
int original_size,
43135
float *out,
44136
float *grad,
45137
float *residual,
46138
const float neg_threshold,
47139
const float pos_threshold) {
48140
// this block contains the compressed representation of
49-
// upto 16 values starting from out_block_id*16
50-
float *compr_block = out + out_block_id;
141+
// upto 4 values starting from (char*)out + out_byte_id
142+
char *compr_byte = reinterpret_cast<char *>(out) + out_byte_id;
51143
// init to 0
52-
*compr_block = 0;
144+
*compr_byte = 0;
53145
// start and end are indices in original grad array
54-
const int start = out_block_id << 4;
55-
const int end = (start + 16 <= original_size) ? start + 16 : original_size;
56-
// cast as char* to manipulate bits of float addresses
57-
char *block_ptr = reinterpret_cast < char * > (compr_block);
146+
const int start = out_byte_id << 2;
147+
const int end = (start + 4 <= original_size) ? start + 4 : original_size;
148+
58149
// masks to set bits when value meets pos_threshold
59150
// 0xc0 is mask when value is to be represented by the first two bits in a char*
60151
// 0xc0 means first two bits are set to 11
61152
const uint8_t posbits[] = {0xc0, 0x30, 0x0c, 0x03};
62153
// masks to set bits when value meets neg_threshold
63154
const uint8_t negbits[] = {0x80, 0x20, 0x08, 0x02};
64155
for (int i = start; i < end; i++) {
65-
// adds offset to reach appropriate byte
66-
char *curr_byte = block_ptr + ((i - start) >> 2);
67156
// adds gradient to existing residual to get updated grad
68157
residual[i] += grad[i];
69158
if (residual[i] >= pos_threshold) {
70159
// set data to 11
71-
*curr_byte |= posbits[(i & 3)];
160+
*compr_byte |= posbits[(i & 3)];
72161
// reduce residual by pos_threshold
73162
residual[i] -= pos_threshold;
74163
} else if (residual[i] <= neg_threshold) {
75164
// set data to 10
76-
*curr_byte |= negbits[(i & 3)];
165+
*compr_byte |= negbits[(i & 3)];
77166
residual[i] -= neg_threshold;
78167
}
79168
}
@@ -85,13 +174,13 @@ void Quantize2BitKernelLaunch(mshadow::Stream<xpu> *s, const std::vector<mxnet::
85174
const float threshold) {
86175
mxnet::op::mxnet_op::Kernel<quantize_2bit, xpu>
87176
::Launch(s,
88-
inputs[2].Size(), // compressed array size
89-
inputs[0].Size(), // original size
90-
inputs[2].dptr<float>(), // compressed array
91-
inputs[0].dptr<float>(), // original array
92-
inputs[1].dptr<float>(), // residual array
93-
-1 *threshold, // negative threshold
94-
threshold); // positive threshold
177+
inputs[2].Size() * 4, // compressed array byte size
178+
inputs[0].Size(), // original size
179+
inputs[2].dptr<float>(), // compressed array
180+
inputs[0].dptr<float>(), // original array
181+
inputs[1].dptr<float>(), // residual array
182+
-1 *threshold, // negative threshold
183+
threshold); // positive threshold
95184
}
96185

97186
struct dequantize_2bit {
@@ -138,6 +227,18 @@ void Dequantize2BitKernelLaunch(mshadow::Stream<xpu> *s, const std::vector<mxnet
138227
threshold); // positive threshold
139228
}
140229

230+
inline void Quantize1BitImpl(mshadow::Stream<mshadow::cpu> *s,
231+
const std::vector<mxnet::TBlob> &inputs,
232+
const float threshold) {
233+
Quantize1BitKernelLaunch(s, inputs, threshold);
234+
}
235+
236+
inline void Dequantize1BitImpl(mshadow::Stream<mshadow::cpu> *s,
237+
const std::vector<mxnet::TBlob> &inputs,
238+
const float threshold) {
239+
Dequantize1BitKernelLaunch(s, inputs, threshold);
240+
}
241+
141242
inline void Quantize2BitImpl(mshadow::Stream<mshadow::cpu> *s,
142243
const std::vector<mxnet::TBlob> &inputs,
143244
const float threshold) {

src/kvstore/gradient_compression.cc

Lines changed: 62 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,10 @@ void GradientCompression::SetParams(const std::vector<std::pair<std::string, std
4141
& kwargs) {
4242
GradientCompressionParam params;
4343
params.InitAllowUnknown(kwargs);
44-
CHECK_GT(params.threshold, 0) << "threshold must be greater than 0";
45-
if (params.type == "2bit") {
44+
if (params.type == "1bit") {
45+
SetOneBitCompression(params.threshold);
46+
} else if (params.type == "2bit") {
47+
CHECK_GT(params.threshold, 0) << "threshold must be greater than 0 for two bit compression";
4648
SetTwoBitCompression(params.threshold);
4749
} else {
4850
LOG(FATAL) << "Unknown type for gradient compression " << params.type;
@@ -57,6 +59,11 @@ std::string GradientCompression::get_type_str() {
5759
return std::to_string(static_cast<int>(type_));
5860
}
5961

62+
void GradientCompression::SetOneBitCompression(const float threshold) {
63+
type_ = CompressionType::kOneBit;
64+
threshold_ = threshold;
65+
}
66+
6067
void GradientCompression::SetTwoBitCompression(const float threshold) {
6168
type_ = CompressionType::kTwoBit;
6269
threshold_ = threshold;
@@ -83,7 +90,9 @@ void GradientCompression::DecodeParams(const std::string &s) {
8390
}
8491

8592
int GradientCompression::GetCompressionFactor() {
86-
if (type_ == CompressionType::kTwoBit) {
93+
if (type_ == CompressionType::kOneBit) {
94+
return 32;
95+
} else if (type_ == CompressionType::kTwoBit) {
8796
return 16;
8897
} else {
8998
LOG(FATAL) << "Unsupported compression type: " << get_type_str();
@@ -106,16 +115,34 @@ void GradientCompression::Quantize(const mxnet::NDArray &from, mxnet::NDArray *t
106115
const int a = from.ctx().dev_mask();
107116
const int b = to->ctx().dev_mask();
108117
const float threshold = threshold_;
109-
if (type_ == CompressionType::kTwoBit) {
110-
if (a == mshadow::cpu::kDevMask && b == mshadow::cpu::kDevMask) {
118+
if (a == mshadow::cpu::kDevMask && b == mshadow::cpu::kDevMask) {
119+
if (type_ == CompressionType::kOneBit) {
120+
mxnet::Engine::Get()->PushSync([from, to, residual, threshold](mxnet::RunContext ctx) {
121+
std::vector<mxnet::TBlob> inputs = {from.data(), residual->data(), to->data()};
122+
Quantize1BitImpl(ctx.get_stream<mshadow::cpu>(), inputs, threshold);
123+
}, from.ctx(), {from.var()}, {to->var(), residual->var()},
124+
mxnet::FnProperty::kNormal, priority, "QuantizeCPU");
125+
} else if (type_ == CompressionType::kTwoBit) {
111126
mxnet::Engine::Get()->PushSync([from, to, residual, threshold](mxnet::RunContext ctx) {
112127
std::vector<mxnet::TBlob> inputs = {from.data(), residual->data(), to->data()};
113128
Quantize2BitImpl(ctx.get_stream<mshadow::cpu>(), inputs, threshold);
114129
}, from.ctx(), {from.var()}, {to->var(), residual->var()},
115130
mxnet::FnProperty::kNormal, priority, "QuantizeCPU");
116131
} else {
132+
LOG(FATAL) << "Unsupported quantization of type " << get_type_str();
133+
}
134+
} else {
135+
if (a == mshadow::gpu::kDevMask && b == mshadow::gpu::kDevMask) {
117136
#if MXNET_USE_CUDA
118-
if (a == mshadow::gpu::kDevMask && b == mshadow::gpu::kDevMask) {
137+
if (type_ == CompressionType::kOneBit) {
138+
mxnet::Engine::Get()->PushSync([from, to, residual, threshold](mxnet::RunContext ctx) {
139+
std::vector<mxnet::TBlob> inputs = {from.data(), residual->data(), to->data()};
140+
Quantize1BitImpl(ctx.get_stream<mshadow::gpu>(), inputs, threshold);
141+
// Wait GPU kernel to complete
142+
ctx.get_stream<mshadow::gpu>()->Wait();
143+
}, from.ctx(), {from.var()}, {to->var(), residual->var()},
144+
mxnet::FnProperty::kNormal, priority, "QuantizeGPU");
145+
} else if (type_ == CompressionType::kTwoBit) {
119146
mxnet::Engine::Get()->PushSync([from, to, residual, threshold](mxnet::RunContext ctx) {
120147
std::vector<mxnet::TBlob> inputs = {from.data(), residual->data(), to->data()};
121148
Quantize2BitImpl(ctx.get_stream<mshadow::gpu>(), inputs, threshold);
@@ -124,14 +151,14 @@ void GradientCompression::Quantize(const mxnet::NDArray &from, mxnet::NDArray *t
124151
}, from.ctx(), {from.var()}, {to->var(), residual->var()},
125152
mxnet::FnProperty::kNormal, priority, "QuantizeGPU");
126153
} else {
127-
LOG(FATAL) << "unknown device mask";
154+
LOG(FATAL) << "Unsupported quantization of type " << get_type_str();
128155
}
129156
#else
130157
LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
131158
#endif
159+
} else {
160+
LOG(FATAL) << "Unknown device mask, from device mask " << a << " to device mask " << b;
132161
}
133-
} else {
134-
LOG(FATAL) << "Unsupported quantization of type " << get_type_str();
135162
}
136163
}
137164

@@ -142,35 +169,52 @@ void GradientCompression::Dequantize(const mxnet::NDArray &from, mxnet::NDArray
142169
const int a = from.ctx().dev_mask();
143170
const int b = to->ctx().dev_mask();
144171
const float threshold = threshold_;
145-
if (type_ == CompressionType::kTwoBit) {
146-
if (a == mshadow::cpu::kDevMask && b == mshadow::cpu::kDevMask) {
172+
if (a == mshadow::cpu::kDevMask && b == mshadow::cpu::kDevMask) {
173+
if (type_ == CompressionType::kOneBit) {
174+
mxnet::Engine::Get()->PushSync([from, to, threshold](mxnet::RunContext ctx) {
175+
std::vector<mxnet::TBlob> inputs = {from.data(), to->data()};
176+
Dequantize1BitImpl(ctx.get_stream<mshadow::cpu>(), inputs, threshold);
177+
}, from.ctx(), {from.var()}, {to->var()},
178+
mxnet::FnProperty::kNormal, priority, "DequantizeCPU");
179+
} else if (type_ == CompressionType::kTwoBit) {
147180
mxnet::Engine::Get()->PushSync([from, to, threshold](mxnet::RunContext ctx) {
148181
std::vector<mxnet::TBlob> inputs = {from.data(), to->data()};
149182
Dequantize2BitImpl(ctx.get_stream<mshadow::cpu>(), inputs, threshold);
150183
}, from.ctx(), {from.var()}, {to->var()},
151184
mxnet::FnProperty::kNormal, priority, "DequantizeCPU");
152185
} else {
186+
LOG(FATAL) << "Unsupported dequantization of type " << get_type_str();
187+
}
188+
} else {
189+
if (a == mshadow::gpu::kDevMask && b == mshadow::gpu::kDevMask) {
153190
#if MXNET_USE_CUDA
154-
if (a == mshadow::gpu::kDevMask && b == mshadow::gpu::kDevMask) {
191+
if (type_ == CompressionType::kOneBit) {
155192
mxnet::Engine::Get()->PushSync([from, to, threshold](mxnet::RunContext ctx) {
156193
std::vector<mxnet::TBlob> inputs = {from.data(), to->data()};
157-
Dequantize2BitImpl(ctx.get_stream<mshadow::gpu>(), inputs, threshold);
194+
Dequantize1BitImpl(ctx.get_stream<mshadow::gpu>(), inputs, threshold);
158195
// Wait GPU kernel to complete
159196
ctx.get_stream<mshadow::gpu>()->Wait();
160197
}, from.ctx(), {from.var()}, {to->var()},
161198
mxnet::FnProperty::kNormal, priority, "DequantizeGPU");
199+
} else if (type_ == CompressionType::kTwoBit) {
200+
mxnet::Engine::Get()->PushSync([from, to, threshold](mxnet::RunContext ctx) {
201+
std::vector<mxnet::TBlob> inputs = {from.data(), to->data()};
202+
Dequantize2BitImpl(ctx.get_stream<mshadow::gpu>(), inputs, threshold);
203+
// Wait GPU kernel to completes
204+
ctx.get_stream<mshadow::gpu>()->Wait();
205+
}, from.ctx(), {from.var()}, {to->var()},
206+
mxnet::FnProperty::kNormal, priority, "DequantizeGPU");
162207
} else {
163-
LOG(FATAL) << "unknown device mask";
208+
LOG(FATAL) << "Unsupported dequantization of type " << get_type_str();
164209
}
165210
#else
166-
LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
211+
LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
167212
#endif
213+
} else {
214+
LOG(FATAL) << "Unknown device mask, from device mask " << a << " to device mask " << b;
168215
}
169-
} else {
170-
LOG(FATAL) << "Unsupported dequantization of type " << get_type_str();
171216
}
172217
}
173-
174218
} // namespace kvstore
175219
} // namespace mxnet
176220

src/kvstore/gradient_compression.cu

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,16 @@
2727

2828
namespace mxnet {
2929
namespace kvstore {
30+
void Quantize1BitImpl(mshadow::Stream<gpu>* s, const std::vector<TBlob>& inputs,
31+
const float threshold) {
32+
Quantize1BitKernelLaunch(s, inputs, threshold);
33+
}
34+
35+
void Dequantize1BitImpl(mshadow::Stream<gpu>* s, const std::vector<TBlob>& inputs,
36+
const float threshold) {
37+
Dequantize1BitKernelLaunch(s, inputs, threshold);
38+
}
39+
3040
void Quantize2BitImpl(mshadow::Stream<gpu>* s, const std::vector<TBlob>& inputs,
3141
const float threshold) {
3242
Quantize2BitKernelLaunch(s, inputs, threshold);

0 commit comments

Comments
 (0)