[MXNET-404] elemwise_add/sub between rsp and rsp on GPU#11179
[MXNET-404] elemwise_add/sub between rsp and rsp on GPU#11179eric-haibin-lin merged 2 commits intoapache:masterfrom
Conversation
|
Benchmark script: import mxnet as mx
import sys
import os
import scipy
import numpy as np
from mxnet.test_utils import rand_ndarray, assert_almost_equal
import time
def measure_cost(repeat, a, b, out=None):
# start bench
start = time.time()
results = []
for i in range(repeat):
results.append(mx.nd.elemwise_add(a, b, out=out))
for result in results:
result.wait_to_read()
end = time.time()
diff = end - start
return diff / repeat
def measure_fallback(repeat, a):
# start bench
start = time.time()
results = []
for i in range(repeat):
results.append(a.tostype('default'))
for result in results:
result.wait_to_read()
end = time.time()
diff = end - start
return diff / repeat
def main():
shape = (1000000, 512)
context = mx.gpu(0)
# context = mx.cpu()
for lhs_density in [0.01, 0.005, 0.001, 0.0005, 0.0001, 0.000]:
mx_lhs = rand_ndarray(shape, stype='row_sparse', density=lhs_density).as_in_context(context)
mx_lhs_dns = mx_lhs.tostype('default')
for rhs_density in [0.01, 0.005, 0.001, 0.0005, 0.0001, 0.000]:
mx_rhs = rand_ndarray(shape=shape, stype='row_sparse', density=rhs_density).as_in_context(context)
mx_rhs_dns = mx_rhs.tostype('default')
#warmup
sparse_cost = 0.0
dns_cost = 0.0
np_lhs = mx_lhs_dns.asnumpy()
check = mx.nd.elemwise_add(mx_lhs, mx_rhs)
np_lhs = np_lhs + mx_rhs.asnumpy()
assert_almost_equal(check.asnumpy(), np_lhs, atol=1e-5, rtol=1e-4)
mx.nd.waitall()
for i in range(100):
sparse_cost += measure_cost(1, mx_lhs, mx_rhs)
dns_cost += measure_cost(1, mx_lhs_dns, mx_rhs_dns)
print("%.2f %% %.2f %%" % (lhs_density*100, rhs_density*100), dns_cost / sparse_cost)
for rhs_density in [1.000, 0.01, 0.005, 0.001, 0.0005, 0.0001, 0.000]:
mx_lhs_dns = mx.nd.ones(shape, ctx=context)
mx_lhs = mx_lhs_dns.tostype('row_sparse')
mx_rhs = rand_ndarray(shape=shape, stype='row_sparse', density=rhs_density).as_in_context(context)
mx_rhs_dns = mx_rhs.tostype('default')
#warmup
sparse_cost = 0.0
dns_cost = 0.0
np_lhs = mx_lhs_dns.asnumpy()
mx.nd.elemwise_add(mx_lhs, mx_rhs, out=mx_lhs)
np_lhs = np_lhs + mx_rhs.asnumpy()
assert_almost_equal(mx_lhs.asnumpy(), np_lhs, atol=1e-5, rtol=1e-4)
mx.nd.waitall()
for i in range(100):
sparse_cost += measure_cost(1, mx_lhs, mx_rhs, out=mx_lhs)
dns_cost += measure_cost(1, mx_lhs_dns, mx_rhs_dns, out=mx_lhs_dns)
print("%.2f %% %.2f %%" % (1.00000*100, rhs_density*100), dns_cost / sparse_cost)
for lhs_density in [1.000, 0.01, 0.005, 0.001, 0.0005, 0.0001, 0.000]:
mx_rhs_dns = mx.nd.ones(shape, ctx=context)
mx_rhs = mx_rhs_dns.tostype('row_sparse')
mx_lhs = rand_ndarray(shape=shape, stype='row_sparse', density=lhs_density).as_in_context(context)
mx_lhs_dns = mx_lhs.tostype('default')
#warmup
sparse_cost = 0.0
dns_cost = 0.0
np_rhs = mx_rhs_dns.asnumpy()
mx.nd.elemwise_add(mx_lhs, mx_rhs, out=mx_rhs)
np_rhs = np_rhs + mx_lhs.asnumpy()
assert_almost_equal(mx_rhs.asnumpy(), np_rhs, atol=1e-5, rtol=1e-4)
mx.nd.waitall()
for i in range(100):
sparse_cost += measure_cost(1, mx_lhs, mx_rhs, out=mx_rhs)
dns_cost += measure_cost(1, mx_lhs_dns, mx_rhs_dns, out=mx_rhs_dns)
print("%.2f %% %.2f %%" % (1.00000*100, lhs_density*100), dns_cost / sparse_cost)
if __name__ == "__main__":
main() |
|
Benchmark results: |
|
@eric-haibin-lin Please give a review when you have time, thanks! |
|
|
||
| /* \brief Check whether the two arrays are the same array */ | ||
| inline bool IsSame(const NDArray& other) { | ||
| inline bool IsSame(const NDArray& other) const { |
There was a problem hiding this comment.
@piiswrong I made the change here so that I can also call this function when I have a const NDArray object.
| }; | ||
|
|
||
| template<typename OP> | ||
| void ElemwiseBinaryOp::RspRspOp(mshadow::Stream<gpu> *s, |
There was a problem hiding this comment.
do we have unit test for write inplace?
There was a problem hiding this comment.
In-place case shares the same code as in-place case between dns and rsp, which already has a unit test.
There was a problem hiding this comment.
BTW correctness is double-checked in benchmark script during the warmup.
|
|
||
| CHECK(!scatter) << "scatter is not supported in RspRspOp on GPU yet..."; | ||
| CHECK(lhs.storage_type() == kRowSparseStorage && rhs.storage_type() == kRowSparseStorage); | ||
| CHECK(output.storage_type() == kRowSparseStorage); |
There was a problem hiding this comment.
Does it support kAddTo? CHECK_NE(kAddTo)?
| ElemwiseBinaryOp::DnsRspDnsOp<gpu, OP>(s, attrs, ctx, dns, rsp, req, output, reverse); | ||
| return; | ||
| } | ||
| CHECK(req == kWriteTo) << "Should be kWriteTo but got " << req; |
There was a problem hiding this comment.
If this function assumes req is never kNullOp, better document it in the header.
| lhs.data().FlatTo1D<gpu, DType>(), s); | ||
| Copy(output.aux_data(kIdx).FlatTo1D<gpu, IType>(), | ||
| lhs.aux_data(kIdx).FlatTo1D<gpu, IType>(), s); | ||
| } |
There was a problem hiding this comment.
what about kWriteInplace in all these branches? should we add a check?
There was a problem hiding this comment.
Extra checks and tests added.
16c160b to
a25a2cd
Compare
a25a2cd to
d7e67a8
Compare
|
@eric-haibin-lin should be good for merge |
* Support for elemwise_add/sub between rsp and rsp on GPU * add extra test coverage for inplace cases
* Support for elemwise_add/sub between rsp and rsp on GPU * add extra test coverage for inplace cases
Description
As title
Checklist
Essentials
Changes
Comments
For performance benchmark results please see comments.