Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions lib/kernels/include/kernels/combine_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,11 @@ namespace Combine {

void forward_kernel(ffStream_t stream,
GenericTensorAccessorR const &input,
GenericTensorAccessorW const &output,
DataType dataType);
GenericTensorAccessorW const &output);

void backward_kernel(ffStream_t stream,
GenericTensorAccessorR const &output_grad,
GenericTensorAccessorW const &input_grad,
DataType dataType);
GenericTensorAccessorW const &input_grad);

} // namespace Combine
} // namespace Kernels
Expand Down
6 changes: 2 additions & 4 deletions lib/kernels/include/kernels/concat_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,12 @@ namespace Concat {
void forward_kernel(ffStream_t stream,
GenericTensorAccessorW const &output,
std::vector<GenericTensorAccessorR> const &inputs,
int num_inputs,
ff_dim_t legion_axis);
ff_dim_t axis);

void backward_kernel(ffStream_t stream,
GenericTensorAccessorR const &output_grad,
std::vector<GenericTensorAccessorW> const &input_grads,
int num_inputs,
ff_dim_t legion_axis);
ff_dim_t axis);

} // namespace Concat
} // namespace Kernels
Expand Down
10 changes: 4 additions & 6 deletions lib/kernels/src/cuda/combine_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -50,17 +50,15 @@ struct BackwardKernel {

void forward_kernel(ffStream_t stream,
GenericTensorAccessorR const &input,
GenericTensorAccessorW const &output,
DataType data_type) {
DataTypeDispatch1<ForwardKernel>{}(data_type, stream, input, output);
GenericTensorAccessorW const &output) {
DataTypeDispatch1<ForwardKernel>{}(input.data_type, stream, input, output);
}

void backward_kernel(ffStream_t stream,
GenericTensorAccessorR const &output_grad,
GenericTensorAccessorW const &input_grad,
DataType data_type) {
GenericTensorAccessorW const &input_grad) {
DataTypeDispatch1<BackwardKernel>{}(
data_type, stream, output_grad, input_grad);
input_grad.data_type, stream, output_grad, input_grad);
}

} // namespace Combine
Expand Down
26 changes: 12 additions & 14 deletions lib/kernels/src/cuda/concat_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -25,30 +25,29 @@ namespace Concat {
void calc_blk_size(size_t &num_blocks,
size_t &blk_size,
ArrayShape const &shape,
req<ff_dim_t> legion_axis) {
ff_dim_t axis) {
num_blocks = 1;
blk_size = 1;
for (int d = 0; d < shape.num_dims(); d++) {
if (d <= legion_axis) {
if (d <= axis) {
blk_size *= shape[legion_dim_t(d)];
} else {
num_blocks *= shape[legion_dim_t(d)];
}
}
}

void forward_kernel(ffStream_t stream,
void forward_kernel(cudaStream_t stream,
GenericTensorAccessorW const &output,
std::vector<GenericTensorAccessorR> const &inputs,
int num_inputs,
ff_dim_t legion_axis) {
ff_dim_t axis) {
size_t num_blocks = 1, output_blk_size = 1, input_blk_sizes[MAX_NUM_INPUTS];
int num_inputs = inputs.size();
assert(num_inputs <= MAX_NUM_INPUTS);
calc_blk_size(num_blocks, output_blk_size, output.shape, legion_axis);
calc_blk_size(num_blocks, output_blk_size, output.shape, axis);
for (int i = 0; i < num_inputs; i++) {
size_t input_num_blocks = 1;
calc_blk_size(
input_num_blocks, input_blk_sizes[i], inputs[i].shape, legion_axis);
calc_blk_size(input_num_blocks, input_blk_sizes[i], inputs[i].shape, axis);
assert(input_num_blocks == num_blocks);
}

Expand All @@ -66,20 +65,19 @@ void forward_kernel(ffStream_t stream,
}
}

void backward_kernel(ffStream_t stream,
void backward_kernel(cudaStream_t stream,
GenericTensorAccessorR const &output_grad,
std::vector<GenericTensorAccessorW> const &input_grads,
int num_inputs,
ff_dim_t legion_axis) {
ff_dim_t axis) {
size_t num_blocks = 1, output_blk_size = 1, input_blk_sizes[MAX_NUM_INPUTS];
int num_inputs = input_grads.size();
assert(num_inputs <= MAX_NUM_INPUTS);

ArrayShape shape = output_grad.shape;
calc_blk_size(num_blocks, output_blk_size, shape, legion_axis);
calc_blk_size(num_blocks, output_blk_size, output_grad.shape, axis);
for (int i = 0; i < num_inputs; i++) {
shape = input_grads[i].shape;
size_t input_num_blocks = 1;
calc_blk_size(input_num_blocks, input_blk_sizes[i], shape, legion_axis);
calc_blk_size(input_num_blocks, input_blk_sizes[i], shape, axis);
assert(input_num_blocks == num_blocks);
}

Expand Down
14 changes: 7 additions & 7 deletions lib/kernels/src/cuda/conv_2d_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -268,11 +268,11 @@ Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle,

void forward_kernel(cudaStream_t stream,
Conv2DPerDeviceState const &m,
optional<Activation> const &activation,
float const *input_ptr,
float *output_ptr,
float const *filter_ptr,
float const *bias_ptr) {
float const *bias_ptr,
optional<Activation> activation) {
checkCUDNN(cudnnSetStream(m.handle.dnn, stream));

float alpha = 1.0f, beta = 0.0f;
Expand Down Expand Up @@ -313,14 +313,14 @@ void forward_kernel(cudaStream_t stream,

void backward_kernel(cudaStream_t stream,
Conv2DPerDeviceState const &m,
optional<Activation> const &activation,
float const *input_ptr,
float *input_grad_ptr,
float const *output_ptr,
float *output_grad_ptr,
float const *kernel_ptr,
float *kernel_grad_ptr,
float *bias_grad_ptr) {
float const *filter_ptr,
float *filter_grad_ptr,
float *bias_grad_ptr,
optional<Activation> activation) {
checkCUDNN(cudnnSetStream(m.handle.dnn, stream));

float alpha = 1.0f;
Expand Down Expand Up @@ -355,7 +355,7 @@ void backward_kernel(cudaStream_t stream,
m.handle.workSpaceSize,
&alpha,
m.filterDesc,
kernel_grad_ptr));
filter_grad_ptr));
// Compute bias gradiant
// NOTE: we use alpha for bias_grad to accumulate gradients
if (bias_grad_ptr != NULL) {
Expand Down
10 changes: 2 additions & 8 deletions lib/kernels/src/hip/combine_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,6 @@
#include <hip/hip_runtime.h>

namespace FlexFlow {

CombinePerDeviceState::CombinePerDeviceState(FFHandler handler)
: PerDeviceOpState(handler) {}

namespace Kernels {
namespace Combine {

Expand Down Expand Up @@ -57,18 +53,16 @@ struct BackwardKernel {
};

void forward_kernel(ffStream_t stream,
CombinePerDeviceState const *m,
GenericTensorAccessorR const &input,
GenericTensorAccessorW const &output) {
DataTypeDispatch1<ForwardKernel>{}(m->data_type, stream, input, output);
DataTypeDispatch1<ForwardKernel>{}(input.data_type, stream, input, output);
}

void backward_kernel(ffStream_t stream,
CombinePerDeviceState const *m,
GenericTensorAccessorR const &output_grad,
GenericTensorAccessorW const &input_grad) {
DataTypeDispatch1<BackwardKernel>{}(
m->data_type, stream, output_grad, input_grad);
input_grad.data_type, stream, output_grad, input_grad);
}

} // namespace Combine
Expand Down
92 changes: 25 additions & 67 deletions lib/kernels/src/hip/concat_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,59 +18,35 @@
#include <hip/hip_runtime.h>

namespace FlexFlow {

// declare Legion names
using Legion::coord_t;
using Legion::Rect;

namespace Kernels {
namespace Concat {

void init_meta(ConcatPerDeviceState *m, int legion_axis) {
m->legion_axis = legion_axis;
}

template <int N>
void calc_blk_size(coord_t &num_blocks,
coord_t &blk_size,
Rect<N> rect,
int axis) {
void calc_blk_size(size_t &num_blocks,
size_t &blk_size,
ArrayShape const &shape,
ff_dim_t axis) {
num_blocks = 1;
blk_size = 1;
for (int d = 0; d < N; d++) {
for (int d = 0; d < shape.num_dims(); d++) {
if (d <= axis) {
blk_size *= (rect.hi[d] - rect.lo[d] + 1);
blk_size *= shape[legion_dim_t(d)];
} else {
num_blocks *= (rect.hi[d] - rect.lo[d] + 1);
num_blocks *= shape[legion_dim_t(d)];
}
}
}

void forward_kernel(hipStream_t stream,
GenericTensorAccessorW const &output,
GenericTensorAccessorR const *inputs,
int num_inputs,
int axis) {
coord_t num_blocks = 1, output_blk_size = 1, input_blk_sizes[MAX_NUM_INPUTS];
std::vector<GenericTensorAccessorR> const &inputs,
ff_dim_t axis) {
size_t num_blocks = 1, output_blk_size = 1, input_blk_sizes[MAX_NUM_INPUTS];
int num_inputs = inputs.size();
assert(num_inputs <= MAX_NUM_INPUTS);
switch (output.domain.get_dim()) {
#define DIMFUNC(DIM) \
case DIM: { \
Rect<DIM> rect = output.domain; \
calc_blk_size<DIM>(num_blocks, output_blk_size, rect, axis); \
for (int i = 0; i < num_inputs; i++) { \
rect = inputs[i].domain; \
coord_t input_num_blocks = 1; \
calc_blk_size<DIM>(input_num_blocks, input_blk_sizes[i], rect, axis); \
assert(input_num_blocks == num_blocks); \
} \
break; \
}
LEGION_FOREACH_N(DIMFUNC)
#undef DIMFUNC
default:
fprintf(stderr, "Unsupported concat dimension number");
assert(false);
for (int i = 0; i < num_inputs; i++) {
size_t input_num_blocks = 1;
calc_blk_size(input_num_blocks, input_blk_sizes[i], inputs[i].shape, axis);
assert(input_num_blocks == num_blocks);
}

off_t offset = 0;
Expand All @@ -89,31 +65,19 @@ void forward_kernel(hipStream_t stream,
}
}

void backward_kernel(ffStream_t stream,
void backward_kernel(hipStream_t stream,
GenericTensorAccessorR const &output_grad,
GenericTensorAccessorW const *input_grads,
int num_inputs,
int axis) {
std::vector<GenericTensorAccessorW> const &input_grads,
ff_dim_t axis) {
coord_t num_blocks = 1, output_blk_size = 1, input_blk_sizes[MAX_NUM_INPUTS];
int num_inputs = input_grads.size();
assert(num_inputs <= MAX_NUM_INPUTS);
switch (output_grad.domain.get_dim()) {
#define DIMFUNC(DIM) \
case DIM: { \
Rect<DIM> rect = output_grad.domain; \
calc_blk_size<DIM>(num_blocks, output_blk_size, rect, axis); \
for (int i = 0; i < num_inputs; i++) { \
rect = input_grads[i].domain; \
coord_t input_num_blocks = 1; \
calc_blk_size<DIM>(input_num_blocks, input_blk_sizes[i], rect, axis); \
assert(input_num_blocks == num_blocks); \
} \
break; \
}
LEGION_FOREACH_N(DIMFUNC)
#undef DIMFUNC
default:
fprintf(stderr, "Unsupported concat dimension number");
assert(false);
calc_blk_size(num_blocks, output_blk_size, output_grad.shape, axis);
for (int i = 0; i < num_inputs; i++) {
shape = input_grads[i].shape;
size_t input_num_blocks = 1;
calc_blk_size(input_num_blocks, input_blk_sizes[i], shape, axis);
assert(input_num_blocks == num_blocks);
}

off_t offset = 0;
Expand All @@ -130,12 +94,6 @@ void backward_kernel(ffStream_t stream,
output_blk_size);
offset += input_blk_sizes[i];
}

// Rect<2> output_rect(Point<2>(0, 0), Point<2>(output_blk_size-1, batch_size
// - 1)); Rect<2> input_rect(Point<2>(0, 0), Point<2>(input_blk_sizes[0]-1,
// batch_size - 1)); print_tensor<2, float>(output_grad - output_blk_size,
// output_rect, "[Concat:backward:output]"); print_tensor<2,
// float>(input_grads[0], input_rect, "[Concat:backward:input0]");
}

} // namespace Concat
Expand Down
Loading