Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion deps/fmt
Submodule fmt updated 70 files
+0 −8 .github/dependabot.yml
+2 −3 .github/pull_request_template.md
+0 −30 .github/workflows/cifuzz.yml
+1 −9 .github/workflows/doc.yml
+13 −30 .github/workflows/linux.yml
+2 −17 .github/workflows/macos.yml
+0 −65 .github/workflows/scorecard.yml
+4 −12 .github/workflows/windows.yml
+41 −98 CMakeLists.txt
+1 −668 ChangeLog.rst
+1 −1 LICENSE.rst
+28 −42 README.rst
+145 −200 doc/api.rst
+6 −5 doc/build.py
+8 −134 doc/syntax.rst
+1 −1 include/fmt/args.h
+327 −466 include/fmt/chrono.h
+37 −18 include/fmt/color.h
+104 −27 include/fmt/compile.h
+1,139 −738 include/fmt/core.h
+152 −91 include/fmt/format-inl.h
+788 −1,081 include/fmt/format.h
+71 −44 include/fmt/os.h
+50 −22 include/fmt/ostream.h
+174 −201 include/fmt/printf.h
+145 −158 include/fmt/ranges.h
+37 −331 include/fmt/std.h
+33 −62 include/fmt/xchar.h
+29 −40 src/fmt.cc
+5 −1 src/format.cc
+62 −99 src/os.cc
+3 −3 support/Vagrantfile
+1 −0 support/bazel/.bazelrc
+1 −1 support/bazel/.bazelversion
+2 −2 support/bazel/BUILD.bazel
+4 −5 support/bazel/README.md
+1 −1 support/build.gradle
+54 −0 support/cmake/cxx14.cmake
+0 −7 support/rst2md.py
+3 −15 test/CMakeLists.txt
+1 −1 test/add-subdirectory-test/CMakeLists.txt
+1 −1 test/args-test.cc
+26 −386 test/chrono-test.cc
+2 −2 test/compile-error-test/CMakeLists.txt
+0 −1 test/compile-fp-test.cc
+40 −22 test/compile-test.cc
+239 −119 test/core-test.cc
+0 −2 test/enforce-checks-test.cc
+1 −1 test/find-package-test/CMakeLists.txt
+91 −27 test/format-impl-test.cc
+329 −285 test/format-test.cc
+1 −1 test/fuzzing/CMakeLists.txt
+0 −2 test/gtest-extra-test.cc
+6 −1 test/gtest-extra.h
+7 −1 test/gtest/CMakeLists.txt
+1 −1 test/gtest/gmock-gtest-all.cc
+2 −2 test/mock-allocator.h
+88 −36 test/module-test.cc
+56 −15 test/os-test.cc
+47 −16 test/ostream-test.cc
+7 −1 test/posix-mock-test.cc
+2 −0 test/posix-mock.h
+37 −13 test/printf-test.cc
+35 −138 test/ranges-test.cc
+1 −1 test/scan-test.cc
+14 −17 test/scan.h
+1 −1 test/static-export-test/CMakeLists.txt
+20 −198 test/std-test.cc
+6 −2 test/util.h
+55 −118 test/xchar-test.cc
36 changes: 22 additions & 14 deletions lib/kernels/include/kernels/element_unary_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,42 +3,50 @@

#include "kernels/accessor.h"
#include "kernels/device.h"
#include "legion.h"
#include "kernels/ff_handle.h"
#include "op-attrs/ops/element_unary.h"
#include <cstddef>

namespace FlexFlow {

class ElementUnaryPerDeviceState : public PerDeviceOpState {
public:
ElementUnaryPerDeviceState(FFHandler handle);
struct ElementUnaryPerDeviceState {
PerDeviceFFHandle handle;
ffTensorDescriptor_t inputTensor, outputTensor;
ffActivationDescriptor_t actiDesc;

OperatorType op_type;
DataType data_type;
bool inplace;
float scalar;
char op_name[MAX_OPNAME];
};

FF_VISITABLE_STRUCT_NO_EQ(ElementUnaryPerDeviceState,
handle,
inputTensor,
outputTensor,
actiDesc,
op_type,
data_type,
scalar);

namespace Kernels {
namespace ElementUnary {

void init_kernel(ElementUnaryPerDeviceState *m,
Legion::Domain const &input_domain,
Legion::Domain const &output_domain);
ElementUnaryPerDeviceState init_kernel(PerDeviceFFHandle const &handle,
ArrayShape const &input_shape,
ArrayShape const &output_shape,
DataType data_type);

void forward_kernel(ffStream_t stream,
ElementUnaryPerDeviceState const *m,
ElementUnaryPerDeviceState const &device_state,
GenericTensorAccessorR const &input,
GenericTensorAccessorW const &output);

void backward_kernel(ffStream_t stream,
ElementUnaryPerDeviceState const *m,
ElementUnaryPerDeviceState const &device_state,
GenericTensorAccessorR const &input,
GenericTensorAccessorR const &input_grad,
GenericTensorAccessorW const &output,
GenericTensorAccessorW const &output_grad);
GenericTensorAccessorW const &input_grad,
GenericTensorAccessorR const &output,
GenericTensorAccessorR const &output_grad);

} // namespace ElementUnary
} // namespace Kernels
Expand Down
16 changes: 6 additions & 10 deletions lib/kernels/include/kernels/embedding_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,29 +5,25 @@
#include "kernels/device.h"

namespace FlexFlow {

class EmbeddingPerDeviceState : public PerDeviceOpState {
public:
EmbeddingPerDeviceState(FFHandler handle);
DataType input_data_type, output_data_type;
AggrMode aggr;
};

namespace Kernels {
namespace Embedding {
void forward_kernel(ffStream_t stream,
EmbeddingPerDeviceState const *m,
GenericTensorAccessorR const &input,
GenericTensorAccessorW const &output,
GenericTensorAccessorR const &weight,
DataType input_data_type,
DataType output_data_type,
AggrMode aggr,
int in_dim,
int out_dim,
int batch_size);
void backward_kernel(ffStream_t stream,
EmbeddingPerDeviceState const *m,
GenericTensorAccessorR const &input,
GenericTensorAccessorR const &output,
GenericTensorAccessorW const &weight_grad,
DataType input_data_type,
DataType output_data_type,
AggrMode aggr,
int in_dim,
int out_dim,
int batch_size);
Expand Down
35 changes: 16 additions & 19 deletions lib/kernels/include/kernels/layer_norm_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,42 +5,39 @@

namespace FlexFlow {

class LayerNormPerDeviceState : public PerDeviceOpState {
public:
LayerNormPerDeviceState(FFHandler handle,
bool elementwise_affine_,
int64_t effective_batch_size_,
int64_t effective_num_elements_,
bool profiling_,
float eps_);

public:
bool elementwise_affine;
int64_t effective_batch_size, effective_num_elements;
float eps;
struct LayerNormPerDeviceState {
float *mean, *rstd, *ds, *db, *scale, *bias;
char op_name[MAX_OPNAME];
DataType data_type;
};

namespace Kernels {
namespace LayerNorm {

LayerNormPerDeviceState init_kernel(PerDeviceFFHandle handle,
int64_t batch_size);

void forward_kernel(ffStream_t stream,
LayerNormPerDeviceState const *m,
LayerNormPerDeviceState const &m,
GenericTensorAccessorR const &input,
GenericTensorAccessorW const &output,
GenericTensorAccessorW const &gamma,
GenericTensorAccessorW const &beta);
GenericTensorAccessorW const &beta,
DataType data_type,
int64_t batch_size,
int64_t num_elements,
float eps);

void backward_kernel(ffStream_t stream,
LayerNormPerDeviceState const *m,
LayerNormPerDeviceState const &m,
GenericTensorAccessorR const &output_grad,
GenericTensorAccessorR const &input,
GenericTensorAccessorW const &input_grad,
GenericTensorAccessorR const &gamma,
GenericTensorAccessorW const &gamma_grad,
GenericTensorAccessorW const &beta_grad);
GenericTensorAccessorW const &beta_grad,
DataType data_type,
int64_t batch_size,
int64_t num_elements,
float eps);

} // namespace LayerNorm
} // namespace Kernels
Expand Down
103 changes: 52 additions & 51 deletions lib/kernels/src/cuda/element_unary_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,6 @@
#include "kernels/element_unary_kernels.h"

namespace FlexFlow {

// declare Legion names
using Legion::coord_t;
using Legion::Domain;

ElementUnaryPerDeviceState::ElementUnaryPerDeviceState(FFHandler handler)
: PerDeviceOpState(handler) {
checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor));
checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor));
checkCUDNN(cudnnCreateActivationDescriptor(&actiDesc));
}

namespace Kernels {
namespace ElementUnary {

Expand All @@ -45,13 +33,23 @@ static bool use_cudnn(OperatorType op_type) {
}
}

void init_kernel(ElementUnaryPerDeviceState *m,
Domain const &input_domain,
Domain const &output_domain) {
ElementUnaryPerDeviceState init_kernel(PerDeviceFFHandle const &handle,
ArrayShape const &input_shape,
ArrayShape const &output_shape,
OperatorType op_type,
DataType data_type) {

ffTensorDescriptor_t inputTensor;
ffTensorDescriptor_t outputTensor;
ffActivationDescriptor_t actiDesc;

if (use_cudnn(m->op_type)) {
checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor));
checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor));
checkCUDNN(cudnnCreateActivationDescriptor(&actiDesc));

if (use_cudnn(op_type)) {
cudnnActivationMode_t mode;
switch (m->op_type) {
switch (op_type) {
case OP_SIGMOID:
mode = CUDNN_ACTIVATION_SIGMOID;
break;
Expand All @@ -67,41 +65,46 @@ void init_kernel(ElementUnaryPerDeviceState *m,
default:
assert(false);
}
checkCUDNN(cudnnSetActivationDescriptor(
m->actiDesc, mode, CUDNN_PROPAGATE_NAN, 0.0));
checkCUDNN(
cudnnSetTensorDescriptorFromDomain(m->inputTensor, input_domain));
// input_domain == output_domain
cudnnSetActivationDescriptor(actiDesc, mode, CUDNN_PROPAGATE_NAN, 0.0));
checkCUDNN(
cudnnSetTensorDescriptorFromDomain(m->outputTensor, output_domain));
cudnnSetTensorDescriptorFromArrayShape(inputTensor, input_shape));
// input_shape == output_shape
checkCUDNN(
cudnnSetTensorDescriptorFromArrayShape(outputTensor, output_shape));
}

ElementUnaryPerDeviceState per_device_state = {
handle, inputTensor, outputTensor, actiDesc, op_type, data_type, scalar};

return per_device_state;
}

template <DataType T>
struct ForwardKernel {
void operator()(ffStream_t stream,
ElementUnaryPerDeviceState const *m,
ElementUnaryPerDeviceState const &m,
GenericTensorAccessorR const &input,
GenericTensorAccessorW const &output) const {
checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
if (use_cudnn(m->op_type)) {
checkCUDNN(cudnnSetStream(m.handle.dnn, stream));
if (use_cudnn(m.op_type)) {
float alpha = 1.0f, beta = 0.0f;
checkCUDNN(cudnnActivationForward(m->handle.dnn,
m->actiDesc,
checkCUDNN(cudnnActivationForward(m.handle.dnn,
m.actiDesc,
&alpha,
m->inputTensor,
m.inputTensor,
input.get<T>(),
&beta,
m->outputTensor,
m.outputTensor,
output.get<T>()));
} else {
size_t num_elements = input.shape.num_elements();
elewise_unary_forward_kernel<<<GET_BLOCKS(num_elements),
CUDA_NUM_THREADS,
0,
stream>>>(num_elements,
(T)m->scalar,
m->op_type,
(T)m.scalar,
m.op_type,
input.get<T>(),
output.get<T>());
}
Expand All @@ -111,34 +114,34 @@ struct ForwardKernel {
template <DataType T>
struct BackwardKernel {
void operator()(ffStream_t stream,
ElementUnaryPerDeviceState const *m,
ElementUnaryPerDeviceState const &m,
GenericTensorAccessorR const &input,
GenericTensorAccessorR const &input_grad,
GenericTensorAccessorW const &output,
GenericTensorAccessorW const &output_grad) {
checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
GenericTensorAccessorW const &input_grad,
GenericTensorAccessorR const &output,
GenericTensorAccessorR const &output_grad) {
checkCUDNN(cudnnSetStream(m.handle.dnn, stream));

if (use_cudnn(m->op_type)) {
if (use_cudnn(m.op_type)) {
float alpha = 1.0f;
checkCUDNN(cudnnActivationBackward(m->handle.dnn,
m->actiDesc,
checkCUDNN(cudnnActivationBackward(m.handle.dnn,
m.actiDesc,
&alpha,
m->outputTensor,
m.outputTensor,
output.get<T>(),
m->outputTensor,
m.outputTensor,
output_grad.get<T>()),
m->inputTensor,
m.inputTensor,
input.get<T>(),
&alpha,
m->inputTensor,
m.inputTensor,
input_grad.get<T>()));
} else {
size_t num_elements = input.shape.num_elements();
elewise_unary_backward_kernel<T>
<<<GET_BLOCKS(num_elements), CUDA_NUM_THREADS, 0, stream>>>(
num_elements,
m->scalar,
m->op_type,
m.scalar,
m.op_type,
output.get<T>(),
output_grad.get<T>(),
input.get<T>(),
Expand All @@ -148,21 +151,19 @@ struct BackwardKernel {
}

void forward_kernel(ffStream_t stream,
ElementUnaryPerDeviceState const *m,
ElementUnaryPerDeviceState const &device_state,
GenericTensorAccessorR const &input,
GenericTensorAccessorW const &output) {
{
DataTypeDispatch1<ForwardKernel>{}(m->data_type, stream, m, input, output);
}
{ DataTypeDispatch1<ForwardKernel>{}(m.data_type, stream, m, input, output); }

void backward_kernel(ffStream_t stream,
ElementUnaryPerDeviceState const *m,
ElementUnaryPerDeviceState const &device_state,
GenericTensorAccessorR const &input,
GenericTensorAccessorR const &input_grad,
GenericTensorAccessorW const &output,
GenericTensorAccessorW const &output_grad)
DataTypeDispatch1<BackwardKernel>{}(
m->data_type, stream, m, input, input_grad, output, output_grad);
m.data_type, stream, m, input, input_grad, output, output_grad);
}

template <typename T>
Expand Down
Loading