diff --git a/.flake/pkgs/fccf/default.nix b/.flake/pkgs/fccf/default.nix new file mode 100644 index 0000000000..f792b8606c --- /dev/null +++ b/.flake/pkgs/fccf/default.nix @@ -0,0 +1,54 @@ +{ fetchFromGitHub +, stdenv +, cmake +, pkg-config +, libclang +, libllvm +, lib +, zlib +, argparse +, nlohmann_json +, fmt +}: + +stdenv.mkDerivation rec { + pname = "fccf"; + version = "03d373fc65e2d7ceeac441ba4bbddfdc25618dff"; + + src = fetchFromGitHub { + owner = "p-ranav"; + repo = "fccf"; + rev = version; + sha256 = "sha256-3NdPon5ZfjoGFFgBlb0rzRnfWgSopvAc5Gls2NWHaOE="; + }; + + nativeBuildInputs = [ + cmake + pkg-config + ]; + + buildInputs = [ + libclang + libllvm + zlib + argparse + nlohmann_json + fmt + ]; + + patches = [ + ./json-package-name.patch + ./fix-argparse-include.patch + ]; + + cmakeFlags = [ + "-DCMAKE_BUILD_TYPE=Release" + "-DFETCHCONTENT_TRY_FIND_PACKAGE_MODE=ALWAYS" + ]; + + meta = with lib; { + description = "A command-line tool that quickly searches through C/C++ source code in a directory based on a search string and prints relevant code snippets that match the query"; + homepage = "https://github.com/p-ranav/fccf"; + license = licenses.mit; + }; +} diff --git a/.flake/pkgs/fccf/fix-argparse-include.patch b/.flake/pkgs/fccf/fix-argparse-include.patch new file mode 100644 index 0000000000..2cb648c1bf --- /dev/null +++ b/.flake/pkgs/fccf/fix-argparse-include.patch @@ -0,0 +1,13 @@ +diff --git a/source/main.cpp b/source/main.cpp +index 7e131d3..6c05d89 100644 +--- a/source/main.cpp ++++ b/source/main.cpp +@@ -6,7 +6,7 @@ + #include + #include + +-#include ++#include + #include + #include "searcher.hpp" + #include diff --git a/.flake/pkgs/fccf/json-package-name.patch b/.flake/pkgs/fccf/json-package-name.patch new file mode 100644 index 0000000000..51f6a012cf --- /dev/null +++ b/.flake/pkgs/fccf/json-package-name.patch @@ -0,0 +1,12 @@ +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 20bcbbf..923075f 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -48,6 +48,7 @@ FetchContent_MakeAvailable(fmt) + + FetchContent_Declare(json + URL https://github.com/nlohmann/json/releases/download/v3.10.5/json.tar.xz ++ FIND_PACKAGE_ARGS NAMES nlohmann_json + ) + FetchContent_MakeAvailable(json) + diff --git a/.github/runs-on.yml b/.github/runs-on.yml index a4fff33536..5033e69d65 100644 --- a/.github/runs-on.yml +++ b/.github/runs-on.yml @@ -1,23 +1,4 @@ images: - runs-on-gpu-pinned: - platform: "linux" - arch: "x64" - owner: "135269210855" # runs-on - # to find, go to - # https://us-east-2.console.aws.amazon.com/ec2/home?region=us-east-2#Images:visibility=public-images;search=:runs-on;v=3;$case=tags:false%5C,client:false;$regex=tags:false%5C,client:false - name: "runs-on-v2.2-ubuntu22-gpu-x64-20250220122045" - - runs-on-cpu-pinned: - platform: "linux" - arch: "x64" - owner: "135269210855" # runs-on - name: "runs-on-v2.2-ubuntu22-full-x64-20250220122045" - - official-ubuntu-ami: - platform: "linux" - arch: "x64" - ami: "ami-0a60b027285c0d4c5" - flexflow-gpu-ci: platform: "linux" arch: "x64" diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 9d98fb07dd..799e3069a9 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -57,9 +57,9 @@ jobs: name: GPU unit tests needs: cpu-ci runs-on: - - runs-on + - runs-on=${{ github.run_id }} - family=g4dn.xlarge - - image=runs-on-gpu-pinned + - image=flexflow-gpu-ci strategy: max-parallel: 1 diff --git a/.proj.toml b/.proj.toml index a06fb53c3a..8eed6166cd 100644 --- a/.proj.toml +++ b/.proj.toml @@ -2,57 +2,81 @@ project_name = "flexflow" testsuite_macro = "FF_TEST_SUITE" namespace_name = "FlexFlow" header_extension = ".h" +cuda_launch_cmd = [ + "nixGL", + "--", +] [targets.utils] type = "lib" -tests = true -benchmarks = true +has-cpu-only-tests = true +has-cpu-only-benchmarks = true +has-cuda-tests = false +has-cuda-benchmarks = false [targets.op-attrs] type = "lib" -tests = true -benchmarks = false +has-cpu-only-tests = true +has-cpu-only-benchmarks = false +has-cuda-tests = false +has-cuda-benchmarks = false [targets.kernels] type = "lib" -tests = true -benchmarks = false +has-cpu-only-tests = true +has-cpu-only-benchmarks = false +has-cuda-tests = true +has-cuda-benchmarks = false [targets.pcg] type = "lib" -tests = true -benchmarks = false +has-cpu-only-tests = true +has-cpu-only-benchmarks = false +has-cuda-tests = false +has-cuda-benchmarks = false [targets.substitutions] type = "lib" -tests = true -benchmarks = false +has-cpu-only-tests = true +has-cpu-only-benchmarks = false +has-cuda-tests = false +has-cuda-benchmarks = false [targets.compiler] type = "lib" -tests = true -benchmarks = true +has-cpu-only-tests = true +has-cpu-only-benchmarks = true +has-cuda-tests = false +has-cuda-benchmarks = false [targets.substitution-generator] type = "lib" -tests = true -benchmarks = false +has-cpu-only-tests = true +has-cpu-only-benchmarks = false +has-cuda-tests = false +has-cuda-benchmarks = false [targets.local-execution] type = "lib" -tests = true -benchmarks = false +has-cpu-only-tests = true +has-cpu-only-benchmarks = false +has-cuda-tests = false +has-cuda-benchmarks = false [targets.models] type = "lib" -tests = true -benchmarks = false +has-cpu-only-tests = true +has-cpu-only-benchmarks = false +has-cuda-tests = false +has-cuda-benchmarks = false [targets.export-model-arch] type = "bin" +cuda = false [targets.substitution-to-dot] type = "bin" +cuda = false # default_build_targets = [ # "utils", diff --git a/cmake/flexflow-utils.cmake b/cmake/flexflow-utils.cmake index 478ebda318..ef5d6d9d11 100644 --- a/cmake/flexflow-utils.cmake +++ b/cmake/flexflow-utils.cmake @@ -126,11 +126,16 @@ function(ff_add_test_executable) ${FF_TEST_EXEC_NAME} ${SRC}) + target_include_directories( + ${FF_TEST_EXEC_NAME} + PRIVATE + ${FF_TEST_EXEC_PRIVATE_INCLUDE}) + target_link_libraries( ${FF_TEST_EXEC_NAME} ${FF_TEST_EXEC_DEPS}) - target_compile_definitions(${FF_TEST_EXEC_NAME} PRIVATE FF_TEST_SUITE="${FF_TEST_EXEC_NAME}" FF_CUDA_TEST_SUITE="cuda-${FF_TEST_EXEC_NAME}") + target_compile_definitions(${FF_TEST_EXEC_NAME} PRIVATE FF_TEST_SUITE="cpu-${FF_TEST_EXEC_NAME}" FF_CUDA_TEST_SUITE="cuda-${FF_TEST_EXEC_NAME}") define_ff_vars(${FF_TEST_EXEC_NAME}) ff_set_cxx_properties(${FF_TEST_EXEC_NAME}) diff --git a/flake.lock b/flake.lock index c991232013..ff6e797d51 100644 --- a/flake.lock +++ b/flake.lock @@ -66,11 +66,11 @@ ] }, "locked": { - "lastModified": 1741679698, - "narHash": "sha256-poSOQS/2qImAo/PgRu37pHdOrwAsZEyC8PMM3evFLX4=", + "lastModified": 1746157536, + "narHash": "sha256-g4Hx/05+Ce3hl8OS1zm4pY/+ThD1blWKmcaPsohSX5Y=", "owner": "lockshaw", "repo": "proj", - "rev": "0de983ff66abea4703f73988d29fc807e2b0a9bd", + "rev": "5871bc7b7fb9d7d7f14c8bca6c50a0cf2e75834d", "type": "github" }, "original": { diff --git a/flake.nix b/flake.nix index 77a6c61b7d..5fa48fa3fd 100644 --- a/flake.nix +++ b/flake.nix @@ -59,6 +59,7 @@ bencher-cli = pkgs.callPackage ./.flake/pkgs/bencher-cli.nix { }; ffdb = pkgs.callPackage ./.flake/pkgs/ffdb { inherit proj; }; hpp2plantuml = pkgs.python3Packages.callPackage ./.flake/pkgs/hpp2plantuml.nix { }; + fccf = pkgs.callPackage ./.flake/pkgs/fccf { }; rapidcheckFull = pkgs.symlinkJoin { name = "rapidcheckFull"; paths = (with pkgs; [ rapidcheck.out rapidcheck.dev ]); @@ -162,6 +163,7 @@ ruff jq gh + expect ]) (with pkgs.python3Packages; [ gitpython @@ -179,6 +181,7 @@ (with self.packages.${system}; [ ffdb hpp2plantuml + fccf ]) ]; }; diff --git a/lib/kernels/CMakeLists.txt b/lib/kernels/CMakeLists.txt index 8ccd7c1011..f5d88f102f 100644 --- a/lib/kernels/CMakeLists.txt +++ b/lib/kernels/CMakeLists.txt @@ -7,8 +7,7 @@ file(GLOB_RECURSE SRC CONFIGURE_DEPENDS LIST_DIRECTORIES False src/*.cc - src/cuda/cuda_helper.cu - src/cuda/ops/*.cu + src/cuda/*.cu ) add_library( @@ -30,6 +29,7 @@ target_link_libraries( cudnn nccl utils + pcg ) define_ff_vars(${project_target}) diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h index 39da65c3be..f9bef91b25 100644 --- a/lib/kernels/include/kernels/accessor.h +++ b/lib/kernels/include/kernels/accessor.h @@ -1,25 +1,88 @@ #ifndef _FLEXFLOW_KERNELS_ACCESSOR_H #define _FLEXFLOW_KERNELS_ACCESSOR_H -#include "array_shape.h" -#include "device.h" +#include "kernels/array_shape.h" +#include "kernels/device.h" #include "kernels/ff_handle.h" #include "op-attrs/datatype.h" -#include "utils/exception.h" +#include "pcg/device_type.dtg.h" +#include "utils/containers/transform.h" #include "utils/required.h" +#include namespace FlexFlow { +nonnegative_int + calculate_accessor_offset(LegionOrdered const &, + ArrayShape const &); + +class GenericTensorAccessorR { +public: + template + typename data_type_enum_to_class
::type const *get() const { + ASSERT(this->data_type == DT, "Invalid datatype requested"); + + return static_cast const *>(this->ptr); + } + + int32_t const *get_int32_ptr() const; + int64_t const *get_int64_ptr() const; + float const *get_float_ptr() const; + double const *get_double_ptr() const; + half const *get_half_ptr() const; + + GenericTensorAccessorR() = delete; + + GenericTensorAccessorR(DataType data_type, + ArrayShape const &shape, + void const *ptr, + DeviceType device_type); + + bool operator==(GenericTensorAccessorR const &) const; + bool operator!=(GenericTensorAccessorR const &) const; + + template + real_type_t
const &at(FFOrdered const &indices) const { + return this->at
(legion_ordered_from_ff_ordered(indices)); + } + + template + real_type_t
const & + at(LegionOrdered const &indices) const { + ASSERT(this->device_type == DeviceType::CPU, + "GenericTensorAccessorR::at() requires CPU-allocated tensor"); + ASSERT(this->data_type == DT, "Invalid datatype requested"); + + using T = real_type_t
; + T const *data_ptr = static_cast(this->ptr); + nonnegative_int offset = calculate_accessor_offset(indices, this->shape); + return data_ptr[offset.unwrap_nonnegative()]; + } + +public: + DataType data_type; + ArrayShape shape; + void const *ptr; + DeviceType device_type; + +private: + std::tuple + tie() const; +}; + +std::string format_as(GenericTensorAccessorR const &); +std::ostream &operator<<(std::ostream &, GenericTensorAccessorR const &); + class GenericTensorAccessorW { public: template typename data_type_enum_to_class
::type *get() const { - if (this->data_type == DT) { - return static_cast *>(this->ptr); - } else { - throw mk_runtime_error(fmt::format( - "Invalid access data type ({} != {})", this->data_type, DT)); - } + ASSERT(this->data_type == DT, "Invalid datatype requested"); + + return static_cast *>(this->ptr); } int32_t *get_int32_ptr() const; @@ -28,76 +91,76 @@ class GenericTensorAccessorW { double *get_double_ptr() const; half *get_half_ptr() const; -public: - DataType data_type; - ArrayShape shape; - req ptr; -}; -FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GenericTensorAccessorW, - data_type, - shape, - ptr); + GenericTensorAccessorW() = delete; -std::string format_as(GenericTensorAccessorW const &); -std::ostream &operator<<(std::ostream &, GenericTensorAccessorW const &); + GenericTensorAccessorW(DataType data_type, + ArrayShape const &shape, + void *ptr, + DeviceType device_type); + + bool operator==(GenericTensorAccessorW const &) const; + bool operator!=(GenericTensorAccessorW const &) const; + + operator GenericTensorAccessorR() const; -class GenericTensorAccessorR { -public: template - typename data_type_enum_to_class
::type const *get() const { - if (this->data_type == DT) { - return static_cast const *>(this->ptr); - } else { - throw mk_runtime_error(fmt::format( - "Invalid access data type ({} != {})", this->data_type, DT)); - } + real_type_t
&at(FFOrdered const &indices) { + return this->at
(legion_ordered_from_ff_ordered(indices)); } - int32_t const *get_int32_ptr() const; - int64_t const *get_int64_ptr() const; - float const *get_float_ptr() const; - double const *get_double_ptr() const; - half const *get_half_ptr() const; + template + real_type_t
&at(LegionOrdered const &indices) { + ASSERT(this->device_type == DeviceType::CPU, + "GenericTensorAccessorW::at() requires CPU-allocated tensor"); + ASSERT(this->data_type == DT, "Invalid datatype requested"); + + using T = real_type_t
; + T *data_ptr = static_cast(this->ptr); + nonnegative_int offset = calculate_accessor_offset(indices, this->shape); + return data_ptr[offset.unwrap_nonnegative()]; + } + + template + real_type_t
const &at(FFOrdered const &indices) const { + return this->at
(legion_ordered_from_ff_ordered(indices)); + } + + template + real_type_t
&at(LegionOrdered const &indices) const { + ASSERT(this->device_type == DeviceType::CPU, + "GenericTensorAccessorW::at() requires CPU-allocated tensor"); + ASSERT(this->data_type == DT, "Invalid datatype requested"); + + using T = real_type_t
; + T const *data_ptr = static_cast(this->ptr); + nonnegative_int offset = calculate_accessor_offset(indices, this->shape); + return data_ptr[offset]; + } public: DataType data_type; ArrayShape shape; - req ptr; + void *ptr; + DeviceType device_type; + +private: + std::tuple + tie() const; }; -FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GenericTensorAccessorR, - data_type, - shape, - ptr); - -std::string format_as(GenericTensorAccessorR const &); -std::ostream &operator<<(std::ostream &, GenericTensorAccessorR const &); -int32_t *get_int32_ptr(GenericTensorAccessorW const &); -int64_t *get_int64_ptr(GenericTensorAccessorW const &); -float *get_float_ptr(GenericTensorAccessorW const &); -double *get_double_ptr(GenericTensorAccessorW const &); -half *get_half_ptr(GenericTensorAccessorW const &); -std::vector - get_int32_ptrs(std::vector const &); -std::vector - get_int64_ptrs(std::vector const &); -std::vector - get_float_ptrs(std::vector const &); -std::vector - get_double_ptrs(std::vector const &); -std::vector get_half_ptrs(std::vector const &); +std::string format_as(GenericTensorAccessorW const &); +std::ostream &operator<<(std::ostream &, GenericTensorAccessorW const &); static_assert(is_fmtable const &>::value, ""); template typename data_type_enum_to_class
::type * get(GenericTensorAccessorW const &a) { - if (a.data_type == DT) { - return static_cast *>(a.ptr); - } else { - throw mk_runtime_error( - fmt::format("Invalid access data type ({} != {})", a.data_type, DT)); - } + ASSERT(a.data_type == DT, "Invalid datatype requested"); + return static_cast *>(a.ptr); } template @@ -113,12 +176,8 @@ std::vector *> template typename data_type_enum_to_class
::type const * get(GenericTensorAccessorR const &a) { - if (a.data_type == DT) { - return static_cast const *>(a.ptr); - } else { - throw mk_runtime_error( - fmt::format("Invalid access data type ({} != {})", a.data_type, DT)); - } + ASSERT(a.data_type == DT, "Invalid datatype requested"); + return static_cast const *>(a.ptr); } int32_t const *get_int32_ptr(GenericTensorAccessorR const &); @@ -137,6 +196,21 @@ std::vector std::vector get_half_ptrs(std::vector const &); +int32_t *get_int32_ptr(GenericTensorAccessorW const &); +int64_t *get_int64_ptr(GenericTensorAccessorW const &); +float *get_float_ptr(GenericTensorAccessorW const &); +double *get_double_ptr(GenericTensorAccessorW const &); +half *get_half_ptr(GenericTensorAccessorW const &); +std::vector + get_int32_ptrs(std::vector const &); +std::vector + get_int64_ptrs(std::vector const &); +std::vector + get_float_ptrs(std::vector const &); +std::vector + get_double_ptrs(std::vector const &); +std::vector get_half_ptrs(std::vector const &); + template std::vector const *> get(std::vector const &accs) { @@ -150,12 +224,8 @@ std::vector const *> GenericTensorAccessorR read_only_accessor_from_write_accessor( GenericTensorAccessorW const &write_accessor); -bool is_shape_and_dtype_equal(GenericTensorAccessorW const &acc1, - GenericTensorAccessorW const &acc2); - -bool shape_and_dtype_matches(GenericTensorAccessorW const &accessor, - ArrayShape const &expected_shape, - DataType const &expected_dtype); +bool is_shape_and_dtype_equal(GenericTensorAccessorR const &acc1, + GenericTensorAccessorR const &acc2); bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor, ArrayShape const &expected_shape, @@ -163,8 +233,9 @@ bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor, std::pair get_shape_and_datatype(GenericTensorAccessorR const &accessor); -std::pair - get_shape_and_datatype(GenericTensorAccessorW const &accessor); + +void copy_accessor_data_to_l_from_r(GenericTensorAccessorW &dst_accessor, + GenericTensorAccessorR const &src_accessor); } // namespace FlexFlow diff --git a/lib/kernels/include/kernels/allocation.h b/lib/kernels/include/kernels/allocation.h index 6500899394..39bad6599c 100644 --- a/lib/kernels/include/kernels/allocation.h +++ b/lib/kernels/include/kernels/allocation.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_KERNELS_ALLOCATION_H #define _FLEXFLOW_KERNELS_ALLOCATION_H -#include "accessor.h" +#include "kernels/accessor.h" #include #include @@ -11,6 +11,8 @@ struct IAllocator { virtual void *allocate(size_t) = 0; virtual void deallocate(void *) = 0; + virtual DeviceType get_allocation_device_type() const = 0; + virtual ~IAllocator() = default; }; @@ -18,9 +20,14 @@ struct Allocator { Allocator() = delete; GenericTensorAccessorW allocate_tensor(TensorShape const &tensor_shape); + void deallocate_tensor(GenericTensorAccessorW const &); + void deallocate_tensor(GenericTensorAccessorR const &); + void *allocate(size_t mem_size); void deallocate(void *ptr); + DeviceType get_allocation_device_type() const; + template static typename std::enable_if::value, Allocator>::type diff --git a/lib/kernels/include/kernels/array_coord.struct.toml b/lib/kernels/include/kernels/array_coord.struct.toml new file mode 100644 index 0000000000..8ce121f2bf --- /dev/null +++ b/lib/kernels/include/kernels/array_coord.struct.toml @@ -0,0 +1,19 @@ +namespace = "FlexFlow" +name = "ArrayCoord" +features = [ + "eq", + "ord", + "hash", + "fmt", + "rapidcheck", + "json", +] + +includes = [ + "op-attrs/ff_ordered/ff_ordered.h", + "utils/nonnegative_int/nonnegative_int.h" +] + +[[fields]] +name = "ff_ordered" +type = "::FlexFlow::FFOrdered<::FlexFlow::nonnegative_int>" diff --git a/lib/kernels/include/kernels/array_shape.h b/lib/kernels/include/kernels/array_shape.h index 57498ee466..25ef8116f2 100644 --- a/lib/kernels/include/kernels/array_shape.h +++ b/lib/kernels/include/kernels/array_shape.h @@ -1,6 +1,7 @@ #ifndef _FLEXFLOW_KERNELS_ARRAY_SHAPE_H #define _FLEXFLOW_KERNELS_ARRAY_SHAPE_H +#include "kernels/array_coord.dtg.h" #include "kernels/legion_dim.h" #include "op-attrs/tensor_shape.dtg.h" #include "utils/nonnegative_int/nonnegative_int.h" @@ -15,9 +16,7 @@ namespace FlexFlow { struct ArrayShape { public: ArrayShape() = delete; - ArrayShape(nonnegative_int *dims, nonnegative_int num_dims); - ArrayShape(TensorShape const &shape); - ArrayShape(std::vector const &); + explicit ArrayShape(LegionOrdered const &dims); /** * @brief Alias of ArrayShape::num_elements for compatibility with @@ -46,24 +45,40 @@ struct ArrayShape { std::optional at_maybe(legion_dim_t) const; std::optional at_maybe(ff_dim_t) const; - ArrayShape - sub_shape(std::optional> start, - std::optional> end) const; + ArrayShape sub_shape(ff_dim_t const &start, + std::optional const &end) const; + + ArrayShape sub_shape(legion_dim_t const &start, + std::optional const &end) const; public: LegionOrdered dims; private: std::tuple tie() const; + + friend ::std::hash; }; +std::string format_as(ArrayShape const &); +std::ostream &operator<<(std::ostream &, ArrayShape const &); + nonnegative_int get_volume(ArrayShape const &); +ArrayShape array_shape_from_tensor_shape(TensorShape const &); TensorShape get_tensor_shape(ArrayShape const &, DataType); -std::string format_as(ArrayShape const &); -std::ostream &operator<<(std::ostream &, ArrayShape const &); +std::unordered_set get_array_coord_set(ArrayShape const &); } // namespace FlexFlow +namespace std { + +template <> +struct hash<::FlexFlow::ArrayShape> { + size_t operator()(::FlexFlow::ArrayShape const &) const; +}; + +} // namespace std + #endif diff --git a/lib/kernels/include/kernels/attention_kernels.h b/lib/kernels/include/kernels/attention_kernels.h index eb5a1b8198..b3c77d3430 100644 --- a/lib/kernels/include/kernels/attention_kernels.h +++ b/lib/kernels/include/kernels/attention_kernels.h @@ -1,7 +1,6 @@ #ifndef _FLEXFLOW_OPS_KERNELS_ATTENTION_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_ATTENTION_KERNELS_H -#include "device.h" #include "kernels/allocation.h" #include "kernels/device.h" #include "kernels/ff_handle.h" @@ -64,8 +63,7 @@ FF_VISITABLE_STRUCT_NO_EQ(MHAPerDeviceState, std::string format_as(MHAPerDeviceState const &x); std::ostream &operator<<(std::ostream &s, MHAPerDeviceState const &x); -namespace Kernels { -namespace MultiHeadAttention { +namespace Kernels::MultiHeadAttention { MHAPerDeviceState init_kernel(PerDeviceFFHandle const &, Allocator &, @@ -105,8 +103,7 @@ void backward_kernel(ffStream_t stream, void cleanup_kernel(Allocator &allocator, MHAPerDeviceState const &device_state); -} // namespace MultiHeadAttention -} // namespace Kernels +} // namespace Kernels::MultiHeadAttention } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/batch_matmul_kernels.h b/lib/kernels/include/kernels/batch_matmul_kernels.h index bfd72647b0..8b67f564d2 100644 --- a/lib/kernels/include/kernels/batch_matmul_kernels.h +++ b/lib/kernels/include/kernels/batch_matmul_kernels.h @@ -1,13 +1,11 @@ #ifndef _FLEXFLOW_OPS_KERNELS_BATCH_MATMUL_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_BATCH_MATMUL_KERNELS_H -#include "device.h" #include "kernels/allocation.h" +#include "kernels/device.h" #include "kernels/ff_handle.h" -namespace FlexFlow { -namespace Kernels { -namespace BatchMatmul { +namespace FlexFlow::Kernels::BatchMatmul { void forward_kernel(ffStream_t stream, PerDeviceFFHandle const &handle, @@ -35,8 +33,6 @@ void backward_kernel(ffStream_t stream, int k, int batch); -} // namespace BatchMatmul -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::BatchMatmul #endif diff --git a/lib/kernels/include/kernels/batch_norm_kernels.h b/lib/kernels/include/kernels/batch_norm_kernels.h index f2ca17f429..9bb2753a12 100644 --- a/lib/kernels/include/kernels/batch_norm_kernels.h +++ b/lib/kernels/include/kernels/batch_norm_kernels.h @@ -1,15 +1,13 @@ #ifndef _FLEXFLOW_KERNELS_BATCH_NORM_KERNELS_H #define _FLEXFLOW_KERNELS_BATCH_NORM_KERNELS_H -#include "device.h" #include "kernels/allocation.h" #include "kernels/batch_norm_per_device_state.dtg.h" +#include "kernels/device.h" #include "kernels/ff_handle.h" #include -namespace FlexFlow { -namespace Kernels { -namespace BatchNorm { +namespace FlexFlow::Kernels::BatchNorm { BatchNormPerDeviceState init_kernel(PerDeviceFFHandle handle, Allocator allocator, @@ -29,9 +27,9 @@ void forward_kernel(ffStream_t stream, void backward_kernel(ffStream_t stream, BatchNormPerDeviceState const &per_device_state, - float const *input_ptr, - float *output_grad_ptr, float const *output_ptr, + float *output_grad_ptr, + float const *input_ptr, float *input_grad_ptr, float const *scale_ptr, float *scale_grad_ptr, @@ -46,8 +44,5 @@ void cleanup_kernel(Allocator allocator, bool relu, float *runningMean); -} // namespace BatchNorm -} // namespace Kernels -} // namespace FlexFlow - +} // namespace FlexFlow::Kernels::BatchNorm #endif diff --git a/lib/kernels/include/kernels/cast_kernels.h b/lib/kernels/include/kernels/cast_kernels.h index 96f9aadd52..5ec4cb3975 100644 --- a/lib/kernels/include/kernels/cast_kernels.h +++ b/lib/kernels/include/kernels/cast_kernels.h @@ -1,29 +1,19 @@ #ifndef _FLEXFLOW_OPS_KERNELS_CAST_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_CAST_KERNELS_H -#include "device.h" #include "kernels/accessor.h" -#include "kernels/ff_handle.h" -#include "op-attrs/activation.dtg.h" +#include "kernels/device.h" -namespace FlexFlow { -namespace Kernels { -namespace Cast { +namespace FlexFlow::Kernels::Cast { void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type); + GenericTensorAccessorW const &output); void backward_kernel(ffStream_t stream, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type); + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input); -} // namespace Cast -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Cast #endif diff --git a/lib/kernels/include/kernels/cast_kernels_cpu.h b/lib/kernels/include/kernels/cast_kernels_cpu.h new file mode 100644 index 0000000000..343ba253d9 --- /dev/null +++ b/lib/kernels/include/kernels/cast_kernels_cpu.h @@ -0,0 +1,17 @@ +#ifndef _FLEXFLOW_OPS_KERNELS_CAST_KERNELS_CPU_H +#define _FLEXFLOW_OPS_KERNELS_CAST_KERNELS_CPU_H + +#include "kernels/accessor.h" +#include "kernels/device.h" + +namespace FlexFlow::Kernels::Cast { + +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output); + +void cpu_backward_kernel(GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input); + +} // namespace FlexFlow::Kernels::Cast + +#endif diff --git a/lib/kernels/include/kernels/combine_kernels.h b/lib/kernels/include/kernels/combine_kernels.h index eb263e0734..c87465a01f 100644 --- a/lib/kernels/include/kernels/combine_kernels.h +++ b/lib/kernels/include/kernels/combine_kernels.h @@ -1,12 +1,10 @@ #ifndef _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_H -#include "device.h" #include "kernels/accessor.h" +#include "kernels/device.h" -namespace FlexFlow { -namespace Kernels { -namespace Combine { +namespace FlexFlow::Kernels::Combine { void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, @@ -16,8 +14,6 @@ void backward_kernel(ffStream_t stream, GenericTensorAccessorR const &output_grad, GenericTensorAccessorW const &input_grad); -} // namespace Combine -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Combine #endif // _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_H diff --git a/lib/kernels/include/kernels/combine_kernels_cpu.h b/lib/kernels/include/kernels/combine_kernels_cpu.h new file mode 100644 index 0000000000..75fdd56498 --- /dev/null +++ b/lib/kernels/include/kernels/combine_kernels_cpu.h @@ -0,0 +1,17 @@ +#ifndef _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_CPU_H +#define _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_CPU_H + +#include "kernels/accessor.h" +#include "kernels/device.h" + +namespace FlexFlow::Kernels::Combine { + +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output); + +void cpu_backward_kernel(GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad); + +} // namespace FlexFlow::Kernels::Combine + +#endif // _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_CPU_H diff --git a/lib/kernels/include/kernels/concat_kernels.h b/lib/kernels/include/kernels/concat_kernels.h index a44affc1f2..1e3c55bf59 100644 --- a/lib/kernels/include/kernels/concat_kernels.h +++ b/lib/kernels/include/kernels/concat_kernels.h @@ -1,12 +1,10 @@ #ifndef _FLEXFLOW_OPS_KERNELS_CONCAT_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_CONCAT_KERNELS_H -#include "device.h" #include "kernels/accessor.h" +#include "kernels/device.h" -namespace FlexFlow { -namespace Kernels { -namespace Concat { +namespace FlexFlow::Kernels::Concat { void forward_kernel(ffStream_t stream, GenericTensorAccessorW const &output, @@ -18,8 +16,6 @@ void backward_kernel(ffStream_t stream, std::vector const &input_grads, ff_dim_t axis); -} // namespace Concat -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Concat #endif diff --git a/lib/kernels/include/kernels/conv_2d_kernels.h b/lib/kernels/include/kernels/conv_2d_kernels.h index cfc64f963d..3b7c0672df 100644 --- a/lib/kernels/include/kernels/conv_2d_kernels.h +++ b/lib/kernels/include/kernels/conv_2d_kernels.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_OPS_KERNELS_CONV_2D_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_CONV_2D_KERNELS_H -#include "device.h" #include "kernels/accessor.h" +#include "kernels/device.h" #include "kernels/ff_handle.h" #include "op-attrs/activation.dtg.h" #include "utils/visitable.h" @@ -34,8 +34,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(Conv2DPerDeviceState, bwdFilterAlgo, bwdDataAlgo); -namespace Kernels { -namespace Conv2D { +namespace Kernels::Conv2D { Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle, std::optional activation, @@ -61,17 +60,16 @@ void forward_kernel(ffStream_t stream, void backward_kernel(ffStream_t stream, Conv2DPerDeviceState const &m, - float const *input_ptr, - float *input_grad_ptr, float const *output_ptr, float *output_grad_ptr, + float const *input_ptr, + float *input_grad_ptr, float const *filter_ptr, float *filter_grad_ptr, float *bias_grad_ptr, std::optional activation); -} // namespace Conv2D -} // namespace Kernels +} // namespace Kernels::Conv2D } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_CONV_2D_KERNELS_H diff --git a/lib/kernels/include/kernels/copy_tensor_accessor.h b/lib/kernels/include/kernels/copy_tensor_accessor.h new file mode 100644 index 0000000000..81fd59dafb --- /dev/null +++ b/lib/kernels/include/kernels/copy_tensor_accessor.h @@ -0,0 +1,27 @@ +#ifndef _FLEXFLOW_KERNELS_COPY_TENSOR_ACCESSOR_H +#define _FLEXFLOW_KERNELS_COPY_TENSOR_ACCESSOR_H + +#include "kernels/accessor.h" +#include "kernels/allocation.h" + +namespace FlexFlow { + +GenericTensorAccessorR + copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor, + Allocator &allocator); + +GenericTensorAccessorW + copy_tensor_accessor_w(GenericTensorAccessorW const &src_accessor, + Allocator &allocator); + +GenericTensorAccessorR + copy_tensor_accessor_r_to_cpu_if_necessary(GenericTensorAccessorR const &, + Allocator &cpu_allocator); + +GenericTensorAccessorW + copy_tensor_accessor_w_to_cpu_if_necessary(GenericTensorAccessorW const &, + Allocator &cpu_allocator); + +} // namespace FlexFlow + +#endif diff --git a/lib/kernels/include/kernels/datatype_dispatch.h b/lib/kernels/include/kernels/datatype_dispatch.h index e83fc3325d..50ca66a820 100644 --- a/lib/kernels/include/kernels/datatype_dispatch.h +++ b/lib/kernels/include/kernels/datatype_dispatch.h @@ -1,7 +1,8 @@ #ifndef _FLEXFLOW_KERNELS_DATATYPE_DISPATCH_H #define _FLEXFLOW_KERNELS_DATATYPE_DISPATCH_H -#include "accessor.h" +#include "op-attrs/datatype.h" +#include "utils/exception.h" namespace FlexFlow { @@ -33,7 +34,7 @@ struct DataTypeDispatch1 { template >()( std::declval()...))> - Out operator()(Args... args) const { + Out operator()(Args &&...args) const { return F
{}(std::forward(args)...); } }; @@ -41,7 +42,7 @@ struct DataTypeDispatch1 { template >()( std::declval()...))> - Out operator()(DataType data_type, Args... args) { + Out operator()(DataType data_type, Args &&...args) { return dispatch(data_type, std::forward(args)...); } }; @@ -54,13 +55,13 @@ struct DataTypeDispatch2 { template struct OutputType { template - void operator()(Args... args) const { + void operator()(Args &&...args) const { F{}(std::forward(args)...); } }; template - void operator()(DataType output_type, Args... args) const { + void operator()(DataType output_type, Args &&...args) const { dispatch(output_type, std::forward(args)...); } }; @@ -68,7 +69,7 @@ struct DataTypeDispatch2 { template void operator()(DataType input_data_type, DataType output_data_type, - Args... args) { + Args &&...args) { dispatch( input_data_type, output_data_type, std::forward(args)...); } diff --git a/lib/kernels/include/kernels/dropout_kernels.h b/lib/kernels/include/kernels/dropout_kernels.h index c0e503be5b..2cc6dd60a3 100644 --- a/lib/kernels/include/kernels/dropout_kernels.h +++ b/lib/kernels/include/kernels/dropout_kernels.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_OPS_KERNELS_DROPOUT_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_DROPOUT_KERNELS_H -#include "device.h" #include "kernels/allocation.h" #include "kernels/array_shape.h" +#include "kernels/device.h" #include "kernels/ff_handle.h" #include @@ -31,8 +31,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(DropoutPerDeviceState, reserveSpaceSize, dropoutStateSize); -namespace Kernels { -namespace Dropout { +namespace Kernels::Dropout { DropoutPerDeviceState init_kernel(PerDeviceFFHandle handle, float rate, @@ -56,8 +55,7 @@ void cleanup_kernel(Allocator allocator, ffDropoutDescriptor_t dropoutDesc, void *dropoutStates); -} // namespace Dropout -} // namespace Kernels +} // namespace Kernels::Dropout } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_DROPOUT_KERNELS_H diff --git a/lib/kernels/include/kernels/element_binary_kernels.h b/lib/kernels/include/kernels/element_binary_kernels.h index 41447e98e6..fd596f2ccf 100644 --- a/lib/kernels/include/kernels/element_binary_kernels.h +++ b/lib/kernels/include/kernels/element_binary_kernels.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_OPS_KERNELS_ELEMENT_BINARY_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_ELEMENT_BINARY_KERNELS_H -#include "device.h" #include "ff_handle.h" #include "kernels/array_shape.h" +#include "kernels/device.h" #include "op-attrs/datatype.h" #include "op-attrs/operator_type.h" @@ -26,8 +26,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(ElementBinaryPerDeviceState, opDesc, reduceAddDesc); -namespace Kernels { -namespace ElementBinary { +namespace Kernels::ElementBinary { ElementBinaryPerDeviceState init_kernel(PerDeviceFFHandle handle, OperatorType op_type, @@ -58,8 +57,7 @@ void backward_kernel(ffStream_t stream, bool broadcast_inputRHS, PerDeviceFFHandle handle); -} // namespace ElementBinary -} // namespace Kernels +} // namespace Kernels::ElementBinary } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/element_unary_kernels.h b/lib/kernels/include/kernels/element_unary_kernels.h index 8c6864b2d9..0257b3b4a6 100644 --- a/lib/kernels/include/kernels/element_unary_kernels.h +++ b/lib/kernels/include/kernels/element_unary_kernels.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_OPS_KERNELS_ELEMENT_UNARY_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_ELEMENT_UNARY_KERNELS_H -#include "device.h" #include "kernels/accessor.h" +#include "kernels/device.h" #include "kernels/ff_handle.h" #include "op-attrs/ops/element_unary.h" #include @@ -19,8 +19,7 @@ FF_VISITABLE_STRUCT_NO_EQ(ElementUnaryPerDeviceState, outputTensor, actiDesc); -namespace Kernels { -namespace ElementUnary { +namespace Kernels::ElementUnary { ElementUnaryPerDeviceState init_kernel(ArrayShape const &input_shape, ArrayShape const &output_shape, @@ -37,13 +36,12 @@ void backward_kernel(ffStream_t stream, ElementUnaryPerDeviceState const &device_state, ElementUnaryAttrs const &attrs, PerDeviceFFHandle const &handle, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &input_grad, GenericTensorAccessorR const &output, - GenericTensorAccessorR const &output_grad); + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad); -} // namespace ElementUnary -} // namespace Kernels +} // namespace Kernels::ElementUnary } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/embedding_kernels.h b/lib/kernels/include/kernels/embedding_kernels.h index 06582ca1d5..f51a730314 100644 --- a/lib/kernels/include/kernels/embedding_kernels.h +++ b/lib/kernels/include/kernels/embedding_kernels.h @@ -1,13 +1,11 @@ #ifndef _FLEXFLOW_OPS_KERNELS_EMBEDDING_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_EMBEDDING_KERNELS_H -#include "device.h" #include "kernels/accessor.h" +#include "kernels/device.h" #include "op-attrs/ops/embedding.h" -namespace FlexFlow { -namespace Kernels { -namespace Embedding { +namespace FlexFlow::Kernels::Embedding { void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output, @@ -19,11 +17,11 @@ void forward_kernel(ffStream_t stream, int out_dim, int batch_size); void backward_kernel(ffStream_t stream, - GenericTensorAccessorR const &input, GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, GenericTensorAccessorW const &weight_grad, - DataType input_data_type, DataType output_data_type, + DataType input_data_type, std::optional aggr, int in_dim, int out_dim, @@ -35,8 +33,6 @@ void rand_generate_int32_wrapper(int32_t *ptr, size_t size, int32_t p); template __global__ void rand_generate_int(TD *ptr, size_t size, TD p); -} // namespace Embedding -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Embedding #endif // _FLEXFLOW_OPS_KERNELS_EMBEDDING_KERNELS_H diff --git a/lib/kernels/include/kernels/ff_handle.h b/lib/kernels/include/kernels/ff_handle.h index 179ce41cbf..31b3296a98 100644 --- a/lib/kernels/include/kernels/ff_handle.h +++ b/lib/kernels/include/kernels/ff_handle.h @@ -5,7 +5,7 @@ #include #endif -#include "device.h" +#include "kernels/device.h" #include "utils/visitable.h" namespace FlexFlow { diff --git a/lib/kernels/include/kernels/flat_kernels.h b/lib/kernels/include/kernels/flat_kernels.h index 3e600c48de..b2b1164f92 100644 --- a/lib/kernels/include/kernels/flat_kernels.h +++ b/lib/kernels/include/kernels/flat_kernels.h @@ -1,23 +1,20 @@ #ifndef _FLEXFLOW_OPS_KERNELS_FLAT_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_FLAT_KERNELS_H -#include "device.h" #include "kernels/accessor.h" +#include "kernels/device.h" -namespace FlexFlow { -namespace Kernels { -namespace Flat { +namespace FlexFlow::Kernels::Flat { void forward_kernel(ffStream_t stream, GenericTensorAccessorR input, float *output_ptr); + void backward_kernel(ffStream_t stream, GenericTensorAccessorR input, - float *input_grad_ptr, - float const *output_grad_ptr); + float const *output_grad_ptr, + float *input_grad_ptr); -} // namespace Flat -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Flat #endif // _FLEXFLOW_OPS_KERNELS_FLAT_KERNELS_H diff --git a/lib/kernels/include/kernels/format_accessor_contents.h b/lib/kernels/include/kernels/format_accessor_contents.h new file mode 100644 index 0000000000..b50cffbbef --- /dev/null +++ b/lib/kernels/include/kernels/format_accessor_contents.h @@ -0,0 +1,13 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_FORMAT_ACCESSOR_CONTENTS_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_FORMAT_ACCESSOR_CONTENTS_H + +#include "kernels/accessor.h" + +namespace FlexFlow { + +std::string format_accessor_r_contents(GenericTensorAccessorR const &); +std::string format_accessor_w_contents(GenericTensorAccessorW const &); + +} // namespace FlexFlow + +#endif diff --git a/lib/kernels/include/kernels/gather_kernels.h b/lib/kernels/include/kernels/gather_kernels.h index 13bf4b898a..8cbc7e457e 100644 --- a/lib/kernels/include/kernels/gather_kernels.h +++ b/lib/kernels/include/kernels/gather_kernels.h @@ -15,23 +15,21 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GatherPerDeviceState, handle, legion_dim); -namespace Kernels { -namespace Gather { +namespace Kernels::Gather { void forward_kernel(ffStream_t stream, - GatherPerDeviceState const &m, + GatherPerDeviceState const &per_device_state, GenericTensorAccessorR const &input, GenericTensorAccessorR const &index, GenericTensorAccessorW const &output); void backward_kernel(ffStream_t stream, - GatherPerDeviceState const &m, + GatherPerDeviceState const &per_device_state, GenericTensorAccessorR const &output_grad, GenericTensorAccessorR const &index, GenericTensorAccessorW const &input_grad); -} // namespace Gather -} // namespace Kernels +} // namespace Kernels::Gather } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/layer_norm_kernels.h b/lib/kernels/include/kernels/layer_norm_kernels.h index be13d32879..10cf2fb14b 100644 --- a/lib/kernels/include/kernels/layer_norm_kernels.h +++ b/lib/kernels/include/kernels/layer_norm_kernels.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_OPS_KERNELS_LAYER_NORM_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_LAYER_NORM_KERNELS_H -#include "device.h" #include "kernels/allocation.h" +#include "kernels/device.h" #include "kernels/ff_handle.h" namespace FlexFlow { @@ -30,8 +30,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(LayerNormPerDeviceState, bias, data_type); -namespace Kernels { -namespace LayerNorm { +namespace Kernels::LayerNorm { // todo: this may have some problem. LayerNormPerDeviceState init_kernel(PerDeviceFFHandle const &handle, @@ -57,8 +56,7 @@ void backward_kernel(ffStream_t stream, GenericTensorAccessorW const &gamma_grad, GenericTensorAccessorW const &beta_grad); -} // namespace LayerNorm -} // namespace Kernels +} // namespace Kernels::LayerNorm } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_LAYER_NORM_KERNELS_H diff --git a/lib/kernels/include/kernels/legion_dim.h b/lib/kernels/include/kernels/legion_dim.h index 7b9b9c455c..947bbd00bb 100644 --- a/lib/kernels/include/kernels/legion_dim.h +++ b/lib/kernels/include/kernels/legion_dim.h @@ -2,7 +2,13 @@ #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_LEGION_DIM_H #include "kernels/legion_dim_t.dtg.h" -#include "op-attrs/dim_ordered/dim_ordered.h" +#include "kernels/legion_ordered/legion_ordered.h" +#include "op-attrs/ff_dim_t.dtg.h" +#include "op-attrs/ff_ordered/ff_ordered.h" +#include "utils/containers/set_of.h" +#include "utils/containers/transform.h" +#include "utils/nonnegative_int/nonnegative_range.h" +#include "utils/nonnegative_int/num_elements.h" namespace FlexFlow { @@ -11,7 +17,10 @@ legion_dim_t add_to_legion_dim(legion_dim_t legion_dim, int value); legion_dim_t legion_dim_from_ff_dim(ff_dim_t, nonnegative_int num_dimensions); template -using LegionOrdered = DimOrdered; +std::set key_range(LegionOrdered const &d) { + return transform(set_of(nonnegative_range(num_elements(d))), + [](nonnegative_int i) { return legion_dim_t{i}; }); +} template FFOrdered @@ -25,17 +34,6 @@ LegionOrdered return LegionOrdered(ff_ordered.rbegin(), ff_ordered.rend()); } -template -std::string format_as(LegionOrdered const &v) { - std::vector as_vec(v.cbegin(), v.cend()); - return fmt::format("", as_vec); -} - -template -std::ostream &operator<<(std::ostream &s, LegionOrdered const &v) { - return (s << fmt::to_string(v)); -} - } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/legion_ordered/legion_ordered.h b/lib/kernels/include/kernels/legion_ordered/legion_ordered.h new file mode 100644 index 0000000000..ad8b3bad6d --- /dev/null +++ b/lib/kernels/include/kernels/legion_ordered/legion_ordered.h @@ -0,0 +1,197 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LEGION_ORDERED_LEGION_ORDERED_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LEGION_ORDERED_LEGION_ORDERED_H + +#include "kernels/legion_dim_t.dtg.h" +#include "utils/fmt/vector.h" +#include "utils/stack_vector/stack_vector.h" + +namespace FlexFlow { + +template +struct LegionOrdered { + LegionOrdered() {} + + LegionOrdered(std::initializer_list const &l) + : contents(l.begin(), l.end()) {} + + LegionOrdered(std::vector const &contents) + : contents(contents.begin(), contents.end()) {} + + template + LegionOrdered(It begin, It end) : contents(begin, end) {} + + template + LegionOrdered(stack_vector const &contents) + : contents(contents.begin(), contents.end()) {} + + T const &at(legion_dim_t idx) const { + int raw = idx.value.unwrap_nonnegative(); + return this->contents.at(raw); + } + + T &at(legion_dim_t idx) { + int raw = idx.value.unwrap_nonnegative(); + return this->contents.at(raw); + } + + T const &operator[](legion_dim_t idx) const { + return this->at(idx); + } + + T &operator[](legion_dim_t idx) { + return this->at(idx); + } + + bool idx_is_valid(legion_dim_t const &idx) const { + int raw = idx.value.unwrap_nonnegative(); + return raw < this->contents.size(); + } + + bool operator==(LegionOrdered const &other) const { + return this->contents == other.contents; + } + + bool operator!=(LegionOrdered const &other) const { + return this->contents != other.contents; + } + + using iterator = typename stack_vector::iterator; + using const_iterator = + typename stack_vector::const_iterator; + using reverse_iterator = + typename stack_vector::reverse_iterator; + using const_reverse_iterator = + typename stack_vector::const_reverse_iterator; + using value_type = T; + using pointer = value_type *; + using const_pointer = value_type const *; + using reference = value_type &; + using const_reference = value_type const &; + + iterator begin() { + return this->contents.begin(); + } + + const_iterator begin() const { + return this->cbegin(); + } + + const_iterator cbegin() const { + return this->contents.cbegin(); + } + + iterator end() { + return this->contents.end(); + } + + const_iterator end() const { + return this->cend(); + } + + const_iterator cend() const { + return this->contents.cend(); + } + + reverse_iterator rbegin() { + return this->contents.rbegin(); + } + + const_reverse_iterator rbegin() const { + return this->crbegin(); + } + + const_reverse_iterator crbegin() const { + return this->contents.crbegin(); + } + + reverse_iterator rend() { + return this->contents.rend(); + } + + const_reverse_iterator rend() const { + return this->crend(); + } + + const_reverse_iterator crend() const { + return this->contents.crend(); + } + + size_t size() const { + return this->contents.size(); + } + + size_t empty() const { + return this->contents.empty(); + } + + size_t num_dims() const { + return this->size(); + } + + friend struct ::std::hash; + +private: + stack_vector contents; +}; + +template +auto operator<(LegionOrdered const &lhs, LegionOrdered const &rhs) + -> std::enable_if_t, bool> { + return std::lexicographical_compare( + lhs.cbegin(), lhs.cend(), rhs.cbegin(), rhs.cend()); +} + +template +std::string format_as(LegionOrdered const &v) { + std::vector as_vec(v.cbegin(), v.cend()); + return fmt::format("", as_vec); +} + +template +std::ostream &operator<<(std::ostream &s, LegionOrdered const &v) { + return (s << fmt::to_string(v)); +} + +} // namespace FlexFlow + +namespace nlohmann { +template +struct adl_serializer<::FlexFlow::LegionOrdered> { + static ::FlexFlow::LegionOrdered from_json(nlohmann::json const &j) { + return {j.template get>()}; + } + + static void to_json(nlohmann::json &j, + ::FlexFlow::LegionOrdered const &x) { + j = std::vector{x.cbegin(), x.cend()}; + } +}; +} // namespace nlohmann + +namespace std { + +template +struct hash<::FlexFlow::LegionOrdered> { + size_t operator()(::FlexFlow::LegionOrdered const &t) const { + static_assert(::FlexFlow::is_hashable::value, + "Elements must be hashable"); + + return get_std_hash(t.contents); + } +}; + +} // namespace std + +namespace rc { + +template +struct Arbitrary<::FlexFlow::LegionOrdered> { + static Gen<::FlexFlow::LegionOrdered> arbitrary() { + return gen::construct<::FlexFlow::LegionOrdered>( + gen::arbitrary<::FlexFlow::stack_vector>()); + } +}; + +} // namespace rc + +#endif diff --git a/lib/kernels/include/kernels/legion_ordered/slice.h b/lib/kernels/include/kernels/legion_ordered/slice.h new file mode 100644 index 0000000000..6980c0d9ec --- /dev/null +++ b/lib/kernels/include/kernels/legion_ordered/slice.h @@ -0,0 +1,24 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LEGION_ORDERED_SLICE_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LEGION_ORDERED_SLICE_H + +#include "kernels/legion_ordered/legion_ordered.h" +#include "utils/containers/slice.h" +#include "utils/containers/transform.h" +#include "utils/containers/vector_of.h" + +namespace FlexFlow { + +template +LegionOrdered slice(LegionOrdered const &d, + legion_dim_t const &start, + std::optional const &end) { + int raw_start = start.value.unwrap_nonnegative(); + std::optional raw_end = transform( + end, [](legion_dim_t const &i) { return i.value.unwrap_nonnegative(); }); + + return LegionOrdered{slice(vector_of(d), raw_start, raw_end)}; +} + +} // namespace FlexFlow + +#endif diff --git a/lib/kernels/include/kernels/legion_ordered/transform.h b/lib/kernels/include/kernels/legion_ordered/transform.h new file mode 100644 index 0000000000..55cc1ff1ea --- /dev/null +++ b/lib/kernels/include/kernels/legion_ordered/transform.h @@ -0,0 +1,17 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LEGION_ORDERED_TRANSFORM_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LEGION_ORDERED_TRANSFORM_H + +#include "kernels/legion_ordered/legion_ordered.h" +#include "utils/containers/vector_of.h" +#include "utils/containers/vector_transform.h" + +namespace FlexFlow { + +template > +LegionOrdered transform(LegionOrdered const &d, F &&f) { + return LegionOrdered{vector_transform(vector_of(d), f)}; +} + +} // namespace FlexFlow + +#endif diff --git a/lib/kernels/include/kernels/linear_kernels.h b/lib/kernels/include/kernels/linear_kernels.h index 3128e39fd0..21d84c2567 100644 --- a/lib/kernels/include/kernels/linear_kernels.h +++ b/lib/kernels/include/kernels/linear_kernels.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_OPS_KERNELS_LINEAR_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_LINEAR_KERNELS_H -#include "device.h" #include "ff_handle.h" +#include "kernels/device.h" #include "op-attrs/datatype.h" #include "op-attrs/ops/linear_attrs.dtg.h" @@ -33,8 +33,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(LinearPerDeviceState, weight_type, output_type); -namespace Kernels { -namespace Linear { +namespace Kernels::Linear { LinearPerDeviceState init_kernel(PerDeviceFFHandle handle, float *one_ptr, @@ -51,29 +50,28 @@ bool use_activation(Activation activation); void forward_kernel(ffStream_t stream, LinearPerDeviceState const &m, - void const *input_ptr, - void *output_ptr, - void const *filter_ptr, - void const *bias_ptr, + float const *input_ptr, + float *output_ptr, + float const *filter_ptr, + float const *bias_ptr, int in_dim, int out_dim, int batch_size); void backward_kernel(ffStream_t stream, LinearPerDeviceState const &m, - void const *input_ptr, - void *input_grad_ptr, - void const *output_ptr, - void *output_grad_ptr, - void const *kernel_ptr, - void *kernel_grad_ptr, - void *bias_ptr, + float const *output_ptr, + float *output_grad_ptr, + float const *input_ptr, + float *input_grad_ptr, + float const *kernel_ptr, + float *kernel_grad_ptr, + float *bias_grad_ptr, int in_dim, int out_dim, int batch_size); -} // namespace Linear -} // namespace Kernels +} // namespace Kernels::Linear } // namespace FlexFlow #endif diff --git a/lib/local-execution/include/local-execution/local_cpu_allocator.h b/lib/kernels/include/kernels/local_cpu_allocator.h similarity index 74% rename from lib/local-execution/include/local-execution/local_cpu_allocator.h rename to lib/kernels/include/kernels/local_cpu_allocator.h index d1e81facf2..9653dcf00e 100644 --- a/lib/local-execution/include/local-execution/local_cpu_allocator.h +++ b/lib/kernels/include/kernels/local_cpu_allocator.h @@ -1,3 +1,6 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LOCAL_CPU_ALLOCATOR_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LOCAL_CPU_ALLOCATOR_H + #include "kernels/allocation.h" #include @@ -12,6 +15,8 @@ struct LocalCPUAllocator : public IAllocator { void *allocate(size_t) override; void deallocate(void *) override; + DeviceType get_allocation_device_type() const override; + private: std::unordered_map> ptrs; }; @@ -20,3 +25,5 @@ CHECK_RC_COPY_VIRTUAL_COMPLIANT(LocalCPUAllocator); Allocator create_local_cpu_memory_allocator(); } // namespace FlexFlow + +#endif diff --git a/lib/kernels/include/kernels/local_cuda_allocator.h b/lib/kernels/include/kernels/local_cuda_allocator.h index 18a4b6e78a..b8e0540974 100644 --- a/lib/kernels/include/kernels/local_cuda_allocator.h +++ b/lib/kernels/include/kernels/local_cuda_allocator.h @@ -12,6 +12,8 @@ struct LocalCudaAllocator : public IAllocator { void *allocate(size_t) override; void deallocate(void *) override; + DeviceType get_allocation_device_type() const override; + private: std::unordered_set ptrs; }; diff --git a/lib/kernels/include/kernels/managed_ff_stream.h b/lib/kernels/include/kernels/managed_ff_stream.h index 2f690b2eb3..576edb0ffa 100644 --- a/lib/kernels/include/kernels/managed_ff_stream.h +++ b/lib/kernels/include/kernels/managed_ff_stream.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_KERNELS_MANAGED_FF_STREAM_H #define _FLEXFLOW_KERNELS_MANAGED_FF_STREAM_H -#include "device.h" +#include "kernels/device.h" namespace FlexFlow { @@ -19,6 +19,9 @@ struct ManagedFFStream { ffStream_t const &raw_stream() const; +private: + void cleanup(); + private: ffStream_t *stream; }; diff --git a/lib/kernels/include/kernels/managed_per_device_ff_handle.h b/lib/kernels/include/kernels/managed_per_device_ff_handle.h index 0a83a5eecb..9bd9370685 100644 --- a/lib/kernels/include/kernels/managed_per_device_ff_handle.h +++ b/lib/kernels/include/kernels/managed_per_device_ff_handle.h @@ -7,7 +7,10 @@ namespace FlexFlow { struct ManagedPerDeviceFFHandle { public: - ManagedPerDeviceFFHandle(); + ManagedPerDeviceFFHandle() = delete; + + ManagedPerDeviceFFHandle(size_t workSpaceSize, + bool allowTensorOpMathConversion); ManagedPerDeviceFFHandle(ManagedPerDeviceFFHandle const &) = delete; ManagedPerDeviceFFHandle & @@ -21,6 +24,9 @@ struct ManagedPerDeviceFFHandle { PerDeviceFFHandle const &raw_handle() const; +private: + void cleanup(); + private: PerDeviceFFHandle *handle; }; diff --git a/lib/kernels/include/kernels/metrics_kernels.h b/lib/kernels/include/kernels/metrics_kernels.h index e4660808b9..430608db55 100644 --- a/lib/kernels/include/kernels/metrics_kernels.h +++ b/lib/kernels/include/kernels/metrics_kernels.h @@ -1,25 +1,24 @@ #ifndef _FLEXFLOW_KERNELS_INCLUDE_KERNELS_METRICS_KERNELS_H #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_METRICS_KERNELS_H -#include "perf_metrics.h" +#include "kernels/perf_metrics.h" +#include "pcg/metric_attrs.h" namespace FlexFlow { -void update_metrics_sparse_label_kernel(ffStream_t, - MetricsAttrs const &, - float const *logit_ptr, - int const *label_ptr, - int num_samples, - int num_classes, - PerfMetrics &perf_zc); -void update_metrics_label_kernel(ffStream_t, - MetricsAttrs const &, - float const *logit_ptr, - float const *label_ptr, - int num_samples, - int num_classes, - PerfMetrics &perf_zc); +void update_metrics_sparse_label_kernel_wrapper(float const *logit_ptr, + int const *label_ptr, + MetricsAttrs const &me, + int num_effective_samples, + int num_classes, + PerfMetrics &perf_zc); +void update_metrics_label_kernel_wrapper(float const *logit_ptr, + float const *label_ptr, + MetricsAttrs const &me, + int num_samples, + int num_classes, + PerfMetrics &perf_zc); } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/nccl.h b/lib/kernels/include/kernels/nccl.h index b8a6784676..042911d172 100644 --- a/lib/kernels/include/kernels/nccl.h +++ b/lib/kernels/include/kernels/nccl.h @@ -23,15 +23,11 @@ struct ncclUniqueId {}; struct ncclComm_t {}; #endif -namespace FlexFlow { -namespace Kernels { -namespace NCCL { +namespace FlexFlow::Kernels::NCCL { ncclUniqueId generate_unique_id(); ncclComm_t create_comm(ncclUniqueId const &, int num_ranks, int my_rank); -} // namespace NCCL -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::NCCL #endif diff --git a/lib/kernels/include/kernels/optimizer_kernels.h b/lib/kernels/include/kernels/optimizer_kernels.h index 9ca6bf8e2b..d552831c78 100644 --- a/lib/kernels/include/kernels/optimizer_kernels.h +++ b/lib/kernels/include/kernels/optimizer_kernels.h @@ -1,7 +1,8 @@ #ifndef _FLEXFLOW_KERNELS_INCLUDE_KERNELS_OPTIMIZER_KERNELS_H #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_OPTIMIZER_KERNELS_H -#include "device.h" +#include "kernels/device.h" +#include "kernels/ff_handle.h" namespace FlexFlow { @@ -16,15 +17,18 @@ void sgd_ps_update_task_gpu(ffStream_t, float *weight_ptr, float *sgd_v_ptr); +#ifdef FF_USE_NCCL void sgd_nccl_update_task_gpu(ffStream_t, float lr, float momentum, bool nesterov, - float weight_decay PerDeviceFFHandle const &, + float weight_decay, + PerDeviceFFHandle const &, float const *weight_grad_ptr, size_t size, float *weight_ptr, float *sgd_v_ptr); +#endif void adam_ps_update_task_gpu(ffStream_t, float alpha_t, @@ -33,9 +37,11 @@ void adam_ps_update_task_gpu(ffStream_t, float weight_decay, float epsilon, float const *weight_grad_ptr, - float *adam_m_ptr, + size_t size, + int num_replicas, + float *weight_ptr, float *adam_v_ptr, - float *weight_ptr); + float *adam_m_ptr); void adam_nccl_update_task_gpu(ffStream_t, float alpha_t, @@ -45,9 +51,10 @@ void adam_nccl_update_task_gpu(ffStream_t, float epsilon, PerDeviceFFHandle const &, float const *weight_grad_ptr, - float *adam_m_ptr, + size_t size, + float *weight_ptr, float *adam_v_ptr, - float *weight_ptr); + float *adam_m_ptr); } // namespace FlexFlow diff --git a/lib/kernels/include/kernels/partition_kernels.h b/lib/kernels/include/kernels/partition_kernels.h index 64ef1a1352..aa3a7a1ef7 100644 --- a/lib/kernels/include/kernels/partition_kernels.h +++ b/lib/kernels/include/kernels/partition_kernels.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_OPS_KERNELS_PARTITION_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_PARTITION_KERNELS_H -#include "device.h" #include "kernels/accessor.h" +#include "kernels/device.h" namespace FlexFlow { @@ -13,8 +13,7 @@ struct RepartitionPerDeviceState { FF_VISITABLE_STRUCT_NO_EQ(RepartitionPerDeviceState, handle, data_type); -namespace Kernels { -namespace Repartition { +namespace Kernels::Repartition { RepartitionPerDeviceState init_kernel(PerDeviceFFHandle const &handle, DataType data_type); @@ -26,11 +25,10 @@ void forward_kernel(ffStream_t stream, void backward_kernel(ffStream_t stream, RepartitionPerDeviceState const &m, - GenericTensorAccessorW const &output_grad, - GenericTensorAccessorR const &input_grad); + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad); -} // namespace Repartition -} // namespace Kernels +} // namespace Kernels::Repartition } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_PARTITION_KERNELS_H diff --git a/lib/local-execution/include/local-execution/per_device_op_state.variant.toml b/lib/kernels/include/kernels/per_device_op_state.variant.toml similarity index 100% rename from lib/local-execution/include/local-execution/per_device_op_state.variant.toml rename to lib/kernels/include/kernels/per_device_op_state.variant.toml diff --git a/lib/kernels/include/kernels/pool_2d_kernels.h b/lib/kernels/include/kernels/pool_2d_kernels.h index 798c0507f8..76aa07d0a4 100644 --- a/lib/kernels/include/kernels/pool_2d_kernels.h +++ b/lib/kernels/include/kernels/pool_2d_kernels.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_OPS_KERNELS_POOL_2D_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_POOL_2D_KERNELS_H -#include "device.h" +#include "kernels/device.h" #include "kernels/ff_handle.h" #include "op-attrs/activation.dtg.h" #include "op-attrs/ops/pool_2d.h" @@ -25,8 +25,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(Pool2DPerDeviceState, poolDesc, relu); -namespace Kernels { -namespace Pool2D { +namespace Kernels::Pool2D { Pool2DPerDeviceState init_kernel(PerDeviceFFHandle handle, std::optional activation, @@ -70,13 +69,12 @@ void forward_kernel(ffStream_t stream, void backward_kernel(ffStream_t stream, Pool2DPerDeviceState const &m, - void const *input_ptr, - void *input_grad_ptr, void const *output_ptr, - void const *output_grad_ptr); + void const *output_grad_ptr, + void const *input_ptr, + void *input_grad_ptr); -} // namespace Pool2D -} // namespace Kernels +} // namespace Kernels::Pool2D } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_POOL_2D_KERNELS_H diff --git a/lib/kernels/include/kernels/profiling.h b/lib/kernels/include/kernels/profiling.h index 655d540685..7c4145c426 100644 --- a/lib/kernels/include/kernels/profiling.h +++ b/lib/kernels/include/kernels/profiling.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_KERNELS_PROFILING_H #define _FLEXFLOW_KERNELS_PROFILING_H -#include "device.h" +#include "kernels/device.h" #include "kernels/profiling_settings.dtg.h" #include "utils/visitable.h" diff --git a/lib/kernels/include/kernels/reduce_kernels.h b/lib/kernels/include/kernels/reduce_kernels.h index 4287472875..10e8e4393b 100644 --- a/lib/kernels/include/kernels/reduce_kernels.h +++ b/lib/kernels/include/kernels/reduce_kernels.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_OPS_KERNELS_REDUCE_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_REDUCE_KERNELS_H -#include "array_shape.h" -#include "device.h" -#include "ff_handle.h" +#include "kernels/array_shape.h" +#include "kernels/device.h" +#include "kernels/ff_handle.h" #include "op-attrs/operator_type.dtg.h" namespace FlexFlow { @@ -25,8 +25,7 @@ FF_VISITABLE_STRUCT(ReducePerDeviceState, op_type, reduction_size); -namespace Kernels { -namespace Reduce { +namespace Kernels::Reduce { ReducePerDeviceState init_kernel(PerDeviceFFHandle const &, OperatorType const &, @@ -43,8 +42,7 @@ void backward_kernel(ffStream_t stream, ReducePerDeviceState const &m, float const *output_grad_ptr, float *input_grad_ptr); -} // namespace Reduce -} // namespace Kernels +} // namespace Kernels::Reduce } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_REDUCE_KERNELS_H diff --git a/lib/kernels/include/kernels/reduction_kernels.h b/lib/kernels/include/kernels/reduction_kernels.h index fb3baf215c..08f73cd9ab 100644 --- a/lib/kernels/include/kernels/reduction_kernels.h +++ b/lib/kernels/include/kernels/reduction_kernels.h @@ -1,12 +1,10 @@ #ifndef _FLEXFLOW_OPS_KERNELS_REDUCTION_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_REDUCTION_KERNELS_H -#include "device.h" #include "kernels/accessor.h" +#include "kernels/device.h" -namespace FlexFlow { -namespace Kernels { -namespace Reduction { +namespace FlexFlow::Kernels::Reduction { void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, @@ -14,11 +12,9 @@ void forward_kernel(ffStream_t stream, size_t num_replicas); void backward_kernel(ffStream_t stream, - GenericTensorAccessorW const &input, - GenericTensorAccessorR const &output); + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input); -} // namespace Reduction -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Reduction #endif // _FLEXFLOW_OPS_KERNELS_REDUCTION_KERNELS_H diff --git a/lib/kernels/include/kernels/replicate_kernels.h b/lib/kernels/include/kernels/replicate_kernels.h index 409fc81f44..0b113868ee 100644 --- a/lib/kernels/include/kernels/replicate_kernels.h +++ b/lib/kernels/include/kernels/replicate_kernels.h @@ -1,24 +1,20 @@ #ifndef _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_H -#include "device.h" #include "kernels/accessor.h" +#include "kernels/device.h" -namespace FlexFlow { -namespace Kernels { -namespace Replicate { +namespace FlexFlow::Kernels::Replicate { void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output); void backward_kernel(ffStream_t stream, - GenericTensorAccessorW const &input, GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input, size_t num_replicas); -} // namespace Replicate -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Replicate #endif // _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_H diff --git a/lib/kernels/include/kernels/replicate_kernels_cpu.h b/lib/kernels/include/kernels/replicate_kernels_cpu.h new file mode 100644 index 0000000000..2a2eaa5eb6 --- /dev/null +++ b/lib/kernels/include/kernels/replicate_kernels_cpu.h @@ -0,0 +1,18 @@ +#ifndef _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_CPU_H +#define _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_CPU_H + +#include "kernels/accessor.h" +#include "kernels/device.h" + +namespace FlexFlow::Kernels::Replicate { + +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW &output); + +void cpu_backward_kernel(GenericTensorAccessorR const &output, + GenericTensorAccessorW &input, + size_t num_replicas); + +} // namespace FlexFlow::Kernels::Replicate + +#endif // _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_CPU_H diff --git a/lib/kernels/include/kernels/reshape_kernels.h b/lib/kernels/include/kernels/reshape_kernels.h index a83caa6bea..88c11d2fb0 100644 --- a/lib/kernels/include/kernels/reshape_kernels.h +++ b/lib/kernels/include/kernels/reshape_kernels.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H -#include "device.h" #include "kernels/accessor.h" +#include "kernels/device.h" #include "utils/required_core.h" namespace FlexFlow { @@ -13,8 +13,7 @@ struct ReshapePerDeviceState { FF_VISITABLE_STRUCT(ReshapePerDeviceState, data_type); -namespace Kernels { -namespace Reshape { +namespace Kernels::Reshape { ReshapePerDeviceState init_kernel(DataType data_type); @@ -25,11 +24,10 @@ void forward_kernel(ffStream_t stream, void backward_kernel(ffStream_t stream, ReshapePerDeviceState const &per_device_state, - GenericTensorAccessorW const &input, - GenericTensorAccessorR const &output); + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input); -} // namespace Reshape -} // namespace Kernels +} // namespace Kernels::Reshape } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H diff --git a/lib/kernels/include/kernels/reverse_kernels.h b/lib/kernels/include/kernels/reverse_kernels.h index 42a83ae219..768707175c 100644 --- a/lib/kernels/include/kernels/reverse_kernels.h +++ b/lib/kernels/include/kernels/reverse_kernels.h @@ -1,30 +1,21 @@ #ifndef _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_H -#include "device.h" +#include "kernels/device.h" +#include "kernels/reverse_kernels_cpu.h" -namespace FlexFlow { -namespace Kernels { -namespace Reverse { +namespace FlexFlow::Kernels::Reverse { void forward_kernel(ffStream_t stream, - float const *in_ptr, - float *out_ptr, - coord_t num_out_blks, - coord_t reverse_dim_size, - coord_t in_blk_size, - coord_t output_size); + GenericTensorAccessorR const &input_accessor, + GenericTensorAccessorW &output_accessor, + ReverseAttrs const &); void backward_kernel(ffStream_t stream, - float const *out_grad_ptr, - float *in_grad_ptr, - coord_t num_out_blks, - coord_t reverse_dim_size, - coord_t in_blk_size, - coord_t input_size); + GenericTensorAccessorR const &output_accessor, + GenericTensorAccessorW &input_accessor, + ReverseAttrs const &); -} // namespace Reverse -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Reverse #endif // _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_H diff --git a/lib/kernels/include/kernels/reverse_kernels_cpu.h b/lib/kernels/include/kernels/reverse_kernels_cpu.h new file mode 100644 index 0000000000..ec82000f8f --- /dev/null +++ b/lib/kernels/include/kernels/reverse_kernels_cpu.h @@ -0,0 +1,20 @@ +#ifndef _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H +#define _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H + +#include "kernels/accessor.h" +#include "kernels/device.h" +#include "op-attrs/ops/reverse_attrs.dtg.h" + +namespace FlexFlow::Kernels::Reverse { + +void cpu_forward_kernel(GenericTensorAccessorR const &input_accessor, + GenericTensorAccessorW &output_accessor, + ReverseAttrs const &); + +void cpu_backward_kernel(GenericTensorAccessorR const &output_accessor, + GenericTensorAccessorW &input_accessor, + ReverseAttrs const &); + +} // namespace FlexFlow::Kernels::Reverse + +#endif // _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H diff --git a/lib/kernels/include/kernels/reverse_kernels_params.h b/lib/kernels/include/kernels/reverse_kernels_params.h new file mode 100644 index 0000000000..766d70b915 --- /dev/null +++ b/lib/kernels/include/kernels/reverse_kernels_params.h @@ -0,0 +1,16 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_REVERSE_KERNELS_PARAMS_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_REVERSE_KERNELS_PARAMS_H + +#include "kernels/array_shape.h" +#include "kernels/reverse_kernels_params.dtg.h" +#include "op-attrs/ops/reverse_attrs.dtg.h" + +namespace FlexFlow { + +ReverseKernelsParams + compute_reverse_kernels_params(ArrayShape const &output_shape, + ReverseAttrs const &attrs); + +} // namespace FlexFlow + +#endif diff --git a/lib/kernels/include/kernels/reverse_kernels_params.struct.toml b/lib/kernels/include/kernels/reverse_kernels_params.struct.toml new file mode 100644 index 0000000000..a5dbd750bc --- /dev/null +++ b/lib/kernels/include/kernels/reverse_kernels_params.struct.toml @@ -0,0 +1,28 @@ +namespace = "FlexFlow" +name = "ReverseKernelsParams" +features = [ + "eq", + "ord", + "hash", + "fmt", +] + +includes = [ + "utils/nonnegative_int/nonnegative_int.h", +] + +[[fields]] +name = "num_out_blks" +type = "::FlexFlow::nonnegative_int" + +[[fields]] +name = "reverse_dim_size" +type = "::FlexFlow::nonnegative_int" + +[[fields]] +name = "in_blk_size" +type = "::FlexFlow::nonnegative_int" + +[[fields]] +name = "out_size" +type = "::FlexFlow::nonnegative_int" diff --git a/lib/kernels/include/kernels/softmax_kernels.h b/lib/kernels/include/kernels/softmax_kernels.h index 061230ec52..60101578e3 100644 --- a/lib/kernels/include/kernels/softmax_kernels.h +++ b/lib/kernels/include/kernels/softmax_kernels.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_OPS_KERNELS_SOFTMAX_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_SOFTMAX_KERNELS_H -#include "device.h" #include "ff_handle.h" +#include "kernels/device.h" namespace FlexFlow { @@ -15,8 +15,7 @@ struct SoftmaxPerDeviceState { FF_VISITABLE_STRUCT(SoftmaxPerDeviceState, handle, inputTensor, dim); -namespace Kernels { -namespace Softmax { +namespace Kernels::Softmax { SoftmaxPerDeviceState init_kernel(PerDeviceFFHandle const &handle, int dim, @@ -31,12 +30,11 @@ void forward_kernel(ffStream_t stream, float *output_ptr); void backward_kernel(ffStream_t stream, - float *input_grad_ptr, float const *output_grad_ptr, + float *input_grad_ptr, size_t num_elements); -} // namespace Softmax -} // namespace Kernels +} // namespace Kernels::Softmax } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/split_kernels.h b/lib/kernels/include/kernels/split_kernels.h index 36434d4be8..3b580f94be 100644 --- a/lib/kernels/include/kernels/split_kernels.h +++ b/lib/kernels/include/kernels/split_kernels.h @@ -1,12 +1,9 @@ #ifndef _FLEXFLOW_OPS_KERNELS_SPLIT_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_SPLIT_KERNELS_H -#include "device.h" +#include "kernels/device.h" -namespace FlexFlow { - -namespace Kernels { -namespace Split { +namespace FlexFlow::Kernels::Split { void forward_kernel(ffStream_t stream, float **out_ptrs, float const *in_ptr, @@ -22,8 +19,6 @@ void backward_kernel(ffStream_t stream, coord_t num_blks, int numOutputs); -} // namespace Split -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Split #endif // _FLEXFLOW_OPS_KERNELS_SPLIT_KERNELS_H diff --git a/lib/kernels/include/kernels/topk_kernels.h b/lib/kernels/include/kernels/topk_kernels.h index ae1c739f6c..085594d57f 100644 --- a/lib/kernels/include/kernels/topk_kernels.h +++ b/lib/kernels/include/kernels/topk_kernels.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_OPS_KERNELS_TOPK_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_TOPK_KERNELS_H -#include "device.h" #include "kernels/allocation.h" +#include "kernels/device.h" namespace FlexFlow { @@ -12,8 +12,7 @@ struct TopKPerDeviceState { FF_VISITABLE_STRUCT(TopKPerDeviceState, sorted); -namespace Kernels { -namespace TopK { +namespace Kernels::TopK { TopKPerDeviceState init_kernel(bool sorted); @@ -35,8 +34,7 @@ void backward_kernel(ffStream_t stream, int length, int k); -} // namespace TopK -} // namespace Kernels +} // namespace Kernels::TopK } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_TOPK_KERNELS_H diff --git a/lib/kernels/include/kernels/transpose_kernels.h b/lib/kernels/include/kernels/transpose_kernels.h index 0f1cc2ae61..776370dcbd 100644 --- a/lib/kernels/include/kernels/transpose_kernels.h +++ b/lib/kernels/include/kernels/transpose_kernels.h @@ -1,15 +1,14 @@ #ifndef _FLEXFLOW_OPS_KERNELS_TRANSPOSE_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_TRANSPOSE_KERNELS_H -#include "device.h" #include "kernels/accessor.h" +#include "kernels/device.h" #include "op-attrs/ops/transpose_attrs.dtg.h" #include namespace FlexFlow { -namespace Kernels { -namespace Transpose { +namespace Kernels::Transpose { void forward_kernel(cudaStream_t stream, TransposeAttrs const &attrs, @@ -18,11 +17,10 @@ void forward_kernel(cudaStream_t stream, void backward_kernel(cudaStream_t stream, TransposeAttrs const &attrs, - GenericTensorAccessorW const &in_grad, - GenericTensorAccessorR const &out_grad); + GenericTensorAccessorR const &out_grad, + GenericTensorAccessorW const &in_grad); -} // namespace Transpose -} // namespace Kernels +} // namespace Kernels::Transpose } // namespace FlexFlow #endif // _FLEXFLOW_OPS_KERNELS_TRANSPOSE_KERNELS_H diff --git a/lib/kernels/src/accessor.cc b/lib/kernels/src/accessor.cc deleted file mode 100644 index 27b7eb390d..0000000000 --- a/lib/kernels/src/accessor.cc +++ /dev/null @@ -1,192 +0,0 @@ -#include "kernels/accessor.h" - -namespace FlexFlow { - -int32_t *GenericTensorAccessorW::get_int32_ptr() const { - return this->get(); -} - -int64_t *GenericTensorAccessorW::get_int64_ptr() const { - return this->get(); -} - -float *GenericTensorAccessorW::get_float_ptr() const { - return this->get(); -} - -double *GenericTensorAccessorW::get_double_ptr() const { - return this->get(); -} - -half *GenericTensorAccessorW::get_half_ptr() const { - return this->get(); -} - -std::string format_as(GenericTensorAccessorW const &a) { - return fmt::format("", - a.data_type, - a.shape, - a.ptr); -} - -std::ostream &operator<<(std::ostream &s, GenericTensorAccessorW const &a) { - return (s << fmt::to_string(a)); -} - -int32_t const *GenericTensorAccessorR::get_int32_ptr() const { - return this->get(); -} - -int64_t const *GenericTensorAccessorR::get_int64_ptr() const { - return this->get(); -} - -float const *GenericTensorAccessorR::get_float_ptr() const { - return this->get(); -} - -double const *GenericTensorAccessorR::get_double_ptr() const { - return this->get(); -} - -half const *GenericTensorAccessorR::get_half_ptr() const { - return get(); -} - -std::string format_as(GenericTensorAccessorR const &a) { - return fmt::format("", - a.data_type, - a.shape, - a.ptr); -} - -std::ostream &operator<<(std::ostream &s, GenericTensorAccessorR const &a) { - return (s << fmt::to_string(a)); -} - -int32_t *get_int32_ptr(GenericTensorAccessorW const &a) { - return get(a); -} - -int64_t *get_int64_ptr(GenericTensorAccessorW const &a) { - return get(a); -} - -float *get_float_ptr(GenericTensorAccessorW const &a) { - return get(a); -} - -double *get_double_ptr(GenericTensorAccessorW const &a) { - return get(a); -} - -half *get_half_ptr(GenericTensorAccessorW const &a) { - return get(a); -} - -std::vector - get_int32_ptrs(std::vector const &a) { - return get(a); -} - -std::vector - get_int64_ptrs(std::vector const &a) { - return get(a); -} - -std::vector - get_float_ptrs(std::vector const &a) { - return get(a); -} - -std::vector - get_double_ptrs(std::vector const &a) { - return get(a); -} - -std::vector - get_half_ptrs(std::vector const &a) { - return get(a); -} - -int32_t const *get_int32_ptr(GenericTensorAccessorR const &a) { - return get(a); -} - -int64_t const *get_int64_ptr(GenericTensorAccessorR const &a) { - return get(a); -} - -float const *get_float_ptr(GenericTensorAccessorR const &a) { - return get(a); -} - -double const *get_double_ptr(GenericTensorAccessorR const &a) { - return get(a); -} - -half const *get_half_ptr(GenericTensorAccessorR const &a) { - return get(a); -} - -std::vector - get_int32_ptrs(std::vector const &a) { - return get(a); -} - -std::vector - get_int64_ptrs(std::vector const &a) { - return get(a); -} - -std::vector - get_float_ptrs(std::vector const &a) { - return get(a); -} - -std::vector - get_double_ptrs(std::vector const &a) { - return get(a); -} - -std::vector - get_half_ptrs(std::vector const &a) { - return get(a); -} - -GenericTensorAccessorR read_only_accessor_from_write_accessor( - GenericTensorAccessorW const &writable) { - return GenericTensorAccessorR{ - writable.data_type, writable.shape, req(writable.ptr)}; -} - -bool is_shape_and_dtype_equal(GenericTensorAccessorW const &acc1, - GenericTensorAccessorW const &acc2) { - return acc1.shape == acc2.shape && acc1.data_type == acc2.data_type; -} - -bool shape_and_dtype_matches(GenericTensorAccessorW const &accessor, - ArrayShape const &expected_shape, - DataType const &expected_dtype) { - return accessor.shape == expected_shape && - accessor.data_type == expected_dtype; -} - -bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor, - ArrayShape const &expected_shape, - DataType const &expected_dtype) { - return accessor.shape == expected_shape && - accessor.data_type == expected_dtype; -} - -std::pair - get_shape_and_datatype(GenericTensorAccessorR const &accessor) { - return std::make_pair(accessor.shape, accessor.data_type); -} - -std::pair - get_shape_and_datatype(GenericTensorAccessorW const &accessor) { - return std::make_pair(accessor.shape, accessor.data_type); -} - -} // namespace FlexFlow diff --git a/lib/kernels/src/allocation.cc b/lib/kernels/src/allocation.cc deleted file mode 100644 index d666592e77..0000000000 --- a/lib/kernels/src/allocation.cc +++ /dev/null @@ -1,21 +0,0 @@ -#include "kernels/allocation.h" -#include "op-attrs/tensor_shape.h" - -namespace FlexFlow { - -void *Allocator::allocate(size_t mem_size) { - return this->i_allocator->allocate(mem_size); -} - -void Allocator::deallocate(void *ptr) { - this->i_allocator->deallocate(ptr); -} - -GenericTensorAccessorW - Allocator::allocate_tensor(TensorShape const &tensor_shape) { - void *ptr = - this->allocate(get_size_in_bytes(tensor_shape).unwrap_nonnegative()); - return {tensor_shape.data_type, tensor_shape, ptr}; -} - -} // namespace FlexFlow diff --git a/lib/kernels/src/cpu/ops/cast_kernels.cc b/lib/kernels/src/cpu/ops/cast_kernels.cc new file mode 100644 index 0000000000..cdd57b8947 --- /dev/null +++ b/lib/kernels/src/cpu/ops/cast_kernels.cc @@ -0,0 +1,51 @@ +#include "kernels/cast_kernels_cpu.h" +#include "kernels/datatype_dispatch.h" + +namespace FlexFlow::Kernels::Cast { + +template +void cpu_cast_forward(IDT const *input, ODT *output, size_t volume) { + for (size_t i = 0; i < volume; ++i) { + output[i] = static_cast(input[i]); + } +} + +template +void cpu_cast_backward(IDT const *input, ODT *output, size_t volume, ODT beta) { + for (size_t i = 0; i < volume; i++) { + output[i] = static_cast(input[i]) + beta * output[i]; + } +} + +template +struct CPUForwardKernel { + void operator()(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + size_t volume = input.shape.get_volume().unwrap_nonnegative(); + cpu_cast_forward(input.get(), output.get(), volume); + } +}; + +template +struct CPUBackwardKernel { + void operator()(GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { + size_t volume = output.shape.get_volume().unwrap_nonnegative(); + cpu_cast_backward( + output.get(), input.get(), volume, cast_to(1.0f)); + } +}; + +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + DataTypeDispatch2{}( + input.data_type, output.data_type, input, output); +} + +void cpu_backward_kernel(GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { + DataTypeDispatch2{}( + output.data_type, input.data_type, output, input); +} + +} // namespace FlexFlow::Kernels::Cast diff --git a/lib/kernels/src/cpu/ops/combine_kernels.cc b/lib/kernels/src/cpu/ops/combine_kernels.cc new file mode 100644 index 0000000000..577984f21a --- /dev/null +++ b/lib/kernels/src/cpu/ops/combine_kernels.cc @@ -0,0 +1,39 @@ +#include "kernels/combine_kernels_cpu.h" +#include "kernels/datatype_dispatch.h" + +namespace FlexFlow::Kernels::Combine { + +template +struct CPUForwardKernel { + void operator()(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + memcpy(output.get
(), + input.get
(), + input.shape.get_volume().unwrap_nonnegative() * + size_of_datatype(DT).unwrap_nonnegative()); + } +}; + +template +struct CPUBackwardKernel { + void operator()(GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad) { + size_t num_elements = output_grad.shape.get_volume().unwrap_nonnegative(); + for (int i = 0; i < num_elements; ++i) { + input_grad.get
()[i] += output_grad.get
()[i]; + } + } +}; + +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + DataTypeDispatch1{}(input.data_type, input, output); +} + +void cpu_backward_kernel(GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad) { + DataTypeDispatch1{}( + input_grad.data_type, output_grad, input_grad); +} + +} // namespace FlexFlow::Kernels::Combine diff --git a/lib/kernels/src/cpu/initializer_kernels.cc b/lib/kernels/src/cpu/ops/initializer_kernels.cc similarity index 100% rename from lib/kernels/src/cpu/initializer_kernels.cc rename to lib/kernels/src/cpu/ops/initializer_kernels.cc diff --git a/lib/kernels/src/cpu/ops/replicate_kernels.cc b/lib/kernels/src/cpu/ops/replicate_kernels.cc new file mode 100644 index 0000000000..798a4ea8c7 --- /dev/null +++ b/lib/kernels/src/cpu/ops/replicate_kernels.cc @@ -0,0 +1,51 @@ +#include "kernels/datatype_dispatch.h" +#include "kernels/replicate_kernels_cpu.h" + +namespace FlexFlow::Kernels::Replicate { + +template +struct CPUForwardKernel { + void operator()(GenericTensorAccessorR const &input, + GenericTensorAccessorW &output) { + memcpy(output.get
(), + input.get
(), + input.shape.num_elements().unwrap_nonnegative() * + size_of_datatype(DT).unwrap_nonnegative()); + } +}; + +template +struct CPUBackwardKernel { + void operator()(GenericTensorAccessorR const &output, + GenericTensorAccessorW &input, + nonnegative_int num_elements, + nonnegative_int num_replicas) { + using T = real_type_t
; + + for (nonnegative_int i : nonnegative_range(num_elements)) { + T cur_sum = 0; + for (nonnegative_int replica_idx : nonnegative_range(num_replicas)) { + cur_sum += output.at
(LegionOrdered{replica_idx, i}); + } + input.at
(LegionOrdered{i}) = cur_sum; + } + } +}; + +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW &output) { + DataTypeDispatch1{}(input.data_type, input, output); +} + +void cpu_backward_kernel(GenericTensorAccessorR const &output, + GenericTensorAccessorW &input, + size_t num_replicas) { + nonnegative_int num_elements = input.shape.num_elements(); + DataTypeDispatch1{}(input.data_type, + output, + input, + num_elements, + nonnegative_int{num_replicas}); +} + +} // namespace FlexFlow::Kernels::Replicate diff --git a/lib/kernels/src/cpu/ops/reverse_kernels.cc b/lib/kernels/src/cpu/ops/reverse_kernels.cc new file mode 100644 index 0000000000..4d9eb8cc09 --- /dev/null +++ b/lib/kernels/src/cpu/ops/reverse_kernels.cc @@ -0,0 +1,46 @@ +#include "kernels/datatype_dispatch.h" +#include "kernels/reverse_kernels_cpu.h" +#include + +namespace FlexFlow::Kernels::Reverse { + +template +struct CPUReverseForwardKernel { + void operator()(GenericTensorAccessorR const &input, + GenericTensorAccessorW &output, + ReverseAttrs const &attrs) { + nonnegative_int reverse_axis_size = input.shape.at(attrs.axis); + + for (ArrayCoord const &input_coord : get_array_coord_set(input.shape)) { + nonnegative_int input_reverse_axis_coord = + input_coord.ff_ordered.at(attrs.axis); + + ArrayCoord output_coord = input_coord; + output_coord.ff_ordered.at(attrs.axis) = + nonnegative_int{reverse_axis_size.unwrap_nonnegative() - + input_reverse_axis_coord.unwrap_nonnegative() - 1}; + + output.at
(output_coord.ff_ordered) = + input.at
(input_coord.ff_ordered); + } + } +}; + +void cpu_forward_kernel(GenericTensorAccessorR const &input_accessor, + GenericTensorAccessorW &output_accessor, + ReverseAttrs const &attrs) { + + DataTypeDispatch1{}( + input_accessor.data_type, input_accessor, output_accessor, attrs); +} + +void cpu_backward_kernel(GenericTensorAccessorR const &output_grad_accessor, + GenericTensorAccessorW &input_grad_accessor, + ReverseAttrs const &attrs) { + DataTypeDispatch1{}(output_grad_accessor.data_type, + output_grad_accessor, + input_grad_accessor, + attrs); +} + +} // namespace FlexFlow::Kernels::Reverse diff --git a/lib/kernels/src/cuda/cuda_helper.cu b/lib/kernels/src/cuda/cuda_helper.cu index 66388c0ec8..86b2d8a437 100644 --- a/lib/kernels/src/cuda/cuda_helper.cu +++ b/lib/kernels/src/cuda/cuda_helper.cu @@ -1,4 +1,4 @@ -#include "device.h" +#include "internal/device.h" #include "kernels/datatype_dispatch.h" #include "utils/containers/reversed.h" @@ -29,13 +29,13 @@ cudaError_t get_legion_stream(cudaStream_t *stream) { #error "Unknown device, please make sure if CUDA is enabled" #endif -__global__ void scale_kernel(float *ptr, coord_t size, float a, float b) { +__global__ void scale_kernel(float *ptr, size_t size, float a, float b) { CUDA_KERNEL_LOOP(i, size) { ptr[i] = (b - a) * ptr[i] + a; } } -__global__ void ones_kernel(float *ptr, coord_t size) { +__global__ void ones_kernel(float *ptr, size_t size) { CUDA_KERNEL_LOOP(i, size) { ptr[i] = 1.0f; } @@ -49,7 +49,7 @@ __global__ void assign_kernel(DT *ptr, size_t size, DT value) { } template -__global__ void copy_kernel(DT *dst, const DT *src, coord_t size) { +__global__ void copy_kernel(DT *dst, const DT *src, size_t size) { CUDA_KERNEL_LOOP(i, size) { dst[i] = src[i]; } @@ -281,11 +281,11 @@ template __global__ void add_kernel(bool *dst, bool const *src, unsigned long size); template __global__ void - copy_kernel(float *dst, float const *src, coord_t size); + copy_kernel(float *dst, float const *src, size_t size); template __global__ void - copy_kernel(int32_t *dst, int32_t const *src, coord_t size); + copy_kernel(int32_t *dst, int32_t const *src, size_t size); template __global__ void - copy_kernel(int64_t *dst, int64_t const *src, coord_t size); + copy_kernel(int64_t *dst, int64_t const *src, size_t size); template __global__ void apply_add_with_scale(float *data_ptr, float const *grad_ptr, diff --git a/lib/kernels/src/cuda/embedding_kernels.cu b/lib/kernels/src/cuda/embedding_kernels.cu index e6a614ba70..cb84f0e777 100644 --- a/lib/kernels/src/cuda/embedding_kernels.cu +++ b/lib/kernels/src/cuda/embedding_kernels.cu @@ -13,16 +13,15 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/datatype_dispatch.h" #include "kernels/embedding_kernels.h" -namespace FlexFlow { -namespace Kernels { -namespace Embedding { +namespace FlexFlow::Kernels::Embedding { void rand_generate_int64_wrapper(int64_t *ptr, size_t size, int64_t p) { cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); // Randomly initialize the intput tensor to avoid out of index range issues rand_generate_int<<>>( @@ -31,36 +30,14 @@ void rand_generate_int64_wrapper(int64_t *ptr, size_t size, int64_t p) { void rand_generate_int32_wrapper(int32_t *ptr, size_t size, int32_t p) { cudaStream_t stream; + checkCUDA(get_legion_stream(&stream)); // Randomly initialize the intput tensor to avoid out of index range issues rand_generate_int<<>>( ptr, size, p); } -template -__global__ void embed_forward_no_aggr( - TI const *input, TD *output, TD const *embed, int out_dim, int batch_size); -template -__global__ void embed_forward_with_aggr(TI const *input, - TD *output, - TD const *embed, - int out_dim, - int in_dim, - int batch_size, - std::optional aggr); -template -__global__ void embed_backward_no_aggr( - TI const *input, TD const *output, TD *embed, int out_dim, int batch_size); -template -__global__ void embed_backward_with_aggr(TI const *input, - TD const *output, - TD *embed, - int out_dim, - int in_dim, - int batch_size, - std::optional aggr); - -template +template __global__ void embed_forward_no_aggr(int32_t const *input, TD *output, TD const *embed, @@ -75,7 +52,7 @@ __global__ void embed_forward_no_aggr(int32_t const *input, } } -template +template __global__ void embed_forward_no_aggr(int64_t const *input, TD *output, TD const *embed, @@ -90,14 +67,14 @@ __global__ void embed_forward_no_aggr(int64_t const *input, } } -template +template __global__ void embed_forward_with_aggr(int32_t const *input, TD *output, TD const *embed, int out_dim, int in_dim, int batch_size, - std::optional aggr) { + AggregateOp aggr) { TD scale = 1.0f / in_dim; CUDA_KERNEL_LOOP(i, batch_size * out_dim) { output[i] = 0; @@ -115,14 +92,14 @@ __global__ void embed_forward_with_aggr(int32_t const *input, } } -template +template __global__ void embed_forward_with_aggr(int64_t const *input, TD *output, TD const *embed, int out_dim, int in_dim, int batch_size, - std::optional aggr) { + AggregateOp aggr) { TD scale = 1.0f / in_dim; CUDA_KERNEL_LOOP(i, batch_size * out_dim) { output[i] = 0; @@ -140,7 +117,7 @@ __global__ void embed_forward_with_aggr(int64_t const *input, } } -template +template __global__ void embed_backward_no_aggr(int32_t const *input, TD const *output, TD *embed, @@ -154,7 +131,7 @@ __global__ void embed_backward_no_aggr(int32_t const *input, } } -template +template __global__ void embed_backward_no_aggr(int64_t const *input, TD const *output, TD *embed, @@ -171,11 +148,11 @@ __global__ void embed_backward_no_aggr(int64_t const *input, // Specialization for half type template <> -__global__ void embed_backward_no_aggr(int32_t const *input, - half const *output, - half *embed, - int out_dim, - int batch_size) { +__global__ void embed_backward_no_aggr(int32_t const *input, + half const *output, + half *embed, + int out_dim, + int batch_size) { CUDA_KERNEL_LOOP(i, batch_size * out_dim) { int idx = i / out_dim; int off = i % out_dim; @@ -192,11 +169,11 @@ __global__ void embed_backward_no_aggr(int32_t const *input, } template <> -__global__ void embed_backward_no_aggr(int64_t const *input, - half const *output, - half *embed, - int out_dim, - int batch_size) { +__global__ void embed_backward_no_aggr(int64_t const *input, + half const *output, + half *embed, + int out_dim, + int batch_size) { CUDA_KERNEL_LOOP(i, batch_size * out_dim) { int idx = i / out_dim; int off = i % out_dim; @@ -212,14 +189,14 @@ __global__ void embed_backward_no_aggr(int64_t const *input, } } -template +template __global__ void embed_backward_with_aggr(int32_t const *input, TD const *output, TD *embed, int out_dim, int in_dim, int batch_size, - std::optional aggr) { + AggregateOp aggr) { TD scale = 1.0f / in_dim; CUDA_KERNEL_LOOP(i, batch_size * out_dim) { int idx = i / out_dim; @@ -238,14 +215,14 @@ __global__ void embed_backward_with_aggr(int32_t const *input, } } -template +template __global__ void embed_backward_with_aggr(int64_t const *input, TD const *output, TD *embed, int out_dim, int in_dim, int batch_size, - std::optional aggr) { + AggregateOp aggr) { TD scale = 1.0f / in_dim; CUDA_KERNEL_LOOP(i, batch_size * out_dim) { int idx = i / out_dim; @@ -267,14 +244,13 @@ __global__ void embed_backward_with_aggr(int64_t const *input, // Specialization for half type template <> -__global__ void - embed_backward_with_aggr(int32_t const *input, - half const *output, - half *embed, - int out_dim, - int in_dim, - int batch_size, - std::optional aggr) { +__global__ void embed_backward_with_aggr(int32_t const *input, + half const *output, + half *embed, + int out_dim, + int in_dim, + int batch_size, + AggregateOp aggr) { half scale = 1.0f / in_dim; CUDA_KERNEL_LOOP(i, batch_size * out_dim) { int idx = i / out_dim; @@ -301,14 +277,13 @@ __global__ void } template <> -__global__ void - embed_backward_with_aggr(int64_t const *input, - half const *output, - half *embed, - int out_dim, - int in_dim, - int batch_size, - std::optional aggr) { +__global__ void embed_backward_with_aggr(int64_t const *input, + half const *output, + half *embed, + int out_dim, + int in_dim, + int batch_size, + AggregateOp aggr) { half scale = 1.0f / in_dim; CUDA_KERNEL_LOOP(i, batch_size * out_dim) { int idx = i / out_dim; @@ -351,35 +326,229 @@ struct ForwardKernel { int in_dim, int out_dim, int batch_size) { - assert(input.data_type == DataType::INT32 || - input.data_type == DataType::INT64); - assert(weight.data_type == DataType::HALF || - weight.data_type == DataType::FLOAT || - weight.data_type == DataType::DOUBLE); + throw mk_runtime_error(fmt::format( + "Invalid type combination: input type {} and output type {}", TI, TD)); + } +}; + +template <> +struct ForwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &weight, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_forward_no_aggr + <<>>(input.get(), + output.get(), + weight.get(), + out_dim, + batch_size); + } else { + assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); + embed_forward_with_aggr + <<>>(input.get(), + output.get(), + weight.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; +template <> +struct ForwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &weight, + int in_dim, + int out_dim, + int batch_size) { if (!aggr.has_value()) { - embed_forward_no_aggr, real_type_t> - << + <<>>(input.get(), - output.get(), - weight.get(), + stream>>>(input.get(), + output.get(), + weight.get(), out_dim, batch_size); } else { assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); - embed_forward_with_aggr, real_type_t> - << + <<>>(input.get(), - output.get(), - weight.get(), + stream>>>(input.get(), + output.get(), + weight.get(), out_dim, in_dim, batch_size, - aggr); + aggr.value()); + } + } +}; + +template <> +struct ForwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &weight, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_forward_no_aggr + <<>>(input.get(), + output.get(), + weight.get(), + out_dim, + batch_size); + } else { + assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); + embed_forward_with_aggr + <<>>(input.get(), + output.get(), + weight.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct ForwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &weight, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_forward_no_aggr + <<>>(input.get(), + output.get(), + weight.get(), + out_dim, + batch_size); + } else { + assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); + embed_forward_with_aggr + <<>>(input.get(), + output.get(), + weight.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct ForwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &weight, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_forward_no_aggr + <<>>(input.get(), + output.get(), + weight.get(), + out_dim, + batch_size); + } else { + assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); + embed_forward_with_aggr + <<>>(input.get(), + output.get(), + weight.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct ForwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &weight, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_forward_no_aggr + <<>>(input.get(), + output.get(), + weight.get(), + out_dim, + batch_size); + } else { + assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); + embed_forward_with_aggr + <<>>(input.get(), + output.get(), + weight.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); } } }; @@ -388,39 +557,229 @@ template struct BackwardKernel { void operator()(cudaStream_t stream, std::optional aggr, + GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &weight_grad, + int in_dim, + int out_dim, + int batch_size) { + throw mk_runtime_error(fmt::format( + "Invalid type combination: input type {} and output type {}", TI, TD)); + } +}; + +template <> +struct BackwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &weight_grad, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_backward_no_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + batch_size); + } else { + embed_backward_with_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct BackwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &weight_grad, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_backward_no_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + batch_size); + } else { + embed_backward_with_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct BackwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &weight_grad, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_backward_no_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + batch_size); + } else { + embed_backward_with_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct BackwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &output, GenericTensorAccessorR const &input, + GenericTensorAccessorW const &weight_grad, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_backward_no_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + batch_size); + } else { + embed_backward_with_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct BackwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, + GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &weight_grad, + int in_dim, + int out_dim, + int batch_size) { + if (!aggr.has_value()) { + embed_backward_no_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + batch_size); + } else { + embed_backward_with_aggr + <<>>(input.get(), + output.get(), + weight_grad.get(), + out_dim, + in_dim, + batch_size, + aggr.value()); + } + } +}; + +template <> +struct BackwardKernel { + void operator()(cudaStream_t stream, + std::optional aggr, GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, GenericTensorAccessorW const &weight_grad, int in_dim, int out_dim, int batch_size) { - assert(input.data_type == DataType::INT32 || - input.data_type == DataType::INT64); - assert(output.data_type == DataType::HALF || - output.data_type == DataType::FLOAT || - output.data_type == DataType::DOUBLE); if (!aggr.has_value()) { - embed_backward_no_aggr, real_type_t> - << + <<>>(input.get(), - output.get(), - weight_grad.get(), + stream>>>(input.get(), + output.get(), + weight_grad.get(), out_dim, batch_size); } else { - embed_backward_with_aggr, real_type_t> - << + <<>>(input.get(), - output.get(), - weight_grad.get(), + stream>>>(input.get(), + output.get(), + weight_grad.get(), out_dim, in_dim, batch_size, - aggr); + aggr.value()); } } }; @@ -448,27 +807,25 @@ void forward_kernel(ffStream_t stream, } void backward_kernel(cudaStream_t stream, - GenericTensorAccessorR const &input, GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, GenericTensorAccessorW const &weight_grad, - DataType input_data_type, DataType output_data_type, + DataType input_data_type, std::optional aggr, int in_dim, int out_dim, int batch_size) { - DataTypeDispatch2{}(input_data_type, - output_data_type, + DataTypeDispatch2{}(output_data_type, + input_data_type, stream, aggr, - input, output, + input, weight_grad, in_dim, out_dim, batch_size); } -} // namespace Embedding -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Embedding diff --git a/lib/kernels/src/cuda/loss_function_kernels.cu b/lib/kernels/src/cuda/loss_function_kernels.cu index 6c22efda21..2fccf4b48f 100644 --- a/lib/kernels/src/cuda/loss_function_kernels.cu +++ b/lib/kernels/src/cuda/loss_function_kernels.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/loss_function_kernels.h" namespace FlexFlow { diff --git a/lib/kernels/src/cuda/metrics_functions.cu b/lib/kernels/src/cuda/metrics_functions.cu index 2e037eb472..54ecd076f6 100644 --- a/lib/kernels/src/cuda/metrics_functions.cu +++ b/lib/kernels/src/cuda/metrics_functions.cu @@ -13,17 +13,42 @@ * limitations under the License. */ -#include "flexflow/model.h" -#include "flexflow/utils/cuda_helper.h" +#include "internal/device.h" +#include "kernels/metrics_kernels.h" +#include "kernels/perf_metrics.h" +#include "pcg/metric_attrs.h" namespace FlexFlow { +struct CUDAPerfMetrics { + int train_all; + int train_correct; + float cce_loss; + float sparse_cce_loss; + float mse_loss; + float rmse_loss; + float mae_loss; + double start_time; + double current_time; + + CUDAPerfMetrics() = delete; + CUDAPerfMetrics(PerfMetrics const &perf) + : train_all(perf.train_all), + train_correct(perf.train_correct.value_or(-1)), + cce_loss(perf.cce_loss.value_or(-1)), + sparse_cce_loss(perf.sparse_cce_loss.value_or(-1)), + mse_loss(perf.mse_loss.value_or(-1)), + rmse_loss(perf.rmse_loss.value_or(-1)), + mae_loss(perf.mae_loss.value_or(-1)), start_time(perf.start_time), + current_time(perf.current_time) {} +}; + float const LOG_MIN_VALUE = 0.00000001f; __global__ void update_metrics_sparse_label_kernel(float const *logits, int const *labels, - PerfMetrics *perf, - const Metrics metrics, + CUDAPerfMetrics *perf, + const MetricsAttrs metrics, int num_samples, int num_classes) { CUDA_KERNEL_LOOP(b, num_samples) { @@ -72,8 +97,8 @@ __global__ void update_metrics_sparse_label_kernel(float const *logits, __global__ void update_metrics_label_kernel(float const *logits, float const *labels, - PerfMetrics *perf, - const Metrics metrics, + CUDAPerfMetrics *perf, + const MetricsAttrs metrics, int num_samples, int num_classes) { CUDA_KERNEL_LOOP(b, num_samples) { @@ -136,17 +161,17 @@ __global__ void update_metrics_label_kernel(float const *logits, } } -void Metrics::update_metrics_sparse_label_kernel_wrapper( - float const *logit_ptr, - int const *label_ptr, - Metrics const *me, - int num_effective_samples, - int num_classes, - PerfMetrics &perf_zc) { - PerfMetrics *perf; - checkCUDA(cudaMalloc(&perf, sizeof(PerfMetrics))); - checkCUDA( - cudaMemcpy(perf, &perf_zc, sizeof(PerfMetrics), cudaMemcpyHostToDevice)); +void update_metrics_sparse_label_kernel_wrapper(float const *logit_ptr, + int const *label_ptr, + MetricsAttrs const &me, + int num_effective_samples, + int num_classes, + PerfMetrics &perf_zc) { + CUDAPerfMetrics perf(perf_zc); + CUDAPerfMetrics *perf_cuda; + checkCUDA(cudaMalloc(&perf_cuda, sizeof(CUDAPerfMetrics))); + checkCUDA(cudaMemcpy( + perf_cuda, &perf, sizeof(CUDAPerfMetrics), cudaMemcpyHostToDevice)); cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -154,32 +179,33 @@ void Metrics::update_metrics_sparse_label_kernel_wrapper( CUDA_NUM_THREADS, 0, stream>>>( - logit_ptr, label_ptr, perf, *me, num_effective_samples, num_classes); + logit_ptr, label_ptr, perf_cuda, me, num_effective_samples, num_classes); checkCUDA(cudaStreamSynchronize(stream)); - checkCUDA( - cudaMemcpy(&perf_zc, perf, sizeof(PerfMetrics), cudaMemcpyDeviceToHost)); - checkCUDA(cudaFree(perf)); + checkCUDA(cudaMemcpy( + &perf, perf_cuda, sizeof(CUDAPerfMetrics), cudaMemcpyDeviceToHost)); + checkCUDA(cudaFree(perf_cuda)); } -void Metrics::update_metrics_label_kernel_wrapper(float const *logit_ptr, - float const *label_ptr, - Metrics const *me, - int num_samples, - int num_classes, - PerfMetrics &perf_zc) { - PerfMetrics *perf; - checkCUDA(cudaMalloc(&perf, sizeof(PerfMetrics))); - checkCUDA( - cudaMemcpy(perf, &perf_zc, sizeof(PerfMetrics), cudaMemcpyHostToDevice)); +void update_metrics_label_kernel_wrapper(float const *logit_ptr, + float const *label_ptr, + MetricsAttrs const &me, + int num_samples, + int num_classes, + PerfMetrics &perf_zc) { + CUDAPerfMetrics perf(perf_zc); + CUDAPerfMetrics *perf_cuda; + checkCUDA(cudaMalloc(&perf_cuda, sizeof(CUDAPerfMetrics))); + checkCUDA(cudaMemcpy( + perf_cuda, &perf, sizeof(CUDAPerfMetrics), cudaMemcpyHostToDevice)); cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); update_metrics_label_kernel<<>>( - logit_ptr, label_ptr, perf, *me, num_samples, num_classes); + logit_ptr, label_ptr, perf_cuda, me, num_samples, num_classes); checkCUDA(cudaStreamSynchronize(stream)); - checkCUDA( - cudaMemcpy(&perf_zc, perf, sizeof(PerfMetrics), cudaMemcpyDeviceToHost)); - checkCUDA(cudaFree(perf)); + checkCUDA(cudaMemcpy( + &perf, perf_cuda, sizeof(CUDAPerfMetrics), cudaMemcpyDeviceToHost)); + checkCUDA(cudaFree(perf_cuda)); } }; // namespace FlexFlow diff --git a/lib/kernels/src/cuda/ops/attention_kernels.cu b/lib/kernels/src/cuda/ops/attention_kernels.cu index 38c32ad9e4..e5bdb6f21d 100644 --- a/lib/kernels/src/cuda/ops/attention_kernels.cu +++ b/lib/kernels/src/cuda/ops/attention_kernels.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/attention_kernels.h" #include "kernels/device.h" diff --git a/lib/kernels/src/cuda/ops/batch_matmul_kernels.cu b/lib/kernels/src/cuda/ops/batch_matmul_kernels.cu index eb23514c5f..348eed9f0c 100644 --- a/lib/kernels/src/cuda/ops/batch_matmul_kernels.cu +++ b/lib/kernels/src/cuda/ops/batch_matmul_kernels.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/batch_matmul_kernels.h" namespace FlexFlow { diff --git a/lib/kernels/src/cuda/ops/batch_norm_kernels.cu b/lib/kernels/src/cuda/ops/batch_norm_kernels.cu index 4e153a028e..ceb3a1b3d9 100644 --- a/lib/kernels/src/cuda/ops/batch_norm_kernels.cu +++ b/lib/kernels/src/cuda/ops/batch_norm_kernels.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/allocation.h" #include "kernels/batch_norm_kernels.h" #include "kernels/ff_handle.h" @@ -53,9 +53,9 @@ void forward_kernel(cudaStream_t stream, void backward_kernel(cudaStream_t stream, BatchNormPerDeviceState const &m, - float const *input_ptr, - float *output_grad_ptr, float const *output_ptr, + float *output_grad_ptr, + float const *input_ptr, float *input_grad_ptr, float const *scale_ptr, float *scale_grad_ptr, diff --git a/lib/kernels/src/cuda/ops/cast_kernels.cu b/lib/kernels/src/cuda/ops/cast_kernels.cu index fe7aec68b9..f3ea6db660 100644 --- a/lib/kernels/src/cuda/ops/cast_kernels.cu +++ b/lib/kernels/src/cuda/ops/cast_kernels.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/cast_kernels.h" #include "kernels/datatype_dispatch.h" @@ -50,30 +50,26 @@ struct ForwardKernel { template struct BackwardKernel { void operator()(ffStream_t stream, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output) { - size_t volume = input.shape.get_volume().unwrap_nonnegative(); + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { + size_t volume = output.shape.get_volume().unwrap_nonnegative(); cast_backward<<>>( - input.get(), output.get(), volume, cast_to(1.0f)); + output.get(), input.get(), volume, cast_to(1.0f)); } }; void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type) { + GenericTensorAccessorW const &output) { DataTypeDispatch2{}( - input_type, output_type, stream, input, output); + input.data_type, output.data_type, stream, input, output); } void backward_kernel(ffStream_t stream, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType input_type, - DataType output_type) { + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { DataTypeDispatch2{}( - input_type, output_type, stream, input, output); + output.data_type, input.data_type, stream, output, input); } } // namespace Cast diff --git a/lib/kernels/src/cuda/ops/combine_kernels.cu b/lib/kernels/src/cuda/ops/combine_kernels.cu index 7cc67ceed8..08cc343fd2 100644 --- a/lib/kernels/src/cuda/ops/combine_kernels.cu +++ b/lib/kernels/src/cuda/ops/combine_kernels.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/accessor.h" #include "kernels/combine_kernels.h" #include "kernels/datatype_dispatch.h" diff --git a/lib/kernels/src/cuda/ops/concat_kernels.cu b/lib/kernels/src/cuda/ops/concat_kernels.cu index 2715ff16e9..37dbbe12f8 100644 --- a/lib/kernels/src/cuda/ops/concat_kernels.cu +++ b/lib/kernels/src/cuda/ops/concat_kernels.cu @@ -13,50 +13,58 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/concat_kernels.h" #include -namespace FlexFlow { -namespace Kernels { -namespace Concat { +namespace FlexFlow::Kernels::Concat { void calc_blk_size(size_t &num_blocks, size_t &blk_size, ArrayShape const &shape, ff_dim_t axis) { - blk_size = shape.sub_shape(legion_dim_t{0_n}, axis) + legion_dim_t legion_axis = legion_dim_from_ff_dim(axis, shape.num_dims()); + assert(legion_axis.value < shape.num_dims()); + if (legion_axis.value == 0_n) { + legion_axis.value = 1_n; + } + blk_size = shape.sub_shape(legion_dim_t{0_n}, legion_axis) .num_elements() .unwrap_nonnegative(); - num_blocks = - shape.sub_shape(axis, std::nullopt).num_elements().unwrap_nonnegative(); + num_blocks = shape.sub_shape(legion_axis, std::nullopt) + .num_elements() + .unwrap_nonnegative(); } void forward_kernel(cudaStream_t stream, GenericTensorAccessorW const &output, std::vector const &inputs, ff_dim_t axis) { - size_t num_blocks = 1, output_blk_size = 1, input_blk_sizes[MAX_NUM_INPUTS]; - int num_inputs = inputs.size(); - assert(num_inputs <= MAX_NUM_INPUTS); + assert(inputs.size() <= MAX_NUM_INPUTS); + size_t num_blocks = 1, output_blk_size = 1; calc_blk_size(num_blocks, output_blk_size, output.shape, axis); - for (int i = 0; i < num_inputs; i++) { - size_t input_num_blocks = 1; - calc_blk_size(input_num_blocks, input_blk_sizes[i], inputs[i].shape, axis); - assert(input_num_blocks == num_blocks); - } - off_t offset = 0; - for (int i = 0; i < num_inputs; i++) { - copy_with_stride<<>>(output.get_float_ptr() + offset, - inputs[i].get_float_ptr(), - num_blocks, + input.get_float_ptr(), + blocks_to_copy, output_blk_size, - input_blk_sizes[i]); - offset += input_blk_sizes[i]; + input_blk_size); + + offset += (output_blk_size == input_blk_size) + ? input_blk_size * input_num_blocks + : input_blk_size; } } @@ -64,32 +72,32 @@ void backward_kernel(cudaStream_t stream, GenericTensorAccessorR const &output_grad, std::vector const &input_grads, ff_dim_t axis) { - size_t num_blocks = 1, output_blk_size = 1, input_blk_sizes[MAX_NUM_INPUTS]; - int num_inputs = input_grads.size(); - assert(num_inputs <= MAX_NUM_INPUTS); - + assert(input_grads.size() <= MAX_NUM_INPUTS); + size_t num_blocks = 1, output_blk_size = 1; calc_blk_size(num_blocks, output_blk_size, output_grad.shape, axis); - for (int i = 0; i < num_inputs; i++) { - ArrayShape shape = input_grads[i].shape; - size_t input_num_blocks = 1; - calc_blk_size(input_num_blocks, input_blk_sizes[i], shape, axis); - assert(input_num_blocks == num_blocks); - } - off_t offset = 0; - for (int i = 0; i < num_inputs; i++) { - add_with_stride<<>>(input_grads[i].get_float_ptr(), + stream>>>(input_grad.get_float_ptr(), output_grad.get_float_ptr() + offset, - num_blocks, - input_blk_sizes[i], + blocks_to_add, + input_blk_size, output_blk_size); - offset += input_blk_sizes[i]; + + offset += (output_blk_size == input_blk_size) + ? input_blk_size * input_num_blocks + : input_blk_size; } } -} // namespace Concat -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Concat diff --git a/lib/kernels/src/cuda/ops/conv_2d_kernels.cu b/lib/kernels/src/cuda/ops/conv_2d_kernels.cu index dac55539d2..16db62a57f 100644 --- a/lib/kernels/src/cuda/ops/conv_2d_kernels.cu +++ b/lib/kernels/src/cuda/ops/conv_2d_kernels.cu @@ -1,4 +1,4 @@ -#include "device.h" +#include "internal/device.h" #include "kernels/conv_2d_kernels.h" namespace FlexFlow { @@ -313,10 +313,10 @@ void forward_kernel(ffStream_t stream, void backward_kernel(ffStream_t stream, Conv2DPerDeviceState const &m, - float const *input_ptr, - float *input_grad_ptr, float const *output_ptr, float *output_grad_ptr, + float const *input_ptr, + float *input_grad_ptr, float const *filter_ptr, float *filter_grad_ptr, float *bias_grad_ptr, diff --git a/lib/kernels/src/cuda/ops/dropout_kernels.cu b/lib/kernels/src/cuda/ops/dropout_kernels.cu index adf0cd8e89..c5fa56bc78 100644 --- a/lib/kernels/src/cuda/ops/dropout_kernels.cu +++ b/lib/kernels/src/cuda/ops/dropout_kernels.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/dropout_kernels.h" #include "kernels/ff_handle.h" diff --git a/lib/kernels/src/cuda/ops/element_binary_kernels.cu b/lib/kernels/src/cuda/ops/element_binary_kernels.cu index 44273a323f..3a4a77b3dd 100644 --- a/lib/kernels/src/cuda/ops/element_binary_kernels.cu +++ b/lib/kernels/src/cuda/ops/element_binary_kernels.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/element_binary_kernels.h" #include "kernels/ff_handle.h" #include "op-attrs/datatype.h" diff --git a/lib/kernels/src/cuda/ops/element_unary_kernels.cu b/lib/kernels/src/cuda/ops/element_unary_kernels.cu index 056c80ecf6..218e74b939 100644 --- a/lib/kernels/src/cuda/ops/element_unary_kernels.cu +++ b/lib/kernels/src/cuda/ops/element_unary_kernels.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/datatype_dispatch.h" #include "kernels/element_unary_kernels.h" #include "op-attrs/get_op_type.h" @@ -290,10 +290,10 @@ struct BackwardKernel { OperatorType op_type, std::optional scalar, PerDeviceFFHandle const &handle, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &input_grad, GenericTensorAccessorR const &output, - GenericTensorAccessorR const &output_grad) { + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad) { checkCUDNN(cudnnSetStream(handle.dnn, stream)); if (use_cudnn(op_type)) { @@ -356,20 +356,20 @@ void backward_kernel(ffStream_t stream, ElementUnaryPerDeviceState const &device_state, ElementUnaryAttrs const &attrs, PerDeviceFFHandle const &handle, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &input_grad, GenericTensorAccessorR const &output, - GenericTensorAccessorR const &output_grad) { + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad) { DataTypeDispatch1{}(input.data_type, stream, device_state, get_op_type(attrs), attrs.scalar, handle, - input, - input_grad, output, - output_grad); + output_grad, + input, + input_grad); } } // namespace ElementUnary diff --git a/lib/kernels/src/cuda/ops/flat_kernels.cu b/lib/kernels/src/cuda/ops/flat_kernels.cu index 973d05f596..594a183ff0 100644 --- a/lib/kernels/src/cuda/ops/flat_kernels.cu +++ b/lib/kernels/src/cuda/ops/flat_kernels.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/accessor.h" #include "kernels/flat_kernels.h" @@ -35,8 +35,8 @@ void forward_kernel(cudaStream_t stream, void backward_kernel(cudaStream_t stream, GenericTensorAccessorR input, - float *input_grad_ptr, - float const *output_grad_ptr) { + float const *output_grad_ptr, + float *input_grad_ptr) { float alpha = 1.0f; apply_add_with_scale diff --git a/lib/kernels/src/cuda/ops/gather_kernels.cu b/lib/kernels/src/cuda/ops/gather_kernels.cu index 31c1bac217..19e495a540 100644 --- a/lib/kernels/src/cuda/ops/gather_kernels.cu +++ b/lib/kernels/src/cuda/ops/gather_kernels.cu @@ -13,14 +13,12 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/datatype_dispatch.h" #include "kernels/device.h" #include "kernels/gather_kernels.h" -namespace FlexFlow { -namespace Kernels { -namespace Gather { +namespace FlexFlow::Kernels::Gather { template __global__ void gather_forward(float const *input, @@ -125,11 +123,15 @@ void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &index, GenericTensorAccessorW const &output) { checkCUDA(get_legion_stream(&stream)); - coord_t stride = - output.shape.sub_shape(std::nullopt, add_to_legion_dim(m.legion_dim, 1)) + output.shape + .sub_shape(legion_dim_t{0_n}, add_to_legion_dim(m.legion_dim, 1)) .num_elements() .unwrap_nonnegative(); + if (m.legion_dim.value == 0_n) { + stride = 1; + } + coord_t output_dim_size = output.shape.at(m.legion_dim).unwrap_nonnegative(); coord_t input_dim_size = input.shape.at(m.legion_dim).unwrap_nonnegative(); @@ -157,9 +159,13 @@ void backward_kernel(ffStream_t stream, coord_t stride = output_grad.shape - .sub_shape(std::nullopt, add_to_legion_dim(m.legion_dim, 1)) - .get_volume() + .sub_shape(legion_dim_t{0_n}, add_to_legion_dim(m.legion_dim, 1)) + .num_elements() .unwrap_nonnegative(); + if (m.legion_dim.value == 0_n) { + stride = 1; + } + coord_t output_dim_size = output_grad.shape.at(m.legion_dim).unwrap_nonnegative(); coord_t input_dim_size = @@ -180,6 +186,4 @@ void backward_kernel(ffStream_t stream, output_dim_size); } -} // namespace Gather -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Gather diff --git a/lib/kernels/src/cuda/ops/linear_kernels.cu b/lib/kernels/src/cuda/ops/linear_kernels.cu index ca51f0d216..02bda55828 100644 --- a/lib/kernels/src/cuda/ops/linear_kernels.cu +++ b/lib/kernels/src/cuda/ops/linear_kernels.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/allocation.h" #include "kernels/linear_kernels.h" #include "utils/integer_conversions.h" @@ -108,10 +108,10 @@ LinearPerDeviceState init_kernel(PerDeviceFFHandle handle, void forward_kernel(cudaStream_t stream, LinearPerDeviceState const &m, - void const *input_ptr, - void *output_ptr, - void const *weight_ptr, - void const *bias_ptr, + float const *input_ptr, + float *output_ptr, + float const *weight_ptr, + float const *bias_ptr, int in_dim, int out_dim, int batch_size) { @@ -135,14 +135,14 @@ void forward_kernel(cudaStream_t stream, batch_size, in_dim, &alpha, - weight_ptr, + static_cast(weight_ptr), weight_type, in_dim, - input_ptr, + static_cast(input_ptr), input_type, in_dim, &beta, - output_ptr, + static_cast(output_ptr), output_type, out_dim, compute_type, @@ -156,14 +156,14 @@ void forward_kernel(cudaStream_t stream, batch_size, 1, &alpha, - bias_ptr, + static_cast(bias_ptr), weight_type, 1, - m.one_ptr, + static_cast(m.one_ptr), CUDA_R_32F, 1, &alpha, - output_ptr, + static_cast(output_ptr), output_type, out_dim, compute_type, @@ -174,10 +174,10 @@ void forward_kernel(cudaStream_t stream, m.actiDesc, &alpha, m.outputTensor, - output_ptr, + static_cast(output_ptr), &beta, m.outputTensor, - output_ptr)); + static_cast(output_ptr))); } else if (m.activation == Activation::GELU) { size_t elements = size_t_from_int(out_dim) * size_t_from_int(batch_size); constexpr float B = 0.7978845608028654f; // sqrt(2.0/M_PI) @@ -191,13 +191,13 @@ void forward_kernel(cudaStream_t stream, void backward_kernel(cudaStream_t stream, LinearPerDeviceState const &m, - void const *input_ptr, - void *input_grad_ptr, - void const *output_ptr, - void *output_grad_ptr, - void const *kernel_ptr, - void *kernel_grad_ptr, - void *bias_grad_ptr, + float const *output_ptr, + float *output_grad_ptr, + float const *input_ptr, + float *input_grad_ptr, + float const *kernel_ptr, + float *kernel_grad_ptr, + float *bias_grad_ptr, int in_dim, int out_dim, int batch_size) { @@ -216,11 +216,17 @@ void backward_kernel(cudaStream_t stream, int output_size = out_dim * batch_size; if (m.activation.has_value()) { if (m.activation == Activation::RELU) { - relu_backward_kernel( - m.output_type, output_grad_ptr, output_ptr, output_size, stream); + relu_backward_kernel(m.output_type, + static_cast(output_grad_ptr), + static_cast(output_ptr), + output_size, + stream); } else if (m.activation == Activation::SIGMOID) { - sigmoid_backward_kernel( - m.output_type, output_grad_ptr, output_ptr, output_size, stream); + sigmoid_backward_kernel(m.output_type, + static_cast(output_grad_ptr), + static_cast(output_ptr), + output_size, + stream); } else { // TODO: only support relu and sigmoid for now assert(false && "Unsupported activation for Linear"); @@ -235,14 +241,14 @@ void backward_kernel(cudaStream_t stream, out_dim, batch_size, &alpha, - input_ptr, + static_cast(input_ptr), input_type, in_dim, - output_grad_ptr, + static_cast(output_grad_ptr), output_type, out_dim, &alpha, - kernel_grad_ptr, + static_cast(kernel_grad_ptr), weight_type, in_dim, compute_type, @@ -261,12 +267,12 @@ void backward_kernel(cudaStream_t stream, in_dim, out_dim, &alpha, - (float *)kernel_grad_ptr, + kernel_grad_ptr, in_dim, &lambda, - (float *)kernel_ptr, + kernel_ptr, in_dim, - (float *)kernel_grad_ptr, + kernel_grad_ptr, in_dim)); } else { assert(false && "Only L2 regularization is supported"); @@ -284,14 +290,14 @@ void backward_kernel(cudaStream_t stream, out_dim, batch_size, &alpha, - m.one_ptr, + static_cast(m.one_ptr), CUDA_R_32F, 1, - output_grad_ptr, + static_cast(output_grad_ptr), output_type, out_dim, &alpha, - bias_grad_ptr, + static_cast(bias_grad_ptr), weight_type, 1, compute_type, @@ -307,14 +313,14 @@ void backward_kernel(cudaStream_t stream, batch_size, out_dim, &alpha, - kernel_ptr, + static_cast(kernel_ptr), weight_type, in_dim, - output_grad_ptr, + static_cast(output_grad_ptr), output_type, out_dim, &alpha, - input_grad_ptr, + static_cast(input_grad_ptr), input_type, in_dim, compute_type, diff --git a/lib/kernels/src/cuda/ops/partition_kernels.cu b/lib/kernels/src/cuda/ops/partition_kernels.cu index 2831562f58..b8dfac5204 100644 --- a/lib/kernels/src/cuda/ops/partition_kernels.cu +++ b/lib/kernels/src/cuda/ops/partition_kernels.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/datatype_dispatch.h" #include "kernels/partition_kernels.h" @@ -40,8 +40,8 @@ template struct BackwardKernel { void operator()(cudaStream_t stream, RepartitionPerDeviceState const &m, - GenericTensorAccessorW const &input_grad, - GenericTensorAccessorR const &output_grad) { + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad) { add_kernel> <<{}( - m.data_type, stream, m, input_grad, output_grad); + m.data_type, stream, m, output_grad, input_grad); } } // namespace Repartition diff --git a/lib/kernels/src/cuda/ops/pool_2d_kernels.cu b/lib/kernels/src/cuda/ops/pool_2d_kernels.cu index 51fa29d289..e8ea3f64c2 100644 --- a/lib/kernels/src/cuda/ops/pool_2d_kernels.cu +++ b/lib/kernels/src/cuda/ops/pool_2d_kernels.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/pool_2d_kernels.h" namespace FlexFlow { @@ -112,10 +112,10 @@ void forward_kernel(cudaStream_t stream, void backward_kernel(cudaStream_t stream, Pool2DPerDeviceState const &m, - void const *input_ptr, - void *input_grad_ptr, void const *output_ptr, - void const *output_grad_ptr) { + void const *output_grad_ptr, + void const *input_ptr, + void *input_grad_ptr) { checkCUDNN(cudnnSetStream(m.handle.dnn, stream)); diff --git a/lib/kernels/src/cuda/ops/reduce_kernels.cu b/lib/kernels/src/cuda/ops/reduce_kernels.cu index 02a89da807..563bbae21d 100644 --- a/lib/kernels/src/cuda/ops/reduce_kernels.cu +++ b/lib/kernels/src/cuda/ops/reduce_kernels.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/reduce_kernels.h" namespace FlexFlow { diff --git a/lib/kernels/src/cuda/ops/reduction_kernels.cu b/lib/kernels/src/cuda/ops/reduction_kernels.cu index 5d95a3766a..d9c09b082d 100644 --- a/lib/kernels/src/cuda/ops/reduction_kernels.cu +++ b/lib/kernels/src/cuda/ops/reduction_kernels.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/datatype_dispatch.h" #include "kernels/reduction_kernels.h" @@ -55,8 +55,8 @@ struct ForwardKernel { template struct BackwardKernel { void operator()(cudaStream_t stream, - GenericTensorAccessorW const &input, - GenericTensorAccessorR const &output) { + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { checkCUDA(cudaMemcpyAsync(input.get(), output.get(), input.shape.num_elements().unwrap_nonnegative() * @@ -75,9 +75,9 @@ void forward_kernel(cudaStream_t stream, } void backward_kernel(cudaStream_t stream, - GenericTensorAccessorW const &input, - GenericTensorAccessorR const &output) { - DataTypeDispatch1{}(input.data_type, stream, input, output); + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { + DataTypeDispatch1{}(output.data_type, stream, output, input); } } // namespace Reduction diff --git a/lib/kernels/src/cuda/ops/replicate_kernels.cu b/lib/kernels/src/cuda/ops/replicate_kernels.cu index 4706f38fd4..4685fd7a2d 100644 --- a/lib/kernels/src/cuda/ops/replicate_kernels.cu +++ b/lib/kernels/src/cuda/ops/replicate_kernels.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/datatype_dispatch.h" #include "kernels/replicate_kernels.h" @@ -22,8 +22,8 @@ namespace Kernels { namespace Replicate { template -__global__ void replicate_backward_kernel(T *input_ptr, - T const *output_ptr, +__global__ void replicate_backward_kernel(T const *output_ptr, + T *input_ptr, size_t num_elements, size_t num_replicas) { CUDA_KERNEL_LOOP(i, num_elements) { @@ -38,7 +38,6 @@ struct ForwardKernel { void operator()(cudaStream_t stream, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output) { - checkCUDA(cudaMemcpyAsync((void *)output.get(), (void *)input.get(), input.shape.num_elements().unwrap_nonnegative() * @@ -51,15 +50,15 @@ struct ForwardKernel { template struct BackwardKernel { void operator()(cudaStream_t stream, - GenericTensorAccessorW const &input, GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input, size_t num_replicas) { size_t total_elements = input.shape.num_elements().unwrap_nonnegative() * num_replicas; replicate_backward_kernel> <<>>( - input.get(), output.get(), + input.get(), input.shape.num_elements().unwrap_nonnegative(), num_replicas); } @@ -72,11 +71,11 @@ void forward_kernel(cudaStream_t stream, } void backward_kernel(cudaStream_t stream, - GenericTensorAccessorW const &input, GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input, size_t num_replicas) { DataTypeDispatch1{}( - input.data_type, stream, input, output, num_replicas); + input.data_type, stream, output, input, num_replicas); } } // namespace Replicate diff --git a/lib/kernels/src/cuda/ops/reshape_kernels.cu b/lib/kernels/src/cuda/ops/reshape_kernels.cu index c5a289ce6b..a6a390b38e 100644 --- a/lib/kernels/src/cuda/ops/reshape_kernels.cu +++ b/lib/kernels/src/cuda/ops/reshape_kernels.cu @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/datatype_dispatch.h" #include "kernels/reshape_kernels.h" @@ -43,8 +43,8 @@ struct ForwardKernel { template struct BackwardKernel { void operator()(cudaStream_t stream, - GenericTensorAccessorW const &input, - GenericTensorAccessorR const &output) { + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { float alpha = 1.0f; apply_add_with_scale> <<{}(m.data_type, stream, input, output); + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { + DataTypeDispatch1{}(m.data_type, stream, output, input); } } // namespace Reshape diff --git a/lib/kernels/src/cuda/ops/reverse_kernels.cu b/lib/kernels/src/cuda/ops/reverse_kernels.cu index 8391a499df..582aa02386 100644 --- a/lib/kernels/src/cuda/ops/reverse_kernels.cu +++ b/lib/kernels/src/cuda/ops/reverse_kernels.cu @@ -13,13 +13,11 @@ * limitations under the License. */ -#include "device.h" +#include "internal/device.h" #include "kernels/reverse_kernels.h" +#include "kernels/reverse_kernels_params.h" -namespace FlexFlow { - -namespace Kernels { -namespace Reverse { +namespace FlexFlow::Kernels::Reverse { __global__ void reverse_forward_kernel(float const *in_ptr, float *out_ptr, @@ -27,23 +25,24 @@ __global__ void reverse_forward_kernel(float const *in_ptr, coord_t reverse_dim_size, coord_t in_blk_size) { CUDA_KERNEL_LOOP(i, num_out_blks * reverse_dim_size * in_blk_size) { + coord_t out_idx = i; coord_t blk_idx = i / (reverse_dim_size * in_blk_size); i = i - blk_idx * (reverse_dim_size * in_blk_size); coord_t reverse_dim_idx = i / in_blk_size; i = i - reverse_dim_idx * in_blk_size; coord_t in_idx = blk_idx * (reverse_dim_size * in_blk_size) + (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size + i; - out_ptr[i] = in_ptr[in_idx]; + out_ptr[out_idx] = in_ptr[in_idx]; } } -void forward_kernel(cudaStream_t stream, - float const *in_ptr, - float *out_ptr, - coord_t num_out_blks, - coord_t reverse_dim_size, - coord_t in_blk_size, - coord_t output_size) { +static void forward_kernel_internal(cudaStream_t stream, + float const *in_ptr, + float *out_ptr, + coord_t num_out_blks, + coord_t reverse_dim_size, + coord_t in_blk_size, + coord_t output_size) { reverse_forward_kernel<< 0.0f) { - V[i] = V[i] * momentum + gt; - if (nesterov) { - gt = gt + momentum * V[i]; - } else { - gt = V[i]; - } - } - W[i] -= lr * gt; - } -} - -__host__ void SGDOptimizer::ps_update_task_gpu(SGDOptimizer const *op, - float const *w_grad_ptr, - size_t size, - int num_replicas, - float *w_ptr, - float *v_ptr) { - cudaStream_t stream; - checkCUDA(get_legion_stream(&stream)); - // Step 1: Gather gradients in the first replica - for (int i = 1; i < num_replicas; i++) { - float const *src = w_grad_ptr + i * size; - apply_add_with_scale - <<>>( - (float *)w_grad_ptr, src, size, 1.0f); - } - // checkCUDA(cudaDeviceSynchronize()); - // Step 2: SGD update - sgd_update<<>>( - size, - op->lr, - op->weight_decay, - op->momentum, - op->nesterov, - w_grad_ptr, - v_ptr, - w_ptr); - // checkCUDA(cudaDeviceSynchronize()); -} - -#ifdef FF_USE_NCCL -__host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op, - PerDeviceOpState const *meta, - float const *w_grad_ptr, - size_t size, - float *w_ptr, - float *v_ptr) { - // Use NCCL to sync gradients - // fprintf(stderr, "weight(%p) Before ncclAllReduce...\n", w_grad_ptr); - cudaStream_t stream; - checkCUDA(get_legion_stream(&stream)); - checkNCCL(ncclAllReduce(w_grad_ptr, - (float *)w_grad_ptr, - size, - ncclFloat, - ncclSum, - meta->handle.ncclComm, - stream)); - // fprintf(stderr, "weight(%p) After ncclAllReduce...\n", w_grad_ptr); - // print_tensor((float*)w_grad_ptr, 16, "[After ncclAllReduce]"); - - // Step 2: SGD update - sgd_update<<>>( - size, - op->lr, - op->weight_decay, - op->momentum, - op->nesterov, - w_grad_ptr, - v_ptr, - w_ptr); - // checkCUDA(cudaDeviceSynchronize()); -} -#endif - -// ================================================================== -// Adam Optimizer -// ================================================================== -__global__ void - add_kernel(int count, float scale, float const *src, float *dst) { - CUDA_KERNEL_LOOP(i, count) { - dst[i] += src[i] * scale; - } -} - -__global__ void scale_kernel(int count, float a, float b, float *ptr) { - CUDA_KERNEL_LOOP(i, count) { - ptr[i] = (b - a) * ptr[i] + a; - } -} - -__global__ void adam_update(int count, - float alpha_t, - float beta1, - float beta2, - float weight_decay, - float epsilon, - float const *WGrad, - float *M, - float *V, - float *W) { - // Reference for weight decay - // https://www.fast.ai/2018/07/02/adam-weight-decay/ - CUDA_KERNEL_LOOP(i, count) { - // W[i] -= weight_decay * alpha_t * W[i]; - // float gt = WGrad[i]; - float gt = WGrad[i] + weight_decay * W[i]; - float mt = beta1 * M[i] + (1 - beta1) * gt; - float vt = beta2 * V[i] + (1 - beta2) * gt * gt; - M[i] = mt; - V[i] = vt; - W[i] -= alpha_t * mt / (sqrt(vt) + epsilon); - } -} - -__host__ void AdamOptimizer::ps_update_task_gpu(AdamOptimizer const *op, - float const *w_grad_ptr, - size_t size, - int num_replicas, - float *w_ptr, - float *v_ptr, - float *m_ptr) { - cudaStream_t stream; - checkCUDA(get_legion_stream(&stream)); - // Step 1: Gather gradients in the first replica - for (int i = 1; i < num_replicas; i++) { - float const *src = w_grad_ptr + i * size; - add_kernel<<>>( - size, 1.0f, src, (float *)w_grad_ptr); - } - // checkCUDA(cudaDeviceSynchronize()); - // fprintf(stderr, "alpha = %.8lf alpha_t = %.8lf decay = %.8lf\n", - // op->alpha, op->alpha_t, op->weight_decay); - // Step 2: Adam update - adam_update<<>>( - size, - op->alpha_t, - op->beta1, - op->beta2, - op->weight_decay, - op->epsilon, - w_grad_ptr, - m_ptr, - v_ptr, - w_ptr); - // checkCUDA(cudaDeviceSynchronize()); -} - -#ifdef FF_USE_NCCL -__host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op, - PerDeviceOpState const *meta, - float const *w_grad_ptr, - size_t size, - float *w_ptr, - float *v_ptr, - float *m_ptr) { - // Use NCCL to sync gradients - cudaStream_t stream; - checkCUDA(get_legion_stream(&stream)); - checkNCCL(ncclAllReduce(w_grad_ptr, - (float *)w_grad_ptr, - size, - ncclFloat, - ncclSum, - meta->handle.ncclComm, - stream)); - // fprintf(stderr, "alpha = %.8lf alpha_t = %.8lf decay = %.8lf\n", - // op->alpha, op->alpha_t, op->weight_decay); - // Step 2: Adam update - adam_update<<>>( - size, - op->alpha_t, - op->beta1, - op->beta2, - op->weight_decay, - op->epsilon, - w_grad_ptr, - m_ptr, - v_ptr, - w_ptr); - // checkCUDA(cudaDeviceSynchronize()); -} -#endif - -} // namespace FlexFlow diff --git a/lib/kernels/src/cuda/optimizer_kernels.cu b/lib/kernels/src/cuda/optimizer_kernels.cu new file mode 100644 index 0000000000..fe817876ce --- /dev/null +++ b/lib/kernels/src/cuda/optimizer_kernels.cu @@ -0,0 +1,205 @@ +/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "internal/device.h" +#include "kernels/nccl.h" +#include "kernels/optimizer_kernels.h" +#include "utils/exception.h" + +namespace FlexFlow { + +__global__ void sgd_update(size_t count, + float lr, + float weight_decay, + float momentum, + bool nesterov, + float const *WGrad, + float *V, + float *W) { + // Refernce https://pytorch.org/docs/stable/_modules/torch/optim/sgd.html#SGD + CUDA_KERNEL_LOOP(i, count) { + float gt = WGrad[i] + weight_decay * W[i]; + if (momentum > 0.0f) { + V[i] = V[i] * momentum + gt; + if (nesterov) { + gt = gt + momentum * V[i]; + } else { + gt = V[i]; + } + } + W[i] -= lr * gt; + } +} + +__host__ void sgd_ps_update_task_gpu(ffStream_t stream, + float lr, + float momentum, + bool nesterov, + float weight_decay, + float const *weight_grad_ptr, + size_t size, + int num_replicas, + float *weight_ptr, + float *sgd_v_ptr) { + // Step 1: Gather gradients in the first replica + for (int i = 1; i < num_replicas; i++) { + float const *src = weight_grad_ptr + i * size; + apply_add_with_scale + <<>>( + (float *)weight_grad_ptr, src, size, 1.0f); + } + + // Step 2: SGD update + sgd_update<<>>(size, + lr, + weight_decay, + momentum, + nesterov, + weight_grad_ptr, + sgd_v_ptr, + weight_ptr); +} + +#ifdef FF_USE_NCCL +__host__ void sgd_nccl_update_task_gpu(ffStream_t stream, + float lr, + float momentum, + bool nesterov, + float weight_decay, + PerDeviceFFHandle const &handle, + float const *w_grad_ptr, + size_t size, + float *w_ptr, + float *v_ptr) { + // Step 1: Use NCCL to sync gradients + ncclComm_t comm = handle.ncclComm; + checkNCCL(ncclAllReduce( + w_grad_ptr, (float *)w_grad_ptr, size, ncclFloat, ncclSum, comm, stream)); + + // Step 2: SGD update + sgd_update<<>>( + size, lr, weight_decay, momentum, nesterov, w_grad_ptr, v_ptr, w_ptr); +} +#endif + +// ================================================================== +// Adam Optimizer +// ================================================================== +__global__ void + add_kernel(int count, float scale, float const *src, float *dst) { + CUDA_KERNEL_LOOP(i, count) { + dst[i] += src[i] * scale; + } +} + +__global__ void scale_kernel(int count, float a, float b, float *ptr) { + CUDA_KERNEL_LOOP(i, count) { + ptr[i] = (b - a) * ptr[i] + a; + } +} + +__global__ void adam_update(int count, + float alpha_t, + float beta1, + float beta2, + float weight_decay, + float epsilon, + float const *WGrad, + float *M, + float *V, + float *W) { + // Reference for weight decay + // https://www.fast.ai/2018/07/02/adam-weight-decay/ + CUDA_KERNEL_LOOP(i, count) { + // W[i] -= weight_decay * alpha_t * W[i]; + // float gt = WGrad[i]; + float gt = WGrad[i] + weight_decay * W[i]; + float mt = beta1 * M[i] + (1 - beta1) * gt; + float vt = beta2 * V[i] + (1 - beta2) * gt * gt; + M[i] = mt; + V[i] = vt; + W[i] -= alpha_t * mt / (sqrt(vt) + epsilon); + } +} + +__host__ void adam_ps_update_task_gpu(ffStream_t stream, + float alpha_t, + float beta1, + float beta2, + float weight_decay, + float epsilon, + float const *w_grad_ptr, + size_t size, + int num_replicas, + float *w_ptr, + float *v_ptr, + float *m_ptr) { + // Step 1: Gather gradients in the first replica + for (int i = 1; i < num_replicas; i++) { + float const *src = w_grad_ptr + i * size; + add_kernel<<>>( + (float *)w_grad_ptr, src, size); + } + + // Step 2: Adam update + adam_update<<>>(size, + alpha_t, + beta1, + beta2, + weight_decay, + epsilon, + w_grad_ptr, + m_ptr, + v_ptr, + w_ptr); +} + +#ifdef FF_USE_NCCL +__host__ void nccl_update_task_gpu(ffStream_t stream, + float alpha_t, + float beta1, + float beta2, + float weight_decay, + float epsilon, + PerDeviceFFHandle const &handle, + float const *w_grad_ptr, + size_t size, + float *w_ptr, + float *v_ptr, + float *m_ptr) { + // Step 1: Use NCCL to sync gradients + checkNCCL(ncclAllReduce(w_grad_ptr, + (float *)w_grad_ptr, + size, + ncclFloat, + ncclSum, + handle.ncclComm, + stream)); + + // Step 2: Adam update + adam_update<<>>(size, + alpha_t, + beta1, + beta2, + weight_decay, + epsilon, + w_grad_ptr, + m_ptr, + v_ptr, + w_ptr); +} +#endif + +} // namespace FlexFlow diff --git a/lib/kernels/src/hip/embedding_kernels.cpp b/lib/kernels/src/hip/embedding_kernels.cpp index 7ca3149f2f..aefe53cc46 100644 --- a/lib/kernels/src/hip/embedding_kernels.cpp +++ b/lib/kernels/src/hip/embedding_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/embedding_kernels.h" -#include "device.h" +#include "internal/device.h" #include "kernels/datatype_dispatch.h" #include @@ -364,8 +364,8 @@ struct ForwardKernel { weight.data_type == DataType::FLOAT || weight.data_type == DataType::DOUBLE); - if (aggr == AggregateOp::NONE) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_forward_no_aggr), + if (aggr == AggregateOp::AVG || aggr == AggregateOp::SUM) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_forward_with_aggr), GET_BLOCKS(output.shape.get_volume()), CUDA_NUM_THREADS, 0, @@ -374,10 +374,11 @@ struct ForwardKernel { output.get(), weight.get(), out_dim, - batch_size); + in_dim, + batch_size, + aggr); } else { - assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); - hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_forward_with_aggr), + hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_forward_no_aggr), GET_BLOCKS(output.shape.get_volume()), CUDA_NUM_THREADS, 0, @@ -386,9 +387,7 @@ struct ForwardKernel { output.get(), weight.get(), out_dim, - in_dim, - batch_size, - aggr); + batch_size); } } } @@ -408,8 +407,9 @@ struct BackwardKernel { assert(output.data_type == DataType::HALF || output.data_type == DataType::FLOAT || output.data_type == DataType::DOUBLE); - if (aggr == AggregateOp::NONE) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_backward_no_aggr), + + if (aggr == AggregateOp::AVG || aggr == AggregateOp::SUM) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_backward_with_aggr), GET_BLOCKS(output.shape.get_volume()), CUDA_NUM_THREADS, 0, @@ -418,9 +418,11 @@ struct BackwardKernel { output.get(), weight_grad.get(), out_dim, - batch_size); + in_dim, + batch_size, + aggr); } else { - hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_backward_with_aggr), + hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_backward_no_aggr), GET_BLOCKS(output.shape.get_volume()), CUDA_NUM_THREADS, 0, @@ -429,9 +431,7 @@ struct BackwardKernel { output.get(), weight_grad.get(), out_dim, - in_dim, - batch_size, - aggr); + batch_size); } } } diff --git a/lib/kernels/src/hip/loss_function_kernels.cpp b/lib/kernels/src/hip/loss_function_kernels.cpp index e82b5c96d5..05068f1bd0 100644 --- a/lib/kernels/src/hip/loss_function_kernels.cpp +++ b/lib/kernels/src/hip/loss_function_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/loss_function_kernels.h" -#include "device.h" +#include "internal/device.h" #include namespace FlexFlow { diff --git a/lib/kernels/src/hip/ops/attention_kernels.cpp b/lib/kernels/src/hip/ops/attention_kernels.cpp index 005cef30d1..b374ead305 100644 --- a/lib/kernels/src/hip/ops/attention_kernels.cpp +++ b/lib/kernels/src/hip/ops/attention_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/attention_kernels.h" -#include "device.h" +#include "internal/device.h" #include namespace FlexFlow { diff --git a/lib/kernels/src/hip/ops/batch_matmul_kernels.cpp b/lib/kernels/src/hip/ops/batch_matmul_kernels.cpp index c4b3be823f..6d9ae8a268 100644 --- a/lib/kernels/src/hip/ops/batch_matmul_kernels.cpp +++ b/lib/kernels/src/hip/ops/batch_matmul_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/batch_matmul_kernels.h" -#include "device.h" +#include "internal/device.h" #include namespace FlexFlow { diff --git a/lib/kernels/src/hip/ops/batch_norm_kernels.cpp b/lib/kernels/src/hip/ops/batch_norm_kernels.cpp index 8e94b462cd..764a3e0b58 100644 --- a/lib/kernels/src/hip/ops/batch_norm_kernels.cpp +++ b/lib/kernels/src/hip/ops/batch_norm_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/batch_norm_kernels.h" -#include "device.h" +#include "internal/device.h" #include "kernels/allocation.h" #include "kernels/ff_handle.h" #include diff --git a/lib/kernels/src/hip/ops/cast_kernels.cpp b/lib/kernels/src/hip/ops/cast_kernels.cpp index fa0c37ffa1..1035657c04 100644 --- a/lib/kernels/src/hip/ops/cast_kernels.cpp +++ b/lib/kernels/src/hip/ops/cast_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/cast_kernels.h" -#include "device.h" +#include "internal/device.h" #include "kernels/datatype_dispatch.h" #include diff --git a/lib/kernels/src/hip/ops/combine_kernels.cpp b/lib/kernels/src/hip/ops/combine_kernels.cpp index aa01f02276..f1e0422747 100644 --- a/lib/kernels/src/hip/ops/combine_kernels.cpp +++ b/lib/kernels/src/hip/ops/combine_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/combine_kernels.h" -#include "device.h" +#include "internal/device.h" #include "kernels/accessor.h" #include "kernels/datatype_dispatch.h" #include diff --git a/lib/kernels/src/hip/ops/concat_kernels.cpp b/lib/kernels/src/hip/ops/concat_kernels.cpp index aa38be739b..a215d67942 100644 --- a/lib/kernels/src/hip/ops/concat_kernels.cpp +++ b/lib/kernels/src/hip/ops/concat_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/concat_kernels.h" -#include "device.h" +#include "internal/device.h" #include #include diff --git a/lib/kernels/src/hip/ops/conv_2d_kernels.h b/lib/kernels/src/hip/ops/conv_2d_kernels.h index bcf015d561..76a73ab08c 100644 --- a/lib/kernels/src/hip/ops/conv_2d_kernels.h +++ b/lib/kernels/src/hip/ops/conv_2d_kernels.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_KERNELS_HIP_CONV_2D_KERNELS_H #define _FLEXFLOW_KERNELS_HIP_CONV_2D_KERNELS_H -#include "device.h" +#include "kernels/device.h" namespace FlexFlow { namespace Kernels { diff --git a/lib/kernels/src/hip/ops/dropout_kernels.cpp b/lib/kernels/src/hip/ops/dropout_kernels.cpp index baaf8e6902..d85c0ae054 100644 --- a/lib/kernels/src/hip/ops/dropout_kernels.cpp +++ b/lib/kernels/src/hip/ops/dropout_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/dropout_kernels.h" -#include "device.h" +#include "internal/device.h" #include "kernels/ff_handle.h" #include diff --git a/lib/kernels/src/hip/ops/element_binary_kernels.cpp b/lib/kernels/src/hip/ops/element_binary_kernels.cpp index bc66bbff2f..9e0452b09b 100644 --- a/lib/kernels/src/hip/ops/element_binary_kernels.cpp +++ b/lib/kernels/src/hip/ops/element_binary_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/element_binary_kernels.h" -#include "device.h" +#include "internal/device.h" #include "kernels/ff_handle.h" #include "op-attrs/datatype.h" #include "op-attrs/operator_type.dtg.h" diff --git a/lib/kernels/src/hip/ops/element_unary_kernels.cpp b/lib/kernels/src/hip/ops/element_unary_kernels.cpp index f4b0ccb82d..163f13a6da 100644 --- a/lib/kernels/src/hip/ops/element_unary_kernels.cpp +++ b/lib/kernels/src/hip/ops/element_unary_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/element_unary_kernels.h" -#include "device.h" +#include "internal/device.h" #include "kernels/datatype_dispatch.h" #include "op-attrs/get_op_type.h" #include diff --git a/lib/kernels/src/hip/ops/flat_kernels.cpp b/lib/kernels/src/hip/ops/flat_kernels.cpp index 763fb9e322..dedfb4b9a9 100644 --- a/lib/kernels/src/hip/ops/flat_kernels.cpp +++ b/lib/kernels/src/hip/ops/flat_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/flat_kernels.h" -#include "device.h" +#include "internal/device.h" #include "kernels/accessor.h" #include diff --git a/lib/kernels/src/hip/ops/gather_kernels.cpp b/lib/kernels/src/hip/ops/gather_kernels.cpp index 17c0014e98..6e9e4c6a2c 100644 --- a/lib/kernels/src/hip/ops/gather_kernels.cpp +++ b/lib/kernels/src/hip/ops/gather_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/gather_kernels.h" -#include "device.h" +#include "internal/device.h" #include "kernels/datatype_dispatch.h" #include diff --git a/lib/kernels/src/hip/ops/partition_kernels.cpp b/lib/kernels/src/hip/ops/partition_kernels.cpp index 4591247faa..26748a7e45 100644 --- a/lib/kernels/src/hip/ops/partition_kernels.cpp +++ b/lib/kernels/src/hip/ops/partition_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/partition_kernels.h" -#include "device.h" +#include "internal/device.h" #include "kernels/datatype_dispatch.h" #include diff --git a/lib/kernels/src/hip/ops/pool_2d_kernels.cpp b/lib/kernels/src/hip/ops/pool_2d_kernels.cpp index ed942c105c..7e5ae2ab80 100644 --- a/lib/kernels/src/hip/ops/pool_2d_kernels.cpp +++ b/lib/kernels/src/hip/ops/pool_2d_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/pool_2d_kernels.h" -#include "device.h" +#include "internal/device.h" #include namespace FlexFlow { diff --git a/lib/kernels/src/hip/ops/reduce_kernels.cpp b/lib/kernels/src/hip/ops/reduce_kernels.cpp index 468543dd5b..c0bcc84d48 100644 --- a/lib/kernels/src/hip/ops/reduce_kernels.cpp +++ b/lib/kernels/src/hip/ops/reduce_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/reduce_kernels.h" -#include "device.h" +#include "internal/device.h" #include namespace FlexFlow { diff --git a/lib/kernels/src/hip/ops/replicate_kernels.cpp b/lib/kernels/src/hip/ops/replicate_kernels.cpp index 8d27bb1908..ee7bf701c0 100644 --- a/lib/kernels/src/hip/ops/replicate_kernels.cpp +++ b/lib/kernels/src/hip/ops/replicate_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/replicate_kernels.h" -#include "device.h" +#include "internal/device.h" #include "kernels/datatype_dispatch.h" #include diff --git a/lib/kernels/src/hip/ops/reshape_kernels.cpp b/lib/kernels/src/hip/ops/reshape_kernels.cpp index 47978a5f4a..810b929e24 100644 --- a/lib/kernels/src/hip/ops/reshape_kernels.cpp +++ b/lib/kernels/src/hip/ops/reshape_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/reshape_kernels.h" -#include "device.h" +#include "internal/device.h" #include "kernels/datatype_dispatch.h" #include diff --git a/lib/kernels/src/hip/ops/reverse_kernels.cpp b/lib/kernels/src/hip/ops/reverse_kernels.cpp index 03e97245bf..a56ff3540a 100644 --- a/lib/kernels/src/hip/ops/reverse_kernels.cpp +++ b/lib/kernels/src/hip/ops/reverse_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/reverse_kernels.h" -#include "device.h" +#include "internal/device.h" #include namespace FlexFlow { diff --git a/lib/kernels/src/hip/ops/softmax_kernels.cpp b/lib/kernels/src/hip/ops/softmax_kernels.cpp index 3a8f2813b7..610675850b 100644 --- a/lib/kernels/src/hip/ops/softmax_kernels.cpp +++ b/lib/kernels/src/hip/ops/softmax_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/softmax_kernels.h" -#include "device.h" +#include "internal/device.h" #include namespace FlexFlow { diff --git a/lib/kernels/src/hip/ops/split_kernels.cpp b/lib/kernels/src/hip/ops/split_kernels.cpp index 5599ae6d6f..3034b633a6 100644 --- a/lib/kernels/src/hip/ops/split_kernels.cpp +++ b/lib/kernels/src/hip/ops/split_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/split_kernels.h" -#include "device.h" +#include "internal/device.h" #include namespace FlexFlow { diff --git a/lib/kernels/src/hip/ops/topk_kernels.cpp b/lib/kernels/src/hip/ops/topk_kernels.cpp index f085c5831f..777d9edffa 100644 --- a/lib/kernels/src/hip/ops/topk_kernels.cpp +++ b/lib/kernels/src/hip/ops/topk_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/topk_kernels.h" -#include "device.h" +#include "internal/device.h" #include namespace FlexFlow { diff --git a/lib/kernels/src/hip/ops/transpose_kernels.cpp b/lib/kernels/src/hip/ops/transpose_kernels.cpp index ef9dd58c63..c5122f34bf 100644 --- a/lib/kernels/src/hip/ops/transpose_kernels.cpp +++ b/lib/kernels/src/hip/ops/transpose_kernels.cpp @@ -14,7 +14,7 @@ */ #include "kernels/transpose_kernels.h" -#include "device.h" +#include "internal/device.h" #include "kernels/accessor.h" #include "utils/exception.h" #include diff --git a/lib/kernels/src/device.cc b/lib/kernels/src/internal/device.cc similarity index 97% rename from lib/kernels/src/device.cc rename to lib/kernels/src/internal/device.cc index f46099c79a..eb3d229c2a 100644 --- a/lib/kernels/src/device.cc +++ b/lib/kernels/src/internal/device.cc @@ -1,4 +1,4 @@ -#include "device.h" +#include "internal/device.h" namespace FlexFlow { diff --git a/lib/kernels/src/device.h b/lib/kernels/src/internal/device.h similarity index 98% rename from lib/kernels/src/device.h rename to lib/kernels/src/internal/device.h index ceff2f92ff..226c7ad174 100644 --- a/lib/kernels/src/device.h +++ b/lib/kernels/src/internal/device.h @@ -1,5 +1,5 @@ -#ifndef _FLEXFLOW_KERNELS_SRC_DEVICE_H -#define _FLEXFLOW_KERNELS_SRC_DEVICE_H +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_INTERNAL_DEVICE_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_INTERNAL_DEVICE_H #include "kernels/array_shape.h" #include "kernels/device.h" diff --git a/lib/kernels/src/kernels/accessor.cc b/lib/kernels/src/kernels/accessor.cc new file mode 100644 index 0000000000..b5042f77a0 --- /dev/null +++ b/lib/kernels/src/kernels/accessor.cc @@ -0,0 +1,249 @@ +#include "kernels/accessor.h" +#include "kernels/allocation.h" +#include "kernels/datatype_dispatch.h" +#include "utils/containers/reversed.h" +#include "utils/containers/vector_of.h" +#include "utils/nonnegative_int/nonnegative_range.h" +#include + +namespace FlexFlow { + +nonnegative_int + calculate_accessor_offset(LegionOrdered const &indices, + ArrayShape const &shape) { + ASSERT(indices.size() == shape.num_dims(), + "Number of indices does not match the number of dimensions"); + + nonnegative_int offset = 0_n; + nonnegative_int multiplier = 1_n; + + for (legion_dim_t dim : reversed(vector_of(key_range(shape.dims)))) { + ASSERT(indices.at(dim) < shape.at(legion_dim_t{dim}), + "Out of bounds access", + dim); + + offset += indices.at(dim) * multiplier; + multiplier *= shape.at(legion_dim_t{dim}); + } + + return offset; +} + +void copy_accessor_data_to_l_from_r( + GenericTensorAccessorW &dst_accessor, + GenericTensorAccessorR const &src_accessor) { + size_t num_bytes = + dst_accessor.shape.get_volume().unwrap_nonnegative() * + size_of_datatype(dst_accessor.data_type).unwrap_nonnegative(); + + DeviceType dst_device_type = dst_accessor.device_type; + DeviceType src_device_type = src_accessor.device_type; + + if (src_device_type == DeviceType::CPU && + dst_device_type == DeviceType::CPU) { + memcpy(dst_accessor.ptr, src_accessor.ptr, num_bytes); + } else if (src_device_type == DeviceType::CPU && + dst_device_type == DeviceType::GPU) { + checkCUDA(cudaMemcpy( + dst_accessor.ptr, src_accessor.ptr, num_bytes, cudaMemcpyHostToDevice)); + } else if (src_device_type == DeviceType::GPU && + dst_device_type == DeviceType::CPU) { + checkCUDA(cudaMemcpy( + dst_accessor.ptr, src_accessor.ptr, num_bytes, cudaMemcpyDeviceToHost)); + } else { + assert(src_device_type == DeviceType::GPU); + assert(dst_device_type == DeviceType::GPU); + checkCUDA(cudaMemcpy(dst_accessor.ptr, + src_accessor.ptr, + num_bytes, + cudaMemcpyDeviceToDevice)); + } +} + +GenericTensorAccessorW::operator GenericTensorAccessorR() const { + return read_only_accessor_from_write_accessor(*this); +} + +GenericTensorAccessorW::GenericTensorAccessorW( + DataType data_type, + ArrayShape const &shape, + void *ptr, + DeviceType device_type = DeviceType::GPU) + : data_type(data_type), shape(shape), ptr(ptr), device_type(device_type) {} + +std::tuple + GenericTensorAccessorW::tie() const { + return std::tie(this->data_type, this->shape, this->ptr, this->device_type); +} + +bool GenericTensorAccessorW::operator==( + GenericTensorAccessorW const &other) const { + return this->tie() == other.tie(); +} + +bool GenericTensorAccessorW::operator!=( + GenericTensorAccessorW const &other) const { + return this->tie() != other.tie(); +} + +int32_t *GenericTensorAccessorW::get_int32_ptr() const { + return this->get(); +} + +int64_t *GenericTensorAccessorW::get_int64_ptr() const { + return this->get(); +} + +float *GenericTensorAccessorW::get_float_ptr() const { + return this->get(); +} + +double *GenericTensorAccessorW::get_double_ptr() const { + return this->get(); +} + +half *GenericTensorAccessorW::get_half_ptr() const { + return this->get(); +} + +std::string format_as(GenericTensorAccessorW const &a) { + return fmt::format("", + a.data_type, + a.shape, + a.ptr); +} + +std::ostream &operator<<(std::ostream &s, GenericTensorAccessorW const &a) { + return (s << fmt::to_string(a)); +} + +GenericTensorAccessorR::GenericTensorAccessorR( + DataType data_type, + ArrayShape const &shape, + void const *ptr, + DeviceType device_type = DeviceType::GPU) + : data_type(data_type), shape(shape), ptr(ptr), device_type(device_type) {} + +std::tuple + GenericTensorAccessorR::tie() const { + return std::tie(this->data_type, this->shape, this->ptr, this->device_type); +} + +bool GenericTensorAccessorR::operator==( + GenericTensorAccessorR const &other) const { + return this->tie() == other.tie(); +} + +bool GenericTensorAccessorR::operator!=( + GenericTensorAccessorR const &other) const { + return this->tie() != other.tie(); +} + +int32_t const *GenericTensorAccessorR::get_int32_ptr() const { + return this->get(); +} + +int64_t const *GenericTensorAccessorR::get_int64_ptr() const { + return this->get(); +} + +float const *GenericTensorAccessorR::get_float_ptr() const { + return this->get(); +} + +double const *GenericTensorAccessorR::get_double_ptr() const { + return this->get(); +} + +half const *GenericTensorAccessorR::get_half_ptr() const { + return get(); +} + +std::string format_as(GenericTensorAccessorR const &a) { + return fmt::format("", + a.data_type, + a.shape, + a.ptr); +} + +std::ostream &operator<<(std::ostream &s, GenericTensorAccessorR const &a) { + return (s << fmt::to_string(a)); +} + +int32_t const *get_int32_ptr(GenericTensorAccessorR const &a) { + return get(a); +} + +int64_t const *get_int64_ptr(GenericTensorAccessorR const &a) { + return get(a); +} + +float const *get_float_ptr(GenericTensorAccessorR const &a) { + return get(a); +} + +double const *get_double_ptr(GenericTensorAccessorR const &a) { + return get(a); +} + +half const *get_half_ptr(GenericTensorAccessorR const &a) { + return get(a); +} + +std::vector + get_int32_ptrs(std::vector const &a) { + return get(a); +} + +std::vector + get_int64_ptrs(std::vector const &a) { + return get(a); +} + +std::vector + get_float_ptrs(std::vector const &a) { + return get(a); +} + +std::vector + get_double_ptrs(std::vector const &a) { + return get(a); +} + +std::vector + get_half_ptrs(std::vector const &a) { + return get(a); +} + +GenericTensorAccessorR read_only_accessor_from_write_accessor( + GenericTensorAccessorW const &writable) { + return GenericTensorAccessorR{writable.data_type, + writable.shape, + req(writable.ptr), + writable.device_type}; +} + +bool is_shape_and_dtype_equal(GenericTensorAccessorR const &acc1, + GenericTensorAccessorR const &acc2) { + return acc1.shape == acc2.shape && acc1.data_type == acc2.data_type; +} + +bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor, + ArrayShape const &expected_shape, + DataType const &expected_dtype) { + return accessor.shape == expected_shape && + accessor.data_type == expected_dtype; +} + +std::pair + get_shape_and_datatype(GenericTensorAccessorR const &accessor) { + return std::make_pair(accessor.shape, accessor.data_type); +} + +} // namespace FlexFlow diff --git a/lib/kernels/src/kernels/allocation.cc b/lib/kernels/src/kernels/allocation.cc new file mode 100644 index 0000000000..b9f253bcff --- /dev/null +++ b/lib/kernels/src/kernels/allocation.cc @@ -0,0 +1,38 @@ +#include "kernels/allocation.h" +#include "op-attrs/tensor_shape.h" + +namespace FlexFlow { + +void *Allocator::allocate(size_t mem_size) { + return this->i_allocator->allocate(mem_size); +} + +void Allocator::deallocate(void *ptr) { + this->i_allocator->deallocate(ptr); +} + +DeviceType Allocator::get_allocation_device_type() const { + return this->i_allocator->get_allocation_device_type(); +} + +GenericTensorAccessorW + Allocator::allocate_tensor(TensorShape const &tensor_shape) { + void *ptr = + this->allocate(get_size_in_bytes(tensor_shape).unwrap_nonnegative()); + return GenericTensorAccessorW{ + tensor_shape.data_type, + array_shape_from_tensor_shape(tensor_shape), + ptr, + this->get_allocation_device_type(), + }; +} + +void Allocator::deallocate_tensor(GenericTensorAccessorW const &t) { + this->deallocate(t.ptr); +} + +void Allocator::deallocate_tensor(GenericTensorAccessorR const &t) { + this->deallocate(const_cast(t.ptr)); +} + +} // namespace FlexFlow diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/kernels/array_shape.cc similarity index 51% rename from lib/kernels/src/array_shape.cc rename to lib/kernels/src/kernels/array_shape.cc index 243185ada4..34a53c1bb3 100644 --- a/lib/kernels/src/array_shape.cc +++ b/lib/kernels/src/kernels/array_shape.cc @@ -1,23 +1,20 @@ #include "kernels/array_shape.h" +#include "kernels/legion_ordered/slice.h" +#include "op-attrs/ff_ordered/ff_ordered_of.h" +#include "op-attrs/ff_ordered/slice.h" +#include "utils/containers/cartesian_product.h" #include "utils/containers/product.h" #include "utils/containers/reversed.h" +#include "utils/containers/transform.h" +#include "utils/containers/unordered_set_of.h" #include "utils/containers/vector_of.h" +#include "utils/hash/tuple.h" +#include "utils/hash/vector.h" #include "utils/nonnegative_int/num_elements.h" namespace FlexFlow { -static LegionOrdered - legion_dims_from_ff_dims(FFOrdered const &ff_ordered) { - return LegionOrdered{reversed(vector_of(ff_ordered))}; -} - -ArrayShape::ArrayShape(nonnegative_int *_dims, nonnegative_int num_dims) - : dims(_dims, _dims + num_dims.unwrap_nonnegative()) {} - -ArrayShape::ArrayShape(TensorShape const &shape) - : dims(legion_dims_from_ff_dims(shape.dims.ff_ordered)) {} - -ArrayShape::ArrayShape(std::vector const &input_dims) +ArrayShape::ArrayShape(LegionOrdered const &input_dims) : dims(input_dims) {} nonnegative_int ArrayShape::get_volume() const { @@ -59,10 +56,19 @@ bool ArrayShape::operator!=(ArrayShape const &other) const { return this->tie() != other.tie(); } -ArrayShape ArrayShape::sub_shape( - std::optional> start, - std::optional> end) const { - NOT_IMPLEMENTED(); +ArrayShape + ArrayShape::sub_shape(ff_dim_t const &start, + std::optional const &maybe_end) const { + FFOrdered ff_ordered_dims = + ff_ordered_from_legion_ordered(this->dims); + FFOrdered sliced = slice(ff_ordered_dims, start, maybe_end); + return ArrayShape{legion_ordered_from_ff_ordered(sliced)}; +} + +ArrayShape + ArrayShape::sub_shape(legion_dim_t const &start, + std::optional const &maybe_end) const { + return ArrayShape{slice(this->dims, start, maybe_end)}; } std::optional ArrayShape::at_maybe(legion_dim_t index) const { @@ -81,15 +87,6 @@ std::tuple const &> ArrayShape::tie() const { return std::tie(this->dims); } -nonnegative_int get_volume(ArrayShape const &shape) { - return shape.get_volume(); -} - -TensorShape get_tensor_shape(ArrayShape const &shape, DataType dtype) { - return TensorShape{TensorDims{ff_ordered_from_legion_ordered(shape.dims)}, - dtype}; -} - std::string format_as(ArrayShape const &x) { std::ostringstream oss; oss << " get_array_coord_set(ArrayShape const &shape) { + std::vector> per_dim_ranges = + transform(vector_of(ff_ordered_from_legion_ordered(shape.dims)), + [](nonnegative_int dim_size) -> std::vector { + return nonnegative_range(dim_size); + }); + + std::unordered_set> raw_points = + unordered_set_of(cartesian_product(per_dim_ranges)); + + return transform(raw_points, + [](std::vector const &raw_point) { + return ArrayCoord{ff_ordered_of(raw_point)}; + }); +} + } // namespace FlexFlow + +namespace std { + +using namespace FlexFlow; + +size_t hash::operator()(ArrayShape const &s) const { + return get_std_hash(s.tie()); +} + +} // namespace std diff --git a/lib/kernels/src/kernels/copy_tensor_accessor.cc b/lib/kernels/src/kernels/copy_tensor_accessor.cc new file mode 100644 index 0000000000..d8619d8ce6 --- /dev/null +++ b/lib/kernels/src/kernels/copy_tensor_accessor.cc @@ -0,0 +1,66 @@ +#include "kernels/copy_tensor_accessor.h" +#include "kernels/datatype_dispatch.h" + +namespace FlexFlow { + +template +struct CopyTensorAccessorW { + GenericTensorAccessorW operator()(GenericTensorAccessorW const &src_accessor, + Allocator &allocator) { + TensorShape shape = + get_tensor_shape(src_accessor.shape, src_accessor.data_type); + GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape); + + copy_accessor_data_to_l_from_r(dst_accessor, src_accessor); + + return dst_accessor; + } +}; + +GenericTensorAccessorW + copy_tensor_accessor_w(GenericTensorAccessorW const &src_accessor, + Allocator &allocator) { + return DataTypeDispatch1{}( + src_accessor.data_type, src_accessor, allocator); +} + +template +struct CopyTensorAccessorR { + GenericTensorAccessorR operator()(GenericTensorAccessorR const &src_accessor, + Allocator &allocator) { + TensorShape shape = + get_tensor_shape(src_accessor.shape, src_accessor.data_type); + GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape); + + copy_accessor_data_to_l_from_r(dst_accessor, src_accessor); + + return read_only_accessor_from_write_accessor(dst_accessor); + } +}; + +GenericTensorAccessorR + copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor, + Allocator &allocator) { + return DataTypeDispatch1{}( + src_accessor.data_type, src_accessor, allocator); +} + +GenericTensorAccessorR copy_tensor_accessor_r_to_cpu_if_necessary( + GenericTensorAccessorR const &accessor, Allocator &cpu_allocator) { + if (accessor.device_type == DeviceType::GPU) { + return copy_tensor_accessor_r(accessor, cpu_allocator); + } else { + return accessor; + } +} + +GenericTensorAccessorW copy_tensor_accessor_w_to_cpu_if_necessary( + GenericTensorAccessorW const &accessor, Allocator &cpu_allocator) { + if (accessor.device_type == DeviceType::GPU) { + return copy_tensor_accessor_w(accessor, cpu_allocator); + } else { + return accessor; + } +} + +} // namespace FlexFlow diff --git a/lib/kernels/src/kernels/format_accessor_contents.cc b/lib/kernels/src/kernels/format_accessor_contents.cc new file mode 100644 index 0000000000..1b8ab35d89 --- /dev/null +++ b/lib/kernels/src/kernels/format_accessor_contents.cc @@ -0,0 +1,184 @@ +#include "kernels/format_accessor_contents.h" +#include "kernels/copy_tensor_accessor.h" +#include "kernels/datatype_dispatch.h" +#include "kernels/local_cpu_allocator.h" +#include "utils/indent.h" +#include + +namespace FlexFlow { + +template +struct Print1DCPUAccessorR { + void operator()(GenericTensorAccessorR const &accessor, + std::ostream &stream) { + ASSERT(accessor.device_type == DeviceType::CPU); + nonnegative_int dims = accessor.shape.num_dims(); + ASSERT(dims == 1_n); + + nonnegative_int ncols = accessor.shape.at(ff_dim_t{0_n}); + + stream << "[" + << join_strings(nonnegative_range(ncols), + " ", + [&](nonnegative_int col_idx) -> std::string { + return fmt::to_string( + accessor.at
(FFOrdered{col_idx})); + }) + << "]"; + } +}; + +static std::string + format_1d_accessor_r_contents(GenericTensorAccessorR const &accessor) { + ASSERT(accessor.device_type == DeviceType::CPU); + ASSERT(accessor.shape.num_dims() == 1_n); + + std::ostringstream oss; + DataTypeDispatch1{}(accessor.data_type, accessor, oss); + return oss.str(); +} + +template +struct Print2DCPUAccessorR { + void operator()(GenericTensorAccessorR const &accessor, + std::ostream &stream) { + ASSERT(accessor.device_type == DeviceType::CPU); + nonnegative_int dims = accessor.shape.num_dims(); + ASSERT(dims == 2_n); + nonnegative_int dim0_size = accessor.shape.at(ff_dim_t{0_n}); + nonnegative_int dim1_size = accessor.shape.at(ff_dim_t{1_n}); + + auto render_1d = [&](nonnegative_int dim0_idx) -> std::string { + return "[" + + join_strings(nonnegative_range(dim1_size), + " ", + [&](nonnegative_int dim1_idx) -> std::string { + return fmt::to_string( + accessor.at
(FFOrdered{dim0_idx, dim1_idx})); + }) + + "]"; + }; + + stream << "[\n" + << indent( + join_strings(nonnegative_range(dim0_size), "\n", render_1d)) + << "\n]"; + } +}; + +static std::string + format_2d_accessor_r_contents(GenericTensorAccessorR const &accessor) { + ASSERT(accessor.device_type == DeviceType::CPU); + ASSERT(accessor.shape.num_dims() == 2_n); + + std::ostringstream oss; + DataTypeDispatch1{}(accessor.data_type, accessor, oss); + return oss.str(); +} + +template +struct Print3DCPUAccessorR { + void operator()(GenericTensorAccessorR const &accessor, + std::ostream &stream) { + ASSERT(accessor.device_type == DeviceType::CPU); + nonnegative_int dims = accessor.shape.num_dims(); + ASSERT(dims == 3_n); + + nonnegative_int dim0_size = accessor.shape.at(ff_dim_t{0_n}); + nonnegative_int dim1_size = accessor.shape.at(ff_dim_t{1_n}); + nonnegative_int dim2_size = accessor.shape.at(ff_dim_t{2_n}); + + auto render_1d = [&](nonnegative_int dim0_idx, + nonnegative_int dim1_idx) -> std::string { + return "[" + + join_strings(nonnegative_range(dim2_size), + " ", + [&](nonnegative_int dim2_idx) -> std::string { + return fmt::to_string(accessor.at
( + FFOrdered{dim0_idx, dim1_idx, dim2_idx})); + }) + + "]"; + }; + + auto render_2d = [&](nonnegative_int dim0_idx) -> std::string { + return "[\n" + + indent(join_strings(nonnegative_range(dim1_size), + "\n", + [&](nonnegative_int dim1_idx) -> std::string { + return render_1d(dim0_idx, dim1_idx); + })) + + "\n]"; + }; + + stream << "[\n" + << indent( + join_strings(nonnegative_range(dim0_size), "\n", render_2d)) + << "\n]"; + } +}; + +static std::string + format_3d_accessor_r_contents(GenericTensorAccessorR const &accessor) { + ASSERT(accessor.device_type == DeviceType::CPU); + ASSERT(accessor.shape.num_dims() == 3_n); + + std::ostringstream oss; + DataTypeDispatch1{}(accessor.data_type, accessor, oss); + return oss.str(); +} + +static std::string + format_1d_accessor_w_contents(GenericTensorAccessorW const &accessor) { + return format_1d_accessor_r_contents( + read_only_accessor_from_write_accessor(accessor)); +} + +static std::string + format_2d_accessor_w_contents(GenericTensorAccessorW const &accessor) { + return format_2d_accessor_r_contents( + read_only_accessor_from_write_accessor(accessor)); +} + +static std::string + format_3d_accessor_w_contents(GenericTensorAccessorW const &accessor) { + return format_3d_accessor_r_contents( + read_only_accessor_from_write_accessor(accessor)); +} + +std::string format_accessor_r_contents(GenericTensorAccessorR const &accessor) { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorR cpu_accessor = + copy_tensor_accessor_r_to_cpu_if_necessary(accessor, cpu_allocator); + + int num_dims = accessor.shape.num_dims().unwrap_nonnegative(); + switch (num_dims) { + case 1: + return format_1d_accessor_r_contents(accessor); + case 2: + return format_2d_accessor_r_contents(accessor); + case 3: + return format_3d_accessor_r_contents(accessor); + default: + PANIC("Unhandled accessor dimensionality", num_dims); + } +} + +std::string format_accessor_w_contents(GenericTensorAccessorW const &accessor) { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorW cpu_accessor = + copy_tensor_accessor_w_to_cpu_if_necessary(accessor, cpu_allocator); + + int num_dims = cpu_accessor.shape.num_dims().unwrap_nonnegative(); + switch (num_dims) { + case 1: + return format_1d_accessor_w_contents(cpu_accessor); + case 2: + return format_2d_accessor_w_contents(cpu_accessor); + case 3: + return format_3d_accessor_w_contents(cpu_accessor); + default: + PANIC("Unhandled accessor dimensionality", num_dims); + } +} + +} // namespace FlexFlow diff --git a/lib/kernels/src/legion_dim.cc b/lib/kernels/src/kernels/legion_dim.cc similarity index 78% rename from lib/kernels/src/legion_dim.cc rename to lib/kernels/src/kernels/legion_dim.cc index bbb15c5636..f3482b1d9b 100644 --- a/lib/kernels/src/legion_dim.cc +++ b/lib/kernels/src/kernels/legion_dim.cc @@ -1,7 +1,11 @@ #include "kernels/legion_dim.h" +#include "utils/archetypes/value_type.h" namespace FlexFlow { +using T = value_type<0>; +template std::set key_range(LegionOrdered const &); + legion_dim_t add_to_legion_dim(legion_dim_t legion_dim, int value) { return legion_dim_t{ nonnegative_int{legion_dim.value.unwrap_nonnegative() + value}}; @@ -11,6 +15,7 @@ legion_dim_t legion_dim_from_ff_dim(ff_dim_t ff_dim, nonnegative_int num_dimensions) { return legion_dim_t{nonnegative_int{num_dimensions.unwrap_nonnegative() - ff_dim.value.unwrap_nonnegative() - 1}}; + ; } } // namespace FlexFlow diff --git a/lib/kernels/src/kernels/legion_ordered/legion_ordered.cc b/lib/kernels/src/kernels/legion_ordered/legion_ordered.cc new file mode 100644 index 0000000000..8af44173b0 --- /dev/null +++ b/lib/kernels/src/kernels/legion_ordered/legion_ordered.cc @@ -0,0 +1,10 @@ +#include "kernels/legion_ordered/legion_ordered.h" +#include "utils/archetypes/value_type.h" + +namespace FlexFlow { + +using T = value_type<0>; + +template struct LegionOrdered; + +} // namespace FlexFlow diff --git a/lib/kernels/src/kernels/legion_ordered/slice.cc b/lib/kernels/src/kernels/legion_ordered/slice.cc new file mode 100644 index 0000000000..69fcf570aa --- /dev/null +++ b/lib/kernels/src/kernels/legion_ordered/slice.cc @@ -0,0 +1,12 @@ +#include "kernels/legion_ordered/slice.h" +#include "utils/archetypes/value_type.h" + +namespace FlexFlow { + +using T = value_type<0>; + +template LegionOrdered slice(LegionOrdered const &, + legion_dim_t const &, + std::optional const &); + +} // namespace FlexFlow diff --git a/lib/kernels/src/kernels/legion_ordered/transform.cc b/lib/kernels/src/kernels/legion_ordered/transform.cc new file mode 100644 index 0000000000..d9fb38198e --- /dev/null +++ b/lib/kernels/src/kernels/legion_ordered/transform.cc @@ -0,0 +1,12 @@ +#include "kernels/legion_ordered/transform.h" +#include "utils/archetypes/value_type.h" + +namespace FlexFlow { + +using T = value_type<0>; +using Out = value_type<1>; +using F = std::function; + +template LegionOrdered transform(LegionOrdered const &, F &&); + +} // namespace FlexFlow diff --git a/lib/local-execution/src/local_cpu_allocator.cc b/lib/kernels/src/kernels/local_cpu_allocator.cc similarity index 52% rename from lib/local-execution/src/local_cpu_allocator.cc rename to lib/kernels/src/kernels/local_cpu_allocator.cc index 4ca5f987a8..738d1abf27 100644 --- a/lib/local-execution/src/local_cpu_allocator.cc +++ b/lib/kernels/src/kernels/local_cpu_allocator.cc @@ -1,20 +1,27 @@ -#include "local-execution/local_cpu_allocator.h" +#include "kernels/local_cpu_allocator.h" +#include "kernels/device.h" #include "utils/containers/contains_key.h" +#include +#include namespace FlexFlow { void *LocalCPUAllocator::allocate(size_t requested_memory_size) { void *ptr = malloc(requested_memory_size); + ASSERT(ptr != nullptr); this->ptrs.insert({ptr, std::unique_ptr(ptr, free)}); return ptr; } void LocalCPUAllocator::deallocate(void *ptr) { - if (contains_key(this->ptrs, ptr)) { - this->ptrs.erase(ptr); - } else { - throw std::runtime_error( - "Deallocating a pointer that was not allocated by this Allocator"); - } + ASSERT(contains_key(this->ptrs, ptr), + "Deallocating a pointer that was not allocated by this Allocator"); + + free(ptr); + this->ptrs.erase(ptr); +} + +DeviceType LocalCPUAllocator::get_allocation_device_type() const { + return DeviceType::CPU; } Allocator create_local_cpu_memory_allocator() { diff --git a/lib/kernels/src/local_cuda_allocator.cc b/lib/kernels/src/kernels/local_cuda_allocator.cc similarity index 59% rename from lib/kernels/src/local_cuda_allocator.cc rename to lib/kernels/src/kernels/local_cuda_allocator.cc index cdcfb017a0..1b081517bf 100644 --- a/lib/kernels/src/local_cuda_allocator.cc +++ b/lib/kernels/src/kernels/local_cuda_allocator.cc @@ -1,6 +1,7 @@ #include "kernels/local_cuda_allocator.h" #include "kernels/device.h" #include "utils/containers/contains.h" +#include namespace FlexFlow { void *LocalCudaAllocator::allocate(size_t requested_memory_size) { @@ -11,13 +12,15 @@ void *LocalCudaAllocator::allocate(size_t requested_memory_size) { } void LocalCudaAllocator::deallocate(void *ptr) { - if (contains(this->ptrs, ptr)) { - checkCUDA(cudaFree(ptr)); - this->ptrs.erase(ptr); - } else { - throw std::runtime_error( - "Deallocating a pointer that was not allocated by this Allocator"); - } + ASSERT(contains(this->ptrs, ptr), + "Deallocating a pointer that was not allocated by this Allocator"); + + checkCUDA(cudaFree(ptr)); + this->ptrs.erase(ptr); +} + +DeviceType LocalCudaAllocator::get_allocation_device_type() const { + return DeviceType::GPU; } LocalCudaAllocator::~LocalCudaAllocator() { @@ -27,7 +30,8 @@ LocalCudaAllocator::~LocalCudaAllocator() { } Allocator create_local_cuda_memory_allocator() { - return Allocator::create(); + Allocator allocator = Allocator::create(); + return allocator; } } // namespace FlexFlow diff --git a/lib/kernels/src/kernels/reverse_kernels_params.cc b/lib/kernels/src/kernels/reverse_kernels_params.cc new file mode 100644 index 0000000000..c647181872 --- /dev/null +++ b/lib/kernels/src/kernels/reverse_kernels_params.cc @@ -0,0 +1,30 @@ +#include "kernels/reverse_kernels_params.h" + +namespace FlexFlow { + +ReverseKernelsParams + compute_reverse_kernels_params(ArrayShape const &output_shape, + ReverseAttrs const &attrs) { + auto axis = attrs.axis; + nonnegative_int in_blk_size = 1_n; + nonnegative_int reverse_dim_size = 1_n; + nonnegative_int num_out_blks = 1_n; + for (nonnegative_int i : nonnegative_range(output_shape.get_dim())) { + if (i < axis.value) { + in_blk_size *= output_shape.at(ff_dim_t{i}); + } else if (i == axis.value) { + reverse_dim_size = output_shape.at(ff_dim_t{i}); + } else { + num_out_blks *= output_shape.at(ff_dim_t{i}); + } + } + + return ReverseKernelsParams{ + num_out_blks, + reverse_dim_size, + in_blk_size, + output_shape.get_volume(), + }; +} + +} // namespace FlexFlow diff --git a/lib/kernels/src/managed_ff_stream.cc b/lib/kernels/src/managed_ff_stream.cc index 7385b6cc3e..f0348aa91c 100644 --- a/lib/kernels/src/managed_ff_stream.cc +++ b/lib/kernels/src/managed_ff_stream.cc @@ -1,28 +1,36 @@ #include "kernels/managed_ff_stream.h" +#include "utils/exception.h" namespace FlexFlow { ManagedFFStream::ManagedFFStream() : stream(new ffStream_t) { - checkCUDA(cudaStreamCreate(stream)); + checkCUDA(cudaStreamCreate(this->stream)); } ManagedFFStream::ManagedFFStream(ManagedFFStream &&other) noexcept : stream(std::exchange(other.stream, nullptr)) {} ManagedFFStream &ManagedFFStream::operator=(ManagedFFStream &&other) noexcept { - std::swap(this->stream, other.stream); + if (this != &other) { + this->cleanup(); + this->stream = std::exchange(other.stream, nullptr); + } return *this; } ManagedFFStream::~ManagedFFStream() { - if (stream != nullptr) { - checkCUDA(cudaStreamDestroy(*stream)); - delete stream; + this->cleanup(); +} + +void ManagedFFStream::cleanup() { + if (this->stream != nullptr) { + checkCUDA(cudaStreamDestroy(*this->stream)); + delete this->stream; } } ffStream_t const &ManagedFFStream::raw_stream() const { - return *stream; + return *this->stream; } } // namespace FlexFlow diff --git a/lib/kernels/src/managed_per_device_ff_handle.cc b/lib/kernels/src/managed_per_device_ff_handle.cc index c050e887b6..ea26d2350c 100644 --- a/lib/kernels/src/managed_per_device_ff_handle.cc +++ b/lib/kernels/src/managed_per_device_ff_handle.cc @@ -1,16 +1,17 @@ #include "kernels/managed_per_device_ff_handle.h" -#include "device.h" +#include "internal/device.h" namespace FlexFlow { -ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle() { - handle = new PerDeviceFFHandle; - handle->workSpaceSize = 1024 * 1024; - handle->allowTensorOpMathConversion = true; - - checkCUDNN(cudnnCreate(&handle->dnn)); - checkCUBLAS(cublasCreate(&handle->blas)); - checkCUDA(cudaMalloc(&handle->workSpace, handle->workSpaceSize)); +ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle( + size_t workSpaceSize, bool allowTensorOpMathConversion) { + this->handle = new PerDeviceFFHandle{}; + this->handle->workSpaceSize = workSpaceSize; + this->handle->allowTensorOpMathConversion = allowTensorOpMathConversion; + + checkCUDNN(cudnnCreate(&this->handle->dnn)); + checkCUBLAS(cublasCreate(&this->handle->blas)); + checkCUDA(cudaMalloc(&this->handle->workSpace, this->handle->workSpaceSize)); } ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle( @@ -19,16 +20,23 @@ ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle( ManagedPerDeviceFFHandle &ManagedPerDeviceFFHandle::operator=( ManagedPerDeviceFFHandle &&other) noexcept { - std::swap(this->handle, other.handle); + if (this != &other) { + this->cleanup(); + this->handle = std::exchange(other.handle, nullptr); + } return *this; } ManagedPerDeviceFFHandle::~ManagedPerDeviceFFHandle() { - if (handle != nullptr) { - checkCUDNN(cudnnDestroy(handle->dnn)); - checkCUBLAS(cublasDestroy(handle->blas)); - checkCUDA(cudaFree(handle->workSpace)); - delete handle; + this->cleanup(); +} + +void ManagedPerDeviceFFHandle::cleanup() { + if (this->handle != nullptr) { + checkCUDNN(cudnnDestroy(this->handle->dnn)); + checkCUBLAS(cublasDestroy(this->handle->blas)); + checkCUDA(cudaFree(this->handle->workSpace)); + delete this->handle; } } diff --git a/lib/kernels/test/CMakeLists.txt b/lib/kernels/test/CMakeLists.txt index 00da2d0d70..066cb96753 100644 --- a/lib/kernels/test/CMakeLists.txt +++ b/lib/kernels/test/CMakeLists.txt @@ -14,6 +14,7 @@ ff_add_test_executable( cudnn cudart cublas + pcg ) set(FF_TEST_EXEC_NAME "kernels-tests") diff --git a/lib/kernels/test/src/cpu/ops/replicate_kernels.cc b/lib/kernels/test/src/cpu/ops/replicate_kernels.cc new file mode 100644 index 0000000000..8630dcd8cd --- /dev/null +++ b/lib/kernels/test/src/cpu/ops/replicate_kernels.cc @@ -0,0 +1,57 @@ +#include "internal/test_utils.h" +#include "kernels/format_accessor_contents.h" +#include "kernels/replicate_kernels_cpu.h" +#include "test/utils/doctest/check_kv.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("Replicate::cpu_forward_kernel") { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + GenericTensorAccessorR input = + create_1d_accessor_r_with_contents({1, 3, 2}, cpu_allocator); + + TensorShape result_shape = TensorShape{ + TensorDims{FFOrdered{3_n}}, + DataType::FLOAT, + }; + GenericTensorAccessorW result = + create_zero_filled_accessor_w(result_shape, cpu_allocator); + + GenericTensorAccessorR correct = input; + + Kernels::Replicate::cpu_forward_kernel(input, result); + + CHECK_MESSAGE(accessors_are_equal(result, correct), + "result=", + format_accessor_w_contents(result)); + } + + TEST_CASE("Replicate::cpu_backward_kernel") { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + GenericTensorAccessorR output = create_2d_accessor_r_with_contents( + { + {1, 2, 3}, + {4, 3, 3}, + {1, 3, 5}, + }, + cpu_allocator); + + GenericTensorAccessorR correct = create_1d_accessor_r_with_contents( + {1 + 2 + 3, 4 + 3 + 3, 1 + 3 + 5}, cpu_allocator); + + TensorShape result_shape = TensorShape{ + TensorDims{FFOrdered{3_n}}, + DataType::FLOAT, + }; + GenericTensorAccessorW result = + create_zero_filled_accessor_w(result_shape, cpu_allocator); + Kernels::Replicate::cpu_backward_kernel(output, result, 3); + + CHECK_MESSAGE(accessors_are_equal(result, correct), + check_kv("result", format_accessor_w_contents(result))); + } +} diff --git a/lib/kernels/test/src/cpu/ops/reverse_kernels.cc b/lib/kernels/test/src/cpu/ops/reverse_kernels.cc new file mode 100644 index 0000000000..db0016cb0b --- /dev/null +++ b/lib/kernels/test/src/cpu/ops/reverse_kernels.cc @@ -0,0 +1,206 @@ +#include "internal/test_utils.h" +#include "kernels/format_accessor_contents.h" +#include "kernels/reverse_kernels_cpu.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("Reverse::cpu_forward_kernel") { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + GenericTensorAccessorR input = create_3d_accessor_r_with_contents( + { + { + {1, 3, 2}, + {4, 2, 1}, + }, + { + {3, 3, 6}, + {2, 1, 5}, + }, + }, + cpu_allocator); + + GenericTensorAccessorW result = create_zero_filled_accessor_w( + TensorShape{ + TensorDims{FFOrdered{2_n, 2_n, 3_n}}, + DataType::FLOAT, + }, + cpu_allocator); + + SUBCASE("axis = ff_dim_t{0}") { + ReverseAttrs attrs = ReverseAttrs{ + /*axis=*/ff_dim_t{0_n}, + }; + + GenericTensorAccessorR correct = create_3d_accessor_r_with_contents( + { + { + {3, 3, 6}, + {2, 1, 5}, + }, + { + {1, 3, 2}, + {4, 2, 1}, + }, + }, + cpu_allocator); + + Kernels::Reverse::cpu_forward_kernel(input, result, attrs); + + CHECK_MESSAGE(accessors_are_equal(result, correct), + "result=", + format_accessor_w_contents(result)); + } + + SUBCASE("axis = ff_dim_t{1}") { + ReverseAttrs attrs = ReverseAttrs{ + /*axis=*/ff_dim_t{1_n}, + }; + + GenericTensorAccessorR correct = create_3d_accessor_r_with_contents( + { + { + {4, 2, 1}, + {1, 3, 2}, + }, + { + {2, 1, 5}, + {3, 3, 6}, + }, + }, + cpu_allocator); + + Kernels::Reverse::cpu_forward_kernel(input, result, attrs); + + CHECK_MESSAGE(accessors_are_equal(result, correct), + "result=", + format_accessor_w_contents(result)); + } + + SUBCASE("axis = ff_dim_t{2}") { + ReverseAttrs attrs = ReverseAttrs{ + /*axis=*/ff_dim_t{2_n}, + }; + + GenericTensorAccessorR correct = create_3d_accessor_r_with_contents( + { + { + {2, 3, 1}, + {1, 2, 4}, + }, + { + {6, 3, 3}, + {5, 1, 2}, + }, + }, + cpu_allocator); + + Kernels::Reverse::cpu_forward_kernel(input, result, attrs); + + CHECK_MESSAGE(accessors_are_equal(result, correct), + "result=", + format_accessor_w_contents(result)); + } + } + + TEST_CASE("Reverse::cpu_backward_kernel") { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + GenericTensorAccessorR input = create_3d_accessor_r_with_contents( + { + { + {1, 3, 2}, + {4, 2, 1}, + }, + { + {3, 3, 6}, + {2, 1, 5}, + }, + }, + cpu_allocator); + + GenericTensorAccessorW result = create_zero_filled_accessor_w( + TensorShape{ + TensorDims{FFOrdered{2_n, 2_n, 3_n}}, + DataType::FLOAT, + }, + cpu_allocator); + + SUBCASE("axis = ff_dim_t{0}") { + ReverseAttrs attrs = ReverseAttrs{ + /*axis=*/ff_dim_t{0_n}, + }; + + GenericTensorAccessorR correct = create_3d_accessor_r_with_contents( + { + { + {3, 3, 6}, + {2, 1, 5}, + }, + { + {1, 3, 2}, + {4, 2, 1}, + }, + }, + cpu_allocator); + + Kernels::Reverse::cpu_forward_kernel(input, result, attrs); + + CHECK_MESSAGE(accessors_are_equal(result, correct), + "result=", + format_accessor_w_contents(result)); + } + + SUBCASE("axis = ff_dim_t{1}") { + ReverseAttrs attrs = ReverseAttrs{ + /*axis=*/ff_dim_t{1_n}, + }; + + GenericTensorAccessorR correct = create_3d_accessor_r_with_contents( + { + { + {4, 2, 1}, + {1, 3, 2}, + }, + { + {2, 1, 5}, + {3, 3, 6}, + }, + }, + cpu_allocator); + + Kernels::Reverse::cpu_forward_kernel(input, result, attrs); + + CHECK_MESSAGE(accessors_are_equal(result, correct), + "result=", + format_accessor_w_contents(result)); + } + + SUBCASE("axis = ff_dim_t{2}") { + ReverseAttrs attrs = ReverseAttrs{ + /*axis=*/ff_dim_t{2_n}, + }; + + GenericTensorAccessorR correct = create_3d_accessor_r_with_contents( + { + { + {2, 3, 1}, + {1, 2, 4}, + }, + { + {6, 3, 3}, + {5, 1, 2}, + }, + }, + cpu_allocator); + + Kernels::Reverse::cpu_forward_kernel(input, result, attrs); + + CHECK_MESSAGE(accessors_are_equal(result, correct), + "result=", + format_accessor_w_contents(result)); + } + } +} diff --git a/lib/kernels/test/src/internal/test_utils.cc b/lib/kernels/test/src/internal/test_utils.cc new file mode 100644 index 0000000000..0f34a6aa06 --- /dev/null +++ b/lib/kernels/test/src/internal/test_utils.cc @@ -0,0 +1,392 @@ +#include "internal/test_utils.h" +#include "op-attrs/tensor_shape.h" +#include "utils/containers/require_all_same1.h" +#include "utils/join_strings.h" +#include + +namespace FlexFlow { + +GenericTensorAccessorW create_zero_filled_accessor_w(TensorShape const &shape, + Allocator &allocator) { + GenericTensorAccessorW result_accessor = allocator.allocate_tensor(shape); + fill_with_zeros(result_accessor); + return result_accessor; +} + +GenericTensorAccessorR create_zero_filled_accessor_r(TensorShape const &shape, + Allocator &allocator) { + GenericTensorAccessorW accessor = + create_zero_filled_accessor_w(shape, allocator); + return read_only_accessor_from_write_accessor(accessor); +} + +GenericTensorAccessorW + create_1d_accessor_w_with_contents(std::vector const &contents, + Allocator &allocator) { + nonnegative_int ncols = num_elements(contents); + ASSERT(ncols > 0); + + TensorShape shape = TensorShape{ + TensorDims{FFOrdered{ncols}}, + DataType::FLOAT, + }; + + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorW cpu_accessor = cpu_allocator.allocate_tensor(shape); + + for (nonnegative_int col_idx : nonnegative_range(ncols)) { + cpu_accessor.at(FFOrdered{col_idx}) = + contents.at(col_idx.unwrap_nonnegative()); + } + + GenericTensorAccessorW result = allocator.allocate_tensor(shape); + copy_accessor_data_to_l_from_r( + result, read_only_accessor_from_write_accessor(cpu_accessor)); + + return result; +} + +GenericTensorAccessorW create_2d_accessor_w_with_contents( + std::vector> const &contents, Allocator &allocator) { + nonnegative_int nrows = num_elements(contents); + ASSERT(nrows > 0); + + nonnegative_int ncols = throw_if_unexpected( + require_all_same1(transform(contents, [](std::vector const &row) { + return num_elements(row); + }))); + ASSERT(ncols > 0); + + TensorShape shape = TensorShape{ + TensorDims{FFOrdered{nrows, ncols}}, + DataType::FLOAT, + }; + + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorW cpu_accessor = cpu_allocator.allocate_tensor(shape); + + for (nonnegative_int row_idx : nonnegative_range(nrows)) { + for (nonnegative_int col_idx : nonnegative_range(ncols)) { + cpu_accessor.at(FFOrdered{row_idx, col_idx}) = + contents.at(row_idx.unwrap_nonnegative()) + .at(col_idx.unwrap_nonnegative()); + } + } + + GenericTensorAccessorW result = allocator.allocate_tensor(shape); + copy_accessor_data_to_l_from_r( + result, read_only_accessor_from_write_accessor(cpu_accessor)); + + return result; +} + +GenericTensorAccessorW create_3d_accessor_w_with_contents( + std::vector>> const &contents, + Allocator &allocator) { + nonnegative_int dim0_size = num_elements(contents); + ASSERT(dim0_size > 0); + + nonnegative_int dim1_size = throw_if_unexpected(require_all_same1( + transform(contents, [](std::vector> const &m) { + return num_elements(m); + }))); + ASSERT(dim1_size > 0); + + nonnegative_int dim2_size = throw_if_unexpected(require_all_same1( + transform(contents, [](std::vector> const &m) { + return throw_if_unexpected( + require_all_same1(transform(m, [](std::vector const &vec) { + return num_elements(vec); + }))); + }))); + ASSERT(dim2_size > 0); + + TensorShape shape = TensorShape{ + TensorDims{FFOrdered{dim0_size, dim1_size, dim2_size}}, + DataType::FLOAT, + }; + + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorW cpu_accessor = cpu_allocator.allocate_tensor(shape); + + for (nonnegative_int dim0_idx : nonnegative_range(dim0_size)) { + for (nonnegative_int dim1_idx : nonnegative_range(dim1_size)) { + for (nonnegative_int dim2_idx : nonnegative_range(dim2_size)) { + cpu_accessor.at( + FFOrdered{dim0_idx, dim1_idx, dim2_idx}) = + contents.at(dim0_idx.unwrap_nonnegative()) + .at(dim1_idx.unwrap_nonnegative()) + .at(dim2_idx.unwrap_nonnegative()); + } + } + } + + GenericTensorAccessorW result = allocator.allocate_tensor(shape); + copy_accessor_data_to_l_from_r( + result, read_only_accessor_from_write_accessor(cpu_accessor)); + + return result; +} + +GenericTensorAccessorW create_4d_accessor_w_with_contents( + std::vector>>> const &contents, + Allocator &allocator) { + nonnegative_int dim0_size = num_elements(contents); + ASSERT(dim0_size > 0); + + nonnegative_int dim1_size = throw_if_unexpected(require_all_same1(transform( + contents, [](std::vector>> const &t) { + return num_elements(t); + }))); + ASSERT(dim1_size > 0); + + nonnegative_int dim2_size = throw_if_unexpected(require_all_same1(transform( + contents, [](std::vector>> const &m) { + return throw_if_unexpected(require_all_same1( + transform(m, [](std::vector> const &vec) { + return num_elements(vec); + }))); + }))); + ASSERT(dim2_size > 0); + + nonnegative_int dim3_size = throw_if_unexpected(require_all_same1(transform( + contents, [](std::vector>> const &t) { + return throw_if_unexpected(require_all_same1( + transform(t, [](std::vector> const &mat) { + return throw_if_unexpected(require_all_same1( + transform(mat, [](std::vector const &vec) { + return num_elements(vec); + }))); + }))); + }))); + ASSERT(dim3_size > 0); + + TensorShape shape = TensorShape{ + TensorDims{FFOrdered{dim0_size, dim1_size, dim2_size, dim3_size}}, + DataType::FLOAT, + }; + + GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); + + for (nonnegative_int dim0_idx : nonnegative_range(dim0_size)) { + for (nonnegative_int dim1_idx : nonnegative_range(dim1_size)) { + for (nonnegative_int dim2_idx : nonnegative_range(dim2_size)) { + for (nonnegative_int dim3_idx : nonnegative_range(dim3_size)) { + accessor.at( + FFOrdered{dim0_idx, dim1_idx, dim2_idx, dim3_idx}) = + contents.at(dim0_idx.unwrap_nonnegative()) + .at(dim1_idx.unwrap_nonnegative()) + .at(dim2_idx.unwrap_nonnegative()) + .at(dim3_idx.unwrap_nonnegative()); + } + } + } + } + + return accessor; +} + +GenericTensorAccessorR + create_1d_accessor_r_with_contents(std::vector const &contents, + Allocator &allocator) { + return read_only_accessor_from_write_accessor( + create_1d_accessor_w_with_contents(contents, allocator)); +} + +GenericTensorAccessorR create_2d_accessor_r_with_contents( + std::vector> const &contents, Allocator &allocator) { + return read_only_accessor_from_write_accessor( + create_2d_accessor_w_with_contents(contents, allocator)); +} + +GenericTensorAccessorR create_3d_accessor_r_with_contents( + std::vector>> const &contents, + Allocator &allocator) { + return read_only_accessor_from_write_accessor( + create_3d_accessor_w_with_contents(contents, allocator)); +} + +GenericTensorAccessorR create_4d_accessor_r_with_contents( + std::vector>>> const &contents, + Allocator &allocator) { + return read_only_accessor_from_write_accessor( + create_4d_accessor_w_with_contents(contents, allocator)); +} + +template +struct CreateRandomFilledAccessorW { + GenericTensorAccessorW operator()(TensorShape const &shape, + Allocator &allocator) { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorW src_accessor = cpu_allocator.allocate_tensor(shape); + + using T = real_type_t
; + T *data_ptr = src_accessor.get
(); + + std::random_device rd; + std::mt19937 gen(rd()); + size_t num_elements = get_num_elements(shape).unwrap_nonnegative(); + if constexpr (std::is_same::value) { + std::bernoulli_distribution dist(0.5); + for (size_t i = 0; i < num_elements; i++) { + data_ptr[i] = dist(gen); + } + } else if constexpr (std::is_floating_point::value) { + std::uniform_real_distribution dist(-1.0, 1.0); + for (size_t i = 0; i < num_elements; i++) { + data_ptr[i] = dist(gen); + } + } else if constexpr (std::is_integral::value) { + std::uniform_int_distribution dist(0, 99); + for (size_t i = 0; i < num_elements; i++) { + data_ptr[i] = dist(gen); + } + } + + GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape); + copy_accessor_data_to_l_from_r(dst_accessor, src_accessor); + + return dst_accessor; + } +}; + +GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape, + Allocator &allocator) { + return DataTypeDispatch1{}( + shape.data_type, shape, allocator); +} + +GenericTensorAccessorR create_random_filled_accessor_r(TensorShape const &shape, + Allocator &allocator) { + GenericTensorAccessorW accessor = + create_random_filled_accessor_w(shape, allocator); + + return read_only_accessor_from_write_accessor(accessor); +} + +template +struct FillWithZeros { + void operator()(GenericTensorAccessorW const &accessor) { + using T = real_type_t
; + + if (accessor.device_type == DeviceType::CPU) { + memset(accessor.ptr, + 0, + accessor.shape.get_volume().unwrap_nonnegative() * sizeof(T)); + } else { + checkCUDA(cudaMemset(accessor.ptr, + 0, + accessor.shape.get_volume().unwrap_nonnegative() * + sizeof(T))); + } + } +}; + +void fill_with_zeros(GenericTensorAccessorW const &accessor) { + DataTypeDispatch1{}(accessor.data_type, accessor); +} + +template +struct CPUAccessorRContainsNonZero { + bool operator()(GenericTensorAccessorR const &accessor) { + using T = real_type_t
; + + T const *data_ptr = accessor.get
(); + + int volume = accessor.shape.num_elements().unwrap_nonnegative(); + for (size_t i = 0; i < volume; i++) { + if (data_ptr[i] != 0) { + return true; + } + } + + return false; + } +}; + +bool contains_non_zero(GenericTensorAccessorR const &accessor) { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorR cpu_accessor = + copy_tensor_accessor_r_to_cpu_if_necessary(accessor, cpu_allocator); + return DataTypeDispatch1{}( + cpu_accessor.data_type, cpu_accessor); +} + +template +struct AccessorsAreEqual { + bool operator()(GenericTensorAccessorR const &accessor_a, + GenericTensorAccessorR const &accessor_b) { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorR cpu_accessor_a = + copy_tensor_accessor_r_to_cpu_if_necessary(accessor_a, cpu_allocator); + GenericTensorAccessorR cpu_accessor_b = + copy_tensor_accessor_r_to_cpu_if_necessary(accessor_b, cpu_allocator); + + using T = real_type_t
; + T const *a_data_ptr = cpu_accessor_a.get
(); + T const *b_data_ptr = cpu_accessor_b.get
(); + + int volume = accessor_a.shape.num_elements().unwrap_nonnegative(); + for (size_t i = 0; i < volume; i++) { + if (a_data_ptr[i] != b_data_ptr[i]) { + return false; + } + } + + return true; + } +}; + +bool accessors_are_equal(GenericTensorAccessorR const &accessor_a, + GenericTensorAccessorR const &accessor_b) { + ASSERT(accessor_a.shape == accessor_b.shape, + "accessors_are_equal expects accessors to have the same shape"); + + return DataTypeDispatch1{}( + accessor_a.data_type, accessor_a, accessor_b); +} + +template +struct CreateFilledAccessorW { + GenericTensorAccessorW operator()(TensorShape const &shape, + Allocator &allocator, + DataTypeValue val) { + using T = real_type_t
; + if (!val.template has()) { + throw mk_runtime_error("create_filed_accessor expected data type of " + "shape and passed-in value to match"); + } + + auto unwrapped_value = val.get(); + GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape); + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorW src_accessor = cpu_allocator.allocate_tensor(shape); + + T *data_ptr = src_accessor.get
(); + + int volume = dst_accessor.shape.num_elements().unwrap_nonnegative(); + for (size_t i = 0; i < volume; i++) { + data_ptr[i] = unwrapped_value; + } + + copy_accessor_data_to_l_from_r(dst_accessor, src_accessor); + return dst_accessor; + } +}; + +GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape, + Allocator &allocator, + DataTypeValue val) { + + return DataTypeDispatch1{}( + shape.data_type, shape, allocator, val); +} + +GenericTensorAccessorR create_filled_accessor_r(TensorShape const &shape, + Allocator &allocator, + DataTypeValue val) { + GenericTensorAccessorW w_accessor = + create_filled_accessor_w(shape, allocator, val); + return read_only_accessor_from_write_accessor(w_accessor); +} +} // namespace FlexFlow diff --git a/lib/kernels/test/src/internal/test_utils.h b/lib/kernels/test/src/internal/test_utils.h new file mode 100644 index 0000000000..a4fc9b88c8 --- /dev/null +++ b/lib/kernels/test/src/internal/test_utils.h @@ -0,0 +1,78 @@ +#ifndef _FLEXFLOW_KERNELS_TEST_SRC_INTERNAL_TEST_UTILS_H +#define _FLEXFLOW_KERNELS_TEST_SRC_INTERNAL_TEST_UTILS_H + +#include "kernels/copy_tensor_accessor.h" +#include "kernels/datatype_dispatch.h" +#include "kernels/device.h" +#include "kernels/local_cpu_allocator.h" +#include "kernels/local_cuda_allocator.h" +#include "kernels/managed_ff_stream.h" +#include "kernels/managed_per_device_ff_handle.h" +#include "op-attrs/datatype.h" +#include "op-attrs/datatype_value.dtg.h" +#include +#include +#include +#include + +namespace FlexFlow { + +GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape, + Allocator &allocator); + +GenericTensorAccessorR create_random_filled_accessor_r(TensorShape const &shape, + Allocator &allocator); + +GenericTensorAccessorW create_zero_filled_accessor_w(TensorShape const &shape, + Allocator &allocator); + +GenericTensorAccessorR create_zero_filled_accessor_r(TensorShape const &shape, + Allocator &allocator); + +GenericTensorAccessorW + create_1d_accessor_w_with_contents(std::vector const &contents, + Allocator &allocator); +GenericTensorAccessorR + create_1d_accessor_r_with_contents(std::vector const &contents, + Allocator &allocator); + +GenericTensorAccessorW create_2d_accessor_w_with_contents( + std::vector> const &contents, Allocator &allocator); +GenericTensorAccessorR create_2d_accessor_r_with_contents( + std::vector> const &contents, Allocator &allocator); + +GenericTensorAccessorW create_3d_accessor_w_with_contents( + std::vector>> const &contents, + Allocator &allocator); +GenericTensorAccessorR create_3d_accessor_r_with_contents( + std::vector>> const &contents, + Allocator &allocator); + +GenericTensorAccessorW create_4d_accessor_w_with_contents( + std::vector>>> const &contents, + Allocator &allocator); +GenericTensorAccessorR create_4d_accessor_r_with_contents( + std::vector>>> const &contents, + Allocator &allocator); + +bool contains_non_zero(GenericTensorAccessorR const &accessor); + +void fill_with_zeros(GenericTensorAccessorW const &accessor); + +void print_2d_tensor_accessor_contents(GenericTensorAccessorR const &accessor, + std::ostream &stream); + +bool accessors_are_equal(GenericTensorAccessorR const &accessor_a, + GenericTensorAccessorR const &accessor_b); + +GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape, + Allocator &allocator, + DataTypeValue val); + +GenericTensorAccessorR create_filled_accessor_r(TensorShape const &shape, + Allocator &allocator, + DataTypeValue val); + +} // namespace FlexFlow + +#endif diff --git a/lib/kernels/test/src/kernels/accessor.cc b/lib/kernels/test/src/kernels/accessor.cc new file mode 100644 index 0000000000..98f8471212 --- /dev/null +++ b/lib/kernels/test/src/kernels/accessor.cc @@ -0,0 +1,73 @@ +#include "kernels/accessor.h" +#include "internal/test_utils.h" +#include "kernels/local_cpu_allocator.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("calculate_accessor_offset") { + SUBCASE("one dimension") { + std::vector indices = {4_n}; + ArrayShape shape = ArrayShape{ + std::vector{ + 13_n, + }, + }; + + nonnegative_int result = calculate_accessor_offset(indices, shape); + nonnegative_int correct = 4_n; + + CHECK(result == correct); + } + + SUBCASE("multiple dimensions") { + std::vector indices = {2_n, 4_n}; + ArrayShape shape = ArrayShape{ + std::vector{ + 6_n, + 5_n, + }, + }; + + nonnegative_int result = calculate_accessor_offset(indices, shape); + nonnegative_int correct = 2_n * 5_n + 4_n; + + CHECK(result == correct); + } + + SUBCASE("zero dimensions") { + std::vector indices = {}; + ArrayShape shape = ArrayShape{std::vector{}}; + + nonnegative_int result = calculate_accessor_offset(indices, shape); + nonnegative_int correct = 0_n; + + CHECK(result == correct); + } + + SUBCASE("index and shape dimensions do not match") { + std::vector indices = {1_n, 2_n, 4_n}; + ArrayShape shape = ArrayShape{ + std::vector{ + 6_n, + 5_n, + }, + }; + + CHECK_THROWS(calculate_accessor_offset(indices, shape)); + } + + SUBCASE("out of bounds index") { + std::vector indices = {2_n, 5_n}; + ArrayShape shape = ArrayShape{ + std::vector{ + 6_n, + 5_n, + }, + }; + + CHECK_THROWS(calculate_accessor_offset(indices, shape)); + } + } +} diff --git a/lib/kernels/test/src/kernels/array_shape.cc b/lib/kernels/test/src/kernels/array_shape.cc new file mode 100644 index 0000000000..1fb4c0b541 --- /dev/null +++ b/lib/kernels/test/src/kernels/array_shape.cc @@ -0,0 +1,49 @@ +#include "kernels/array_shape.h" +#include "test/utils/doctest/fmt/unordered_set.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("get_array_coord_set") { + SUBCASE("ArrayShape is not empty") { + ArrayShape input = ArrayShape{ + LegionOrdered{2_n, 1_n, 3_n}, + }; + + std::unordered_set result = get_array_coord_set(input); + std::unordered_set correct = { + ArrayCoord{FFOrdered{0_n, 0_n, 0_n}}, + ArrayCoord{FFOrdered{0_n, 0_n, 1_n}}, + ArrayCoord{FFOrdered{1_n, 0_n, 0_n}}, + ArrayCoord{FFOrdered{1_n, 0_n, 1_n}}, + ArrayCoord{FFOrdered{2_n, 0_n, 0_n}}, + ArrayCoord{FFOrdered{2_n, 0_n, 1_n}}, + }; + + CHECK(result == correct); + } + + SUBCASE("ArrayShape has a dimension of size zero") { + ArrayShape input = ArrayShape{ + LegionOrdered{2_n, 0_n, 3_n}, + }; + + std::unordered_set result = get_array_coord_set(input); + std::unordered_set correct = {}; + + CHECK(result == correct); + } + + SUBCASE("ArrayShape is zero-dimensional") { + ArrayShape input = ArrayShape{LegionOrdered{}}; + + std::unordered_set result = get_array_coord_set(input); + std::unordered_set correct = { + ArrayCoord{FFOrdered{}}, + }; + + CHECK(result == correct); + } + } +} diff --git a/lib/kernels/test/src/kernels/format_accessor_contents.cc b/lib/kernels/test/src/kernels/format_accessor_contents.cc new file mode 100644 index 0000000000..915a84c335 --- /dev/null +++ b/lib/kernels/test/src/kernels/format_accessor_contents.cc @@ -0,0 +1,94 @@ +#include "kernels/format_accessor_contents.h" +#include "internal/test_utils.h" +#include "kernels/local_cpu_allocator.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("format_accessor_r_contents(GenericTensorAccessorR)") { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + SUBCASE("accessor is 1d") { + GenericTensorAccessorR accessor = + create_1d_accessor_r_with_contents({1, 2, 3, 2}, cpu_allocator); + + std::string correct = "[1 2 3 2]"; + + std::string result = format_accessor_r_contents(accessor); + + CHECK(result == correct); + } + + SUBCASE("accessor is 2d") { + GenericTensorAccessorR accessor = create_2d_accessor_r_with_contents( + { + {1, 2, 3, 5}, + {4, 3, 3, 2}, + {1, 1, 5, 8}, + }, + cpu_allocator); + + std::string correct = "[\n" + " [1 2 3 5]\n" + " [4 3 3 2]\n" + " [1 1 5 8]\n" + "]"; + + std::string result = format_accessor_r_contents(accessor); + + CHECK(result == correct); + } + + SUBCASE("accessor is 3d") { + GenericTensorAccessorR accessor = create_3d_accessor_r_with_contents( + { + { + {1, 2, 3, 6}, + {4, 3, 3, 9}, + {1, 1, 5, 1}, + }, + { + {4, 1, 8, 7}, + {9, 4, 2, 4}, + {1, 0, 0, 6}, + }, + { + {2, 1, 1, 9}, + {1, 3, 6, 2}, + {1, 9, 8, 9}, + }, + }, + cpu_allocator); + + std::string correct = "[\n" + " [\n" + " [1 2 3 6]\n" + " [4 3 3 9]\n" + " [1 1 5 1]\n" + " ]\n" + " [\n" + " [4 1 8 7]\n" + " [9 4 2 4]\n" + " [1 0 0 6]\n" + " ]\n" + " [\n" + " [2 1 1 9]\n" + " [1 3 6 2]\n" + " [1 9 8 9]\n" + " ]\n" + "]"; + + std::string result = format_accessor_r_contents(accessor); + + CHECK(result == correct); + } + + SUBCASE("accessor is some other dimension") { + GenericTensorAccessorR accessor = + create_4d_accessor_r_with_contents({{{{5}}}}, cpu_allocator); + + CHECK_THROWS(format_accessor_r_contents(accessor)); + } + } +} diff --git a/lib/kernels/test/src/kernels/legion_dim.cc b/lib/kernels/test/src/kernels/legion_dim.cc new file mode 100644 index 0000000000..34822ed1c3 --- /dev/null +++ b/lib/kernels/test/src/kernels/legion_dim.cc @@ -0,0 +1,32 @@ +#include "kernels/legion_dim.h" +#include "test/utils/doctest/fmt/set.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_CUDA_TEST_SUITE) { + TEST_CASE("key_range(LegionOrdered)") { + SUBCASE("input is non-empty") { + LegionOrdered input = {5, 3, 2, 3}; + + std::set result = key_range(input); + std::set correct = { + legion_dim_t{0_n}, + legion_dim_t{1_n}, + legion_dim_t{2_n}, + legion_dim_t{3_n}, + }; + + CHECK(result == correct); + } + + SUBCASE("input is empty") { + LegionOrdered input = {}; + + std::set result = key_range(input); + std::set correct = {}; + + CHECK(result == correct); + } + } +} diff --git a/lib/kernels/test/src/kernels/legion_ordered/legion_ordered.cc b/lib/kernels/test/src/kernels/legion_ordered/legion_ordered.cc new file mode 100644 index 0000000000..4b50cad735 --- /dev/null +++ b/lib/kernels/test/src/kernels/legion_ordered/legion_ordered.cc @@ -0,0 +1,12 @@ +#include "kernels/legion_ordered/legion_ordered.h" +#include "test/utils/rapidcheck.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_CUDA_TEST_SUITE) { + TEST_CASE_TEMPLATE( + "Arbitrary> with T=", T, int, double, char) { + RC_SUBCASE([](LegionOrdered) {}); + } +} diff --git a/lib/kernels/test/src/kernels/legion_ordered/slice.cc b/lib/kernels/test/src/kernels/legion_ordered/slice.cc new file mode 100644 index 0000000000..d0211d270e --- /dev/null +++ b/lib/kernels/test/src/kernels/legion_ordered/slice.cc @@ -0,0 +1,30 @@ +#include "kernels/legion_ordered/slice.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_CUDA_TEST_SUITE) { + TEST_CASE("slice(LegionOrdered, ..., ...") { + LegionOrdered d = LegionOrdered{ + 1, + 2, + 3, + 4, + }; + SUBCASE("legion_dim_t, legion_dim_t") { + LegionOrdered result = slice(d, + legion_dim_t{nonnegative_int{1}}, + legion_dim_t{nonnegative_int{3}}); + LegionOrdered correct = LegionOrdered{2, 3}; + + CHECK(result == correct); + } + SUBCASE("legion_dim_t, std::nullopt_t") { + LegionOrdered result = + slice(d, legion_dim_t{nonnegative_int{1}}, std::nullopt); + LegionOrdered correct = LegionOrdered{2, 3, 4}; + + CHECK(result == correct); + } + } +} diff --git a/lib/kernels/test/src/kernels/legion_ordered/transform.cc b/lib/kernels/test/src/kernels/legion_ordered/transform.cc new file mode 100644 index 0000000000..759507264f --- /dev/null +++ b/lib/kernels/test/src/kernels/legion_ordered/transform.cc @@ -0,0 +1,36 @@ +#include "kernels/legion_ordered/transform.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_CUDA_TEST_SUITE) { + TEST_CASE("transform(LegionOrdered, F)") { + SUBCASE("input is empty") { + LegionOrdered input = {}; + + LegionOrdered result = + transform(input, [](std::string const &) -> int { + CHECK(false); + return 0; + }); + LegionOrdered correct = {}; + + CHECK(result == correct); + } + + SUBCASE("input is not empty") { + LegionOrdered input = {2, 1, 2, 5}; + + LegionOrdered result = + transform(input, [](int x) { return fmt::to_string(x); }); + LegionOrdered correct = LegionOrdered{ + "2", + "1", + "2", + "5", + }; + + CHECK(result == correct); + } + } +} diff --git a/lib/kernels/test/src/test_attention_kernel.cc b/lib/kernels/test/src/test_attention_kernel.cc index 64264f6c39..9064ae4824 100644 --- a/lib/kernels/test/src/test_attention_kernel.cc +++ b/lib/kernels/test/src/test_attention_kernel.cc @@ -1,10 +1,10 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/attention_kernels.h" -#include "test_utils.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test multi-head attention kernel") { nonnegative_int num_samples = 10_n; nonnegative_int num_heads = 4_n; @@ -19,7 +19,9 @@ TEST_SUITE(FF_TEST_SUITE) { nonnegative_int kvSeqLength = 20_n; ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -39,16 +41,26 @@ TEST_SUITE(FF_TEST_SUITE) { /*kvSeqLength=*/kvSeqLength.unwrap_nonnegative(), /*add_bias_kv=*/false); - TensorShape query_shape = make_float_tensor_shape_from_legion_dims( - {qoSeqLength, num_samples, qSize}); - TensorShape key_shape = make_float_tensor_shape_from_legion_dims( - {kvSeqLength, num_samples, kSize}); - TensorShape value_shape = make_float_tensor_shape_from_legion_dims( - {kvSeqLength, num_samples, vSize}); - TensorShape output_shape = make_float_tensor_shape_from_legion_dims( - {qoSeqLength, num_samples, oProjSize}); - TensorShape weight_shape = make_float_tensor_shape_from_legion_dims( - {nonnegative_int{state.weightSize}}); + TensorShape query_shape = TensorShape{ + TensorDims{FFOrdered{qoSeqLength, num_samples, qSize}}, + DataType::FLOAT, + }; + TensorShape key_shape = TensorShape{ + TensorDims{FFOrdered{kvSeqLength, num_samples, kSize}}, + DataType::FLOAT, + }; + TensorShape value_shape = TensorShape{ + TensorDims{FFOrdered{kvSeqLength, num_samples, vSize}}, + DataType::FLOAT, + }; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{qoSeqLength, num_samples, oProjSize}}, + DataType::FLOAT, + }; + TensorShape weight_shape = TensorShape{ + TensorDims{FFOrdered{nonnegative_int{state.weightSize}}}, + DataType::FLOAT, + }; GenericTensorAccessorW query_accessor = create_random_filled_accessor_w(query_shape, allocator); @@ -72,9 +84,7 @@ TEST_SUITE(FF_TEST_SUITE) { weight_accessor.get_float_ptr(), output_accessor.get_float_ptr()); - std::vector host_output = load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(host_output)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { diff --git a/lib/kernels/test/src/test_batch_matmul_kernel.cc b/lib/kernels/test/src/test_batch_matmul_kernel.cc index cacd5b60fb..5f63b48198 100644 --- a/lib/kernels/test/src/test_batch_matmul_kernel.cc +++ b/lib/kernels/test/src/test_batch_matmul_kernel.cc @@ -1,10 +1,10 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/batch_matmul_kernels.h" -#include "test_utils.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test BatchMatmul Kernel") { nonnegative_int m = 10_n; nonnegative_int n = 10_n; @@ -15,16 +15,24 @@ TEST_SUITE(FF_TEST_SUITE) { int seq_length = -1; ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; Allocator allocator = create_local_cuda_memory_allocator(); - TensorShape input_shape_a = - make_float_tensor_shape_from_legion_dims({m, k, batch}); - TensorShape input_shape_b = - make_float_tensor_shape_from_legion_dims({k, n, batch}); - TensorShape output_shape = - make_float_tensor_shape_from_legion_dims({m, n, batch}); + TensorShape input_shape_a = TensorShape{ + TensorDims{FFOrdered{batch, k, m}}, + DataType::FLOAT, + }; + TensorShape input_shape_b = TensorShape{ + TensorDims{FFOrdered{batch, n, k}}, + DataType::FLOAT, + }; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{batch, n, m}}, + DataType::FLOAT, + }; GenericTensorAccessorW a_accessor = create_random_filled_accessor_w(input_shape_a, allocator); diff --git a/lib/kernels/test/src/test_batch_norm_kernel.cc b/lib/kernels/test/src/test_batch_norm_kernel.cc index b4c43cf1d8..903ad8cc43 100644 --- a/lib/kernels/test/src/test_batch_norm_kernel.cc +++ b/lib/kernels/test/src/test_batch_norm_kernel.cc @@ -1,10 +1,11 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/batch_norm_kernels.h" -#include "test_utils.h" +#include "op-attrs/datatype_value.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test BatchNorm Kernel") { nonnegative_int output_n = 1_n; nonnegative_int output_c = 10_n; @@ -12,7 +13,9 @@ TEST_SUITE(FF_TEST_SUITE) { nonnegative_int output_w = 10_n; ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -26,25 +29,33 @@ TEST_SUITE(FF_TEST_SUITE) { /*output_w=*/output_w.unwrap_nonnegative(), /*relu=*/true); - TensorShape input_shape = make_float_tensor_shape_from_legion_dims( - {output_n, output_c, output_h, output_w}); - TensorShape output_shape = make_float_tensor_shape_from_legion_dims( - {output_n, output_c, output_h, output_w}); - TensorShape scale_shape = make_float_tensor_shape_from_legion_dims( - {output_n, output_c, output_h, output_w}); - TensorShape bias_shape = make_float_tensor_shape_from_legion_dims( - {output_n, output_c, output_h, output_w}); + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{output_n, output_c, output_h, output_w}}, + DataType::FLOAT, + }; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{output_n, output_c, output_h, output_w}}, + DataType::FLOAT, + }; + TensorShape scale_shape = TensorShape{ + TensorDims{FFOrdered{output_n, output_c, output_h, output_w}}, + DataType::FLOAT, + }; + TensorShape bias_shape = TensorShape{ + TensorDims{FFOrdered{output_n, output_c, output_h, output_w}}, + DataType::FLOAT, + }; GenericTensorAccessorW input_accessor = create_random_filled_accessor_w(input_shape, allocator); GenericTensorAccessorW output_accessor = create_random_filled_accessor_w(output_shape, allocator); - GenericTensorAccessorW scale_accessor = - create_filled_accessor_w(scale_shape, allocator, 1.0f); + GenericTensorAccessorW scale_accessor = create_filled_accessor_w( + scale_shape, allocator, make_float_data_type_value(1)); SUBCASE("forward_kernel") { - GenericTensorAccessorW bias_accessor = - create_filled_accessor_w(bias_shape, allocator, 0.0f); + GenericTensorAccessorW bias_accessor = create_filled_accessor_w( + bias_shape, allocator, make_float_data_type_value(0)); Kernels::BatchNorm::forward_kernel( /*stream=*/managed_stream.raw_stream(), @@ -54,10 +65,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*scale_ptr=*/scale_accessor.get_float_ptr(), /*bias_ptr=*/bias_accessor.get_float_ptr()); - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(host_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { @@ -73,9 +81,9 @@ TEST_SUITE(FF_TEST_SUITE) { Kernels::BatchNorm::backward_kernel( /*stream=*/managed_stream.raw_stream(), /*per_device_state=*/state, - /*input_ptr=*/input_accessor.get_float_ptr(), - /*output_grad_ptr=*/output_grad_accessor.get_float_ptr(), /*output_ptr=*/output_accessor.get_float_ptr(), + /*output_grad_ptr=*/output_grad_accessor.get_float_ptr(), + /*input_ptr=*/input_accessor.get_float_ptr(), /*input_grad_ptr=*/input_grad_accessor.get_float_ptr(), /*scale_ptr=*/scale_accessor.get_float_ptr(), /*scale_grad_ptr=*/scale_grad_accessor.get_float_ptr(), @@ -83,19 +91,9 @@ TEST_SUITE(FF_TEST_SUITE) { /*numElements=*/ input_accessor.shape.num_elements().unwrap_nonnegative()); - std::vector host_input_grad_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - std::vector host_scale_grad_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(scale_grad_accessor)); - std::vector host_bias_grad_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(bias_grad_accessor)); - - CHECK(contains_non_zero(host_input_grad_data)); - CHECK(contains_non_zero(host_scale_grad_data)); - CHECK(contains_non_zero(host_bias_grad_data)); + CHECK(contains_non_zero(input_grad_accessor)); + CHECK(contains_non_zero(scale_grad_accessor)); + CHECK(contains_non_zero(bias_grad_accessor)); } Kernels::BatchNorm::cleanup_kernel(allocator, diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc index 0e0769014d..0c41fe12ac 100644 --- a/lib/kernels/test/src/test_cast_kernel.cc +++ b/lib/kernels/test/src/test_cast_kernel.cc @@ -1,56 +1,86 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/cast_kernels.h" -#include "test_utils.h" -#include +#include "kernels/cast_kernels_cpu.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Call Cast Forward and Backward Kernels") { ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); - TensorShape input_shape = - make_float_tensor_shape_from_legion_dims({100_n, 100_n}); - TensorShape output_shape = - make_double_tensor_shape_from_legion_dims({100_n, 100_n}); - - GenericTensorAccessorW output_accessor = - create_random_filled_accessor_w(output_shape, allocator); + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{100_n, 100_n}}, + DataType::FLOAT, + }; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{100_n, 100_n}}, + DataType::DOUBLE, + }; SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); - - Kernels::Cast::forward_kernel(managed_stream.raw_stream(), - input_accessor, - output_accessor, - DataType::FLOAT, - DataType::DOUBLE); + create_random_filled_accessor_r(input_shape, allocator); + GenericTensorAccessorW output_accessor = + allocator.allocate_tensor(output_shape); - std::vector host_double_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); + Kernels::Cast::forward_kernel( + managed_stream.raw_stream(), input_accessor, output_accessor); - CHECK(contains_non_zero(host_double_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { + GenericTensorAccessorR grad_output_accessor = + create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW grad_input_accessor = - allocator.allocate_tensor(input_shape); - - Kernels::Cast::backward_kernel( - managed_stream.raw_stream(), - read_only_accessor_from_write_accessor(output_accessor), - grad_input_accessor, - DataType::DOUBLE, - DataType::FLOAT); - - std::vector host_grad_float_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(grad_input_accessor)); - CHECK(contains_non_zero(host_grad_float_data)); + create_zero_filled_accessor_w(input_shape, allocator); + + Kernels::Cast::backward_kernel(managed_stream.raw_stream(), + grad_output_accessor, + grad_input_accessor); + + CHECK(contains_non_zero(grad_input_accessor)); + } + } + + TEST_CASE("Check Cast Forward Kernel against CPU Kernel") { + ManagedFFStream managed_stream{}; + + Allocator gpu_allocator = create_local_cuda_memory_allocator(); + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{10_n, 2_n}}, + DataType::FLOAT, + }; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{10_n, 2_n}}, + DataType::DOUBLE, + }; + + // Only calling forward kernel as backward kernel is exactly the same + SUBCASE("forward_kernel") { + // Run GPU Forward Kernel + GenericTensorAccessorR input_accessor_gpu = + create_random_filled_accessor_r(input_shape, gpu_allocator); + GenericTensorAccessorW output_accessor_gpu = + create_zero_filled_accessor_w(output_shape, gpu_allocator); + + Kernels::Cast::forward_kernel( + managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu); + + // Run CPU Forward Kernel + GenericTensorAccessorR input_accessor_cpu = + copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator); + GenericTensorAccessorW output_accessor_cpu = + create_zero_filled_accessor_w(output_shape, cpu_allocator); + + Kernels::Cast::cpu_forward_kernel(input_accessor_cpu, + output_accessor_cpu); + + CHECK(accessors_are_equal(output_accessor_gpu, output_accessor_cpu)); } } } diff --git a/lib/kernels/test/src/test_combine_kernel.cc b/lib/kernels/test/src/test_combine_kernel.cc index 2b6b9bf589..2040dcbd5d 100644 --- a/lib/kernels/test/src/test_combine_kernel.cc +++ b/lib/kernels/test/src/test_combine_kernel.cc @@ -1,39 +1,39 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/combine_kernels.h" -#include "test_utils.h" +#include "kernels/combine_kernels_cpu.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { - TEST_CASE("Test combine kernel") { - ManagedPerDeviceFFHandle managed_handle{}; +TEST_SUITE(FF_CUDA_TEST_SUITE) { + TEST_CASE("Call Combine Forward and Backward Kernels") { + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); - TensorShape input_shape = - make_float_tensor_shape_from_legion_dims({100_n, 100_n}); + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{100_n, 100_n}}, + DataType::FLOAT, + }; TensorShape output_shape = input_shape; SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_r(input_shape, allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); Kernels::Combine::forward_kernel( managed_stream.raw_stream(), input_accessor, output_accessor); - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(host_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); + create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); @@ -41,9 +41,66 @@ TEST_SUITE(FF_TEST_SUITE) { output_grad_accessor, input_grad_accessor); - std::vector host_input_grad = load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - CHECK(contains_non_zero(host_input_grad)); + CHECK(contains_non_zero(input_grad_accessor)); + } + } + + TEST_CASE("Check Combine Forward Kernel against CPU Kernel") { + ManagedFFStream managed_stream{}; + + Allocator gpu_allocator = create_local_cuda_memory_allocator(); + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{5_n, 5_n}}, + DataType::FLOAT, + }; + TensorShape output_shape = input_shape; + + SUBCASE("forward_kernel") { + // Run GPU Combine Forward Kernel + GenericTensorAccessorR input_accessor_gpu = + create_random_filled_accessor_r(input_shape, gpu_allocator); + GenericTensorAccessorW output_accessor_gpu = + gpu_allocator.allocate_tensor(output_shape); + + Kernels::Combine::forward_kernel( + managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu); + + // Run CPU Combine Forward Kernel + GenericTensorAccessorR input_accessor_cpu = + copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator); + GenericTensorAccessorW output_accessor_cpu = + cpu_allocator.allocate_tensor(output_shape); + + Kernels::Combine::cpu_forward_kernel(input_accessor_cpu, + output_accessor_cpu); + + CHECK(accessors_are_equal(output_accessor_gpu, output_accessor_cpu)); + } + + SUBCASE("backward_kernel") { + // Run GPU Combine Backward Kernel + GenericTensorAccessorR output_grad_accessor_gpu = + create_random_filled_accessor_r(output_shape, gpu_allocator); + GenericTensorAccessorW input_grad_accessor_gpu = + create_zero_filled_accessor_w(input_shape, gpu_allocator); + + Kernels::Combine::backward_kernel(managed_stream.raw_stream(), + output_grad_accessor_gpu, + input_grad_accessor_gpu); + + // Run CPU Combine Backward Kernel + GenericTensorAccessorR output_grad_accessor_cpu = + copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator); + GenericTensorAccessorW input_grad_accessor_cpu = + create_zero_filled_accessor_w(input_shape, cpu_allocator); + + Kernels::Combine::cpu_backward_kernel(output_grad_accessor_cpu, + input_grad_accessor_cpu); + + CHECK(accessors_are_equal(input_grad_accessor_gpu, + input_grad_accessor_cpu)); } } } diff --git a/lib/kernels/test/src/test_concat_kernel.cc b/lib/kernels/test/src/test_concat_kernel.cc index 215e599716..c2df907917 100644 --- a/lib/kernels/test/src/test_concat_kernel.cc +++ b/lib/kernels/test/src/test_concat_kernel.cc @@ -1,56 +1,113 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/concat_kernels.h" -#include "test_utils.h" #include "utils/containers/repeat.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test concat kernel forward and backward") { - nonnegative_int num_inputs = 3_n; - nonnegative_int size_per_input = 100_n; - ff_dim_t concat_axis = ff_dim_t{0_n}; - - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; - - TensorShape input_shape = - make_float_tensor_shape_from_legion_dims({size_per_input}); - TensorShape output_shape = - make_float_tensor_shape_from_legion_dims({size_per_input, num_inputs}); - Allocator allocator = create_local_cuda_memory_allocator(); + const nonnegative_int num_inputs = 4_n; + SUBCASE("forward_kernel") { - std::vector input_accessors = - repeat(num_inputs, [&]() { - return read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); - }); - GenericTensorAccessorW output_accessor = - allocator.allocate_tensor(output_shape); - - Kernels::Concat::forward_kernel(managed_stream.raw_stream(), - output_accessor, - input_accessors, - concat_axis); - - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - - CHECK(contains_non_zero(host_output_data)); + auto run_forward_test = [&](nonnegative_int input_rows, + nonnegative_int input_cols, + TensorShape output_shape, + ff_dim_t concat_axis) { + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{input_rows, input_cols}}, + DataType::FLOAT, + }; + + std::vector input_accessors = + repeat(num_inputs, [&]() { + return create_random_filled_accessor_r(input_shape, allocator); + }); + + GenericTensorAccessorW output_accessor = + allocator.allocate_tensor(output_shape); + + Kernels::Concat::forward_kernel(managed_stream.raw_stream(), + output_accessor, + input_accessors, + concat_axis); + + CHECK(contains_non_zero(output_accessor)); + }; + + SUBCASE("test forward concat, axis = 0") { + nonnegative_int input_rows = 2_n; + nonnegative_int input_cols = 4_n; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{num_inputs * input_rows, input_cols}}, + DataType::FLOAT, + }; + run_forward_test(input_rows, input_cols, output_shape, ff_dim_t{0_n}); + } + + SUBCASE("test forward concat, axis = 1") { + nonnegative_int input_rows = 4_n; + nonnegative_int input_cols = 2_n; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{input_rows, num_inputs * input_cols}}, + DataType::FLOAT, + }; + run_forward_test(input_rows, input_cols, output_shape, ff_dim_t{1_n}); + } } SUBCASE("backward_kernel") { - GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); - std::vector input_grad_accessors = repeat( - num_inputs, [&]() { return allocator.allocate_tensor(input_shape); }); - Kernels::Concat::backward_kernel(managed_stream.raw_stream(), - output_grad_accessor, - input_grad_accessors, - concat_axis); + auto run_backward_test = [&](nonnegative_int input_rows, + nonnegative_int input_cols, + TensorShape output_shape, + ff_dim_t concat_axis) { + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{input_rows, input_cols}}, + DataType::FLOAT, + }; + + GenericTensorAccessorR output_grad_accessor = + create_random_filled_accessor_r(output_shape, allocator); + + std::vector input_grad_accessors = + repeat(num_inputs, [&]() { + return create_zero_filled_accessor_w(input_shape, allocator); + }); + + Kernels::Concat::backward_kernel(managed_stream.raw_stream(), + output_grad_accessor, + input_grad_accessors, + concat_axis); + + for (auto &accessor : input_grad_accessors) { + CHECK(contains_non_zero(accessor)); + } + }; + + SUBCASE("test backward concat, axis = 0") { + nonnegative_int input_rows = 2_n; + nonnegative_int input_cols = 4_n; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{num_inputs * input_rows, input_cols}}, + DataType::FLOAT, + }; + run_backward_test(input_rows, input_cols, output_shape, ff_dim_t{0_n}); + } + + SUBCASE("test backward concat, axis = 1") { + nonnegative_int input_rows = 4_n; + nonnegative_int input_cols = 2_n; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{input_rows, num_inputs * input_cols}}, + DataType::FLOAT, + }; + run_backward_test(input_rows, input_cols, output_shape, ff_dim_t{1_n}); + } } } } diff --git a/lib/kernels/test/src/test_cuda.cc b/lib/kernels/test/src/test_cuda.cc index ed5852bc31..de3215cf2d 100644 --- a/lib/kernels/test/src/test_cuda.cc +++ b/lib/kernels/test/src/test_cuda.cc @@ -1,10 +1,10 @@ -#include "doctest/doctest.h" -#include "test_utils.h" +#include "internal/test_utils.h" +#include #include namespace FlexFlow { -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test CUDA") { int deviceCount = 0; diff --git a/lib/kernels/test/src/test_dropout.cc b/lib/kernels/test/src/test_dropout.cc index 86f8f2102b..409b06d9a9 100644 --- a/lib/kernels/test/src/test_dropout.cc +++ b/lib/kernels/test/src/test_dropout.cc @@ -1,38 +1,37 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/dropout_kernels.h" -#include "test_utils.h" #include "utils/containers/count.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test Dropout Kernels") { unsigned long long seed = 12345; float dropout_rate = 0.1; ArrayShape shape = ArrayShape{ - std::vector{10_n, 10_n}, + std::vector{10_n, 10_n}, }; - TensorShape input_shape = - make_float_tensor_shape_from_legion_dims({10_n, 10_n}); + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{10_n, 10_n}}, + DataType::FLOAT, + }; TensorShape output_shape = input_shape; ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; Allocator allocator = create_local_cuda_memory_allocator(); DropoutPerDeviceState state = Kernels::Dropout::init_kernel( managed_handle.raw_handle(), dropout_rate, seed, shape, allocator); - auto get_zero_count = [](std::vector const &data) { - return count(data, [](float x) { return x == 0.0f; }); - }; - SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_r(input_shape, allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); @@ -41,11 +40,7 @@ TEST_SUITE(FF_TEST_SUITE) { input_accessor.get_float_ptr(), output_accessor.get_float_ptr()); - std::vector host_output_accessor = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - - CHECK(contains_non_zero(host_output_accessor)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { diff --git a/lib/kernels/test/src/test_flat_kernel.cc b/lib/kernels/test/src/test_flat_kernel.cc index 83f7f0445e..f8a3abdb98 100644 --- a/lib/kernels/test/src/test_flat_kernel.cc +++ b/lib/kernels/test/src/test_flat_kernel.cc @@ -1,21 +1,27 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/flat_kernels.h" -#include "test_utils.h" +#include "op-attrs/datatype_value.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test Flat Kernel") { Allocator allocator = create_local_cuda_memory_allocator(); - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; - TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n}); + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{100_n}}, + DataType::FLOAT, + }; TensorShape output_shape = input_shape; GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(input_shape, allocator, 2.0f)); + read_only_accessor_from_write_accessor(create_filled_accessor_w( + input_shape, allocator, make_float_data_type_value(2))); SUBCASE("forward_kernel") { GenericTensorAccessorW output_accessor = @@ -25,33 +31,21 @@ TEST_SUITE(FF_TEST_SUITE) { input_accessor, output_accessor.get_float_ptr()); - std::vector check_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - - std::vector expected_output_data( - input_accessor.shape.num_elements().unwrap_nonnegative(), 2.0f); - CHECK(check_output_data == expected_output_data); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { - GenericTensorAccessorW output_grad_accessor = - create_filled_accessor_w(output_shape, allocator, 0.0f); - GenericTensorAccessorW input_grad_accessor = - create_filled_accessor_w(input_shape, allocator, 1.0f); + GenericTensorAccessorR output_grad_accessor = create_filled_accessor_r( + output_shape, allocator, make_float_data_type_value(0)); + GenericTensorAccessorW input_grad_accessor = create_filled_accessor_w( + input_shape, allocator, make_float_data_type_value(1)); Kernels::Flat::backward_kernel(managed_stream.raw_stream(), input_accessor, - input_grad_accessor.get_float_ptr(), - output_grad_accessor.get_float_ptr()); - - std::vector backward_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); + output_grad_accessor.get_float_ptr(), + input_grad_accessor.get_float_ptr()); - std::vector expected_output_data( - input_accessor.shape.num_elements().unwrap_nonnegative(), 1.0f); - CHECK(backward_output_data == expected_output_data); + CHECK(contains_non_zero(input_grad_accessor)); } } } diff --git a/lib/kernels/test/src/test_gather_kernels.cc b/lib/kernels/test/src/test_gather_kernels.cc index 1a8cf5f82a..f0be809475 100644 --- a/lib/kernels/test/src/test_gather_kernels.cc +++ b/lib/kernels/test/src/test_gather_kernels.cc @@ -1,61 +1,107 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/gather_kernels.h" -#include "test_utils.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { + +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test Gather Forward and Backward Kernel") { - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; - Allocator allocator = create_local_cuda_memory_allocator(); GatherPerDeviceState state = {managed_handle.raw_handle(), - legion_dim_t{2_n}}; + legion_dim_t{0_n}}; - TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n}); - TensorShape output_shape = make_float_tensor_shape_from_legion_dims({50_n}); + SUBCASE("forward_kernel") { + auto run_forward_test = [&](TensorShape input_shape, + TensorShape index_shape, + TensorShape output_shape) { + GenericTensorAccessorR input_accessor = + create_random_filled_accessor_r(input_shape, allocator); + GenericTensorAccessorR index_accessor = + create_random_filled_accessor_r(index_shape, allocator); + GenericTensorAccessorW output_accessor = + allocator.allocate_tensor(output_shape); - GenericTensorAccessorR index_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); + Kernels::Gather::forward_kernel(managed_stream.raw_stream(), + state, + input_accessor, + index_accessor, + output_accessor); - SUBCASE("forward_kernel") { - GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); - GenericTensorAccessorW output_accessor = - allocator.allocate_tensor(output_shape); - - Kernels::Gather::forward_kernel(managed_stream.raw_stream(), - state, - input_accessor, - index_accessor, - output_accessor); - - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(host_output_data)); + CHECK(contains_non_zero(output_accessor)); + }; + + SUBCASE("test gather forward, 2D") { + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{2_n, 100_n}}, + DataType::FLOAT, + }; + TensorShape index_shape = TensorShape{ + TensorDims{FFOrdered{2_n, 20_n}}, + DataType::INT32, + }; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{2_n, 20_n}}, + DataType::FLOAT, + }; + run_forward_test(input_shape, index_shape, output_shape); + } + + SUBCASE("test gather forward, 1D") { + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{100_n}}, + DataType::FLOAT, + }; + TensorShape index_shape = TensorShape{ + TensorDims{FFOrdered{10_n}}, + DataType::INT32, + }; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{10_n}}, + DataType::FLOAT, + }; + run_forward_test(input_shape, index_shape, output_shape); + } } SUBCASE("backward_kernel") { - GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); - GenericTensorAccessorW input_grad_accessor = - create_random_filled_accessor_w(input_shape, allocator); - - Kernels::Gather::backward_kernel(managed_stream.raw_stream(), - state, - output_grad_accessor, - index_accessor, - input_grad_accessor); - - std::vector host_input_grad_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - CHECK(contains_non_zero(host_input_grad_data)); + auto run_backward_test = [&](TensorShape input_shape, + TensorShape index_shape, + TensorShape output_shape) { + GenericTensorAccessorR output_grad_accessor = + create_random_filled_accessor_r(output_shape, allocator); + GenericTensorAccessorR index_accessor = + create_random_filled_accessor_r(index_shape, allocator); + GenericTensorAccessorW input_grad_accessor = + allocator.allocate_tensor(input_shape); + + Kernels::Gather::backward_kernel(managed_stream.raw_stream(), + state, + output_grad_accessor, + index_accessor, + input_grad_accessor); + CHECK(contains_non_zero(input_grad_accessor)); + }; + + SUBCASE("test gather backward, 2D") { + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{2_n, 100_n}}, + DataType::FLOAT, + }; + TensorShape index_shape = TensorShape{ + TensorDims{FFOrdered{2_n, 25_n}}, + DataType::INT32, + }; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{2_n, 25_n}}, + DataType::FLOAT, + }; + run_backward_test(input_shape, index_shape, output_shape); + } } } } diff --git a/lib/kernels/test/src/test_layer_norm_kernels.cc b/lib/kernels/test/src/test_layer_norm_kernels.cc index 5386c1d943..02a95ba58a 100644 --- a/lib/kernels/test/src/test_layer_norm_kernels.cc +++ b/lib/kernels/test/src/test_layer_norm_kernels.cc @@ -1,23 +1,30 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/layer_norm_kernels.h" -#include "test_utils.h" +#include "op-attrs/datatype_value.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test LayerNorm Forward and Backward Kernel") { nonnegative_int batch_size = 10_n; nonnegative_int feature_size = 10_n; float epsilon = 1e-5f; bool elementwise_affine = true; - TensorShape input_shape = - make_float_tensor_shape_from_legion_dims({batch_size, feature_size}); + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{batch_size, feature_size}}, + DataType::FLOAT, + }; TensorShape output_shape = input_shape; - TensorShape feature_shape = - make_float_tensor_shape_from_legion_dims({feature_size}); + TensorShape feature_shape = TensorShape{ + TensorDims{FFOrdered{feature_size}}, + DataType::FLOAT, + }; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -31,16 +38,15 @@ TEST_SUITE(FF_TEST_SUITE) { epsilon); GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); - GenericTensorAccessorW gamma_accessor = - create_filled_accessor_w(feature_shape, allocator, 1.0f); + create_random_filled_accessor_r(input_shape, allocator); + GenericTensorAccessorW gamma_accessor = create_filled_accessor_w( + feature_shape, allocator, make_float_data_type_value(1)); SUBCASE("forward_kernel") { GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); - GenericTensorAccessorW beta_accessor = - create_filled_accessor_w(feature_shape, allocator, 0.0f); + GenericTensorAccessorW beta_accessor = create_filled_accessor_w( + feature_shape, allocator, make_float_data_type_value(0)); Kernels::LayerNorm::forward_kernel(managed_stream.raw_stream(), state, @@ -52,8 +58,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); + create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = create_random_filled_accessor_w(input_shape, allocator); GenericTensorAccessorW gamma_grad_accessor = diff --git a/lib/kernels/test/src/test_managed_ff_stream.cc b/lib/kernels/test/src/test_managed_ff_stream.cc new file mode 100644 index 0000000000..fb5920adcc --- /dev/null +++ b/lib/kernels/test/src/test_managed_ff_stream.cc @@ -0,0 +1,107 @@ +#include "internal/test_utils.h" +#include "kernels/gather_kernels.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_CUDA_TEST_SUITE) { + TEST_CASE("Test ManagedFFStream") { + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; + ManagedFFStream managed_stream{}; + Allocator allocator = create_local_cuda_memory_allocator(); + + GatherPerDeviceState state = {managed_handle.raw_handle(), + legion_dim_t{0_n}}; + + SUBCASE("forward_kernel") { + auto run_forward_test = [&](TensorShape const &input_shape, + TensorShape const &index_shape, + TensorShape const &output_shape) { + GenericTensorAccessorR input_accessor = + create_random_filled_accessor_r(input_shape, allocator); + GenericTensorAccessorR index_accessor = + create_random_filled_accessor_r(index_shape, allocator); + GenericTensorAccessorW output_accessor = + allocator.allocate_tensor(output_shape); + + Kernels::Gather::forward_kernel(/*stream=*/managed_stream.raw_stream(), + /*per_device_state=*/state, + /*input=*/input_accessor, + /*index=*/index_accessor, + /*output=*/output_accessor); + + CHECK(contains_non_zero(output_accessor)); + }; + + SUBCASE("test gather forward, 2D") { + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{2_n, 100_n}}, + DataType::FLOAT, + }; + TensorShape index_shape = TensorShape{ + TensorDims{FFOrdered{2_n, 20_n}}, + DataType::INT32, + }; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{2_n, 20_n}}, + DataType::FLOAT, + }; + run_forward_test(input_shape, index_shape, output_shape); + } + + SUBCASE("test gather forward, 1D") { + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{100_n}}, + DataType::FLOAT, + }; + TensorShape index_shape = TensorShape{ + TensorDims{FFOrdered{10_n}}, + DataType::INT32, + }; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{10_n}}, + DataType::FLOAT, + }; + run_forward_test(input_shape, index_shape, output_shape); + } + } + + SUBCASE("backward_kernel") { + auto run_backward_test = [&](TensorShape const &input_shape, + TensorShape const &index_shape, + TensorShape const &output_shape) { + GenericTensorAccessorR output_grad_accessor = + create_random_filled_accessor_r(output_shape, allocator); + GenericTensorAccessorR index_accessor = + create_random_filled_accessor_r(index_shape, allocator); + GenericTensorAccessorW input_grad_accessor = + allocator.allocate_tensor(input_shape); + + Kernels::Gather::backward_kernel(/*stream=*/managed_stream.raw_stream(), + /*per_device_state=*/state, + /*output_grad=*/output_grad_accessor, + /*index=*/index_accessor, + /*input_grad=*/input_grad_accessor); + CHECK(contains_non_zero(input_grad_accessor)); + }; + + SUBCASE("test gather backward, 2D") { + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{2_n, 100_n}}, + DataType::FLOAT, + }; + TensorShape index_shape = TensorShape{ + TensorDims{FFOrdered{2_n, 25_n}}, + DataType::INT32, + }; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{2_n, 25_n}}, + DataType::FLOAT, + }; + run_backward_test(input_shape, index_shape, output_shape); + } + } + } +} diff --git a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc new file mode 100644 index 0000000000..fc67764cdb --- /dev/null +++ b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc @@ -0,0 +1,37 @@ +#include "kernels/managed_per_device_ff_handle.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_CUDA_TEST_SUITE) { + TEST_CASE("Test ManagedPerDeviceFFHandle") { + ManagedPerDeviceFFHandle base_handle{/*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; + PerDeviceFFHandle const *base_handle_ptr = &base_handle.raw_handle(); + + SUBCASE("constructor") { + CHECK(base_handle.raw_handle().workSpaceSize == 1024 * 1024); + CHECK(base_handle.raw_handle().allowTensorOpMathConversion == true); + } + + SUBCASE("move constructor") { + ManagedPerDeviceFFHandle new_handle(std::move(base_handle)); + CHECK(&new_handle.raw_handle() == base_handle_ptr); + } + + SUBCASE("move assignment operator") { + SUBCASE("move assign to other") { + ManagedPerDeviceFFHandle new_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; + new_handle = std::move(base_handle); + CHECK(&new_handle.raw_handle() == base_handle_ptr); + } + + SUBCASE("move assign to self") { + base_handle = std::move(base_handle); + CHECK(&base_handle.raw_handle() == base_handle_ptr); + } + } + } +} diff --git a/lib/kernels/test/src/test_partition_kernel.cc b/lib/kernels/test/src/test_partition_kernel.cc index 4fd1b53210..5452266dad 100644 --- a/lib/kernels/test/src/test_partition_kernel.cc +++ b/lib/kernels/test/src/test_partition_kernel.cc @@ -1,12 +1,15 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/partition_kernels.h" -#include "test_utils.h" +#include "op-attrs/datatype_value.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test Partition Forward and Backward") { - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -14,48 +17,36 @@ TEST_SUITE(FF_TEST_SUITE) { RepartitionPerDeviceState state = Kernels::Repartition::init_kernel( managed_handle.raw_handle(), DataType::FLOAT); - TensorShape input_shape = - make_float_tensor_shape_from_legion_dims({10_n, 10_n}); + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{10_n, 10_n}}, + DataType::FLOAT, + }; TensorShape output_shape = input_shape; SUBCASE("forward_kernel") { - GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(input_shape, allocator, 1.0f)); + GenericTensorAccessorR input_accessor = create_filled_accessor_r( + input_shape, allocator, make_float_data_type_value(1)); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); Kernels::Repartition::forward_kernel( managed_stream.raw_stream(), state, input_accessor, output_accessor); - std::vector check_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - - std::vector expected_output_data( - input_accessor.shape.num_elements().unwrap_nonnegative(), 1.0f); - CHECK(check_output_data == expected_output_data); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { - GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(output_shape, allocator, 1.0f)); - GenericTensorAccessorW input_grad_accessor = - create_filled_accessor_w(input_shape, allocator, 2.0f); + GenericTensorAccessorR output_grad_accessor = create_filled_accessor_r( + output_shape, allocator, make_float_data_type_value(1)); + GenericTensorAccessorW input_grad_accessor = create_filled_accessor_w( + input_shape, allocator, make_float_data_type_value(2)); Kernels::Repartition::backward_kernel(managed_stream.raw_stream(), state, - input_grad_accessor, - output_grad_accessor); - - std::vector host_grad_input_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); + output_grad_accessor, + input_grad_accessor); - std::vector expected_grad_input_data( - input_grad_accessor.shape.num_elements().unwrap_nonnegative(), 3.0f); - CHECK(host_grad_input_data == expected_grad_input_data); + CHECK(contains_non_zero(input_grad_accessor)); } } } diff --git a/lib/kernels/test/src/test_pool_2d_kernels.cc b/lib/kernels/test/src/test_pool_2d_kernels.cc index 62b61707c6..f2ada8387e 100644 --- a/lib/kernels/test/src/test_pool_2d_kernels.cc +++ b/lib/kernels/test/src/test_pool_2d_kernels.cc @@ -1,9 +1,10 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/pool_2d_kernels.h" -#include "test_utils.h" +#include "op-attrs/datatype_value.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test Pool2D Forward and Backward Kernel") { nonnegative_int input_w = 10_n; nonnegative_int input_h = 10_n; @@ -22,7 +23,9 @@ TEST_SUITE(FF_TEST_SUITE) { PoolOp pool_type = PoolOp::MAX; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -46,10 +49,14 @@ TEST_SUITE(FF_TEST_SUITE) { /*stride_w=*/stride_w.unwrap_nonnegative(), /*pool_type=*/pool_type); - TensorShape input_shape = make_float_tensor_shape_from_legion_dims( - {input_w, input_h, input_c, input_n}); - TensorShape output_shape = make_float_tensor_shape_from_legion_dims( - {output_w, output_h, output_c, output_n}); + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{input_n, input_c, input_h, input_w}}, + DataType::FLOAT, + }; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{output_n, input_c, output_h, output_w}}, + DataType::FLOAT, + }; GenericTensorAccessorW input_accessor = create_random_filled_accessor_w(input_shape, allocator); @@ -62,28 +69,23 @@ TEST_SUITE(FF_TEST_SUITE) { input_accessor.ptr, output_accessor.ptr); - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(host_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { - GenericTensorAccessorW output_grad_accessor = - create_filled_accessor_w(output_shape, allocator, 1.0f); + GenericTensorAccessorW output_grad_accessor = create_filled_accessor_w( + output_shape, allocator, make_float_data_type_value(1)); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); Kernels::Pool2D::backward_kernel(managed_stream.raw_stream(), state, - input_accessor.ptr, - input_grad_accessor.ptr, output_accessor.ptr, - output_grad_accessor.ptr); + output_grad_accessor.ptr, + input_accessor.ptr, + input_grad_accessor.ptr); - std::vector host_input_grad = load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - CHECK(contains_non_zero(host_input_grad)); + CHECK(contains_non_zero(input_grad_accessor)); } } } diff --git a/lib/kernels/test/src/test_reduction_kernel.cc b/lib/kernels/test/src/test_reduction_kernel.cc index 04a3817b84..e13b149769 100644 --- a/lib/kernels/test/src/test_reduction_kernel.cc +++ b/lib/kernels/test/src/test_reduction_kernel.cc @@ -1,27 +1,33 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/reduction_kernels.h" -#include "test_utils.h" +#include "op-attrs/datatype_value.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test Reduction Forward and Backward Kernel") { std::size_t num_replicas = 5; - TensorShape input_shape = make_float_tensor_shape_from_legion_dims( - {10_n, 10_n, 10_n, 10_n, 10_n}); + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{10_n, 10_n, 10_n, 10_n, 10_n}}, + DataType::FLOAT, + }; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); SUBCASE("forward_kernel") { - TensorShape output_shape = - make_float_tensor_shape_from_legion_dims({10_n}); + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{10_n}}, + DataType::FLOAT, + }; GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_r(input_shape, allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); @@ -30,30 +36,22 @@ TEST_SUITE(FF_TEST_SUITE) { output_accessor, num_replicas); - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(host_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { TensorShape output_shape = input_shape; - GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(output_shape, allocator, 1.0f)); + GenericTensorAccessorR output_grad_accessor = create_filled_accessor_r( + output_shape, allocator, make_float_data_type_value(1)); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); Kernels::Reduction::backward_kernel(managed_stream.raw_stream(), - input_grad_accessor, - output_grad_accessor); - - std::vector expected_grad_input_data( - input_grad_accessor.shape.num_elements().unwrap_nonnegative(), 1.0f); - std::vector host_grad_data = load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - CHECK(host_grad_data == expected_grad_input_data); + output_grad_accessor, + input_grad_accessor); + + CHECK(contains_non_zero(input_grad_accessor)); } } } diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc index fa726898f2..83a9a992f7 100644 --- a/lib/kernels/test/src/test_replicate_kernel.cc +++ b/lib/kernels/test/src/test_replicate_kernel.cc @@ -1,55 +1,150 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" +#include "kernels/format_accessor_contents.h" #include "kernels/replicate_kernels.h" -#include "test_utils.h" +#include "kernels/replicate_kernels_cpu.h" +#include "test/utils/doctest/check_kv.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { - TEST_CASE("Test Replicate Kernel") { + +TEST_SUITE(FF_CUDA_TEST_SUITE) { + TEST_CASE("Call Replicate Forward and Backward Kernels") { nonnegative_int num_replicas = 10_n; - TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n}); - TensorShape output_shape = input_shape; + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{3_n}}, + DataType::FLOAT, + }; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{3_n}}, + DataType::FLOAT, + }; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; - Allocator allocator = create_local_cuda_memory_allocator(); + Allocator gpu_allocator = create_local_cuda_memory_allocator(); + Allocator cpu_allocator = create_local_cpu_memory_allocator(); SUBCASE("forward_kernel") { - GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(input_shape, allocator, 1.0f)); - GenericTensorAccessorW output_accessor = - allocator.allocate_tensor(output_shape); + GenericTensorAccessorR input = + create_1d_accessor_r_with_contents({1, 3, 2}, gpu_allocator); + + GenericTensorAccessorW output = + gpu_allocator.allocate_tensor(output_shape); Kernels::Replicate::forward_kernel( - managed_stream.raw_stream(), input_accessor, output_accessor); + managed_stream.raw_stream(), input, output); - std::vector check_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); + GenericTensorAccessorR correct = input; - std::vector expected_output_data( - input_accessor.shape.num_elements().unwrap_nonnegative(), 1.0f); - CHECK(check_output_data == expected_output_data); + CHECK_MESSAGE(accessors_are_equal(output, correct), + check_kv("output", format_accessor_w_contents(output))); } SUBCASE("backward_kernel") { - GenericTensorAccessorW input_grad_accessor = - create_filled_accessor_w(input_shape, allocator, 1.0f); - GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(output_shape, allocator, 1.0f)); + GenericTensorAccessorR output_grad = create_2d_accessor_r_with_contents( + { + {1, 2, 3}, + {4, 3, 3}, + {1, 3, 5}, + }, + gpu_allocator); + + GenericTensorAccessorR correct = create_1d_accessor_r_with_contents( + {1 + 2 + 3, 4 + 3 + 3, 1 + 3 + 5}, cpu_allocator); + + GenericTensorAccessorW input_grad = + gpu_allocator.allocate_tensor(input_shape); Kernels::Replicate::backward_kernel(managed_stream.raw_stream(), - input_grad_accessor, - output_grad_accessor, + output_grad, + input_grad, num_replicas.unwrap_nonnegative()); - std::vector check_aggregated_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - CHECK(contains_non_zero(check_aggregated_data)); + CHECK_MESSAGE( + accessors_are_equal(input_grad, correct), + check_kv("input_grad", format_accessor_w_contents(input_grad))); + } + } + + TEST_CASE("Check Replicate Forward and Backward Kernel against CPU Kernel") { + nonnegative_int num_replicas = 2_n; + + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{5_n}}, + DataType::FLOAT, + }; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{5_n, num_replicas}}, + DataType::FLOAT, + }; + + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; + ManagedFFStream managed_stream{}; + + Allocator gpu_allocator = create_local_cuda_memory_allocator(); + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + SUBCASE("forward_kernel") { + // Run GPU Replicate Forward Kernel + GenericTensorAccessorR input_accessor_gpu = + create_random_filled_accessor_r(input_shape, gpu_allocator); + GenericTensorAccessorW output_accessor_gpu = + create_zero_filled_accessor_w(output_shape, gpu_allocator); + + Kernels::Replicate::forward_kernel( + managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu); + + // Run CPU Replicate Forward Kernel + GenericTensorAccessorR input_accessor_cpu = + copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator); + GenericTensorAccessorW output_accessor_cpu = + create_zero_filled_accessor_w(output_shape, cpu_allocator); + + Kernels::Replicate::cpu_forward_kernel(input_accessor_cpu, + output_accessor_cpu); + + CHECK_MESSAGE( + accessors_are_equal(output_accessor_gpu, output_accessor_cpu), + check_kv("input", format_accessor_r_contents(input_accessor_cpu)), + check_kv("gpu", format_accessor_w_contents(output_accessor_gpu)), + check_kv("cpu", format_accessor_w_contents(output_accessor_cpu))); + } + + SUBCASE("backward_kernel") { + // Run GPU Replicate Backward Kernel + GenericTensorAccessorR output_grad_accessor_gpu = + create_random_filled_accessor_r(output_shape, gpu_allocator); + GenericTensorAccessorW input_grad_accessor_gpu = + create_zero_filled_accessor_w(input_shape, gpu_allocator); + + Kernels::Replicate::backward_kernel(managed_stream.raw_stream(), + output_grad_accessor_gpu, + input_grad_accessor_gpu, + num_replicas.unwrap_nonnegative()); + + // Run CPU Replicate Backward Kernel + GenericTensorAccessorR output_grad_accessor_cpu = + copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator); + GenericTensorAccessorW input_grad_accessor_cpu = + create_zero_filled_accessor_w(input_shape, cpu_allocator); + + Kernels::Replicate::cpu_backward_kernel( + output_grad_accessor_cpu, + input_grad_accessor_cpu, + num_replicas.unwrap_nonnegative()); + + CHECK_MESSAGE( + accessors_are_equal(input_grad_accessor_gpu, input_grad_accessor_cpu), + check_kv("output_grad", + format_accessor_r_contents(output_grad_accessor_cpu)), + check_kv("gpu", format_accessor_w_contents(input_grad_accessor_gpu)), + check_kv("cpu", format_accessor_w_contents(input_grad_accessor_cpu))); } } } diff --git a/lib/kernels/test/src/test_reshape_kernel.cc b/lib/kernels/test/src/test_reshape_kernel.cc index d329a347b3..66c6bf849b 100644 --- a/lib/kernels/test/src/test_reshape_kernel.cc +++ b/lib/kernels/test/src/test_reshape_kernel.cc @@ -1,16 +1,21 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/reshape_kernels.h" -#include "test_utils.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test Reshape Forward and Backward") { - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); - TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n}); + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{100_n}}, + DataType::FLOAT, + }; TensorShape output_shape = input_shape; ReshapePerDeviceState state = @@ -18,42 +23,28 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(input_shape, allocator, 1.0f)); + create_random_filled_accessor_r(input_shape, allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); Kernels::Reshape::forward_kernel( managed_stream.raw_stream(), state, input_accessor, output_accessor); - std::vector check_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - - std::vector expected_output_data( - input_accessor.shape.num_elements().unwrap_nonnegative(), 1.0f); - CHECK(check_output_data == expected_output_data); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(output_shape, allocator, 1.0f)); + create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = - create_filled_accessor_w(input_shape, allocator, 2.0f); + allocator.allocate_tensor(input_shape); Kernels::Reshape::backward_kernel(managed_stream.raw_stream(), state, - input_grad_accessor, - output_grad_accessor); - - std::vector host_grad_input_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); + output_grad_accessor, + input_grad_accessor); - std::vector expected_grad_input_data( - input_grad_accessor.shape.num_elements().unwrap_nonnegative(), 3.0f); - CHECK(host_grad_input_data == expected_grad_input_data); + CHECK(contains_non_zero(input_grad_accessor)); } } } diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc index 9c8475f6d6..6e12c48ac3 100644 --- a/lib/kernels/test/src/test_reverse_kernels.cc +++ b/lib/kernels/test/src/test_reverse_kernels.cc @@ -1,63 +1,124 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/reverse_kernels.h" -#include "test_utils.h" +#include "kernels/reverse_kernels_cpu.h" +#include "op-attrs/datatype_value.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Call Reverse Forward and Backward Kernels") { - nonnegative_int reverse_dim_size = 10_n; - nonnegative_int in_blk_size = 10_n; - nonnegative_int num_out_blks = 1_n; - - TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n}); + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{1_n, 10_n, 10_n}}, + DataType::FLOAT, + }; TensorShape output_shape = input_shape; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); + ReverseAttrs attrs = ReverseAttrs{ + /*axis=*/ff_dim_t{0_n}, + }; + SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_filled_accessor_w(input_shape, allocator, 1.0f)); + read_only_accessor_from_write_accessor(create_filled_accessor_w( + input_shape, allocator, make_float_data_type_value(1))); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); Kernels::Reverse::forward_kernel( - managed_stream.raw_stream(), - input_accessor.get_float_ptr(), - output_accessor.get_float_ptr(), - num_out_blks.unwrap_nonnegative(), - reverse_dim_size.unwrap_nonnegative(), - in_blk_size.unwrap_nonnegative(), - input_accessor.shape.num_elements().unwrap_nonnegative()); - - std::vector check_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(check_output_data)); + managed_stream.raw_stream(), input_accessor, output_accessor, attrs); + + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { GenericTensorAccessorW output_grad_accessor = create_random_filled_accessor_w(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = - create_random_filled_accessor_w(input_shape, allocator); - - Kernels::Reverse::backward_kernel( - managed_stream.raw_stream(), - output_grad_accessor.get_float_ptr(), - input_grad_accessor.get_float_ptr(), - num_out_blks.unwrap_nonnegative(), - reverse_dim_size.unwrap_nonnegative(), - in_blk_size.unwrap_nonnegative(), - input_grad_accessor.shape.num_elements().unwrap_nonnegative()); - - std::vector host_grad_input_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - CHECK(contains_non_zero(host_grad_input_data)); + allocator.allocate_tensor(input_shape); + + Kernels::Reverse::backward_kernel(managed_stream.raw_stream(), + output_grad_accessor, + input_grad_accessor, + attrs); + + CHECK(contains_non_zero(input_grad_accessor)); + } + } + + TEST_CASE("Check Reverse Forward and Backward Kernels against CPU Kernels") { + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{1_n, 4_n, 3_n}}, + DataType::FLOAT, + }; + TensorShape output_shape = input_shape; + + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; + ManagedFFStream managed_stream{}; + + Allocator gpu_allocator = create_local_cuda_memory_allocator(); + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + ReverseAttrs attrs = ReverseAttrs{ + /*axis=*/ff_dim_t{0_n}, + }; + + SUBCASE("forward_kernel") { + // Run GPU Cast Forward Kernel + GenericTensorAccessorR input_accessor_gpu = + create_random_filled_accessor_r(input_shape, gpu_allocator); + GenericTensorAccessorW output_accessor_gpu = + create_zero_filled_accessor_w(output_shape, gpu_allocator); + + Kernels::Reverse::forward_kernel(managed_stream.raw_stream(), + input_accessor_gpu, + output_accessor_gpu, + attrs); + + // Run CPU Cast Forward Kernel + GenericTensorAccessorR input_accessor_cpu = + copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator); + GenericTensorAccessorW output_accessor_cpu = + create_zero_filled_accessor_w(output_shape, cpu_allocator); + + Kernels::Reverse::cpu_forward_kernel( + input_accessor_cpu, output_accessor_cpu, attrs); + + CHECK(accessors_are_equal(output_accessor_cpu, output_accessor_cpu)); + } + + SUBCASE("backward_kernel") { + // Run GPU Cast Backward Kernel + GenericTensorAccessorR output_grad_accessor_gpu = + create_random_filled_accessor_r(output_shape, gpu_allocator); + + GenericTensorAccessorW input_grad_accessor_gpu = + create_zero_filled_accessor_w(input_shape, gpu_allocator); + + Kernels::Reverse::backward_kernel(managed_stream.raw_stream(), + output_grad_accessor_gpu, + input_grad_accessor_gpu, + attrs); + + // Run CPU Cast Backward Kernel + GenericTensorAccessorR output_grad_accessor_cpu = + copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator); + GenericTensorAccessorW input_grad_accessor_cpu = + create_zero_filled_accessor_w(input_shape, cpu_allocator); + + Kernels::Reverse::cpu_backward_kernel( + output_grad_accessor_cpu, input_grad_accessor_cpu, attrs); + + CHECK(accessors_are_equal(input_grad_accessor_gpu, + input_grad_accessor_cpu)); } } } diff --git a/lib/kernels/test/src/test_softmax_kernel.cc b/lib/kernels/test/src/test_softmax_kernel.cc index c9eaa76b86..904cca2d3e 100644 --- a/lib/kernels/test/src/test_softmax_kernel.cc +++ b/lib/kernels/test/src/test_softmax_kernel.cc @@ -1,10 +1,10 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/softmax_kernels.h" -#include "test_utils.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test Softmax Kernel Operations") { nonnegative_int input_n = 1_n; nonnegative_int input_c = 1_n; @@ -12,12 +12,17 @@ TEST_SUITE(FF_TEST_SUITE) { nonnegative_int input_w = 100_n; nonnegative_int channels = 100_n; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); - TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n}); + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{100_n}}, + DataType::FLOAT, + }; TensorShape output_shape = input_shape; SoftmaxPerDeviceState state = @@ -40,30 +45,22 @@ TEST_SUITE(FF_TEST_SUITE) { input_accessor.get_float_ptr(), output_accessor.get_float_ptr()); - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(host_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { - GenericTensorAccessorW output_grad_accessor = - create_filled_accessor_w(output_shape, allocator, 1.0f); + GenericTensorAccessorR output_grad_accessor = + create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); Kernels::Softmax::backward_kernel( managed_stream.raw_stream(), - input_grad_accessor.get_float_ptr(), output_grad_accessor.get_float_ptr(), + input_grad_accessor.get_float_ptr(), output_grad_accessor.shape.num_elements().unwrap_nonnegative()); - std::vector expected_input_grad_data = std::vector( - input_grad_accessor.shape.num_elements().unwrap_nonnegative(), 1.0f); - std::vector host_input_grad_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - CHECK(host_input_grad_data == expected_input_grad_data); + CHECK(contains_non_zero(input_grad_accessor)); } } } diff --git a/lib/kernels/test/src/test_split_kernel.cc b/lib/kernels/test/src/test_split_kernel.cc index ea0d280f68..44e8f42f76 100644 --- a/lib/kernels/test/src/test_split_kernel.cc +++ b/lib/kernels/test/src/test_split_kernel.cc @@ -1,24 +1,33 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/split_kernels.h" -#include "test_utils.h" +#include "op-attrs/datatype_value.h" #include "utils/containers/repeat.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test Split Forward and Backward Kernel") { nonnegative_int num_outputs = 2_n; coord_t out_blk_sizes[] = {50, 50}; coord_t in_blk_size = 100; coord_t num_blks = 1; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); - TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n}); - TensorShape output_shape = make_float_tensor_shape_from_legion_dims({50_n}); + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{100_n}}, + DataType::FLOAT, + }; + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{50_n}}, + DataType::FLOAT, + }; SUBCASE("forward_kernel") { GenericTensorAccessorW input_accessor = @@ -47,8 +56,8 @@ TEST_SUITE(FF_TEST_SUITE) { output_grad_ptrs[i] = output_grad_accessor.get_float_ptr(); } - GenericTensorAccessorW input_grad_accessor = - create_filled_accessor_w(input_shape, allocator, 0.0f); + GenericTensorAccessorW input_grad_accessor = create_filled_accessor_w( + input_shape, allocator, make_float_data_type_value(0)); Kernels::Split::backward_kernel(managed_stream.raw_stream(), input_grad_accessor.get_float_ptr(), diff --git a/lib/kernels/test/src/test_transpose_kernel.cc b/lib/kernels/test/src/test_transpose_kernel.cc index 02d99c86a1..3c15661396 100644 --- a/lib/kernels/test/src/test_transpose_kernel.cc +++ b/lib/kernels/test/src/test_transpose_kernel.cc @@ -1,58 +1,54 @@ -#include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/transpose_kernels.h" -#include "test_utils.h" +#include using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test Transpose Kernel Operations") { TransposeAttrs attrs = TransposeAttrs{ - FFOrdered{ - ff_dim_t{0_n}, + FFOrdered{ ff_dim_t{1_n}, + ff_dim_t{0_n}, }, }; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle{ + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true}; ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); - TensorShape input_shape = - make_float_tensor_shape_from_legion_dims({10_n, 10_n}); + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{10_n, 10_n}}, + DataType::FLOAT, + }; TensorShape output_shape = input_shape; SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(input_shape, allocator)); + create_random_filled_accessor_r(input_shape, allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); Kernels::Transpose::forward_kernel( managed_stream.raw_stream(), attrs, input_accessor, output_accessor); - std::vector host_output_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(output_accessor)); - CHECK(contains_non_zero(host_output_data)); + CHECK(contains_non_zero(output_accessor)); } SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad_accessor = - read_only_accessor_from_write_accessor( - create_random_filled_accessor_w(output_shape, allocator)); + create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = create_random_filled_accessor_w(input_shape, allocator); Kernels::Transpose::backward_kernel(managed_stream.raw_stream(), attrs, - input_grad_accessor, - output_grad_accessor); + output_grad_accessor, + input_grad_accessor); - std::vector host_grad_input_data = - load_data_to_host_from_device( - read_only_accessor_from_write_accessor(input_grad_accessor)); - CHECK(contains_non_zero(host_grad_input_data)); + CHECK(contains_non_zero(input_grad_accessor)); } } } diff --git a/lib/kernels/test/src/test_utils.cc b/lib/kernels/test/src/test_utils.cc deleted file mode 100644 index 903b666fa9..0000000000 --- a/lib/kernels/test/src/test_utils.cc +++ /dev/null @@ -1,106 +0,0 @@ -#include "test_utils.h" - -GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape, - Allocator &allocator, - bool cpu_fill) { - GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); - size_t volume = accessor.shape.num_elements().unwrap_nonnegative(); - std::vector host_data(volume); - std::random_device rd; - std::mt19937 gen(rd()); - std::uniform_real_distribution dist(-1.0f, 1.0f); - - for (auto &val : host_data) { - val = dist(gen); - } - - if (cpu_fill) { - memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(float)); - } else { - checkCUDA(cudaMemcpy(accessor.ptr, - host_data.data(), - host_data.size() * sizeof(float), - cudaMemcpyHostToDevice)); - } - - return accessor; -} - -GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape, - Allocator &allocator, - float val, - bool cpu_fill) { - GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); - size_t volume = accessor.shape.num_elements().unwrap_nonnegative(); - std::vector host_data(volume, val); - - if (cpu_fill) { - memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(float)); - } else { - checkCUDA(cudaMemcpy(accessor.ptr, - host_data.data(), - host_data.size() * sizeof(float), - cudaMemcpyHostToDevice)); - } - - return accessor; -} - -GenericTensorAccessorW create_iota_filled_accessor_w(TensorShape const &shape, - Allocator &allocator, - bool cpu_fill) { - GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); - size_t volume = accessor.shape.num_elements().unwrap_nonnegative(); - std::vector host_data(volume); - - for (size_t i = 0; i < volume; i++) { - host_data[i] = i; - } - - if (cpu_fill) { - memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(float)); - } else { - checkCUDA(cudaMemcpy(accessor.ptr, - host_data.data(), - host_data.size() * sizeof(float), - cudaMemcpyHostToDevice)); - } - - return accessor; -} - -void fill_tensor_accessor_w(GenericTensorAccessorW accessor, - float val, - bool cpu_fill) { - size_t volume = accessor.shape.num_elements().unwrap_nonnegative(); - std::vector host_data(volume, val); - - if (cpu_fill) { - memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(float)); - } else { - checkCUDA(cudaMemcpy(accessor.ptr, - host_data.data(), - host_data.size() * sizeof(float), - cudaMemcpyHostToDevice)); - } -} - -TensorShape - make_float_tensor_shape_from_legion_dims(FFOrdered dims) { - return TensorShape{ - TensorDims{ - dims, - }, - DataType::FLOAT, - }; -} - -TensorShape - make_double_tensor_shape_from_legion_dims(FFOrdered dims) { - return TensorShape{ - TensorDims{ - dims, - }, - DataType::DOUBLE, - }; -} diff --git a/lib/kernels/test/src/test_utils.h b/lib/kernels/test/src/test_utils.h deleted file mode 100644 index 08f0f382fb..0000000000 --- a/lib/kernels/test/src/test_utils.h +++ /dev/null @@ -1,72 +0,0 @@ -#ifndef _FLEXFLOW_KERNELS_TEST_UTILS -#define _FLEXFLOW_KERNELS_TEST_UTILS - -#include "kernels/device.h" -#include "kernels/local_cuda_allocator.h" -#include "kernels/managed_ff_stream.h" -#include "kernels/managed_per_device_ff_handle.h" -#include -#include -#include -#include -#include - -using namespace FlexFlow; - -GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape, - Allocator &allocator, - bool cpu_fill = false); - -GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape, - Allocator &allocator, - float val, - bool cpu_fill = false); - -GenericTensorAccessorW create_iota_filled_accessor_w(TensorShape const &shape, - Allocator &allocator, - bool cpu_fill = false); - -void fill_tensor_accessor_w(GenericTensorAccessorW accessor, - float val, - bool cpu_fill = false); - -TensorShape - make_float_tensor_shape_from_legion_dims(FFOrdered dims); - -TensorShape - make_double_tensor_shape_from_legion_dims(FFOrdered dims); - -template -std::vector load_data_to_host_from_device(GenericTensorAccessorR accessor) { - int volume = accessor.shape.get_volume(); - - std::vector local_data(volume); - checkCUDA(cudaMemcpy(local_data.data(), - accessor.ptr, - local_data.size() * sizeof(T), - cudaMemcpyDeviceToHost)); - return local_data; -} - -template -bool contains_non_zero(std::vector &data) { - return !all_of( - data.begin(), data.end(), [](T const &val) { return val == 0; }); -} - -// Specialize doctest's StringMaker for std::vector -template <> -struct doctest::StringMaker> { - static doctest::String convert(std::vector const &vec) { - std::ostringstream oss; - for (size_t i = 0; i < vec.size(); ++i) { - oss << vec[i]; - if (i != vec.size() - 1) { - oss << ", "; - } - } - return doctest::String(("[" + oss.str() + "]").c_str()); - } -}; - -#endif diff --git a/lib/local-execution/include/local-execution/per_device_op_state.h b/lib/local-execution/include/local-execution/per_device_op_state.h index 1edd5b6360..f1f357a86e 100644 --- a/lib/local-execution/include/local-execution/per_device_op_state.h +++ b/lib/local-execution/include/local-execution/per_device_op_state.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_PER_DEVICE_STATE_H #define _FLEXFLOW_LOCAL_EXECUTION_PER_DEVICE_STATE_H +#include "kernels/per_device_op_state.dtg.h" #include "local-execution/device_specific_device_states.dtg.h" -#include "local-execution/per_device_op_state.dtg.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/task_argument_accessor.h b/lib/local-execution/include/local-execution/task_argument_accessor.h index 54c8dfc5f1..48584588e3 100644 --- a/lib/local-execution/include/local-execution/task_argument_accessor.h +++ b/lib/local-execution/include/local-execution/task_argument_accessor.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_ARGUMENT_ACCESSOR_H #define _FLEXFLOW_LOCAL_EXECUTION_TASK_ARGUMENT_ACCESSOR_H +#include "kernels/per_device_op_state.dtg.h" #include "local-execution/device_specific.h" #include "local-execution/itask_argument_accessor.h" -#include "local-execution/per_device_op_state.dtg.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/tracked_allocator.h b/lib/local-execution/include/local-execution/tracked_allocator.h index 731e04fdc8..f697337c52 100644 --- a/lib/local-execution/include/local-execution/tracked_allocator.h +++ b/lib/local-execution/include/local-execution/tracked_allocator.h @@ -13,6 +13,9 @@ struct TrackedAllocator : public IAllocator { void *allocate(size_t) override; void deallocate(void *) override; + + DeviceType get_allocation_device_type() const override; + size_t get_current_mem_usage(); private: diff --git a/lib/local-execution/src/local_task_argument_accessor.cc b/lib/local-execution/src/local_task_argument_accessor.cc index 54eca7e514..5d099c6b46 100644 --- a/lib/local-execution/src/local_task_argument_accessor.cc +++ b/lib/local-execution/src/local_task_argument_accessor.cc @@ -24,8 +24,8 @@ GenericTensorAccessor LocalTaskArgumentAccessor::get_tensor( auto tensor_backing = std::get( this->tensor_slots_backing.at(slot_grad_pair)); if (priv == Permissions::RO) { - GenericTensorAccessorR readonly_tensor_backing = { - tensor_backing.data_type, tensor_backing.shape, tensor_backing.ptr}; + GenericTensorAccessorR readonly_tensor_backing = + read_only_accessor_from_write_accessor(tensor_backing); return readonly_tensor_backing; } else if (priv == Permissions::RW || priv == Permissions::WO) { return tensor_backing; @@ -33,6 +33,7 @@ GenericTensorAccessor LocalTaskArgumentAccessor::get_tensor( throw mk_runtime_error(fmt::format("Unhandled privilege mode {}", priv)); } } + VariadicGenericTensorAccessor LocalTaskArgumentAccessor::get_variadic_tensor( slot_id_t slot, Permissions priv, IsGrad is_grad) const { SlotGradId slot_grad_pair = SlotGradId{slot, is_grad}; @@ -43,7 +44,7 @@ VariadicGenericTensorAccessor LocalTaskArgumentAccessor::get_variadic_tensor( for (GenericTensorAccessorW const &tensor_backing : variadic_tensor_backing) { readonly_variadic_tensor_backing.push_back( - {tensor_backing.data_type, tensor_backing.shape, tensor_backing.ptr}); + read_only_accessor_from_write_accessor(tensor_backing)); } return readonly_variadic_tensor_backing; } else if (priv == Permissions::RW || priv == Permissions::WO) { diff --git a/lib/local-execution/src/ops/batch_norm.cc b/lib/local-execution/src/ops/batch_norm.cc index 1df6da8d8e..5cf8742918 100644 --- a/lib/local-execution/src/ops/batch_norm.cc +++ b/lib/local-execution/src/ops/batch_norm.cc @@ -134,9 +134,9 @@ static std::optional profiling, "[BatchNorm] backward_time = {:.2lf}ms\n", per_device_state, - input.get_float_ptr(), - output_grad.get_float_ptr(), output.get_float_ptr(), + output_grad.get_float_ptr(), + input.get_float_ptr(), input_grad.get_float_ptr(), scale.get_float_ptr(), scale_grad.get_float_ptr(), diff --git a/lib/local-execution/src/ops/cast.cc b/lib/local-execution/src/ops/cast.cc index 3e7baf49a9..e9adf88422 100644 --- a/lib/local-execution/src/ops/cast.cc +++ b/lib/local-execution/src/ops/cast.cc @@ -54,9 +54,7 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { profiling, "[Cast] forward_time = {:.2lf}ms\n", input, - output, - input.data_type, - attrs.dtype); + output); } static std::optional @@ -73,9 +71,7 @@ static std::optional profiling, "[Cast] forward_time = {:.2lf}ms\n", input_grad, - output_grad, - input.data_type, - attrs.dtype); + output_grad); } TaskImplFunction get_cast_fwd_task_impl() { diff --git a/lib/local-execution/src/ops/conv_2d.cc b/lib/local-execution/src/ops/conv_2d.cc index bb1504a3f5..55ff354483 100644 --- a/lib/local-execution/src/ops/conv_2d.cc +++ b/lib/local-execution/src/ops/conv_2d.cc @@ -107,8 +107,8 @@ static std::optional acc.get_argument(PER_DEVICE_STATE); auto attrs = acc.get_argument(ATTRS); - auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); + auto input = acc.get_tensor(INPUT); auto filter = acc.get_tensor(FILTER); auto input_grad = acc.get_tensor_grad(INPUT); @@ -120,10 +120,10 @@ static std::optional profiling, "[Conv2d] backward_time = {:.2lf}ms\n", per_device_state, - input.get_float_ptr(), - input_grad.get_float_ptr(), output.get_float_ptr(), output_grad.get_float_ptr(), + input.get_float_ptr(), + input_grad.get_float_ptr(), filter.get_float_ptr(), filter_grad.get_float_ptr(), bias_grad.get_float_ptr(), diff --git a/lib/local-execution/src/ops/element_unary.cc b/lib/local-execution/src/ops/element_unary.cc index c5ff9199f3..311b8e7924 100644 --- a/lib/local-execution/src/ops/element_unary.cc +++ b/lib/local-execution/src/ops/element_unary.cc @@ -58,8 +58,10 @@ static DeviceSpecificDeviceStates ParallelTensorShape output_shape = throw_if_unexpected(get_output_shape(attrs, input_shape)); - ElementUnaryPerDeviceState per_device_state = init_kernel( - get_piece_shape(input_shape), get_piece_shape(output_shape), attrs); + ElementUnaryPerDeviceState per_device_state = + init_kernel(array_shape_from_tensor_shape(get_piece_shape(input_shape)), + array_shape_from_tensor_shape(get_piece_shape(output_shape)), + attrs); return DeviceSpecificDeviceStates{ DeviceSpecific::create(per_device_state)}; @@ -88,10 +90,10 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { static std::optional backward_task_impl(TaskArgumentAccessor const &acc) { - auto input = acc.get_tensor(INPUT); - auto input_grad = acc.get_tensor_grad(INPUT); auto output = acc.get_tensor(OUTPUT); auto output_grad = acc.get_tensor_grad(OUTPUT); + auto input = acc.get_tensor(INPUT); + auto input_grad = acc.get_tensor_grad(INPUT); auto const &attrs = acc.get_argument(ATTRS); auto handle = acc.get_argument(HANDLE); @@ -106,10 +108,10 @@ static std::optional per_device_state, attrs, handle, - input, - input_grad, output, - output_grad); + output_grad, + input, + input_grad); } TaskImplFunction get_element_unary_init_task_impl() { diff --git a/lib/local-execution/src/ops/flat.cc b/lib/local-execution/src/ops/flat.cc index 0f872b5d50..af6fc16272 100644 --- a/lib/local-execution/src/ops/flat.cc +++ b/lib/local-execution/src/ops/flat.cc @@ -40,15 +40,15 @@ static std::optional ProfilingSettings profiling = acc.get_argument(PROFILING); auto input = acc.get_tensor(INPUT); - auto input_grad = acc.get_tensor_grad(INPUT); auto output_grad = acc.get_tensor_grad(OUTPUT); + auto input_grad = acc.get_tensor_grad(INPUT); return profile(backward_kernel, profiling, "[Flat] backward_time = {:.2lf}ms\n", input, - input_grad.get_float_ptr(), - output_grad.get_float_ptr()); + output_grad.get_float_ptr(), + input_grad.get_float_ptr()); } TaskImplFunction get_flat_fwd_task_impl() { diff --git a/lib/local-execution/src/ops/linear.cc b/lib/local-execution/src/ops/linear.cc index 6f0901e66a..9641cdbd4a 100644 --- a/lib/local-execution/src/ops/linear.cc +++ b/lib/local-execution/src/ops/linear.cc @@ -26,9 +26,9 @@ OpTaskInvocation init(LinearAttrs const &attrs) { binding.bind_arg(HANDLE, ff_handle()); binding.bind_arg(ATTRS, attrs); - binding.bind(INPUT, input_tensor(0)); // input - binding.bind(WEIGHT, weight_tensor(0)); // weight - binding.bind(OUTPUT, output_tensor(0)); // output + binding.bind(INPUT, input_tensor(0)); + binding.bind(WEIGHT, weight_tensor(0)); + binding.bind(OUTPUT, output_tensor(0)); return {task_id_t::LINEAR_INIT_TASK_ID, binding}; } @@ -36,11 +36,11 @@ OpTaskInvocation init(LinearAttrs const &attrs) { OpTaskInvocation forward(LinearAttrs const &attrs) { OpTaskBinding binding; - binding.bind(INPUT, input_tensor(0)); // input - binding.bind(WEIGHT, weight_tensor(0)); // weight - binding.bind(OUTPUT, output_tensor(0)); // output + binding.bind(INPUT, input_tensor(0)); + binding.bind(WEIGHT, weight_tensor(0)); + binding.bind(OUTPUT, output_tensor(0)); if (attrs.use_bias) { - binding.bind(BIAS, weight_tensor(1)); // bias + binding.bind(BIAS, weight_tensor(1)); } binding.bind_arg(PROFILING, profiling_settings()); @@ -124,20 +124,21 @@ static std::optional backward_task_impl(TaskArgumentAccessor const &acc) { auto input = acc.get_tensor(INPUT); auto weight = acc.get_tensor(WEIGHT); - auto output = acc.get_tensor(OUTPUT); - auto bias = acc.get_tensor(BIAS); + auto output = acc.get_tensor(OUTPUT); auto input_grad = acc.get_tensor_grad(INPUT); auto weight_grad = acc.get_tensor_grad(WEIGHT); - auto output_grad = acc.get_tensor_grad(OUTPUT); + auto output_grad = acc.get_tensor_grad(OUTPUT); + auto per_device_state = acc.get_argument(PER_DEVICE_STATE); ProfilingSettings profiling = acc.get_argument(PROFILING); auto attrs = acc.get_argument(ATTRS); - float const *bias_ptr = NULL; + float *bias_grad_ptr = NULL; if (attrs.use_bias) { - bias_ptr = bias.get_float_ptr(); + auto bias_grad = acc.get_tensor_grad(BIAS); + bias_grad_ptr = bias_grad.get_float_ptr(); } nonnegative_int in_dim = input.shape.at(ff_dim_t{0_n}); @@ -148,13 +149,13 @@ static std::optional profiling, "[Linear] backward_time = {:.2lf}ms\n", per_device_state, - (void *)input.get_float_ptr(), - (void *)input_grad.get_float_ptr(), - (void *)output.get_float_ptr(), - (void *)output_grad.get_float_ptr(), - (void *)weight.get_float_ptr(), - (void *)weight_grad.get_float_ptr(), - (void *)bias_ptr, + output.get_float_ptr(), + output_grad.get_float_ptr(), + input.get_float_ptr(), + input_grad.get_float_ptr(), + weight.get_float_ptr(), + weight_grad.get_float_ptr(), + bias_grad_ptr, in_dim.unwrap_nonnegative(), out_dim.unwrap_nonnegative(), batch_size.unwrap_nonnegative()); diff --git a/lib/local-execution/src/ops/pool_2d.cc b/lib/local-execution/src/ops/pool_2d.cc index fb0635efba..f85874dc0a 100644 --- a/lib/local-execution/src/ops/pool_2d.cc +++ b/lib/local-execution/src/ops/pool_2d.cc @@ -115,19 +115,19 @@ static std::optional Pool2DPerDeviceState state = acc.get_argument(PER_DEVICE_STATE); - auto input = acc.get_tensor(INPUT); - auto input_grad = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); auto output_grad = acc.get_tensor(OUTPUT); + auto input = acc.get_tensor(INPUT); + auto input_grad = acc.get_tensor(INPUT); return profile(backward_kernel, profiling, "[Pool2D] backward_time = {:.2lf}ms\n", state, - input.get_float_ptr(), - input_grad.get_float_ptr(), output.get_float_ptr(), - output_grad.get_float_ptr()); + output_grad.get_float_ptr(), + input.get_float_ptr(), + input_grad.get_float_ptr()); } TaskImplFunction get_pool_2d_init_task_impl() { diff --git a/lib/local-execution/src/ops/reduction.cc b/lib/local-execution/src/ops/reduction.cc index ee1a7c6c4e..b07d9fe965 100644 --- a/lib/local-execution/src/ops/reduction.cc +++ b/lib/local-execution/src/ops/reduction.cc @@ -63,13 +63,13 @@ static std::optional backward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); - auto input_grad = acc.get_tensor_grad(INPUT); auto output_grad = acc.get_tensor_grad(OUTPUT); + auto input_grad = acc.get_tensor_grad(INPUT); return profile(backward_kernel, profiling, "[Reduction] backward_time = {:.2lf}ms\n", - input_grad, - output_grad); + output_grad, + input_grad); } TaskImplFunction get_reduction_fwd_task_impl() { diff --git a/lib/local-execution/src/ops/repartition.cc b/lib/local-execution/src/ops/repartition.cc index 6c0c813c8d..7b6e9fe2f6 100644 --- a/lib/local-execution/src/ops/repartition.cc +++ b/lib/local-execution/src/ops/repartition.cc @@ -85,8 +85,8 @@ static std::optional ProfilingSettings profiling = acc.get_argument(PROFILING); auto per_device_state = acc.get_argument(PER_DEVICE_STATE); - auto input_grad = acc.get_tensor_grad(INPUT); - auto output_grad = acc.get_tensor_grad(OUTPUT); + auto output_grad = acc.get_tensor_grad(INPUT); + auto input_grad = acc.get_tensor_grad(OUTPUT); return profile(backward_kernel, profiling, diff --git a/lib/local-execution/src/ops/replicate.cc b/lib/local-execution/src/ops/replicate.cc index d3ada35d93..99aeb913ba 100644 --- a/lib/local-execution/src/ops/replicate.cc +++ b/lib/local-execution/src/ops/replicate.cc @@ -66,8 +66,8 @@ static std::optional return profile(backward_kernel, profiling, "[replicate] backward_time = {:.2lf}ms\n", - input_grad, output_grad, + input_grad, attrs.replicate_degree.unwrap_nonnegative()); } diff --git a/lib/local-execution/src/ops/reshape.cc b/lib/local-execution/src/ops/reshape.cc index fc3a75607d..e382b2668e 100644 --- a/lib/local-execution/src/ops/reshape.cc +++ b/lib/local-execution/src/ops/reshape.cc @@ -86,8 +86,8 @@ static std::optional profiling, "[Reshape] backward time = {:.2lf}ms\n", per_device_state, - input_grad, - output_grad); + output_grad, + input_grad); } TaskImplFunction get_reshape_init_task_impl() { diff --git a/lib/local-execution/src/ops/reverse.cc b/lib/local-execution/src/ops/reverse.cc index ddd47d355d..00f56c6892 100644 --- a/lib/local-execution/src/ops/reverse.cc +++ b/lib/local-execution/src/ops/reverse.cc @@ -48,30 +48,12 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { auto output = acc.get_tensor(OUTPUT); auto attrs = acc.get_argument(ATTRS); - nonnegative_int output_size = output.shape.get_volume(); - auto axis = attrs.axis; - nonnegative_int in_blk_size = 1_n; - nonnegative_int reverse_dim_size = 1_n; - nonnegative_int num_out_blks = 1_n; - for (nonnegative_int i : nonnegative_range(output.shape.get_dim())) { - if (i < axis.value) { - in_blk_size *= output.shape.at(ff_dim_t{i}); - } else if (i == axis.value) { - reverse_dim_size = output.shape.at(ff_dim_t{i}); - } else { - num_out_blks *= output.shape.at(ff_dim_t{i}); - } - } - return profile(forward_kernel, profiling, "[reverse] forward_time = {:.2lf}ms\n", - input.get_float_ptr(), - output.get_float_ptr(), - num_out_blks.unwrap_nonnegative(), - reverse_dim_size.unwrap_nonnegative(), - in_blk_size.unwrap_nonnegative(), - output_size.unwrap_nonnegative()); + input, + output, + attrs); } static std::optional @@ -81,30 +63,12 @@ static std::optional auto output_grad = acc.get_tensor_grad(OUTPUT); auto attrs = acc.get_argument(ATTRS); - int axis = input_grad.shape.num_dims().unwrap_nonnegative() - - attrs.axis.value.unwrap_nonnegative() - 1; - nonnegative_int in_blk_size = 1_n; - nonnegative_int reverse_dim_size = 1_n; - nonnegative_int num_out_blks = 1_n; - for (nonnegative_int i : nonnegative_range(input_grad.shape.get_dim())) { - if (i < axis) { - in_blk_size *= input_grad.shape.at(ff_dim_t{i}); - } else if (i == axis) { - reverse_dim_size = input_grad.shape.at(ff_dim_t{i}); - } else { - num_out_blks *= input_grad.shape.at(ff_dim_t{i}); - } - } - return profile(backward_kernel, profiling, "[reverse] backward_time = {:.2lf}ms\n", - output_grad.get_float_ptr(), - input_grad.get_float_ptr(), - num_out_blks.unwrap_nonnegative(), - reverse_dim_size.unwrap_nonnegative(), - in_blk_size.unwrap_nonnegative(), - input_grad.shape.get_volume().unwrap_nonnegative()); + output_grad, + input_grad, + attrs); } TaskImplFunction get_reverse_fwd_task_impl() { diff --git a/lib/local-execution/src/ops/softmax.cc b/lib/local-execution/src/ops/softmax.cc index 0e94422c5f..e008098e05 100644 --- a/lib/local-execution/src/ops/softmax.cc +++ b/lib/local-execution/src/ops/softmax.cc @@ -106,8 +106,8 @@ static std::optional return profile(backward_kernel, profiling, "[SoftMax] backward_time = {:.2lf}ms\n", - input_grad.get_float_ptr(), output_grad.get_float_ptr(), + input_grad.get_float_ptr(), output_grad.shape.get_volume().unwrap_nonnegative()); } diff --git a/lib/local-execution/src/ops/transpose.cc b/lib/local-execution/src/ops/transpose.cc index 4146836b9a..1859bb0ccc 100644 --- a/lib/local-execution/src/ops/transpose.cc +++ b/lib/local-execution/src/ops/transpose.cc @@ -67,8 +67,8 @@ static std::optional profiling, "[Transpose] Backward_time = {:.2lf} [ms]", attrs, - input_grad, - output_grad); + output_grad, + input_grad); } OpTaskInvocation backward(TransposeAttrs const &attrs) { diff --git a/lib/local-execution/src/per_device_state.cc b/lib/local-execution/src/per_device_op_state.cc similarity index 100% rename from lib/local-execution/src/per_device_state.cc rename to lib/local-execution/src/per_device_op_state.cc diff --git a/lib/local-execution/src/tracked_allocator.cc b/lib/local-execution/src/tracked_allocator.cc index e6c3a11711..ed181aea32 100644 --- a/lib/local-execution/src/tracked_allocator.cc +++ b/lib/local-execution/src/tracked_allocator.cc @@ -23,8 +23,13 @@ size_t TrackedAllocator::get_current_mem_usage() { return this->current_mem_usage; } +DeviceType TrackedAllocator::get_allocation_device_type() const { + return this->allocator.get_allocation_device_type(); +} + Allocator get_tracked_memory_allocator(Allocator const &base_allocator) { - return Allocator::create(base_allocator); + Allocator allocator = Allocator::create(base_allocator); + return allocator; } } // namespace FlexFlow diff --git a/lib/local-execution/test/src/test_local_cost_estimator.cc b/lib/local-execution/test/src/test_local_cost_estimator.cc index da3af6e3ad..9f8b4092c1 100644 --- a/lib/local-execution/test/src/test_local_cost_estimator.cc +++ b/lib/local-execution/test/src/test_local_cost_estimator.cc @@ -12,68 +12,71 @@ // TEST_SUITE(FF_CUDA_TEST_SUITE) { // TEST_CASE("Local Cost Estimator") { // // local backing initialization -// ManagedPerDeviceFFHandle managed_handle{}; +// ManagedPerDeviceFFHandle managed_handle{ +// /*workSpaceSize=*/1024 * 1024, +// /*allowTensorOpMathConversion=*/true}; -// RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ -// DeviceSpecific::create(managed_handle.raw_handle()), -// EnableProfiling::YES, -// ProfilingSettings{/*warmup_iters=*/0, -// /*measure_iters=*/1}}; +// RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ +// DeviceSpecific::create(managed_handle.raw_handle()), +// EnableProfiling::YES, +// ProfilingSettings{/*warmup_iters=*/0, +// /*measure_iters=*/1}}; -// LocalCostEstimator cost_estimator = -// LocalCostEstimator{runtime_arg_config}; +// LocalCostEstimator cost_estimator = +// LocalCostEstimator{runtime_arg_config}; -// SUBCASE("Estimate cost -- Attention Op") { -// int embed_dim = 32; -// int num_heads = 10; -// MultiHeadAttentionAttrs attrs = MultiHeadAttentionAttrs{ -// /*embed_dim=*/embed_dim, -// /*num_heads=*/num_heads, -// /*kdim=*/embed_dim, -// /*vdim=*/embed_dim, -// /*dropout=*/0.0, -// /*bias=*/true, -// /*add_bias_kv=*/false, -// /*add_zero_attn=*/false, -// }; +// SUBCASE("Estimate cost -- Attention Op") { +// int embed_dim = 32; +// int num_heads = 10; +// MultiHeadAttentionAttrs attrs = MultiHeadAttentionAttrs{ +// /*embed_dim=*/embed_dim, +// /*num_heads=*/num_heads, +// /*kdim=*/embed_dim, +// /*vdim=*/embed_dim, +// /*dropout=*/0.0, +// /*bias=*/true, +// /*add_bias_kv=*/false, +// /*add_zero_attn=*/false, +// }; -// size_t batch_size = 40; -// size_t seq_len = 48; -// size_t feature_size = 36; +// size_t batch_size = 40; +// size_t seq_len = 48; +// size_t feature_size = 36; -// DataType dtype = DataType::FLOAT; -// ParallelTensorShape inputs_shape = lift_to_parallel(TensorShape{ -// TensorDims{FFOrdered{batch_size, seq_len, feature_size}}, -// DataType::FLOAT, -// }); +// DataType dtype = DataType::FLOAT; +// ParallelTensorShape inputs_shape = lift_to_parallel(TensorShape{ +// TensorDims{FFOrdered{batch_size, seq_len, +// feature_size}}, DataType::FLOAT, +// }); -// ParallelTensorShape weights_shape = throw_if_unexpected( -// get_weights_shape(attrs, inputs_shape, inputs_shape, -// inputs_shape)); -// ParallelTensorAttrs weight_attrs = -// ParallelTensorAttrs{weights_shape, -// /*sync_type=*/std::nullopt, -// /*initializer=*/std::nullopt, -// CreateGrad::YES}; +// ParallelTensorShape weights_shape = throw_if_unexpected( +// get_weights_shape(attrs, inputs_shape, inputs_shape, +// inputs_shape)); +// ParallelTensorAttrs weight_attrs = +// ParallelTensorAttrs{weights_shape, +// /*sync_type=*/std::nullopt, +// /*initializer=*/std::nullopt, +// CreateGrad::YES}; -// ParallelTensorShape output_shape = throw_if_unexpected( -// get_output_shape(attrs, inputs_shape, inputs_shape, inputs_shape)); -// ParallelTensorAttrs output_attrs = -// ParallelTensorAttrs{output_shape, -// /*sync_type=*/std::nullopt, -// /*initializer=*/std::nullopt, -// CreateGrad::YES}; +// ParallelTensorShape output_shape = throw_if_unexpected( +// get_output_shape(attrs, inputs_shape, inputs_shape, +// inputs_shape)); +// ParallelTensorAttrs output_attrs = +// ParallelTensorAttrs{output_shape, +// /*sync_type=*/std::nullopt, +// /*initializer=*/std::nullopt, +// CreateGrad::YES}; -// CostDetails result = cost_estimator.estimate_cost( -// PCGOperatorAttrs{attrs}, -// std::vector{ -// inputs_shape, inputs_shape, inputs_shape}, -// std::vector{weight_attrs}, -// std::vector{output_attrs}, -// make_1d_machine_view(gpu_id_t{0}, gpu_id_t{1})); +// CostDetails result = cost_estimator.estimate_cost( +// PCGOperatorAttrs{attrs}, +// std::vector{ +// inputs_shape, inputs_shape, inputs_shape}, +// std::vector{weight_attrs}, +// std::vector{output_attrs}, +// make_1d_machine_view(gpu_id_t{0}, gpu_id_t{1})); -// CHECK(result.total_elapsed_time > 0); -// CHECK(result.total_mem_usage > 0); +// CHECK(result.total_elapsed_time > 0); +// CHECK(result.total_mem_usage > 0); +// } +// } // } -// } -// } diff --git a/lib/local-execution/test/src/test_local_slots_backing.cc b/lib/local-execution/test/src/test_local_slots_backing.cc index dffb19398c..e55d1eddf5 100644 --- a/lib/local-execution/test/src/test_local_slots_backing.cc +++ b/lib/local-execution/test/src/test_local_slots_backing.cc @@ -1,6 +1,6 @@ #include "kernels/attention_kernels.h" +#include "kernels/local_cpu_allocator.h" #include "local-execution/local_cost_estimator.h" -#include "local-execution/local_cpu_allocator.h" #include "local-execution/local_slots_backing.h" #include "op-attrs/ops/attention.h" #include "op-attrs/parallel_tensor_shape.h" @@ -106,24 +106,24 @@ TEST_SUITE(FF_TEST_SUITE) { std::pair result = get_result_shape_and_dtype_for_tensor_guid_and_map( query_guid, local_slots_backing.gradient_tensor_mapping); - std::pair correct = {ArrayShape{query_shape}, - dtype}; + std::pair correct = { + array_shape_from_tensor_shape(query_shape), dtype}; CHECK(result == correct); } SUBCASE("Key grad") { std::pair result = get_result_shape_and_dtype_for_tensor_guid_and_map( key_guid, local_slots_backing.gradient_tensor_mapping); - std::pair correct = {ArrayShape{key_shape}, - dtype}; + std::pair correct = { + array_shape_from_tensor_shape(key_shape), dtype}; CHECK(result == correct); } SUBCASE("Value grad") { std::pair result = get_result_shape_and_dtype_for_tensor_guid_and_map( value_guid, local_slots_backing.gradient_tensor_mapping); - std::pair correct = {ArrayShape{value_shape}, - dtype}; + std::pair correct = { + array_shape_from_tensor_shape(value_shape), dtype}; CHECK(result == correct); } } @@ -135,9 +135,9 @@ TEST_SUITE(FF_TEST_SUITE) { get_result_shape_and_dtype_for_tensor_guid_and_map( output_guid, local_slots_backing.tensor_mapping); std::pair correct = { - ArrayShape{ + array_shape_from_tensor_shape( get_tensor_attrs(cg_builder.computation_graph, output_guid) - .shape}, + .shape), dtype}; CHECK(result == correct); } @@ -146,9 +146,9 @@ TEST_SUITE(FF_TEST_SUITE) { get_result_shape_and_dtype_for_tensor_guid_and_map( output_guid, local_slots_backing.gradient_tensor_mapping); std::pair correct = { - ArrayShape{ + array_shape_from_tensor_shape( get_tensor_attrs(cg_builder.computation_graph, output_guid) - .shape}, + .shape), dtype}; CHECK(result == correct); } diff --git a/lib/local-execution/test/src/test_local_task_arg_accessor.cc b/lib/local-execution/test/src/test_local_task_arg_accessor.cc index 0fab0f6a60..a39bb229e2 100644 --- a/lib/local-execution/test/src/test_local_task_arg_accessor.cc +++ b/lib/local-execution/test/src/test_local_task_arg_accessor.cc @@ -1,5 +1,5 @@ #include "doctest/doctest.h" -#include "local-execution/local_cpu_allocator.h" +#include "kernels/local_cpu_allocator.h" #include "local-execution/local_task_argument_accessor.h" #include "local-execution/task_signature_impl.h" #include "utils/fmt/variant.h" diff --git a/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml b/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml index 27aa50f38f..09ee99915d 100644 --- a/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml +++ b/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml @@ -10,5 +10,6 @@ features = [ [[values]] name = "SUM" -[[value]] +[[values]] name = "AVG" + diff --git a/lib/op-attrs/include/op-attrs/datatype_value.h b/lib/op-attrs/include/op-attrs/datatype_value.h new file mode 100644 index 0000000000..723e69bddd --- /dev/null +++ b/lib/op-attrs/include/op-attrs/datatype_value.h @@ -0,0 +1,16 @@ +#ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DATATYPE_VALUE_H +#define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DATATYPE_VALUE_H + +#include "op-attrs/datatype_value.dtg.h" + +namespace FlexFlow { + +DataTypeValue make_float_data_type_value(float value); +DataTypeValue make_double_data_type_value(double value); +DataTypeValue make_int32_data_type_value(int32_t value); +DataTypeValue make_int64_data_type_value(int64_t value); +DataTypeValue make_bool_data_type_value(bool value); + +} // namespace FlexFlow + +#endif // _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_MAKE_DATATYPE_VALUE_H diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/dim_ordered.h b/lib/op-attrs/include/op-attrs/dim_ordered/dim_ordered.h index f2355289dc..5c47745209 100644 --- a/lib/op-attrs/include/op-attrs/dim_ordered/dim_ordered.h +++ b/lib/op-attrs/include/op-attrs/dim_ordered/dim_ordered.h @@ -17,13 +17,9 @@ struct DimOrdered { DimOrdered(std::initializer_list const &l) : contents(l.begin(), l.end()) {} - /* template ::value>::type> */ DimOrdered(std::vector const &contents) : contents(contents.begin(), contents.end()) {} - /* template ::value>::type> */ template DimOrdered(It begin, It end) : contents(begin, end) {} @@ -62,10 +58,6 @@ struct DimOrdered { return this->contents != other.contents; } - bool operator<(DimOrdered const &other) const { - return this->contents < other.contents; - } - using iterator = typename stack_vector::iterator; using const_iterator = typename stack_vector::const_iterator; @@ -116,7 +108,7 @@ struct DimOrdered { } reverse_iterator rend() { - return this->contents.crend(); + return this->contents.rend(); } const_reverse_iterator rend() const { @@ -145,195 +137,26 @@ struct DimOrdered { stack_vector contents; }; -template -struct DimOrdered { - DimOrdered() {} - - DimOrdered(std::initializer_list const &l) - : contents(l.begin(), l.end()) {} - - DimOrdered(std::vector const &contents) - : contents(contents.begin(), contents.end()) {} - - template - DimOrdered(It begin, It end) : contents(begin, end) {} - - template - DimOrdered(stack_vector const &contents) - : contents(contents.begin(), contents.end()) {} - - T const &at(ff_dim_t idx) const { - int raw = idx.value.unwrap_nonnegative(); - return this->contents.at(raw); - } - - T const &at(relative_ff_dim_t idx) const { - int raw = idx.value; - if (raw < 0) { - raw = this->contents.size() + raw; - } - return this->contents.at(raw); - } - - T &at(ff_dim_t idx) { - int raw = idx.value.unwrap_nonnegative(); - return this->contents.at(raw); - } - - T &at(relative_ff_dim_t idx) { - int raw = idx.value; - if (raw < 0) { - raw = this->contents.size() + raw; - } - return this->contents.at(raw); - } - - T const &operator[](ff_dim_t idx) const { - return this->at(idx); - } - - T const &operator[](relative_ff_dim_t idx) const { - return this->at(idx); - } - - T &operator[](ff_dim_t idx) { - return this->at(idx); - } - - T &operator[](relative_ff_dim_t idx) { - return this->at(idx); - } - - bool idx_is_valid(ff_dim_t const &idx) const { - int raw = idx.value.unwrap_nonnegative(); - return raw < this->contents.size(); - } - - bool idx_is_valid(relative_ff_dim_t const &idx) const { - int raw = idx.value; - if (raw < 0) { - raw = this->contents.size() + raw; - } - return (raw >= 0 && raw < this->contents.size()); - } - - bool operator==(DimOrdered const &other) const { - return this->contents == other.contents; - } - - bool operator!=(DimOrdered const &other) const { - return this->contents != other.contents; - } - - bool operator<(DimOrdered const &other) const { - return this->contents < other.contents; - } - - using iterator = typename stack_vector::iterator; - using const_iterator = - typename stack_vector::const_iterator; - using reverse_iterator = - typename stack_vector::reverse_iterator; - using const_reverse_iterator = - typename stack_vector::const_reverse_iterator; - using value_type = T; - using pointer = value_type *; - using const_pointer = value_type const *; - using reference = value_type &; - using const_reference = value_type const &; - - iterator begin() { - return this->contents.begin(); - } - - const_iterator begin() const { - return this->cbegin(); - } - - const_iterator cbegin() const { - return this->contents.cbegin(); - } - - iterator end() { - return this->contents.end(); - } - - const_iterator end() const { - return this->cend(); - } - - const_iterator cend() const { - return this->contents.cend(); - } - - reverse_iterator rbegin() { - return this->contents.rbegin(); - } - - const_reverse_iterator rbegin() const { - return this->crbegin(); - } - - const_reverse_iterator crbegin() const { - return this->contents.crbegin(); - } - - reverse_iterator rend() { - return this->contents.crend(); - } - - const_reverse_iterator rend() const { - return this->crend(); - } - - const_reverse_iterator crend() const { - return this->contents.crend(); - } - - size_t size() const { - return this->contents.size(); - } - - size_t empty() const { - return this->contents.empty(); - } - - size_t num_dims() const { - return this->size(); - } - - friend struct ::std::hash; - -private: - stack_vector contents; -}; - -template -using FFOrdered = DimOrdered; +template +auto operator<(DimOrdered const &lhs, DimOrdered const &rhs) + -> std::enable_if_t, bool> { + return std::lexicographical_compare( + lhs.cbegin(), lhs.cend(), rhs.cbegin(), rhs.cend()); +} -template -std::string format_as(FFOrdered const &v) { +template +std::string format_as(DimOrdered const &v) { std::vector as_vec(v.cbegin(), v.cend()); return fmt::format("", as_vec); } -template -std::ostream &operator<<(std::ostream &s, FFOrdered const &v) { +template +std::ostream &operator<<(std::ostream &s, DimOrdered const &v) { return (s << fmt::to_string(v)); } } // namespace FlexFlow -/* template */ -/* void to_json(json &j, DimOrdered const &x) { */ -/* /1* j = std::vector{x.cbegin(), x.cend()}; *1/ */ -/* } */ - -/* template */ -/* void from_json(json const &j, DimOrdered &x) { */ -/* /1* x = DimOrdered{j.template get>()}; *1/ */ -/* } */ - namespace nlohmann { template struct adl_serializer<::FlexFlow::DimOrdered> { diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/slice.h b/lib/op-attrs/include/op-attrs/dim_ordered/slice.h index 166916dd44..76526447be 100644 --- a/lib/op-attrs/include/op-attrs/dim_ordered/slice.h +++ b/lib/op-attrs/include/op-attrs/dim_ordered/slice.h @@ -2,7 +2,7 @@ #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_SLICE_H #include "op-attrs/dim_ordered/dim_ordered.h" -#include "utils/containers/subvec.h" +#include "utils/containers/slice.h" #include "utils/containers/transform.h" #include "utils/containers/vector_of.h" #include "utils/optional.h" @@ -18,35 +18,8 @@ DimOrdered nonoverloaded_slice(DimOrdered const &d, }; return DimOrdered{ - subvec(vector_of(d), to_raw_idx(start), to_raw_idx(end))}; + slice(vector_of(d), to_raw_idx(start), to_raw_idx(end))}; } - -template -FFOrdered ff_dim_t_nonoverloaded_slice(FFOrdered const &d, - std::optional const &start, - std::optional const &end) { - auto to_raw_idx = - [](std::optional const &idx) -> std::optional { - return transform( - idx, [](ff_dim_t const &i) { return i.value.unwrap_nonnegative(); }); - }; - - return FFOrdered{subvec(vector_of(d), to_raw_idx(start), to_raw_idx(end))}; -} - -template -FFOrdered relative_ff_dim_t_nonoverloaded_slice( - FFOrdered const &d, - std::optional const &start, - std::optional const &end) { - auto to_raw_idx = - [](std::optional const &idx) -> std::optional { - return transform(idx, [](relative_ff_dim_t const &i) { return i.value; }); - }; - - return FFOrdered{subvec(vector_of(d), to_raw_idx(start), to_raw_idx(end))}; -} - template DimOrdered slice(DimOrdered const &d, std::optional const &start = std::nullopt, @@ -54,20 +27,6 @@ DimOrdered slice(DimOrdered const &d, return ff_dim_t_nonoverloaded_slice(d, start, end); } -template -FFOrdered slice(FFOrdered const &d, - std::optional const &start = std::nullopt, - std::optional const &end = std::nullopt) { - return ff_dim_t_nonoverloaded_slice(d, start, end); -} - -template -FFOrdered slice(FFOrdered const &d, - std::optional const &start = std::nullopt, - std::optional const &end = std::nullopt) { - return relative_ff_dim_t_nonoverloaded_slice(d, start, end); -} - } // namespace FlexFlow #endif diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/concat.h b/lib/op-attrs/include/op-attrs/ff_ordered/concat.h similarity index 95% rename from lib/op-attrs/include/op-attrs/dim_ordered/concat.h rename to lib/op-attrs/include/op-attrs/ff_ordered/concat.h index 9b9eaf9b93..a5faed2b36 100644 --- a/lib/op-attrs/include/op-attrs/dim_ordered/concat.h +++ b/lib/op-attrs/include/op-attrs/ff_ordered/concat.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_CONCAT_H #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_CONCAT_H -#include "op-attrs/dim_ordered/dim_ordered.h" +#include "op-attrs/ff_ordered/ff_ordered.h" #include "utils/containers/concat_vectors.h" #include "utils/containers/transform.h" diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/enumerate.h b/lib/op-attrs/include/op-attrs/ff_ordered/enumerate.h similarity index 95% rename from lib/op-attrs/include/op-attrs/dim_ordered/enumerate.h rename to lib/op-attrs/include/op-attrs/ff_ordered/enumerate.h index 9e4271a1ff..bc8636615c 100644 --- a/lib/op-attrs/include/op-attrs/dim_ordered/enumerate.h +++ b/lib/op-attrs/include/op-attrs/ff_ordered/enumerate.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_ENUMERATE_H #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_ENUMERATE_H -#include "op-attrs/dim_ordered/dim_ordered.h" +#include "op-attrs/ff_ordered/ff_ordered.h" #include "utils/bidict/bidict.h" #include "utils/containers/count.h" diff --git a/lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered.h b/lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered.h new file mode 100644 index 0000000000..92ed211c31 --- /dev/null +++ b/lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered.h @@ -0,0 +1,228 @@ +#ifndef _FLEXFLOW_OPATTRS_INCLUDE_OPATTRS_DIM_ORDERED_FF_ORDERED_H +#define _FLEXFLOW_OPATTRS_INCLUDE_OPATTRS_DIM_ORDERED_FF_ORDERED_H + +#include "op-attrs/ff_dim_t.dtg.h" +#include "op-attrs/relative_ff_dim_t.dtg.h" +#include "utils/fmt/vector.h" +#include "utils/stack_vector/stack_vector.h" + +namespace FlexFlow { + +template +struct FFOrdered { + FFOrdered() {} + + FFOrdered(std::initializer_list const &l) : contents(l.begin(), l.end()) {} + + FFOrdered(std::vector const &contents) + : contents(contents.begin(), contents.end()) {} + + template + FFOrdered(It begin, It end) : contents(begin, end) {} + + template + FFOrdered(stack_vector const &contents) + : contents(contents.begin(), contents.end()) {} + + T const &at(ff_dim_t idx) const { + int raw = idx.value.unwrap_nonnegative(); + return this->contents.at(raw); + } + + T const &at(relative_ff_dim_t idx) const { + int raw = idx.value; + if (raw < 0) { + raw = this->contents.size() + raw; + } + return this->contents.at(raw); + } + + T &at(ff_dim_t idx) { + int raw = idx.value.unwrap_nonnegative(); + return this->contents.at(raw); + } + + T &at(relative_ff_dim_t idx) { + int raw = idx.value; + if (raw < 0) { + raw = this->contents.size() + raw; + } + return this->contents.at(raw); + } + + T const &operator[](ff_dim_t idx) const { + return this->at(idx); + } + + T const &operator[](relative_ff_dim_t idx) const { + return this->at(idx); + } + + T &operator[](ff_dim_t idx) { + return this->at(idx); + } + + T &operator[](relative_ff_dim_t idx) { + return this->at(idx); + } + + bool idx_is_valid(ff_dim_t const &idx) const { + int raw = idx.value.unwrap_nonnegative(); + return raw < this->contents.size(); + } + + bool idx_is_valid(relative_ff_dim_t const &idx) const { + int raw = idx.value; + if (raw < 0) { + raw = this->contents.size() + raw; + } + return (raw >= 0 && raw < this->contents.size()); + } + + bool operator==(FFOrdered const &other) const { + return this->contents == other.contents; + } + + bool operator!=(FFOrdered const &other) const { + return this->contents != other.contents; + } + + using iterator = typename stack_vector::iterator; + using const_iterator = + typename stack_vector::const_iterator; + using reverse_iterator = + typename stack_vector::reverse_iterator; + using const_reverse_iterator = + typename stack_vector::const_reverse_iterator; + using value_type = T; + using pointer = value_type *; + using const_pointer = value_type const *; + using reference = value_type &; + using const_reference = value_type const &; + + iterator begin() { + return this->contents.begin(); + } + + const_iterator begin() const { + return this->cbegin(); + } + + const_iterator cbegin() const { + return this->contents.cbegin(); + } + + iterator end() { + return this->contents.end(); + } + + const_iterator end() const { + return this->cend(); + } + + const_iterator cend() const { + return this->contents.cend(); + } + + reverse_iterator rbegin() { + return this->contents.rbegin(); + } + + const_reverse_iterator rbegin() const { + return this->crbegin(); + } + + const_reverse_iterator crbegin() const { + return this->contents.crbegin(); + } + + reverse_iterator rend() { + return this->contents.rend(); + } + + const_reverse_iterator rend() const { + return this->crend(); + } + + const_reverse_iterator crend() const { + return this->contents.crend(); + } + + size_t size() const { + return this->contents.size(); + } + + size_t empty() const { + return this->contents.empty(); + } + + size_t num_dims() const { + return this->size(); + } + + friend struct ::std::hash; + +private: + stack_vector contents; +}; + +template +auto operator<(FFOrdered const &lhs, FFOrdered const &rhs) + -> std::enable_if_t, bool> { + return std::lexicographical_compare( + lhs.cbegin(), lhs.cend(), rhs.cbegin(), rhs.cend()); +} + +template +std::string format_as(FFOrdered const &v) { + std::vector as_vec(v.cbegin(), v.cend()); + return fmt::format("", as_vec); +} + +template +std::ostream &operator<<(std::ostream &s, FFOrdered const &v) { + return (s << fmt::to_string(v)); +} + +} // namespace FlexFlow + +namespace nlohmann { +template +struct adl_serializer<::FlexFlow::FFOrdered> { + static ::FlexFlow::FFOrdered from_json(nlohmann::json const &j) { + return {j.template get>()}; + } + + static void to_json(nlohmann::json &j, ::FlexFlow::FFOrdered const &x) { + j = std::vector{x.cbegin(), x.cend()}; + } +}; +} // namespace nlohmann + +namespace std { + +template +struct hash<::FlexFlow::FFOrdered> { + size_t operator()(::FlexFlow::FFOrdered const &t) const { + static_assert(::FlexFlow::is_hashable::value, + "Elements must be hashable"); + + return get_std_hash(t.contents); + } +}; + +} // namespace std + +namespace rc { + +template +struct Arbitrary<::FlexFlow::FFOrdered> { + static Gen<::FlexFlow::FFOrdered> arbitrary() { + return gen::construct<::FlexFlow::FFOrdered>( + gen::arbitrary<::FlexFlow::stack_vector>()); + } +}; + +} // namespace rc + +#endif diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/ff_ordered_from_map.h b/lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered_from_map.h similarity index 88% rename from lib/op-attrs/include/op-attrs/dim_ordered/ff_ordered_from_map.h rename to lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered_from_map.h index f8f49233ec..9232afddfb 100644 --- a/lib/op-attrs/include/op-attrs/dim_ordered/ff_ordered_from_map.h +++ b/lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered_from_map.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_FF_ORDERED_FROM_MAP_H #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_FF_ORDERED_FROM_MAP_H -#include "op-attrs/dim_ordered/dim_ordered.h" -#include "op-attrs/dim_ordered/ff_ordered_of.h" #include "op-attrs/ff_dim_t.h" +#include "op-attrs/ff_ordered/ff_ordered.h" +#include "op-attrs/ff_ordered/ff_ordered_of.h" namespace FlexFlow { diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/ff_ordered_of.h b/lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered_of.h similarity index 88% rename from lib/op-attrs/include/op-attrs/dim_ordered/ff_ordered_of.h rename to lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered_of.h index 8cc1bf3a51..ace60b7e3d 100644 --- a/lib/op-attrs/include/op-attrs/dim_ordered/ff_ordered_of.h +++ b/lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered_of.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_FF_ORDERED_OF_H #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_FF_ORDERED_OF_H -#include "op-attrs/dim_ordered/dim_ordered.h" +#include "op-attrs/ff_ordered/ff_ordered.h" namespace FlexFlow { diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/get_idxs.h b/lib/op-attrs/include/op-attrs/ff_ordered/get_idxs.h similarity index 91% rename from lib/op-attrs/include/op-attrs/dim_ordered/get_idxs.h rename to lib/op-attrs/include/op-attrs/ff_ordered/get_idxs.h index 4e7f8530a4..5ff390d3fe 100644 --- a/lib/op-attrs/include/op-attrs/dim_ordered/get_idxs.h +++ b/lib/op-attrs/include/op-attrs/ff_ordered/get_idxs.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_GET_IDXS_H #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_GET_IDXS_H -#include "op-attrs/dim_ordered/dim_ordered.h" #include "op-attrs/ff_dim_t.h" +#include "op-attrs/ff_ordered/ff_ordered.h" #include "utils/containers/count.h" #include "utils/containers/transform.h" diff --git a/lib/op-attrs/include/op-attrs/ff_ordered/slice.h b/lib/op-attrs/include/op-attrs/ff_ordered/slice.h new file mode 100644 index 0000000000..79217c4cc3 --- /dev/null +++ b/lib/op-attrs/include/op-attrs/ff_ordered/slice.h @@ -0,0 +1,49 @@ +#ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_FF_ORDERED_SLICE_H +#define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_FF_ORDERED_SLICE_H + +#include "op-attrs/ff_ordered/ff_ordered.h" +#include "utils/containers/slice.h" +#include "utils/containers/transform.h" +#include "utils/containers/vector_of.h" + +namespace FlexFlow { + +template +FFOrdered ff_dim_t_nonoverloaded_slice(FFOrdered const &d, + ff_dim_t const &start, + std::optional const &end) { + int raw_start = start.value.unwrap_nonnegative(); + std::optional raw_end = transform( + end, [](ff_dim_t const &i) { return i.value.unwrap_nonnegative(); }); + return FFOrdered{slice(vector_of(d), raw_start, raw_end)}; +} + +template +FFOrdered relative_ff_dim_t_nonoverloaded_slice( + FFOrdered const &d, + relative_ff_dim_t const &start, + std::optional const &end) { + int raw_start = start.value; + std::optional raw_end = + transform(end, [](relative_ff_dim_t const &i) { return i.value; }); + + return FFOrdered{slice(vector_of(d), raw_start, raw_end)}; +} + +template +FFOrdered slice(FFOrdered const &d, + ff_dim_t const &start = ff_dim_t{0_n}, + std::optional const &end = std::nullopt) { + return ff_dim_t_nonoverloaded_slice(d, start, end); +} + +template +FFOrdered slice(FFOrdered const &d, + relative_ff_dim_t const &start = relative_ff_dim_t{0}, + std::optional const &end = std::nullopt) { + return relative_ff_dim_t_nonoverloaded_slice(d, start, end); +} + +} // namespace FlexFlow + +#endif diff --git a/lib/op-attrs/include/op-attrs/ff_ordered/transform.h b/lib/op-attrs/include/op-attrs/ff_ordered/transform.h new file mode 100644 index 0000000000..3a8eeb9ecf --- /dev/null +++ b/lib/op-attrs/include/op-attrs/ff_ordered/transform.h @@ -0,0 +1,17 @@ +#ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_TRANSFORM_H +#define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_TRANSFORM_H + +#include "op-attrs/ff_ordered/ff_ordered.h" +#include "utils/containers/vector_of.h" +#include "utils/containers/vector_transform.h" + +namespace FlexFlow { + +template > +FFOrdered transform(FFOrdered const &d, F &&f) { + return FFOrdered{vector_transform(vector_of(d), f)}; +} + +} // namespace FlexFlow + +#endif diff --git a/lib/op-attrs/include/op-attrs/ff_ordered/zip.h b/lib/op-attrs/include/op-attrs/ff_ordered/zip.h new file mode 100644 index 0000000000..fe207740f7 --- /dev/null +++ b/lib/op-attrs/include/op-attrs/ff_ordered/zip.h @@ -0,0 +1,18 @@ +#ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_FF_ORDERED_ZIP_H +#define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_FF_ORDERED_ZIP_H + +#include "op-attrs/ff_ordered/ff_ordered.h" +#include "utils/containers/vector_of.h" +#include "utils/containers/zip.h" + +namespace FlexFlow { + +template +FFOrdered> zip(FFOrdered const &lhs, + FFOrdered const &rhs) { + return FFOrdered>{zip(vector_of(lhs), vector_of(rhs))}; +} + +} // namespace FlexFlow + +#endif diff --git a/lib/op-attrs/include/op-attrs/ops/transpose_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/transpose_attrs.struct.toml index b1c5f60382..50756f095b 100644 --- a/lib/op-attrs/include/op-attrs/ops/transpose_attrs.struct.toml +++ b/lib/op-attrs/include/op-attrs/ops/transpose_attrs.struct.toml @@ -12,7 +12,7 @@ features = [ includes = [ "op-attrs/ff_dim_t.h", "op-attrs/ff_dim_t.dtg.h", - "op-attrs/dim_ordered/dim_ordered.h", + "op-attrs/ff_ordered/ff_ordered.h", ] [[fields]] diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_dim_degrees.struct.toml b/lib/op-attrs/include/op-attrs/parallel_tensor_dim_degrees.struct.toml index be3a95eec8..d68ef02ec1 100644 --- a/lib/op-attrs/include/op-attrs/parallel_tensor_dim_degrees.struct.toml +++ b/lib/op-attrs/include/op-attrs/parallel_tensor_dim_degrees.struct.toml @@ -12,7 +12,7 @@ features = [ includes = [ "op-attrs/parallel_tensor_shape/sum_degree.dtg.h", "op-attrs/parallel_tensor_shape/discard_copy_degree.dtg.h", - "op-attrs/dim_ordered/dim_ordered.h", + "op-attrs/ff_ordered/ff_ordered.h", "utils/nonnegative_int/nonnegative_int.h", ] diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_dims.struct.toml b/lib/op-attrs/include/op-attrs/parallel_tensor_dims.struct.toml index f24fa12309..d2f8758377 100644 --- a/lib/op-attrs/include/op-attrs/parallel_tensor_dims.struct.toml +++ b/lib/op-attrs/include/op-attrs/parallel_tensor_dims.struct.toml @@ -10,7 +10,7 @@ features = [ ] includes = [ - "op-attrs/dim_ordered/dim_ordered.h", + "op-attrs/ff_ordered/ff_ordered.h", "op-attrs/shard_parallel_dim.dtg.h", "op-attrs/replica_parallel_dim_set.dtg.h", "", diff --git a/lib/op-attrs/include/op-attrs/tensor_dims.h b/lib/op-attrs/include/op-attrs/tensor_dims.h index 97f3432c2f..ba35295e09 100644 --- a/lib/op-attrs/include/op-attrs/tensor_dims.h +++ b/lib/op-attrs/include/op-attrs/tensor_dims.h @@ -19,7 +19,7 @@ std::optional get_broadcast_target_dims(std::unordered_set const &); TensorDims slice_tensor_dims(TensorDims const &, - std::optional const &start, + relative_ff_dim_t const &start, std::optional const &stop); } // namespace FlexFlow diff --git a/lib/op-attrs/include/op-attrs/tensor_dims.struct.toml b/lib/op-attrs/include/op-attrs/tensor_dims.struct.toml index e86b866fd6..8c6d1098cc 100644 --- a/lib/op-attrs/include/op-attrs/tensor_dims.struct.toml +++ b/lib/op-attrs/include/op-attrs/tensor_dims.struct.toml @@ -10,7 +10,7 @@ features = [ ] includes = [ - "op-attrs/dim_ordered/dim_ordered.h", + "op-attrs/ff_ordered/ff_ordered.h", "utils/nonnegative_int/nonnegative_int.h", ] diff --git a/lib/op-attrs/include/op-attrs/tensor_shape.h b/lib/op-attrs/include/op-attrs/tensor_shape.h index a3cd8bfd9a..298ea04638 100644 --- a/lib/op-attrs/include/op-attrs/tensor_shape.h +++ b/lib/op-attrs/include/op-attrs/tensor_shape.h @@ -12,7 +12,7 @@ nonnegative_int get_num_elements(TensorShape const &); nonnegative_int get_size_in_bytes(TensorShape const &); TensorShape slice_tensor_shape(TensorShape const &, - std::optional const &start, + relative_ff_dim_t const &start, std::optional const &stop); } // namespace FlexFlow diff --git a/lib/op-attrs/src/op-attrs/datatype_value.cc b/lib/op-attrs/src/op-attrs/datatype_value.cc new file mode 100644 index 0000000000..4604ef0b4e --- /dev/null +++ b/lib/op-attrs/src/op-attrs/datatype_value.cc @@ -0,0 +1,25 @@ +#include "op-attrs/datatype_value.h" + +namespace FlexFlow { + +DataTypeValue make_float_data_type_value(float value) { + return DataTypeValue{value}; +} + +DataTypeValue make_double_data_type_value(double value) { + return DataTypeValue{value}; +} + +DataTypeValue make_int32_data_type_value(int32_t value) { + return DataTypeValue{value}; +} + +DataTypeValue make_int64_data_type_value(int64_t value) { + return DataTypeValue{value}; +} + +DataTypeValue make_bool_data_type_value(bool value) { + return DataTypeValue{value}; +} + +} // namespace FlexFlow diff --git a/lib/op-attrs/src/op-attrs/dim_ordered/concat.cc b/lib/op-attrs/src/op-attrs/dim_ordered/concat.cc deleted file mode 100644 index cb29f708a3..0000000000 --- a/lib/op-attrs/src/op-attrs/dim_ordered/concat.cc +++ /dev/null @@ -1 +0,0 @@ -#include "op-attrs/dim_ordered/concat.h" diff --git a/lib/op-attrs/src/op-attrs/dim_ordered/enumerate.cc b/lib/op-attrs/src/op-attrs/dim_ordered/enumerate.cc deleted file mode 100644 index 6edd5485af..0000000000 --- a/lib/op-attrs/src/op-attrs/dim_ordered/enumerate.cc +++ /dev/null @@ -1 +0,0 @@ -#include "op-attrs/dim_ordered/enumerate.h" diff --git a/lib/op-attrs/src/op-attrs/dim_ordered/ff_ordered_from_map.cc b/lib/op-attrs/src/op-attrs/dim_ordered/ff_ordered_from_map.cc deleted file mode 100644 index 2de88f38c8..0000000000 --- a/lib/op-attrs/src/op-attrs/dim_ordered/ff_ordered_from_map.cc +++ /dev/null @@ -1 +0,0 @@ -#include "op-attrs/dim_ordered/ff_ordered_from_map.h" diff --git a/lib/op-attrs/src/op-attrs/dim_ordered/ff_ordered_of.cc b/lib/op-attrs/src/op-attrs/dim_ordered/ff_ordered_of.cc deleted file mode 100644 index 8e5c2fd38a..0000000000 --- a/lib/op-attrs/src/op-attrs/dim_ordered/ff_ordered_of.cc +++ /dev/null @@ -1 +0,0 @@ -#include "op-attrs/dim_ordered/ff_ordered_of.h" diff --git a/lib/op-attrs/src/op-attrs/dim_ordered/get_idxs.cc b/lib/op-attrs/src/op-attrs/dim_ordered/get_idxs.cc deleted file mode 100644 index 175ae8d4bd..0000000000 --- a/lib/op-attrs/src/op-attrs/dim_ordered/get_idxs.cc +++ /dev/null @@ -1 +0,0 @@ -#include "op-attrs/dim_ordered/get_idxs.h" diff --git a/lib/op-attrs/src/op-attrs/dim_ordered/slice.cc b/lib/op-attrs/src/op-attrs/dim_ordered/slice.cc index 75ab1a32aa..8c3dbd7bbc 100644 --- a/lib/op-attrs/src/op-attrs/dim_ordered/slice.cc +++ b/lib/op-attrs/src/op-attrs/dim_ordered/slice.cc @@ -1,26 +1 @@ #include "op-attrs/dim_ordered/slice.h" -#include "utils/archetypes/value_type.h" - -namespace FlexFlow { - -using T = value_type<0>; - -template FFOrdered - ff_dim_t_nonoverloaded_slice(FFOrdered const &d, - std::optional const &start, - std::optional const &end); - -template FFOrdered relative_ff_dim_t_nonoverloaded_slice( - FFOrdered const &d, - std::optional const &start, - std::optional const &end); - -template FFOrdered slice(FFOrdered const &d, - std::optional const &start, - std::optional const &end); - -template FFOrdered slice(FFOrdered const &d, - std::optional const &start, - std::optional const &end); - -} // namespace FlexFlow diff --git a/lib/op-attrs/src/op-attrs/dim_ordered/transform.cc b/lib/op-attrs/src/op-attrs/dim_ordered/transform.cc new file mode 100644 index 0000000000..73683eba94 --- /dev/null +++ b/lib/op-attrs/src/op-attrs/dim_ordered/transform.cc @@ -0,0 +1 @@ +#include "op-attrs/dim_ordered/transform.h" diff --git a/lib/op-attrs/src/op-attrs/ff_ordered/enumerate.cc b/lib/op-attrs/src/op-attrs/ff_ordered/enumerate.cc new file mode 100644 index 0000000000..e06c144149 --- /dev/null +++ b/lib/op-attrs/src/op-attrs/ff_ordered/enumerate.cc @@ -0,0 +1,10 @@ +#include "op-attrs/ff_ordered/enumerate.h" +#include "utils/archetypes/value_type.h" + +namespace FlexFlow { + +using T = value_type<0>; + +template std::map enumerate(FFOrdered const &); + +} // namespace FlexFlow diff --git a/lib/op-attrs/src/op-attrs/ff_ordered/ff_ordered.cc b/lib/op-attrs/src/op-attrs/ff_ordered/ff_ordered.cc new file mode 100644 index 0000000000..1420586809 --- /dev/null +++ b/lib/op-attrs/src/op-attrs/ff_ordered/ff_ordered.cc @@ -0,0 +1,14 @@ +#include "op-attrs/ff_ordered/ff_ordered.h" +#include "utils/archetypes/value_type.h" + +namespace FlexFlow { + +using T = value_type<0>; + +template struct FFOrdered; + +template std::string format_as(FFOrdered const &); + +template std::ostream &operator<<(std::ostream &, FFOrdered const &); + +} // namespace FlexFlow diff --git a/lib/op-attrs/src/op-attrs/ff_ordered/ff_ordered_from_map.cc b/lib/op-attrs/src/op-attrs/ff_ordered/ff_ordered_from_map.cc new file mode 100644 index 0000000000..e39fedb858 --- /dev/null +++ b/lib/op-attrs/src/op-attrs/ff_ordered/ff_ordered_from_map.cc @@ -0,0 +1,13 @@ +#include "op-attrs/ff_ordered/ff_ordered_from_map.h" +#include "utils/archetypes/value_type.h" + +namespace FlexFlow { + +using T = value_type<0>; + +template FFOrdered ff_ordered_from_map(std::map const &); + +template FFOrdered + ff_ordered_from_map(std::unordered_map const &); + +} // namespace FlexFlow diff --git a/lib/op-attrs/src/op-attrs/ff_ordered/get_idxs.cc b/lib/op-attrs/src/op-attrs/ff_ordered/get_idxs.cc new file mode 100644 index 0000000000..3da15bebba --- /dev/null +++ b/lib/op-attrs/src/op-attrs/ff_ordered/get_idxs.cc @@ -0,0 +1,10 @@ +#include "op-attrs/ff_ordered/get_idxs.h" +#include "utils/archetypes/value_type.h" + +namespace FlexFlow { + +using T = value_type<0>; + +template std::vector get_idxs(FFOrdered const &); + +} // namespace FlexFlow diff --git a/lib/op-attrs/src/op-attrs/ff_ordered/slice.cc b/lib/op-attrs/src/op-attrs/ff_ordered/slice.cc new file mode 100644 index 0000000000..059fd811cd --- /dev/null +++ b/lib/op-attrs/src/op-attrs/ff_ordered/slice.cc @@ -0,0 +1,24 @@ +#include "op-attrs/ff_ordered/slice.h" +#include "utils/archetypes/value_type.h" + +namespace FlexFlow { + +using T = value_type<0>; + +template FFOrdered ff_dim_t_nonoverloaded_slice( + FFOrdered const &, ff_dim_t const &, std::optional const &); + +template FFOrdered relative_ff_dim_t_nonoverloaded_slice( + FFOrdered const &, + relative_ff_dim_t const &, + std::optional const &); + +template FFOrdered slice(FFOrdered const &, + ff_dim_t const &, + std::optional const &); + +template FFOrdered slice(FFOrdered const &, + relative_ff_dim_t const &, + std::optional const &); + +} // namespace FlexFlow diff --git a/lib/op-attrs/src/op-attrs/ff_ordered/transform.cc b/lib/op-attrs/src/op-attrs/ff_ordered/transform.cc new file mode 100644 index 0000000000..74bf4895a3 --- /dev/null +++ b/lib/op-attrs/src/op-attrs/ff_ordered/transform.cc @@ -0,0 +1,12 @@ +#include "op-attrs/ff_ordered/transform.h" +#include "utils/archetypes/value_type.h" + +namespace FlexFlow { + +using T = value_type<0>; +using Out = value_type<1>; +using F = std::function; + +template FFOrdered transform(FFOrdered const &, F &&); + +} // namespace FlexFlow diff --git a/lib/op-attrs/src/op-attrs/ff_ordered/zip.cc b/lib/op-attrs/src/op-attrs/ff_ordered/zip.cc new file mode 100644 index 0000000000..dc715ea97c --- /dev/null +++ b/lib/op-attrs/src/op-attrs/ff_ordered/zip.cc @@ -0,0 +1,12 @@ +#include "op-attrs/ff_ordered/zip.h" +#include "utils/archetypes/value_type.h" + +namespace FlexFlow { + +using T1 = value_type<0>; +using T2 = value_type<1>; + +template FFOrdered> zip(FFOrdered const &, + FFOrdered const &); + +} // namespace FlexFlow diff --git a/lib/op-attrs/src/op-attrs/ops/batch_norm.cc b/lib/op-attrs/src/op-attrs/ops/batch_norm.cc index d4763ef004..ddd92bd417 100644 --- a/lib/op-attrs/src/op-attrs/ops/batch_norm.cc +++ b/lib/op-attrs/src/op-attrs/ops/batch_norm.cc @@ -1,6 +1,6 @@ #include "op-attrs/ops/batch_norm.h" -#include "op-attrs/dim_ordered/concat.h" -#include "op-attrs/dim_ordered/slice.h" +#include "op-attrs/ff_ordered/concat.h" +#include "op-attrs/ff_ordered/slice.h" #include "op-attrs/parallel_tensor_shape.h" #include "op-attrs/tensor_shape.h" #include "utils/containers/any_of.h" diff --git a/lib/op-attrs/src/op-attrs/ops/concat.cc b/lib/op-attrs/src/op-attrs/ops/concat.cc index fc42241ef2..bf0ba553e4 100644 --- a/lib/op-attrs/src/op-attrs/ops/concat.cc +++ b/lib/op-attrs/src/op-attrs/ops/concat.cc @@ -1,6 +1,6 @@ #include "op-attrs/ops/concat.h" -#include "op-attrs/dim_ordered/enumerate.h" -#include "op-attrs/dim_ordered/ff_ordered_from_map.h" +#include "op-attrs/ff_ordered/enumerate.h" +#include "op-attrs/ff_ordered/ff_ordered_from_map.h" #include "op-attrs/parallel_tensor_shape.h" #include "op-attrs/tensor_dims.h" #include "op-attrs/tensor_shape.h" diff --git a/lib/op-attrs/src/op-attrs/ops/embedding.cc b/lib/op-attrs/src/op-attrs/ops/embedding.cc index 4dc602646b..5b5b91a8e7 100644 --- a/lib/op-attrs/src/op-attrs/ops/embedding.cc +++ b/lib/op-attrs/src/op-attrs/ops/embedding.cc @@ -1,8 +1,10 @@ #include "op-attrs/ops/embedding.h" -#include "op-attrs/dim_ordered/slice.h" -#include "op-attrs/dim_ordered/transform.h" +#include "op-attrs/ff_ordered/slice.h" +#include "op-attrs/ff_ordered/transform.h" +#include "op-attrs/ops/embedding_attrs.dtg.h" #include "op-attrs/parallel_tensor_dims.h" #include "utils/containers/product.h" +#include "utils/fmt/optional.h" #include "utils/integer_conversions.h" namespace FlexFlow { diff --git a/lib/op-attrs/src/op-attrs/ops/flat.cc b/lib/op-attrs/src/op-attrs/ops/flat.cc index 8ed12167b3..b4eeda76ab 100644 --- a/lib/op-attrs/src/op-attrs/ops/flat.cc +++ b/lib/op-attrs/src/op-attrs/ops/flat.cc @@ -1,6 +1,6 @@ #include "op-attrs/ops/flat.h" -#include "op-attrs/dim_ordered/concat.h" -#include "op-attrs/dim_ordered/slice.h" +#include "op-attrs/ff_ordered/concat.h" +#include "op-attrs/ff_ordered/slice.h" #include "op-attrs/parallel_tensor_shape.h" #include "op-attrs/tensor_dims.h" #include "utils/containers/any_of.h" diff --git a/lib/op-attrs/src/op-attrs/ops/layer_norm.cc b/lib/op-attrs/src/op-attrs/ops/layer_norm.cc index 00c6bb5e9b..c9798368e2 100644 --- a/lib/op-attrs/src/op-attrs/ops/layer_norm.cc +++ b/lib/op-attrs/src/op-attrs/ops/layer_norm.cc @@ -1,6 +1,6 @@ #include "op-attrs/ops/layer_norm.h" -#include "op-attrs/dim_ordered/ff_ordered_of.h" -#include "op-attrs/dim_ordered/get_idxs.h" +#include "op-attrs/ff_ordered/ff_ordered_of.h" +#include "op-attrs/ff_ordered/get_idxs.h" #include "op-attrs/parallel_tensor_shape.h" #include "op-attrs/tensor_shape.h" #include "utils/containers/all_of.h" diff --git a/lib/op-attrs/src/op-attrs/ops/linear.cc b/lib/op-attrs/src/op-attrs/ops/linear.cc index fb26113613..bee9d0cf4f 100644 --- a/lib/op-attrs/src/op-attrs/ops/linear.cc +++ b/lib/op-attrs/src/op-attrs/ops/linear.cc @@ -1,11 +1,12 @@ #include "op-attrs/ops/linear.h" -#include "op-attrs/dim_ordered/slice.h" -#include "op-attrs/dim_ordered/transform.h" +#include "op-attrs/ff_ordered/slice.h" +#include "op-attrs/ff_ordered/transform.h" #include "op-attrs/initializers/kaiming_initializer_mode.h" #include "op-attrs/parallel_tensor_shape.h" #include "op-attrs/tensor_shape.h" #include "utils/containers/product.h" #include "utils/expected.h" +#include "utils/fmt/optional.h" #include "utils/integer_conversions.h" namespace FlexFlow { @@ -101,7 +102,7 @@ tl::expected SumDegree sum_degree = SumDegree{1_n}; DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{ get_sum_degree(input) * product(slice(ff_ordered_shard_degrees(input), - std::nullopt, + relative_ff_dim_t{0}, relative_ff_dim_t{-1}))}; FFOrdered shard_degrees = FFOrdered{ shard_dim_at_idx(input, relative_ff_dim_t{-1}).degree, @@ -126,8 +127,10 @@ tl::expected SumDegree sum_degree = SumDegree{get_sum_degree(input) * shard_dim_at_idx(input, relative_ff_dim_t{-1}).degree}; - DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{product(slice( - ff_ordered_shard_degrees(input), std::nullopt, relative_ff_dim_t{-1}))}; + DiscardCopyDegree discard_copy_degree = + DiscardCopyDegree{product(slice(ff_ordered_shard_degrees(input), + relative_ff_dim_t{0}, + relative_ff_dim_t{-1}))}; FFOrdered shard_degrees = FFOrdered{get_discard_copy_degree(input)}; diff --git a/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc b/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc index 7a8f91e498..3f2245b2dc 100644 --- a/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc +++ b/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc @@ -1,6 +1,6 @@ #include "op-attrs/parallel_tensor_dims.h" -#include "op-attrs/dim_ordered/transform.h" -#include "op-attrs/dim_ordered/zip.h" +#include "op-attrs/ff_ordered/transform.h" +#include "op-attrs/ff_ordered/zip.h" #include "op-attrs/replica_parallel_dim.h" #include "op-attrs/replica_parallel_dim_set.h" #include "op-attrs/shard_parallel_dim.h" diff --git a/lib/op-attrs/src/op-attrs/tensor_dims.cc b/lib/op-attrs/src/op-attrs/tensor_dims.cc index 8d0592eab7..760278297c 100644 --- a/lib/op-attrs/src/op-attrs/tensor_dims.cc +++ b/lib/op-attrs/src/op-attrs/tensor_dims.cc @@ -1,6 +1,6 @@ #include "op-attrs/tensor_dims.h" -#include "op-attrs/dim_ordered/slice.h" -#include "op-attrs/dim_ordered/zip.h" +#include "op-attrs/ff_ordered/slice.h" +#include "op-attrs/ff_ordered/zip.h" #include "op-attrs/replica_parallel_dim_set.h" #include "op-attrs/shard_parallel_dim.dtg.h" #include "utils/containers/all_of.h" @@ -67,7 +67,7 @@ std::optional } TensorDims slice_tensor_dims(TensorDims const &dims, - std::optional const &start, + relative_ff_dim_t const &start, std::optional const &stop) { return TensorDims{ slice(dims.ff_ordered, start, stop), diff --git a/lib/op-attrs/src/op-attrs/tensor_shape.cc b/lib/op-attrs/src/op-attrs/tensor_shape.cc index 04b18794f1..afc14af54c 100644 --- a/lib/op-attrs/src/op-attrs/tensor_shape.cc +++ b/lib/op-attrs/src/op-attrs/tensor_shape.cc @@ -29,7 +29,7 @@ nonnegative_int get_size_in_bytes(TensorShape const &s) { } TensorShape slice_tensor_shape(TensorShape const &shape, - std::optional const &start, + relative_ff_dim_t const &start, std::optional const &stop) { return TensorShape{ slice_tensor_dims(shape.dims, start, stop), diff --git a/lib/op-attrs/test/src/op-attrs/datatype_value.cc b/lib/op-attrs/test/src/op-attrs/datatype_value.cc new file mode 100644 index 0000000000..9b0e90b601 --- /dev/null +++ b/lib/op-attrs/test/src/op-attrs/datatype_value.cc @@ -0,0 +1,68 @@ +#include "op-attrs/datatype_value.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("test make_data_type_value") { + SUBCASE("make_float_data_type_value") { + float value = 1.0f; + DataTypeValue data_type_value = make_float_data_type_value(value); + + CHECK(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK(data_type_value.get() == value); + } + + SUBCASE("make_double_data_type_value") { + double value = 2.71828; + DataTypeValue data_type_value = make_double_data_type_value(value); + + CHECK(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK(data_type_value.get() == value); + } + + SUBCASE("make_int32_data_type_value") { + int32_t value = -42; + DataTypeValue data_type_value = make_int32_data_type_value(value); + + CHECK(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK(data_type_value.get() == value); + } + + SUBCASE("make_int64_data_type_value") { + int64_t value = 1LL << 40; + DataTypeValue data_type_value = make_int64_data_type_value(value); + + CHECK(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK(data_type_value.get() == value); + } + + SUBCASE("make_bool_data_type_value") { + bool value = true; + DataTypeValue data_type_value = make_bool_data_type_value(value); + + CHECK(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK(data_type_value.get() == value); + } + } +} diff --git a/lib/op-attrs/test/src/op-attrs/dim_ordered/dim_ordered.cc b/lib/op-attrs/test/src/op-attrs/dim_ordered/dim_ordered.cc index d7901a0c53..a5a261da25 100644 --- a/lib/op-attrs/test/src/op-attrs/dim_ordered/dim_ordered.cc +++ b/lib/op-attrs/test/src/op-attrs/dim_ordered/dim_ordered.cc @@ -10,8 +10,4 @@ TEST_SUITE(FF_TEST_SUITE) { "Arbitrary> with T=", T, int, double, char) { RC_SUBCASE([](DimOrdered) {}); } - - TEST_CASE_TEMPLATE("Arbitrary> with T=", T, int, double, char) { - RC_SUBCASE([](FFOrdered) {}); - } } diff --git a/lib/op-attrs/test/src/op-attrs/dim_ordered/concat.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/concat.cc similarity index 97% rename from lib/op-attrs/test/src/op-attrs/dim_ordered/concat.cc rename to lib/op-attrs/test/src/op-attrs/ff_ordered/concat.cc index 2ac641cfc2..d8e04124bc 100644 --- a/lib/op-attrs/test/src/op-attrs/dim_ordered/concat.cc +++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/concat.cc @@ -1,4 +1,4 @@ -#include "op-attrs/dim_ordered/concat.h" +#include "op-attrs/ff_ordered/concat.h" #include using namespace ::FlexFlow; diff --git a/lib/op-attrs/test/src/op-attrs/dim_ordered/enumerate.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/enumerate.cc similarity index 92% rename from lib/op-attrs/test/src/op-attrs/dim_ordered/enumerate.cc rename to lib/op-attrs/test/src/op-attrs/ff_ordered/enumerate.cc index bf4c33d65a..e1a94e72c3 100644 --- a/lib/op-attrs/test/src/op-attrs/dim_ordered/enumerate.cc +++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/enumerate.cc @@ -1,4 +1,4 @@ -#include "op-attrs/dim_ordered/enumerate.h" +#include "op-attrs/ff_ordered/enumerate.h" #include "test/utils/doctest/fmt/map.h" #include diff --git a/lib/op-attrs/test/src/op-attrs/ff_ordered/ff_ordered.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/ff_ordered.cc new file mode 100644 index 0000000000..b0812ba9d6 --- /dev/null +++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/ff_ordered.cc @@ -0,0 +1,11 @@ +#include "op-attrs/ff_ordered/ff_ordered.h" +#include "test/utils/rapidcheck.h" +#include + +using namespace FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE_TEMPLATE("Arbitrary> with T=", T, int, double, char) { + RC_SUBCASE([](FFOrdered) {}); + } +} diff --git a/lib/op-attrs/test/src/op-attrs/dim_ordered/ff_ordered_from_map.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/ff_ordered_from_map.cc similarity index 96% rename from lib/op-attrs/test/src/op-attrs/dim_ordered/ff_ordered_from_map.cc rename to lib/op-attrs/test/src/op-attrs/ff_ordered/ff_ordered_from_map.cc index bba989920e..73036d5662 100644 --- a/lib/op-attrs/test/src/op-attrs/dim_ordered/ff_ordered_from_map.cc +++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/ff_ordered_from_map.cc @@ -1,4 +1,4 @@ -#include "op-attrs/dim_ordered/ff_ordered_from_map.h" +#include "op-attrs/ff_ordered/ff_ordered_from_map.h" #include using namespace ::FlexFlow; diff --git a/lib/op-attrs/test/src/op-attrs/dim_ordered/slice.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/slice.cc similarity index 79% rename from lib/op-attrs/test/src/op-attrs/dim_ordered/slice.cc rename to lib/op-attrs/test/src/op-attrs/ff_ordered/slice.cc index b2fddd058e..2f1dfecd65 100644 --- a/lib/op-attrs/test/src/op-attrs/dim_ordered/slice.cc +++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/slice.cc @@ -1,4 +1,4 @@ -#include "op-attrs/dim_ordered/slice.h" +#include "op-attrs/ff_ordered/slice.h" #include using namespace ::FlexFlow; @@ -25,13 +25,6 @@ TEST_SUITE(FF_TEST_SUITE) { CHECK(result == correct); } - SUBCASE("std::nullopt_t, ff_dim_t") { - FFOrdered result = - slice(d, std::nullopt, ff_dim_t{nonnegative_int{3}}); - FFOrdered correct = FFOrdered{1, 2, 3}; - - CHECK(result == correct); - } SUBCASE("relative_ff_dim_t, relative_ff_dim_t") { FFOrdered result = slice(d, relative_ff_dim_t{1}, relative_ff_dim_t{-1}); @@ -45,12 +38,6 @@ TEST_SUITE(FF_TEST_SUITE) { CHECK(result == correct); } - SUBCASE("std::nullopt_t, relative_ff_dim_t") { - FFOrdered result = slice(d, std::nullopt, relative_ff_dim_t{-1}); - FFOrdered correct = FFOrdered{1, 2, 3}; - - CHECK(result == correct); - } SUBCASE("start index = stop index") { FFOrdered result = slice(d, relative_ff_dim_t{1}, relative_ff_dim_t{1}); @@ -86,10 +73,10 @@ TEST_SUITE(FF_TEST_SUITE) { CHECK_THROWS(slice(d, relative_ff_dim_t{10}, std::nullopt)); } SUBCASE("stop index out of bounds (too low)") { - CHECK_THROWS(slice(d, std::nullopt, relative_ff_dim_t{-10})); + CHECK_THROWS(slice(d, relative_ff_dim_t{0}, relative_ff_dim_t{-10})); } SUBCASE("stop index out of bounds (too high)") { - CHECK_THROWS(slice(d, std::nullopt, relative_ff_dim_t{10})); + CHECK_THROWS(slice(d, relative_ff_dim_t{0}, relative_ff_dim_t{10})); } } } diff --git a/lib/op-attrs/test/src/op-attrs/ff_ordered/transform.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/transform.cc new file mode 100644 index 0000000000..4bf189ec77 --- /dev/null +++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/transform.cc @@ -0,0 +1,35 @@ +#include "op-attrs/ff_ordered/transform.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("transform(FFOrdered, F)") { + SUBCASE("input is empty") { + FFOrdered input = {}; + + FFOrdered result = transform(input, [](std::string const &) -> int { + CHECK(false); + return 0; + }); + FFOrdered correct = {}; + + CHECK(result == correct); + } + + SUBCASE("input is not empty") { + FFOrdered input = {2, 1, 2, 5}; + + FFOrdered result = + transform(input, [](int x) { return fmt::to_string(x); }); + FFOrdered correct = FFOrdered{ + "2", + "1", + "2", + "5", + }; + + CHECK(result == correct); + } + } +} diff --git a/lib/op-attrs/test/src/op-attrs/ff_ordered/zip.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/zip.cc new file mode 100644 index 0000000000..19167cd0ff --- /dev/null +++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/zip.cc @@ -0,0 +1,38 @@ +#include "op-attrs/ff_ordered/zip.h" +#include "test/utils/doctest/fmt/pair.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("zip(FFOrdered, FFOrdered)") { + FFOrdered lhs_input = {9, 9, 8, 9}; + FFOrdered rhs_input = {"m", "m", "k", "l", "m"}; + + SUBCASE("lhs is longer") { + FFOrdered> result = zip(lhs_input, rhs_input); + + FFOrdered> correct = { + {9, "m"}, + {9, "m"}, + {8, "k"}, + {9, "l"}, + }; + + CHECK(result == correct); + } + + SUBCASE("rhs is longer") { + FFOrdered> result = zip(rhs_input, lhs_input); + + FFOrdered> correct = { + {"m", 9}, + {"m", 9}, + {"k", 8}, + {"l", 9}, + }; + + CHECK(result == correct); + } + } +} diff --git a/lib/pcg/include/pcg/metric.enum.toml b/lib/pcg/include/pcg/metric.enum.toml new file mode 100644 index 0000000000..ebb2323203 --- /dev/null +++ b/lib/pcg/include/pcg/metric.enum.toml @@ -0,0 +1,26 @@ +namespace = "FlexFlow" +name = "Metric" +features = [ + "hash", + "json", + "rapidcheck", + "fmt", +] + +[[values]] +name = "ACCURACY" + +[[values]] +name = "CATEGORICAL_CROSSENTROPY" + +[[values]] +name = "SPARSE_CATEGORICAL_CROSSENTROPY" + +[[values]] +name = "MEAN_SQUARED_ERROR" + +[[values]] +name = "ROOT_MEAN_SQUARED_ERROR" + +[[values]] +name = "MEAN_ABSOLUTE_ERROR" diff --git a/lib/pcg/include/pcg/metric_attrs.h b/lib/pcg/include/pcg/metric_attrs.h new file mode 100644 index 0000000000..343c2154dd --- /dev/null +++ b/lib/pcg/include/pcg/metric_attrs.h @@ -0,0 +1,28 @@ +#ifndef _FF_METRICS_H_ +#define _FF_METRICS_H_ + +#include "op-attrs/ops/loss_functions/loss_functions.h" +#include "pcg/metric.dtg.h" +#include "utils/fmt.h" +#include + +namespace FlexFlow { + +class MetricsAttrs { +public: + MetricsAttrs() = delete; + MetricsAttrs(LossFunction, std::unordered_set const &); + +public: + LossFunction loss_type; + bool measure_accuracy; + bool measure_categorical_crossentropy; + bool measure_sparse_categorical_crossentropy; + bool measure_mean_squared_error; + bool measure_root_mean_squared_error; + bool measure_mean_absolute_error; +}; + +} // namespace FlexFlow + +#endif diff --git a/lib/pcg/src/pcg/metric_attrs.cc b/lib/pcg/src/pcg/metric_attrs.cc new file mode 100644 index 0000000000..9a93e75350 --- /dev/null +++ b/lib/pcg/src/pcg/metric_attrs.cc @@ -0,0 +1,38 @@ +#include "pcg/metric_attrs.h" + +namespace FlexFlow { +MetricsAttrs::MetricsAttrs(LossFunction _loss_type, + std::unordered_set const &metrics) + : loss_type(_loss_type), measure_accuracy(false), + measure_categorical_crossentropy(false), + measure_sparse_categorical_crossentropy(false), + measure_mean_squared_error(false), measure_root_mean_squared_error(false), + measure_mean_absolute_error(false) { + for (Metric const &m : metrics) { + switch (m) { + case Metric::ACCURACY: + measure_accuracy = true; + continue; + case Metric::CATEGORICAL_CROSSENTROPY: + measure_categorical_crossentropy = true; + continue; + case Metric::SPARSE_CATEGORICAL_CROSSENTROPY: + measure_sparse_categorical_crossentropy = true; + continue; + case Metric::MEAN_SQUARED_ERROR: + measure_mean_squared_error = true; + continue; + case Metric::ROOT_MEAN_SQUARED_ERROR: + measure_root_mean_squared_error = true; + continue; + case Metric::MEAN_ABSOLUTE_ERROR: + measure_mean_absolute_error = true; + continue; + default: + throw mk_runtime_error(fmt::format( + "Initializing MetricsAttrs with unrecogonized metrics type {}", m)); + } + } +} + +} // namespace FlexFlow diff --git a/lib/pcg/src/pcg/parallel_computation_graph/generate_weight_transform.cc b/lib/pcg/src/pcg/parallel_computation_graph/generate_weight_transform.cc index 2cf149f78a..940024c9b6 100644 --- a/lib/pcg/src/pcg/parallel_computation_graph/generate_weight_transform.cc +++ b/lib/pcg/src/pcg/parallel_computation_graph/generate_weight_transform.cc @@ -1,5 +1,5 @@ #include "pcg/parallel_computation_graph/generate_weight_transform.h" -#include "op-attrs/dim_ordered/enumerate.h" +#include "op-attrs/ff_ordered/enumerate.h" #include "op-attrs/parallel_tensor_shape.h" namespace FlexFlow { diff --git a/lib/runtime/src/metrics_functions.cc b/lib/runtime/src/metrics_functions.cc index feb6e704b2..33e15baed2 100644 --- a/lib/runtime/src/metrics_functions.cc +++ b/lib/runtime/src/metrics_functions.cc @@ -25,39 +25,6 @@ namespace FlexFlow { LegionRuntime::Logger::Category log_metrics("metrics"); -MetricsAttrs::MetricsAttrs(LossFunction _loss_type, - std::vector const &metrics) - : loss_type(_loss_type), measure_accuracy(false), - measure_categorical_crossentropy(false), - measure_sparse_categorical_crossentropy(false), - measure_mean_squared_error(false), measure_root_mean_squared_error(false), - measure_mean_absolute_error(false) { - for (Metric const &m : metrics) { - switch (m) { - case Metric::ACCURACY: - measure_accuracy = true; - continue; - case Metric::CATEGORICAL_CROSSENTROPY: - measure_categorical_crossentropy = true; - continue; - case Metric::SPARSE_CATEGORICAL_CROSSENTROPY: - measure_sparse_categorical_crossentropy = true; - continue; - case Metric::MEAN_SQUARED_ERROR: - measure_mean_squared_error = true; - continue; - case Metric::ROOT_MEAN_SQUARED_ERROR: - measure_root_mean_squared_error = true; - continue; - case Metric::MEAN_ABSOLUTE_ERROR: - measure_mean_absolute_error = true; - continue; - default: - throw mk_runtime_error("Unrecogonized metrics type {}", m); - } - } -} - enum Slots { LOGIT, LABEL, diff --git a/lib/runtime/src/metrics_functions.h b/lib/runtime/src/metrics_functions.h index fbb0b633bf..73dc3bbc51 100644 --- a/lib/runtime/src/metrics_functions.h +++ b/lib/runtime/src/metrics_functions.h @@ -16,38 +16,13 @@ #ifndef _FF_METRICS_FUNCTIONS_H_ #define _FF_METRICS_FUNCTIONS_H_ +#include "kernels/metric.h" #include "kernels/perf_metrics.h" #include "legion.h" -#include "op-attrs/ops/loss_functions.h" #include "task_spec/task_invocation.h" -#include "utils/fmt.h" namespace FlexFlow { -enum class Metric { - ACCURACY, - CATEGORICAL_CROSSENTROPY, - SPARSE_CATEGORICAL_CROSSENTROPY, - MEAN_SQUARED_ERROR, - ROOT_MEAN_SQUARED_ERROR, - MEAN_ABSOLUTE_ERROR, -}; - -class MetricsAttrs { -public: - MetricsAttrs() = delete; - MetricsAttrs(LossFunction, std::vector const &); - -public: - LossFunction loss_type; - bool measure_accuracy; - bool measure_categorical_crossentropy; - bool measure_sparse_categorical_crossentropy; - bool measure_mean_squared_error; - bool measure_root_mean_squared_error; - bool measure_mean_absolute_error; -}; - TypedIndexTaskInvocation compute_metrics(MetricsAttrs const &, parallel_tensor_guid_t const &logit, @@ -79,40 +54,4 @@ VISITABLE_STRUCT(::FlexFlow::MetricsAttrs, measure_root_mean_squared_error, measure_mean_absolute_error); -namespace fmt { - -template <> -struct formatter<::FlexFlow::Metric> : formatter { - template - auto format(::FlexFlow::Metric m, FormatContext &ctx) const - -> decltype(ctx.out()) { - using namespace FlexFlow; - - string_view name = "unknown"; - switch (m) { - case Metric::ACCURACY: - name = "Accuracy"; - break; - case Metric::CATEGORICAL_CROSSENTROPY: - name = "CategoricalCrossEntropy"; - break; - case Metric::SPARSE_CATEGORICAL_CROSSENTROPY: - name = "SparseCategoricalCrossEntropy"; - break; - case Metric::MEAN_SQUARED_ERROR: - name = "MeanSquaredError"; - break; - case Metric::ROOT_MEAN_SQUARED_ERROR: - name = "RootMeanSquaredError"; - break; - case Metric::MEAN_ABSOLUTE_ERROR: - name = "MeanAbsoluteError"; - break; - } - return formatter::format(name, ctx); - } -}; - -} // namespace fmt - #endif diff --git a/lib/runtime/src/ops/embedding.cc b/lib/runtime/src/ops/embedding.cc index 253fd3cb4f..83e7c15460 100644 --- a/lib/runtime/src/ops/embedding.cc +++ b/lib/runtime/src/ops/embedding.cc @@ -77,11 +77,11 @@ static std::optional return profile(backward_kernel, profiling, "[Embedding] backward_time = {:.2lf}ms\n", - input, output, + input, weight_grad, - input.data_type, output.data_type, + input.data_type, attrs.aggr, input.shape.get_dim(), output.shape.get_dim(), diff --git a/lib/utils/include/utils/containers/subvec.h b/lib/utils/include/utils/containers/slice.h similarity index 69% rename from lib/utils/include/utils/containers/subvec.h rename to lib/utils/include/utils/containers/slice.h index c89e9227de..a82fb383b5 100644 --- a/lib/utils/include/utils/containers/subvec.h +++ b/lib/utils/include/utils/containers/slice.h @@ -9,9 +9,9 @@ namespace FlexFlow { template -std::vector subvec(std::vector const &v, - std::optional const &maybe_start, - std::optional const &maybe_end) { +std::vector slice(std::vector const &v, + int const &maybe_start, + std::optional const &maybe_end) { auto begin_iter = v.cbegin(); auto end_iter = v.cend(); @@ -22,15 +22,13 @@ std::vector subvec(std::vector const &v, if (idx < 0) { new_idx = size + idx; } - if (new_idx < 0 || new_idx > size) { - throw mk_runtime_error("Index {} is out of bounds for array {}"); - } + + ASSERT(new_idx >= 0, "Index out of bounds"); + ASSERT(new_idx <= size, "Index out of bounds"); return new_idx; }; - if (maybe_start.has_value()) { - begin_iter += resolve_loc(maybe_start.value()); - } + begin_iter += resolve_loc(maybe_start); if (maybe_end.has_value()) { end_iter = v.cbegin() + resolve_loc(maybe_end.value()); diff --git a/lib/utils/include/utils/containers/zip_strict.h b/lib/utils/include/utils/containers/zip_strict.h index 64049042d4..5606fccff1 100644 --- a/lib/utils/include/utils/containers/zip_strict.h +++ b/lib/utils/include/utils/containers/zip_strict.h @@ -4,21 +4,17 @@ #include "utils/containers/zip.h" #include "utils/exception.h" #include "utils/fmt/vector.h" +#include namespace FlexFlow { template std::vector> zip_strict(std::vector const &lhs, std::vector const &rhs) { - if (lhs.size() != rhs.size()) { - throw mk_runtime_error( - fmt::format("zip_strict requires lhs and rhs to have the same length, " - "but received lhs={} (length {}), rhs={} (length {})", - lhs, - lhs.size(), - rhs, - rhs.size())); - } + ASSERT(lhs.size() == rhs.size(), + "zip_strict requires lhs and rhs to have the same length", + lhs, + rhs); return zip(lhs, rhs); } diff --git a/lib/utils/include/utils/exception.h b/lib/utils/include/utils/exception.h index 080cbb3611..f95eb8a38d 100644 --- a/lib/utils/include/utils/exception.h +++ b/lib/utils/include/utils/exception.h @@ -3,6 +3,7 @@ #include "utils/fmt.h" #include +#include #include #include diff --git a/lib/utils/include/utils/indent.h b/lib/utils/include/utils/indent.h new file mode 100644 index 0000000000..eccbd34cfc --- /dev/null +++ b/lib/utils/include/utils/indent.h @@ -0,0 +1,12 @@ +#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_INDENT_H +#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_INDENT_H + +#include + +namespace FlexFlow { + +std::string indent(std::string const &, int indent_size = 2); + +} // namespace FlexFlow + +#endif diff --git a/lib/utils/include/utils/stack_vector/stack_vector.h b/lib/utils/include/utils/stack_vector/stack_vector.h index 5d4d6eaad3..64d005a10e 100644 --- a/lib/utils/include/utils/stack_vector/stack_vector.h +++ b/lib/utils/include/utils/stack_vector/stack_vector.h @@ -272,18 +272,6 @@ struct stack_vector { return !(*this == other); } - bool operator<(stack_vector const &other) const { - for (std::size_t i = 0; i < std::min(this->m_size, other.m_size); i++) { - if (this->at(i) < other.at(i)) { - return true; - } else if (this->at(i) > other.at(i)) { - return false; - } - } - - return (this->m_size < other.m_size); - } - std::size_t size() const { return this->m_size; } @@ -305,17 +293,16 @@ struct stack_vector { private: std::size_t m_size = 0; std::array contents; - - static_assert( - implies, is_equal_comparable>::value, - ""); - static_assert( - implies, is_neq_comparable>::value, - ""); - static_assert( - implies, is_lt_comparable>::value, ""); }; +template +auto operator<(stack_vector const &lhs, + stack_vector const &rhs) + -> std::enable_if_t, bool> { + return std::lexicographical_compare( + lhs.begin(), lhs.end(), rhs.begin(), rhs.end()); +} + template std::ostream &operator<<(std::ostream &s, stack_vector const &v) { return s << fmt::to_string(v); diff --git a/lib/utils/src/utils/containers/slice.cc b/lib/utils/src/utils/containers/slice.cc new file mode 100644 index 0000000000..f960c21881 --- /dev/null +++ b/lib/utils/src/utils/containers/slice.cc @@ -0,0 +1,3 @@ +#include "utils/containers/slice.h" + +namespace FlexFlow {} // namespace FlexFlow diff --git a/lib/utils/src/utils/containers/subvec.cc b/lib/utils/src/utils/containers/subvec.cc deleted file mode 100644 index 93c7de31c5..0000000000 --- a/lib/utils/src/utils/containers/subvec.cc +++ /dev/null @@ -1 +0,0 @@ -#include "utils/containers/subvec.h" diff --git a/lib/utils/src/utils/full_binary_tree/binary_tree_path.cc b/lib/utils/src/utils/full_binary_tree/binary_tree_path.cc index 8445a2721a..8aed06ae01 100644 --- a/lib/utils/src/utils/full_binary_tree/binary_tree_path.cc +++ b/lib/utils/src/utils/full_binary_tree/binary_tree_path.cc @@ -1,5 +1,5 @@ #include "utils/full_binary_tree/binary_tree_path.h" -#include "utils/containers/subvec.h" +#include "utils/containers/slice.h" namespace FlexFlow { @@ -27,7 +27,7 @@ BinaryTreePathEntry binary_tree_path_get_top_level(BinaryTreePath const &p) { BinaryTreePath binary_tree_path_get_non_top_level(BinaryTreePath const &p) { return BinaryTreePath{ - subvec(p.entries, 1, std::nullopt), + slice(p.entries, 1, std::nullopt), }; } diff --git a/lib/utils/src/utils/graph/series_parallel/series_reduction.cc b/lib/utils/src/utils/graph/series_parallel/series_reduction.cc index 5b9b592444..459e61be71 100644 --- a/lib/utils/src/utils/graph/series_parallel/series_reduction.cc +++ b/lib/utils/src/utils/graph/series_parallel/series_reduction.cc @@ -3,7 +3,7 @@ #include "utils/containers/contains_key.h" #include "utils/containers/get_only.h" #include "utils/containers/require_same.h" -#include "utils/containers/subvec.h" +#include "utils/containers/slice.h" #include "utils/containers/unordered_set_of.h" #include "utils/containers/values.h" #include "utils/graph/digraph/algorithms/get_predecessors.h" @@ -103,7 +103,7 @@ MultiDiEdge Node last = g.get_multidiedge_dst(reduction.edges.back()); std::vector internal_nodes; - for (MultiDiEdge const &e : subvec(reduction.edges, std::nullopt, -1)) { + for (MultiDiEdge const &e : slice(reduction.edges, 0, -1)) { internal_nodes.push_back(g.get_multidiedge_dst(e)); } diff --git a/lib/utils/src/utils/indent.cc b/lib/utils/src/utils/indent.cc new file mode 100644 index 0000000000..2761ad1878 --- /dev/null +++ b/lib/utils/src/utils/indent.cc @@ -0,0 +1,17 @@ +#include "utils/indent.h" +#include "utils/containers/flatmap.h" + +namespace FlexFlow { + +std::string indent(std::string const &s, int indent_size) { + std::string indent_str(indent_size, ' '); + return indent_str + flatmap(s, [&](char c) -> std::string { + if (c == '\n') { + return "\n" + indent_str; + } else { + return std::string{c}; + }; + }); +} + +} // namespace FlexFlow diff --git a/lib/utils/src/utils/stack_vector/stack_vector.cc b/lib/utils/src/utils/stack_vector/stack_vector.cc index d4fb849412..e2009d74d3 100644 --- a/lib/utils/src/utils/stack_vector/stack_vector.cc +++ b/lib/utils/src/utils/stack_vector/stack_vector.cc @@ -1,9 +1,9 @@ #include "utils/stack_vector/stack_vector.h" -#include "utils/archetypes/ordered_value_type.h" +#include "utils/archetypes/value_type.h" namespace FlexFlow { -using T = ordered_value_type<0>; +using T = value_type<0>; template struct stack_vector; template struct stack_vector; diff --git a/lib/utils/test/common/include/test/utils/doctest/check_kv.h b/lib/utils/test/common/include/test/utils/doctest/check_kv.h new file mode 100644 index 0000000000..6449b8ac87 --- /dev/null +++ b/lib/utils/test/common/include/test/utils/doctest/check_kv.h @@ -0,0 +1,12 @@ +#ifndef _FLEXFLOW_LIB_UTILS_TEST_COMMON_INCLUDE_TEST_UTILS_DOCTEST_CHECK_KV_H +#define _FLEXFLOW_LIB_UTILS_TEST_COMMON_INCLUDE_TEST_UTILS_DOCTEST_CHECK_KV_H + +#include + +namespace FlexFlow { + +std::string check_kv(std::string const &k, std::string const &v); + +} // namespace FlexFlow + +#endif diff --git a/lib/utils/test/common/src/main.cc b/lib/utils/test/common/src/main.cc index 9522fa7fdb..6df2d925b7 100644 --- a/lib/utils/test/common/src/main.cc +++ b/lib/utils/test/common/src/main.cc @@ -1,2 +1,15 @@ -#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN -#include "doctest/doctest.h" +#define DOCTEST_CONFIG_IMPLEMENT +#include + +#include +#include + +void libassert_throw_exception_handler(libassert::assertion_info const &info) { + throw std::runtime_error("Assertion failed:\n" + info.to_string()); +} + +int main(int argc, char **argv) { + libassert::set_failure_handler(libassert_throw_exception_handler); + + return doctest::Context(argc, argv).run(); +} diff --git a/lib/utils/test/common/src/test/utils/doctest/check_kv.cc b/lib/utils/test/common/src/test/utils/doctest/check_kv.cc new file mode 100644 index 0000000000..d3c1ee335e --- /dev/null +++ b/lib/utils/test/common/src/test/utils/doctest/check_kv.cc @@ -0,0 +1,17 @@ +#include "test/utils/doctest/check_kv.h" +#include "utils/indent.h" +#include + +namespace FlexFlow { + +std::string check_kv(std::string const &k, std::string const &v) { + std::ostringstream oss; + + oss << std::endl + << indent(k + "=", /*indent_size=*/4) << std::endl + << indent(v, /*indent_size=*/6); + + return oss.str(); +} + +} // namespace FlexFlow diff --git a/lib/utils/test/src/utils/containers/subvec.cc b/lib/utils/test/src/utils/containers/slice.cc similarity index 69% rename from lib/utils/test/src/utils/containers/subvec.cc rename to lib/utils/test/src/utils/containers/slice.cc index 610fc55b5a..4e4d840bfe 100644 --- a/lib/utils/test/src/utils/containers/subvec.cc +++ b/lib/utils/test/src/utils/containers/slice.cc @@ -1,4 +1,4 @@ -#include "utils/containers/subvec.h" +#include "utils/containers/slice.h" #include "test/utils/doctest/fmt/vector.h" #include #include @@ -6,57 +6,57 @@ using namespace FlexFlow; TEST_SUITE(FF_TEST_SUITE) { - TEST_CASE("subvec") { + TEST_CASE("slice") { std::vector v = {1, 2, 3, 4, 5}; - SUBCASE("Basic subvector") { - auto result = subvec(v, 1, 4); + SUBCASE("Basic slice") { + auto result = slice(v, 1, 4); std::vector correct = {2, 3, 4}; CHECK(result == correct); } SUBCASE("From beginning to index") { - auto result = subvec(v, std::nullopt, 3); + auto result = slice(v, 0, 3); std::vector correct = {1, 2, 3}; CHECK(result == correct); } SUBCASE("From index to end") { - auto result = subvec(v, 2, std::nullopt); + auto result = slice(v, 2, std::nullopt); std::vector correct = {3, 4, 5}; CHECK(result == correct); } SUBCASE("All of the vector") { - auto result = subvec(v, std::nullopt, std::nullopt); + auto result = slice(v, 0, std::nullopt); std::vector correct = {1, 2, 3, 4, 5}; CHECK(result == correct); } SUBCASE("Start greater than end") { - auto result = subvec(v, 3, 1); + auto result = slice(v, 3, 1); std::vector correct = {}; CHECK(result == correct); } SUBCASE("Start equal to end") { - auto result = subvec(v, 3, 3); + auto result = slice(v, 3, 3); std::vector correct = {}; CHECK(result == correct); } SUBCASE("Negative indices") { - auto result = subvec(v, -3, -1); + auto result = slice(v, -3, -1); std::vector correct = {3, 4}; CHECK(result == correct); } SUBCASE("Upper index is out of bounds by 1") { - CHECK_THROWS(subvec(v, 2, 6)); + CHECK_THROWS(slice(v, 2, 6)); } SUBCASE("Lower index is out of bounds by 1") { - CHECK_THROWS(subvec(v, -6, 2)); + CHECK_THROWS(slice(v, -6, 2)); } } } diff --git a/lib/utils/test/src/utils/indent.cc b/lib/utils/test/src/utils/indent.cc new file mode 100644 index 0000000000..b137253fae --- /dev/null +++ b/lib/utils/test/src/utils/indent.cc @@ -0,0 +1,66 @@ +#include "utils/indent.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("indent") { + SUBCASE("string is empty") { + std::string input = ""; + + std::string result = indent(input); + std::string correct = " "; + + CHECK(result == correct); + } + + SUBCASE("string is one line") { + std::string input = "hello world"; + std::string result = indent(input); + std::string correct = " hello world"; + + CHECK(result == correct); + } + + SUBCASE("string has multiple lines") { + std::string input = "\n" + "a b\n" + "c d\n" + "e f\n" + "g\n"; + + std::string result = indent(input); + std::string correct = " \n" + " a b\n" + " c d\n" + " e f\n" + " g\n" + " "; + + CHECK(result == correct); + } + + SUBCASE("leading and trailing whitespace is preserved") { + std::string input = " a b \n" + "c d e\n" + " "; + + std::string result = indent(input); + std::string correct = " a b \n" + " c d e\n" + " "; + + CHECK(result == correct); + } + + SUBCASE("allows custom indent size") { + std::string input = "hello\nworld"; + + std::string result = indent(input, /*indent_size=*/4); + std::string correct = " hello\n" + " world"; + + CHECK(result == correct); + } + } +} diff --git a/lib/utils/test/src/utils/stack_vector/stack_vector.cc b/lib/utils/test/src/utils/stack_vector/stack_vector.cc index c36de733b6..6eb2cc0d88 100644 --- a/lib/utils/test/src/utils/stack_vector/stack_vector.cc +++ b/lib/utils/test/src/utils/stack_vector/stack_vector.cc @@ -1,12 +1,97 @@ #include "utils/stack_vector/stack_vector.h" #include "test/utils/doctest/fmt/vector.h" #include "test/utils/rapidcheck.h" +#include "utils/archetypes/value_type.h" #include #include using namespace FlexFlow; TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("operator<(stack_vector, stack_vector)") { + constexpr std::size_t MAXSIZE = 5; + + SUBCASE("T is ordered") { + SUBCASE("inputs are the same") { + std::vector input = {2, 1, 2, 3}; + + bool result = (input < input); + bool correct = false; + + CHECK(result == correct); + } + + SUBCASE("lhs is strict prefix of rhs") { + std::vector lhs = {2, 1, 2}; + std::vector rhs = {2, 1, 2, 3}; + + bool result = (lhs < rhs); + bool correct = true; + + CHECK(result == correct); + } + + SUBCASE("lhs is empty") { + std::vector lhs = {}; + std::vector rhs = {2, 1, 2, 3}; + + bool result = (lhs < rhs); + bool correct = true; + + CHECK(result == correct); + } + + SUBCASE("lhs has a smaller element first") { + std::vector lhs = {2, 1, 0, 3}; + std::vector rhs = {2, 1, 2}; + + bool result = (lhs < rhs); + bool correct = true; + + CHECK(result == correct); + } + + // from the definition of a strict total order, i.e., + // https://en.wikipedia.org/w/index.php?title=Total_order&oldid=1278541072#Strict_and_non-strict_total_orders + RC_SUBCASE("operator< is irreflexive", + [](stack_vector const &input) { + RC_ASSERT(!(input < input)); + }); + + RC_SUBCASE("operator< is asymmetric", + [](stack_vector const &lhs, + stack_vector const &rhs) { + RC_PRE(lhs != rhs); + + RC_ASSERT((lhs < rhs) == !(rhs < lhs)); + }); + + RC_SUBCASE("operator< is transitive", + [](stack_vector const &a, + stack_vector const &b, + stack_vector const &c) { + RC_PRE(a < b); + RC_PRE(b < c); + + RC_ASSERT(a < c); + }); + + RC_SUBCASE("operator< is connected", + [](stack_vector const &lhs, + stack_vector const &rhs) { + RC_PRE(lhs != rhs); + + RC_ASSERT((lhs < rhs) || (rhs < lhs)); + }); + } + + SUBCASE("T is not ordered") { + bool result = is_lt_comparable_v, MAXSIZE>>; + + CHECK_FALSE(result); + } + } + TEST_CASE_TEMPLATE( "stack_vector::push_back", T, int, double, char) { constexpr std::size_t MAXSIZE = 5;