Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
7617bd2
pr for debugging kernel driver issues
dylanllim May 6, 2024
9a79f6b
Commit flake files
reyna-abhyankar May 17, 2024
d2df4bc
current kernel tests
dylanllim May 30, 2024
3442e62
softmax, flat, transpose kernel tests
dylanllim May 31, 2024
eec114e
clang formatting kernel tests
dylanllim May 31, 2024
c82d3c2
reverse, split, full dropout kernels
dylanllim May 31, 2024
02099d5
rest of kernel-tests
dylanllim Jun 2, 2024
25d75c7
minor cleannup
dylanllim Jun 2, 2024
2d6d3fc
Restore .proj.toml
dylanllim Jun 2, 2024
ee3f80a
Delete misadded directory
dylanllim Jun 2, 2024
9ecc218
merge w/ repo-refactor
dylanllim Jun 7, 2024
6022388
merge fix
dylanllim Jun 7, 2024
2e9b4ca
more merge fixes
dylanllim Jun 7, 2024
bd8c8a9
resolved merge conflicts with repo-refactor
dylanllim Jun 13, 2024
cfff16d
code review changes
dylanllim Jun 13, 2024
f8075b4
allocator updates
dylanllim Jun 14, 2024
9e4bda2
allocation util updates
dylanllim Jun 16, 2024
e7dad32
test clean up and review fixes
dylanllim Jun 18, 2024
d0a3ea9
fixed forward backward pass consistencies, added filler tests for all…
dylanllim Jun 19, 2024
35071af
unnested test subcases and more review changes
dylanllim Jun 20, 2024
f92d046
added managed_stream and handle classes, other minor clean up
dylanllim Jun 23, 2024
25c38b7
fix accessor and corresponding shape clarity, other clean up
dylanllim Jun 25, 2024
66b0736
merge w/ repo-refactor
dylanllim Jun 25, 2024
3276252
merge error fixes
dylanllim Jun 25, 2024
f75b22e
managed handle and stream fixes, removed datatype dispatch from cuda_…
dylanllim Jun 25, 2024
8f36830
managed handle and stream updates
dylanllim Jun 27, 2024
dcd9f9b
Merge branch 'repo-refactor' into kernel-tests
reyna-abhyankar Jul 2, 2024
ca09037
fixed deallocator
dylanllim Jul 9, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .proj.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,9 @@ build_targets = [
# "substitutions",
# "compiler",
"substitution-generator",
"local-execution",
"local-execution",
]

test_targets = [
"utils-tests",
"op-attrs-tests",
Expand Down
2 changes: 1 addition & 1 deletion flake.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion flake.nix
Original file line number Diff line number Diff line change
Expand Up @@ -152,4 +152,4 @@
};
}
);
}
}
4 changes: 4 additions & 0 deletions lib/kernels/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ file(GLOB_RECURSE SRC
CONFIGURE_DEPENDS
LIST_DIRECTORIES False
src/*.cc
src/cuda/cuda_helper.cu
src/cuda/ops/*.cu
)

Expand All @@ -28,6 +29,7 @@ target_link_libraries(
cuda
cudnn
nccl
utils
)

define_ff_vars(${project_target})
Expand All @@ -37,3 +39,5 @@ set_target_properties(
PROPERTIES
CUDA_STANDARD 17
)

add_subdirectory(test)
3 changes: 3 additions & 0 deletions lib/kernels/include/kernels/accessor.h
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,9 @@ std::vector<real_type<DT> const *>
return out;
}

GenericTensorAccessorR read_only_accessor_from_write_accessor(
GenericTensorAccessorW const &write_accessor);

} // namespace FlexFlow

namespace FlexFlow {
Expand Down
2 changes: 0 additions & 2 deletions lib/kernels/include/kernels/array_shape.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,6 @@ struct ArrayShape {

std::optional<std::size_t> at_maybe(std::size_t) const;

ArrayShape reversed_dim_order() const;

ArrayShape
sub_shape(std::optional<std::variant<ff_dim_t, legion_dim_t>> start,
std::optional<std::variant<ff_dim_t, legion_dim_t>> end) const;
Expand Down
1 change: 1 addition & 0 deletions lib/kernels/include/kernels/attention_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include "device.h"
#include "kernels/allocation.h"
#include "kernels/device.h"
#include "kernels/ff_handle.h"
#include "op-attrs/ops/attention.h"
#include <memory>
Expand Down
2 changes: 1 addition & 1 deletion lib/kernels/include/kernels/conv_2d_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle,
int padding_w,
int stride_h,
int stride_w,
GenericTensorAccessorR const &input,
GenericTensorAccessorW const &input,
GenericTensorAccessorW const &output,
float const *filter_ptr,
float *filter_grad_ptr);
Expand Down
8 changes: 6 additions & 2 deletions lib/kernels/include/kernels/device.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,12 @@
#include <iostream>
#include <sstream>

namespace FlexFlow {
cudaError_t get_legion_stream(cudaStream_t *stream);
} // namespace FlexFlow

#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
typedef cudaStream_t ffStream_t;
cudaError_t get_legion_stream(cudaStream_t *stream);
typedef cudnnTensorDescriptor_t ffTensorDescriptor_t;
typedef cudnnActivationDescriptor_t ffActivationDescriptor_t;
typedef cudnnPoolingDescriptor_t ffPoolingDescriptor_t;
Expand Down Expand Up @@ -96,7 +99,8 @@ using coord_t = long long;
do { \
std::stringstream _error; \
if (status != 0) { \
_error << "Cuda failure: " << status; \
_error << "CUDA failure: " << cudaGetErrorString(status) << " (" \
<< status << ")"; \
FatalError(_error.str()); \
} \
} while (0)
Expand Down
4 changes: 2 additions & 2 deletions lib/kernels/include/kernels/element_unary_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,14 @@ ElementUnaryPerDeviceState init_kernel(ArrayShape const &input_shape,
void forward_kernel(ffStream_t stream,
ElementUnaryPerDeviceState const &device_state,
ElementUnaryAttrs const &attrs,
PerDeviceFFHandle &handle,
PerDeviceFFHandle const &handle,
GenericTensorAccessorR const &input,
GenericTensorAccessorW const &output);

void backward_kernel(ffStream_t stream,
ElementUnaryPerDeviceState const &device_state,
ElementUnaryAttrs const &attrs,
PerDeviceFFHandle &handle,
PerDeviceFFHandle const &handle,
GenericTensorAccessorR const &input,
GenericTensorAccessorW const &input_grad,
GenericTensorAccessorR const &output,
Expand Down
4 changes: 2 additions & 2 deletions lib/kernels/include/kernels/layer_norm_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@ namespace Kernels {
namespace LayerNorm {

// todo: this may have some problem.
LayerNormPerDeviceState init_kernel(PerDeviceFFHandle const &,
Allocator const &,
LayerNormPerDeviceState init_kernel(PerDeviceFFHandle const &handle,
Allocator &allocator,
bool elementwise_affine,
int64_t effective_batch_size,
int64_t effective_num_elements,
Expand Down
2 changes: 1 addition & 1 deletion lib/kernels/include/kernels/legion_dim.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

namespace FlexFlow {

legion_dim_t add_to_legion_dim(legion_dim_t, int);
legion_dim_t add_to_legion_dim(legion_dim_t legion_dim, int value);

legion_dim_t legion_dim_from_ff_dim(ff_dim_t, int num_dimensions);

Expand Down
2 changes: 2 additions & 0 deletions lib/kernels/include/kernels/linear_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ namespace Linear {

LinearPerDeviceState init_kernel(PerDeviceFFHandle handle,
float *one_ptr,
std::optional<Activation> activation,
std::optional<RegularizerAttrs> regularizer,
bool use_bias,
DataType input_type,
Expand All @@ -57,6 +58,7 @@ void forward_kernel(ffStream_t stream,
int in_dim,
int out_dim,
int batch_size);

void backward_kernel(ffStream_t stream,
LinearPerDeviceState const &m,
void const *input_ptr,
Expand Down
22 changes: 22 additions & 0 deletions lib/kernels/include/kernels/local_cuda_allocator.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#include "kernels/allocation.h"
#include <unordered_set>

namespace FlexFlow {

struct LocalCudaAllocator : public IAllocator {
LocalCudaAllocator() = default;
LocalCudaAllocator(LocalCudaAllocator const &) = delete;
LocalCudaAllocator(LocalCudaAllocator &&) = delete;
~LocalCudaAllocator() override;

void *allocate(size_t) override;
void deallocate(void *) override;

private:
std::unordered_set<void *> ptrs;
};
CHECK_RC_COPY_VIRTUAL_COMPLIANT(LocalCudaAllocator);

Allocator create_local_cuda_memory_allocator();

} // namespace FlexFlow
28 changes: 28 additions & 0 deletions lib/kernels/include/kernels/managed_ff_stream.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#ifndef _FLEXFLOW_KERNELS_MANAGED_FF_STREAM_H
#define _FLEXFLOW_KERNELS_MANAGED_FF_STREAM_H

#include "device.h"

namespace FlexFlow {

struct ManagedFFStream {
public:
ManagedFFStream();

ManagedFFStream(ManagedFFStream const &) = delete;
ManagedFFStream &operator=(ManagedFFStream const &) = delete;

ManagedFFStream(ManagedFFStream &&other) noexcept;
ManagedFFStream &operator=(ManagedFFStream &&other) noexcept;

~ManagedFFStream();

ffStream_t const &raw_stream() const;

private:
ffStream_t *stream;
};

} // namespace FlexFlow

#endif
30 changes: 30 additions & 0 deletions lib/kernels/include/kernels/managed_per_device_ff_handle.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#ifndef _FLEXFLOW_KERNELS_MANAGED_HANDLE_H
#define _FLEXFLOW_KERNELS_MANAGED_HANDLE_H

#include "kernels/ff_handle.h"

namespace FlexFlow {

struct ManagedPerDeviceFFHandle {
public:
ManagedPerDeviceFFHandle();

ManagedPerDeviceFFHandle(ManagedPerDeviceFFHandle const &) = delete;
ManagedPerDeviceFFHandle &
operator=(ManagedPerDeviceFFHandle const &) = delete;

ManagedPerDeviceFFHandle(ManagedPerDeviceFFHandle &&other) noexcept;
ManagedPerDeviceFFHandle &
operator=(ManagedPerDeviceFFHandle &&other) noexcept;

~ManagedPerDeviceFFHandle();

PerDeviceFFHandle const &raw_handle() const;

private:
PerDeviceFFHandle *handle;
};

} // namespace FlexFlow

#endif
4 changes: 2 additions & 2 deletions lib/kernels/include/kernels/reduce_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ namespace Reduce {
ReducePerDeviceState init_kernel(PerDeviceFFHandle const &,
OperatorType const &,
size_t const &,
ArrayShape input_shape,
ArrayShape output_shape);
ArrayShape const &input_shape,
ArrayShape const &output_shape);

void forward_kernel(ffStream_t stream,
ReducePerDeviceState const &m,
Expand Down
4 changes: 2 additions & 2 deletions lib/kernels/include/kernels/replicate_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ void forward_kernel(ffStream_t stream,
GenericTensorAccessorW const &output);

void backward_kernel(ffStream_t stream,
GenericTensorAccessorR const &input,
GenericTensorAccessorW const &output,
GenericTensorAccessorW const &input,
GenericTensorAccessorR const &output,
size_t num_replicas);

} // namespace Replicate
Expand Down
8 changes: 7 additions & 1 deletion lib/kernels/include/kernels/softmax_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,18 @@ FF_VISITABLE_STRUCT(SoftmaxPerDeviceState, handle, inputTensor, dim);
namespace Kernels {
namespace Softmax {

SoftmaxPerDeviceState init_kernel(PerDeviceFFHandle const &, int);
SoftmaxPerDeviceState init_kernel(PerDeviceFFHandle const &handle,
int dim,
int input_n,
int input_c,
int input_h,
int input_w);

void forward_kernel(ffStream_t stream,
SoftmaxPerDeviceState const &m,
float const *input_ptr,
float *output_ptr);

void backward_kernel(ffStream_t stream,
float *input_grad_ptr,
float const *output_grad_ptr,
Expand Down
1 change: 1 addition & 0 deletions lib/kernels/include/kernels/transpose_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#define _FLEXFLOW_OPS_KERNELS_TRANSPOSE_KERNELS_H

#include "device.h"
#include "kernels/accessor.h"
#include <vector>

namespace FlexFlow {
Expand Down
46 changes: 46 additions & 0 deletions lib/kernels/src/accessor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,46 @@

namespace FlexFlow {

int32_t *GenericTensorAccessorW::get_int32_ptr() const {
return this->get<DataType::INT32>();
}

int64_t *GenericTensorAccessorW::get_int64_ptr() const {
return this->get<DataType::INT64>();
}

float *GenericTensorAccessorW::get_float_ptr() const {
return this->get<DataType::FLOAT>();
}

double *GenericTensorAccessorW::get_double_ptr() const {
return this->get<DataType::DOUBLE>();
}

half *GenericTensorAccessorW::get_half_ptr() const {
return this->get<DataType::HALF>();
}

int32_t const *GenericTensorAccessorR::get_int32_ptr() const {
return this->get<DataType::INT32>();
}

int64_t const *GenericTensorAccessorR::get_int64_ptr() const {
return this->get<DataType::INT64>();
}

float const *GenericTensorAccessorR::get_float_ptr() const {
return this->get<DataType::FLOAT>();
}

double const *GenericTensorAccessorR::get_double_ptr() const {
return this->get<DataType::DOUBLE>();
}

half const *GenericTensorAccessorR::get_half_ptr() const {
return get<DataType::HALF>();
}

int32_t *get_int32_ptr(GenericTensorAccessorW const &a) {
return get<DataType::INT32>(a);
}
Expand Down Expand Up @@ -92,4 +132,10 @@ std::vector<half const *>
return get<DataType::HALF>(a);
}

GenericTensorAccessorR read_only_accessor_from_write_accessor(
GenericTensorAccessorW const &writable) {
return GenericTensorAccessorR{
writable.data_type, writable.shape, req<void const *>(writable.ptr)};
}

} // namespace FlexFlow
Loading