diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md
index 5da439e94e0..184cd419554 100644
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -660,8 +660,9 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|--------------------|---------------------------------------|---------------------------------------------|
| GGML_SYCL | ON (mandatory) | Enable build with SYCL code path.
FP32 path - recommended for better perforemance than FP16 on quantized model|
| GGML_SYCL_TARGET | INTEL *(default)* \| NVIDIA \| AMD | Set the SYCL target device type. |
-| GGML_SYCL_DEVICE_ARCH | Optional (except for AMD) | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. |
+| GGML_SYCL_DEVICE_ARCH | Optional (except for AMD) | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. |
| GGML_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. |
+| GGML_SYCL_GRAPH | ON *(default)* \|OFF *(Optional)* | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). |
| CMAKE_C_COMPILER | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path. |
| CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)* | Set `icpx/icx` compiler for SYCL code path. |
@@ -671,6 +672,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------|
| GGML_SYCL_DEBUG | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG |
| GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features based on Intel GPU type, to compare the performance increase |
+| GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because graph performance isn't yet better than non-graph performance. |
| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.
Recommended to use when --split-mode = layer |
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index 9a4ee4992d0..740f9f69cf2 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -186,6 +186,7 @@ option(GGML_OPENMP "ggml: use OpenMP"
option(GGML_RPC "ggml: use RPC" OFF)
option(GGML_SYCL "ggml: use SYCL" OFF)
option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
+option(GGML_SYCL_GRAPH "ggml: enable graphs in the SYCL backend" ON)
set (GGML_SYCL_TARGET "INTEL" CACHE STRING
"ggml: sycl target device")
set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt
index 3ad044432a2..271413ca414 100644
--- a/ggml/src/ggml-sycl/CMakeLists.txt
+++ b/ggml/src/ggml-sycl/CMakeLists.txt
@@ -66,6 +66,9 @@ if (WIN32)
find_package(MKL REQUIRED)
target_link_libraries(ggml-sycl PRIVATE IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
else()
+ if (GGML_SYCL_GRAPH)
+ add_compile_definitions(GGML_SYCL_GRAPH)
+ endif()
if (GGML_SYCL_TARGET STREQUAL "INTEL")
target_link_libraries(ggml-sycl PRIVATE sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp
index a92988b7dbd..3b86d8b2a79 100644
--- a/ggml/src/ggml-sycl/common.hpp
+++ b/ggml/src/ggml-sycl/common.hpp
@@ -301,6 +301,7 @@ inline optimize_feature check_gpu_optimize_feature(syclex::architecture &arch) {
return opt;
}
+namespace sycl_ex = sycl::ext::oneapi::experimental;
struct ggml_backend_sycl_context {
int device;
std::string name;
@@ -392,6 +393,10 @@ struct ggml_backend_sycl_context {
return pool(device);
}
+#ifdef GGML_SYCL_GRAPH
+ std::unique_ptr> exec_graph = nullptr;
+#endif
+
ggml_sycl_pool & host_pool(int device) {
if (host_pools[device] == nullptr) {
host_pools[device] = new_pool_for_host(stream(device, 0), device);
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index 6977b705e48..1bbfb4b2415 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -46,6 +46,7 @@
static bool g_sycl_loaded = false;
int g_ggml_sycl_debug = 0;
int g_ggml_sycl_disable_optimize = 0;
+int g_ggml_sycl_disable_graph = 0;
static ggml_sycl_device_info ggml_sycl_init() {
ggml_sycl_device_info info = {};
@@ -95,7 +96,7 @@ const ggml_sycl_device_info & ggml_sycl_info() {
return info;
}
-void print_device_detail(int id, sycl::device &device, std::string device_type) {
+static void print_device_detail(int id, sycl::device &device, std::string device_type) {
dpct::device_info prop;
SYCL_CHECK(CHECK_TRY_ERROR(
@@ -118,7 +119,7 @@ void print_device_detail(int id, sycl::device &device, std::string device_type)
global_mem_size, device.get_info().c_str());
}
-void print_device_opt_feature(int device_count) {
+static void print_device_opt_feature(int device_count) {
GGML_LOG_INFO("SYCL Optimization Feature:\n");
GGML_LOG_INFO(
"|ID| Device Type|Reorder|\n");
@@ -191,10 +192,12 @@ static void ggml_check_sycl() try {
if (!initialized) {
g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 0);
+ g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1);
GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n");
GGML_LOG_INFO("Running with Environment Variables:\n");
GGML_LOG_INFO(" GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug);
GGML_LOG_INFO(" GGML_SYCL_DISABLE_OPT: %d\n", g_ggml_sycl_disable_optimize);
+ GGML_LOG_INFO(" GGML_SYCL_DISABLE_GRAPH: %d\n", g_ggml_sycl_disable_graph);
GGML_LOG_INFO("Build with Macros:\n");
#if defined(GGML_SYCL_FORCE_MMQ)
GGML_LOG_INFO(" GGML_SYCL_FORCE_MMQ: yes\n");
@@ -400,7 +403,7 @@ catch (sycl::exception const &exc) {
std::exit(1);
}
-void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst,
+static void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst,
const void *ptr_src, size_t size) {
char *host_buf = (char *)malloc(size);
q_src.memcpy(host_buf, (const char *)ptr_src, size).wait();
@@ -604,7 +607,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
return &ggml_backend_sycl_buffer_types[device];
}
-ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(ggml_backend_sycl_context * ctx) {
+static ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(ggml_backend_sycl_context * ctx) {
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n");
int device = ctx->device;
@@ -1666,7 +1669,7 @@ static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx,
stream->parallel_for(
sycl::nd_range<3>(num_blocks * block_size, block_size),
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
quantize_q8_1(x, vy, kx, kx_padded, item_ct1);
});
}
@@ -1687,7 +1690,7 @@ static void ggml_mul_mat_p021_f16_f32_sycl(const void *vx, const float *y,
stream->parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
mul_mat_p021_f16_f32(vx, y, dst, ncols_x, nrows_x, nchannels_x,
nchannels_y, item_ct1);
});
@@ -1707,7 +1710,7 @@ static void ggml_mul_mat_vec_nc_f16_f32_sycl(
stream->parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
mul_mat_vec_nc_f16_f32(vx, y, dst, ncols_x, nrows_x,
row_stride_x, channel_stride_x,
nchannels_y / nchannels_x, item_ct1);
@@ -1748,7 +1751,7 @@ static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols,
const sycl::range<3> block_nums(1, nrows, 1);
stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1)
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+ [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
k_sum_rows_f32(x, dst, ncols, item_ct1);
});
}
@@ -2898,7 +2901,7 @@ inline bool ggml_sycl_supports_mmq(enum ggml_type type) {
return false;
}
-bool ggml_sycl_supports_dmmv(enum ggml_type type) {
+static bool ggml_sycl_supports_dmmv(enum ggml_type type) {
switch (type) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
@@ -3271,7 +3274,7 @@ static void ggml_sycl_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
}
-void ggml_sycl_set_main_device(const int main_device) try {
+static void ggml_sycl_set_main_device(const int main_device) try {
if (dpct::get_current_device_id() == static_cast (main_device)) {
return;
}
@@ -3292,7 +3295,7 @@ catch (sycl::exception const &exc) {
std::exit(1);
}
-bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tensor * dst) {
+static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tensor * dst) {
if (!g_sycl_loaded) return false;
if (dst->src[0] != nullptr && ggml_backend_buffer_is_sycl_split(dst->src[0]->buffer)) {
@@ -3610,7 +3613,7 @@ catch (sycl::exception const &exc) {
std::exit(1);
}
-void reorder_qw(char *data_device, const int ncols, const int nrows,
+static void reorder_qw(char *data_device, const int ncols, const int nrows,
size_t size, size_t offset, dpct::queue_ptr stream) {
auto tmp_buf = sycl::malloc_shared(size, *stream);
SYCL_CHECK(
@@ -3624,7 +3627,7 @@ void reorder_qw(char *data_device, const int ncols, const int nrows,
stream->parallel_for(
size / sizeof(block_q4_0),
- [=](auto i) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+ [=](auto i) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
const block_q4_0* x = (const block_q4_0*)tmp_buf;
const int ib = i;
@@ -3638,7 +3641,7 @@ void reorder_qw(char *data_device, const int ncols, const int nrows,
sycl::free(tmp_buf, *stream);
}
-void reorder_qw(ggml_tensor * src0, dpct::queue_ptr stream) {
+static void reorder_qw(ggml_tensor * src0, dpct::queue_ptr stream) {
char*data_device = (char*)src0->data;
size_t ncols = src0->ne[0];
size_t nrows = src0->ne[1];
@@ -3647,7 +3650,7 @@ void reorder_qw(ggml_tensor * src0, dpct::queue_ptr stream) {
reorder_qw(data_device, ncols, nrows, size, 0, stream);
}
-void opt_for_reorder(ggml_tensor * dst, dpct::queue_ptr stream) {
+static void opt_for_reorder(ggml_tensor * dst, dpct::queue_ptr stream) {
ggml_tensor *src0 = dst->src[0];
ggml_tensor *src1 = dst->src[1];
@@ -3660,7 +3663,7 @@ void opt_for_reorder(ggml_tensor * dst, dpct::queue_ptr stream) {
}
}
-void optimize_graph_once(ggml_cgraph * cgraph, ggml_backend_sycl_context * ctx) {
+static void optimize_graph_once(ggml_cgraph * cgraph, ggml_backend_sycl_context * ctx) {
dpct::queue_ptr stream = ctx->stream();
if (ctx->optimized_graph) {
return;
@@ -3671,10 +3674,9 @@ void optimize_graph_once(ggml_cgraph * cgraph, ggml_backend_sycl_context * ctx)
if (ctx->opt_feature.reorder) opt_for_reorder(cgraph->nodes[i], stream);
}
}
-static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
- ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
- ggml_sycl_set_main_device(sycl_ctx->device);
+static void ggml_backend_sycl_graph_compute_impl(ggml_backend_sycl_context * sycl_ctx, ggml_cgraph * cgraph) {
+ ggml_sycl_set_main_device(sycl_ctx->device);
if (!g_ggml_sycl_disable_optimize) optimize_graph_once(cgraph, sycl_ctx);
for (int i = 0; i < cgraph->n_nodes; i++) {
@@ -3696,7 +3698,46 @@ static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_
}
GGML_ASSERT(ok);
}
+}
+
+static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+ auto * sycl_ctx = static_cast(backend->context);
+
+#ifdef GGML_SYCL_GRAPH
+ if (!g_ggml_sycl_disable_graph) {
+ if (!sycl_ctx->exec_graph && !dpct::get_device(sycl_ctx->device).has(sycl::aspect::ext_oneapi_graph)) {
+ GGML_SYCL_DEBUG("[SYCL-GRAPH] can not use graphs on device:%d\n", sycl_ctx->device);
+ ggml_backend_sycl_graph_compute_impl(sycl_ctx, cgraph);
+ return GGML_STATUS_SUCCESS;
+ }
+
+ sycl_ex::command_graph model_sycl_graph(*(sycl_ctx->stream()));
+ model_sycl_graph.begin_recording(*(sycl_ctx->stream()));
+ ggml_backend_sycl_graph_compute_impl(sycl_ctx, cgraph);
+ model_sycl_graph.end_recording();
+ if (!sycl_ctx->exec_graph) {
+ auto exec_graph = model_sycl_graph.finalize({sycl_ex::property::graph::updatable{}});
+ sycl_ctx->exec_graph = std::make_unique<
+ sycl_ex::command_graph>(exec_graph);
+ } else {
+ try {
+ sycl_ctx->exec_graph->update(model_sycl_graph);
+ GGML_SYCL_DEBUG("[SYCL-GRAPH] update success\n");
+ } catch (sycl::exception const & e) {
+ GGML_SYCL_DEBUG("[SYCL-GRAPH] Exception when updating graph, %s\n", e.what());
+ auto exec_graph = model_sycl_graph.finalize({sycl_ex::property::graph::updatable{}});
+ sycl_ctx->exec_graph = std::make_unique<
+ sycl_ex::command_graph>(exec_graph);
+ }
+ }
+
+ sycl_ctx->stream()->ext_oneapi_graph(*(sycl_ctx->exec_graph));
+ } else
+#endif
+ {
+ ggml_backend_sycl_graph_compute_impl(sycl_ctx, cgraph);
+ }
return GGML_STATUS_SUCCESS;
}
@@ -3851,7 +3892,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
return true;
}
return false;
- } break;
+ }
case GGML_OP_UNARY:
switch (ggml_get_unary_op(op)) {
case GGML_UNARY_OP_NEG:
@@ -3869,7 +3910,6 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
default:
return false;
}
- break;
case GGML_OP_MUL_MAT:
case GGML_OP_MUL_MAT_ID:
{
@@ -3900,7 +3940,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
return false;
}
return true;
- } break;
+ }
case GGML_OP_OUT_PROD:
return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->ne[2] == 1 && op->ne[3] == 1;
case GGML_OP_GET_ROWS:
@@ -3917,7 +3957,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
default:
return false;
}
- } break;
+ }
case GGML_OP_CPY:
{
ggml_type src0_type = op->src[0]->type;
@@ -3968,12 +4008,12 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
return true;
}
return false;
- } break;
+ }
case GGML_OP_CONCAT:
{
ggml_type src0_type = op->src[0]->type;
return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
- } break;
+ }
case GGML_OP_DUP:
case GGML_OP_ARGMAX:
case GGML_OP_NONE: