From 2896987bc2efcd0668b4bf7849944d843ff5473e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20=C5=9Alusarczyk?= Date: Tue, 4 Mar 2025 17:58:14 +0100 Subject: [PATCH 1/7] alberto changes --- ggml/CMakeLists.txt | 1 + ggml/src/ggml-sycl/CMakeLists.txt | 3 +++ ggml/src/ggml-sycl/common.hpp | 5 +++++ ggml/src/ggml-sycl/ggml-sycl.cpp | 27 +++++++++++++++++++++++++++ 4 files changed, 36 insertions(+) diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 9a4ee4992d0..c33f229660b 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -186,6 +186,7 @@ option(GGML_OPENMP "ggml: use OpenMP" option(GGML_RPC "ggml: use RPC" OFF) option(GGML_SYCL "ggml: use SYCL" OFF) option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF) +option(GGML_USE_SYCL_GRAPH "ggml: enable graphs in the SYCL backend" OFF) set (GGML_SYCL_TARGET "INTEL" CACHE STRING "ggml: sycl target device") set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt index 3ad044432a2..9f59dffe343 100644 --- a/ggml/src/ggml-sycl/CMakeLists.txt +++ b/ggml/src/ggml-sycl/CMakeLists.txt @@ -72,6 +72,9 @@ else() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda") add_compile_definitions(GGML_SYCL_NVIDIA) target_link_libraries(ggml-sycl PRIVATE sycl pthread m dl onemkl_blas_cublas) + if (GGML_USE_SYCL_GRAPH) + add_compile_definitions(GGML_USE_SYCL_GRAPH) + endif() elseif (GGML_SYCL_TARGET STREQUAL "AMD") if (NOT GGML_SYCL_DEVICE_ARCH) message(ERROR "Can't enable SYCL hip backend, GGML_SYCL_DEVICE_ARCH has not been set.") diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index a92988b7dbd..1981c1e0f99 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -301,6 +301,7 @@ inline optimize_feature check_gpu_optimize_feature(syclex::architecture &arch) { return opt; } +namespace sycl_ex = sycl::ext::oneapi::experimental; struct ggml_backend_sycl_context { int device; std::string name; @@ -392,6 +393,10 @@ struct ggml_backend_sycl_context { return pool(device); } +#ifdef GGML_USE_SYCL_GRAPH + std::unique_ptr> exec_graph = nullptr; +#endif + ggml_sycl_pool & host_pool(int device) { if (host_pools[device] == nullptr) { host_pools[device] = new_pool_for_host(stream(device, 0), device); diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 6977b705e48..0c0a5063165 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -3676,6 +3676,12 @@ static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_ ggml_sycl_set_main_device(sycl_ctx->device); if (!g_ggml_sycl_disable_optimize) optimize_graph_once(cgraph, sycl_ctx); +#ifdef GGML_USE_SYCL_GRAPH + namespace sycl_ex = sycl::ext::oneapi::experimental; + sycl_ex::command_graph model_sycl_graph(*(sycl_ctx->stream())); + + model_sycl_graph.begin_recording(*(sycl_ctx->stream())); +#endif for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; @@ -3697,6 +3703,27 @@ static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_ GGML_ASSERT(ok); } +#ifdef GGML_USE_SYCL_GRAPH + model_sycl_graph.end_recording(); + + if (!sycl_ctx->exec_graph) { + auto exec_graph = model_sycl_graph.finalize({sycl_ex::property::graph::updatable{}}); + sycl_ctx->exec_graph = std::make_unique< + sycl_ex::command_graph>(exec_graph); + } else { + try { + sycl_ctx->exec_graph->update(model_sycl_graph); + } catch (sycl::exception e) { + GGML_SYCL_DEBUG("[SYCL-GRAPH] Exception when updating graph.\n"); + auto exec_graph = model_sycl_graph.finalize({sycl_ex::property::graph::updatable{}}); + sycl_ctx->exec_graph = std::make_unique< + sycl_ex::command_graph>(exec_graph); + } + } + + sycl_ctx->stream()->ext_oneapi_graph(*(sycl_ctx->exec_graph)); +#endif + return GGML_STATUS_SUCCESS; } From 4efab98a658ea4a9040d4aafabfe47c5dd30123f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20=C5=9Alusarczyk?= Date: Thu, 13 Mar 2025 15:05:46 +0100 Subject: [PATCH 2/7] enable sycl graphs by env variable --- ggml/src/ggml-sycl/CMakeLists.txt | 6 +-- ggml/src/ggml-sycl/ggml-sycl.cpp | 65 +++++++++++++++++++------------ 2 files changed, 43 insertions(+), 28 deletions(-) diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt index 9f59dffe343..b1dab13b250 100644 --- a/ggml/src/ggml-sycl/CMakeLists.txt +++ b/ggml/src/ggml-sycl/CMakeLists.txt @@ -66,15 +66,15 @@ if (WIN32) find_package(MKL REQUIRED) target_link_libraries(ggml-sycl PRIVATE IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL) else() + if (GGML_USE_SYCL_GRAPH) + add_compile_definitions(GGML_USE_SYCL_GRAPH) + endif() if (GGML_SYCL_TARGET STREQUAL "INTEL") target_link_libraries(ggml-sycl PRIVATE sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread) elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda") add_compile_definitions(GGML_SYCL_NVIDIA) target_link_libraries(ggml-sycl PRIVATE sycl pthread m dl onemkl_blas_cublas) - if (GGML_USE_SYCL_GRAPH) - add_compile_definitions(GGML_USE_SYCL_GRAPH) - endif() elseif (GGML_SYCL_TARGET STREQUAL "AMD") if (NOT GGML_SYCL_DEVICE_ARCH) message(ERROR "Can't enable SYCL hip backend, GGML_SYCL_DEVICE_ARCH has not been set.") diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 0c0a5063165..8c46c7f0566 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -46,6 +46,7 @@ static bool g_sycl_loaded = false; int g_ggml_sycl_debug = 0; int g_ggml_sycl_disable_optimize = 0; +int g_ggml_sycl_graphs = 0; static ggml_sycl_device_info ggml_sycl_init() { ggml_sycl_device_info info = {}; @@ -191,10 +192,12 @@ static void ggml_check_sycl() try { if (!initialized) { g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0); g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 0); + g_ggml_sycl_graphs = get_sycl_env("GGML_SYCL_GRAPHS", 0); GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n"); GGML_LOG_INFO("Running with Environment Variables:\n"); GGML_LOG_INFO(" GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug); GGML_LOG_INFO(" GGML_SYCL_DISABLE_OPT: %d\n", g_ggml_sycl_disable_optimize); + GGML_LOG_INFO(" GGML_SYCL_GRAPHS: %d\n", g_ggml_sycl_graphs); GGML_LOG_INFO("Build with Macros:\n"); #if defined(GGML_SYCL_FORCE_MMQ) GGML_LOG_INFO(" GGML_SYCL_FORCE_MMQ: yes\n"); @@ -3671,17 +3674,10 @@ void optimize_graph_once(ggml_cgraph * cgraph, ggml_backend_sycl_context * ctx) if (ctx->opt_feature.reorder) opt_for_reorder(cgraph->nodes[i], stream); } } -static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { - ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; - ggml_sycl_set_main_device(sycl_ctx->device); +static void ggml_backend_sycl_graph_compute_impl(ggml_backend_sycl_context * sycl_ctx, ggml_cgraph * cgraph) { + ggml_sycl_set_main_device(sycl_ctx->device); if (!g_ggml_sycl_disable_optimize) optimize_graph_once(cgraph, sycl_ctx); -#ifdef GGML_USE_SYCL_GRAPH - namespace sycl_ex = sycl::ext::oneapi::experimental; - sycl_ex::command_graph model_sycl_graph(*(sycl_ctx->stream())); - - model_sycl_graph.begin_recording(*(sycl_ctx->stream())); -#endif for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; @@ -3702,28 +3698,47 @@ static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_ } GGML_ASSERT(ok); } +} + +static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { + auto * sycl_ctx = static_cast(backend->context); #ifdef GGML_USE_SYCL_GRAPH - model_sycl_graph.end_recording(); + if (g_ggml_sycl_graphs) { + if (!sycl_ctx->exec_graph && !dpct::get_device(sycl_ctx->device).has(sycl::aspect::ext_oneapi_graph)) { + GGML_SYCL_DEBUG("[SYCL-GRAPH] can not use graphs on device:%d\n", sycl_ctx->device); + ggml_backend_sycl_graph_compute_impl(sycl_ctx, cgraph); + return GGML_STATUS_SUCCESS; + } - if (!sycl_ctx->exec_graph) { - auto exec_graph = model_sycl_graph.finalize({sycl_ex::property::graph::updatable{}}); - sycl_ctx->exec_graph = std::make_unique< - sycl_ex::command_graph>(exec_graph); - } else { - try { - sycl_ctx->exec_graph->update(model_sycl_graph); - } catch (sycl::exception e) { - GGML_SYCL_DEBUG("[SYCL-GRAPH] Exception when updating graph.\n"); - auto exec_graph = model_sycl_graph.finalize({sycl_ex::property::graph::updatable{}}); - sycl_ctx->exec_graph = std::make_unique< - sycl_ex::command_graph>(exec_graph); + namespace sycl_ex = sycl::ext::oneapi::experimental; + sycl_ex::command_graph model_sycl_graph(*(sycl_ctx->stream())); + model_sycl_graph.begin_recording(*(sycl_ctx->stream())); + ggml_backend_sycl_graph_compute_impl(sycl_ctx, cgraph); + model_sycl_graph.end_recording(); + + if (!sycl_ctx->exec_graph) { + auto exec_graph = model_sycl_graph.finalize({sycl_ex::property::graph::updatable{}}); + sycl_ctx->exec_graph = std::make_unique< + sycl_ex::command_graph>(exec_graph); + } else { + try { + sycl_ctx->exec_graph->update(model_sycl_graph); + GGML_SYCL_DEBUG("[SYCL-GRAPH] update success\n"); + } catch (sycl::exception const & e) { + GGML_SYCL_DEBUG("[SYCL-GRAPH] Exception when updating graph, %s\n", e.what()); + auto exec_graph = model_sycl_graph.finalize({sycl_ex::property::graph::updatable{}}); + sycl_ctx->exec_graph = std::make_unique< + sycl_ex::command_graph>(exec_graph); + } } - } - sycl_ctx->stream()->ext_oneapi_graph(*(sycl_ctx->exec_graph)); + sycl_ctx->stream()->ext_oneapi_graph(*(sycl_ctx->exec_graph)); + } else #endif - + { + ggml_backend_sycl_graph_compute_impl(sycl_ctx, cgraph); + } return GGML_STATUS_SUCCESS; } From d02a0d2872f0a66d2d63584fbfff9b8d935951e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20=C5=9Alusarczyk?= Date: Thu, 13 Mar 2025 15:06:17 +0100 Subject: [PATCH 3/7] fixed compilation warnings in ggml-sycl.cpp --- ggml/src/ggml-sycl/ggml-sycl.cpp | 43 ++++++++++++++++---------------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 8c46c7f0566..d2dddfaf043 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -96,7 +96,7 @@ const ggml_sycl_device_info & ggml_sycl_info() { return info; } -void print_device_detail(int id, sycl::device &device, std::string device_type) { +static void print_device_detail(int id, sycl::device &device, std::string device_type) { dpct::device_info prop; SYCL_CHECK(CHECK_TRY_ERROR( @@ -119,7 +119,7 @@ void print_device_detail(int id, sycl::device &device, std::string device_type) global_mem_size, device.get_info().c_str()); } -void print_device_opt_feature(int device_count) { +static void print_device_opt_feature(int device_count) { GGML_LOG_INFO("SYCL Optimization Feature:\n"); GGML_LOG_INFO( "|ID| Device Type|Reorder|\n"); @@ -403,7 +403,7 @@ catch (sycl::exception const &exc) { std::exit(1); } -void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst, +static void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst, const void *ptr_src, size_t size) { char *host_buf = (char *)malloc(size); q_src.memcpy(host_buf, (const char *)ptr_src, size).wait(); @@ -607,7 +607,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) { return &ggml_backend_sycl_buffer_types[device]; } -ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(ggml_backend_sycl_context * ctx) { +static ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(ggml_backend_sycl_context * ctx) { GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n"); int device = ctx->device; @@ -1669,7 +1669,7 @@ static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx, stream->parallel_for( sycl::nd_range<3>(num_blocks * block_size, block_size), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { quantize_q8_1(x, vy, kx, kx_padded, item_ct1); }); } @@ -1690,7 +1690,7 @@ static void ggml_mul_mat_p021_f16_f32_sycl(const void *vx, const float *y, stream->parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_p021_f16_f32(vx, y, dst, ncols_x, nrows_x, nchannels_x, nchannels_y, item_ct1); }); @@ -1710,7 +1710,7 @@ static void ggml_mul_mat_vec_nc_f16_f32_sycl( stream->parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_nc_f16_f32(vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x, nchannels_y / nchannels_x, item_ct1); @@ -1751,7 +1751,7 @@ static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols, const sycl::range<3> block_nums(1, nrows, 1); stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [[sycl::reqd_sub_group_size(WARP_SIZE)]] { k_sum_rows_f32(x, dst, ncols, item_ct1); }); } @@ -2901,7 +2901,7 @@ inline bool ggml_sycl_supports_mmq(enum ggml_type type) { return false; } -bool ggml_sycl_supports_dmmv(enum ggml_type type) { +static bool ggml_sycl_supports_dmmv(enum ggml_type type) { switch (type) { case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: @@ -3274,7 +3274,7 @@ static void ggml_sycl_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst) } -void ggml_sycl_set_main_device(const int main_device) try { +static void ggml_sycl_set_main_device(const int main_device) try { if (dpct::get_current_device_id() == static_cast (main_device)) { return; } @@ -3295,7 +3295,7 @@ catch (sycl::exception const &exc) { std::exit(1); } -bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tensor * dst) { +static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tensor * dst) { if (!g_sycl_loaded) return false; if (dst->src[0] != nullptr && ggml_backend_buffer_is_sycl_split(dst->src[0]->buffer)) { @@ -3613,7 +3613,7 @@ catch (sycl::exception const &exc) { std::exit(1); } -void reorder_qw(char *data_device, const int ncols, const int nrows, +static void reorder_qw(char *data_device, const int ncols, const int nrows, size_t size, size_t offset, dpct::queue_ptr stream) { auto tmp_buf = sycl::malloc_shared(size, *stream); SYCL_CHECK( @@ -3627,7 +3627,7 @@ void reorder_qw(char *data_device, const int ncols, const int nrows, stream->parallel_for( size / sizeof(block_q4_0), - [=](auto i) [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [=](auto i) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { const block_q4_0* x = (const block_q4_0*)tmp_buf; const int ib = i; @@ -3641,7 +3641,7 @@ void reorder_qw(char *data_device, const int ncols, const int nrows, sycl::free(tmp_buf, *stream); } -void reorder_qw(ggml_tensor * src0, dpct::queue_ptr stream) { +static void reorder_qw(ggml_tensor * src0, dpct::queue_ptr stream) { char*data_device = (char*)src0->data; size_t ncols = src0->ne[0]; size_t nrows = src0->ne[1]; @@ -3650,7 +3650,7 @@ void reorder_qw(ggml_tensor * src0, dpct::queue_ptr stream) { reorder_qw(data_device, ncols, nrows, size, 0, stream); } -void opt_for_reorder(ggml_tensor * dst, dpct::queue_ptr stream) { +static void opt_for_reorder(ggml_tensor * dst, dpct::queue_ptr stream) { ggml_tensor *src0 = dst->src[0]; ggml_tensor *src1 = dst->src[1]; @@ -3663,7 +3663,7 @@ void opt_for_reorder(ggml_tensor * dst, dpct::queue_ptr stream) { } } -void optimize_graph_once(ggml_cgraph * cgraph, ggml_backend_sycl_context * ctx) { +static void optimize_graph_once(ggml_cgraph * cgraph, ggml_backend_sycl_context * ctx) { dpct::queue_ptr stream = ctx->stream(); if (ctx->optimized_graph) { return; @@ -3893,7 +3893,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g return true; } return false; - } break; + } case GGML_OP_UNARY: switch (ggml_get_unary_op(op)) { case GGML_UNARY_OP_NEG: @@ -3911,7 +3911,6 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g default: return false; } - break; case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT_ID: { @@ -3942,7 +3941,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g return false; } return true; - } break; + } case GGML_OP_OUT_PROD: return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->ne[2] == 1 && op->ne[3] == 1; case GGML_OP_GET_ROWS: @@ -3959,7 +3958,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g default: return false; } - } break; + } case GGML_OP_CPY: { ggml_type src0_type = op->src[0]->type; @@ -4010,12 +4009,12 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g return true; } return false; - } break; + } case GGML_OP_CONCAT: { ggml_type src0_type = op->src[0]->type; return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16; - } break; + } case GGML_OP_DUP: case GGML_OP_ARGMAX: case GGML_OP_NONE: From 89ac171a8355016c62b2394d1df90c5adfe83803 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20=C5=9Alusarczyk?= Date: Fri, 14 Mar 2025 10:50:27 +0100 Subject: [PATCH 4/7] renamed graph variables --- docs/backend/SYCL.md | 2 ++ ggml/CMakeLists.txt | 2 +- ggml/src/ggml-sycl/CMakeLists.txt | 4 ++-- ggml/src/ggml-sycl/common.hpp | 2 +- ggml/src/ggml-sycl/ggml-sycl.cpp | 11 +++++------ 5 files changed, 11 insertions(+), 10 deletions(-) diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md index 5da439e94e0..1497d6ed1a1 100644 --- a/docs/backend/SYCL.md +++ b/docs/backend/SYCL.md @@ -662,6 +662,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512 | GGML_SYCL_TARGET | INTEL *(default)* \| NVIDIA \| AMD | Set the SYCL target device type. | | GGML_SYCL_DEVICE_ARCH | Optional (except for AMD) | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. | | GGML_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. | +| GGML_SYCL_GRAPH | OFF *(default) \|ON *(Optional)* | Enable build with (SYCL Graph extension)[https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc]. | | CMAKE_C_COMPILER | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path. | | CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)* | Set `icpx/icx` compiler for SYCL code path. | @@ -671,6 +672,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512 |-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------| | GGML_SYCL_DEBUG | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG | | GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features based on Intel GPU type, to compare the performance increase | +| GGML_SYCL_ENABLE_GRAPH | 0 (default) or 1 | Enable running computations through SYCL Graphs feature. Requires also the code be be compiled with GGML_SYCL_GRAPH=ON. | | ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.
Recommended to use when --split-mode = layer | diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index c33f229660b..09b6f630c52 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -186,7 +186,7 @@ option(GGML_OPENMP "ggml: use OpenMP" option(GGML_RPC "ggml: use RPC" OFF) option(GGML_SYCL "ggml: use SYCL" OFF) option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF) -option(GGML_USE_SYCL_GRAPH "ggml: enable graphs in the SYCL backend" OFF) +option(GGML_SYCL_GRAPH "ggml: enable graphs in the SYCL backend" OFF) set (GGML_SYCL_TARGET "INTEL" CACHE STRING "ggml: sycl target device") set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt index b1dab13b250..271413ca414 100644 --- a/ggml/src/ggml-sycl/CMakeLists.txt +++ b/ggml/src/ggml-sycl/CMakeLists.txt @@ -66,8 +66,8 @@ if (WIN32) find_package(MKL REQUIRED) target_link_libraries(ggml-sycl PRIVATE IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL) else() - if (GGML_USE_SYCL_GRAPH) - add_compile_definitions(GGML_USE_SYCL_GRAPH) + if (GGML_SYCL_GRAPH) + add_compile_definitions(GGML_SYCL_GRAPH) endif() if (GGML_SYCL_TARGET STREQUAL "INTEL") target_link_libraries(ggml-sycl PRIVATE sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread) diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index 1981c1e0f99..3b86d8b2a79 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -393,7 +393,7 @@ struct ggml_backend_sycl_context { return pool(device); } -#ifdef GGML_USE_SYCL_GRAPH +#ifdef GGML_SYCL_GRAPH std::unique_ptr> exec_graph = nullptr; #endif diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index d2dddfaf043..92b298193a4 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -46,7 +46,7 @@ static bool g_sycl_loaded = false; int g_ggml_sycl_debug = 0; int g_ggml_sycl_disable_optimize = 0; -int g_ggml_sycl_graphs = 0; +int g_ggml_sycl_enable_graph = 0; static ggml_sycl_device_info ggml_sycl_init() { ggml_sycl_device_info info = {}; @@ -192,12 +192,12 @@ static void ggml_check_sycl() try { if (!initialized) { g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0); g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 0); - g_ggml_sycl_graphs = get_sycl_env("GGML_SYCL_GRAPHS", 0); + g_ggml_sycl_enable_graph = get_sycl_env("GGML_SYCL_ENABLE_GRAPH", 0); GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n"); GGML_LOG_INFO("Running with Environment Variables:\n"); GGML_LOG_INFO(" GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug); GGML_LOG_INFO(" GGML_SYCL_DISABLE_OPT: %d\n", g_ggml_sycl_disable_optimize); - GGML_LOG_INFO(" GGML_SYCL_GRAPHS: %d\n", g_ggml_sycl_graphs); + GGML_LOG_INFO(" GGML_SYCL_ENABLE_GRAPH: %d\n", g_ggml_sycl_enable_graph); GGML_LOG_INFO("Build with Macros:\n"); #if defined(GGML_SYCL_FORCE_MMQ) GGML_LOG_INFO(" GGML_SYCL_FORCE_MMQ: yes\n"); @@ -3703,15 +3703,14 @@ static void ggml_backend_sycl_graph_compute_impl(ggml_backend_sycl_context * syc static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { auto * sycl_ctx = static_cast(backend->context); -#ifdef GGML_USE_SYCL_GRAPH - if (g_ggml_sycl_graphs) { +#ifdef GGML_SYCL_GRAPH + if (g_ggml_sycl_enable_graph) { if (!sycl_ctx->exec_graph && !dpct::get_device(sycl_ctx->device).has(sycl::aspect::ext_oneapi_graph)) { GGML_SYCL_DEBUG("[SYCL-GRAPH] can not use graphs on device:%d\n", sycl_ctx->device); ggml_backend_sycl_graph_compute_impl(sycl_ctx, cgraph); return GGML_STATUS_SUCCESS; } - namespace sycl_ex = sycl::ext::oneapi::experimental; sycl_ex::command_graph model_sycl_graph(*(sycl_ctx->stream())); model_sycl_graph.begin_recording(*(sycl_ctx->stream())); ggml_backend_sycl_graph_compute_impl(sycl_ctx, cgraph); From d75f29acdb92eb20435a9050002ed4d91f740734 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20=C5=9Alusarczyk?= <112692748+lslusarczyk@users.noreply.github.com> Date: Fri, 14 Mar 2025 12:38:54 +0100 Subject: [PATCH 5/7] fix markdown in docs/backend/SYCL.md Co-authored-by: Romain Biessy --- docs/backend/SYCL.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md index 1497d6ed1a1..49d7ffcf4d3 100644 --- a/docs/backend/SYCL.md +++ b/docs/backend/SYCL.md @@ -662,7 +662,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512 | GGML_SYCL_TARGET | INTEL *(default)* \| NVIDIA \| AMD | Set the SYCL target device type. | | GGML_SYCL_DEVICE_ARCH | Optional (except for AMD) | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. | | GGML_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. | -| GGML_SYCL_GRAPH | OFF *(default) \|ON *(Optional)* | Enable build with (SYCL Graph extension)[https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc]. | +| GGML_SYCL_GRAPH | OFF *(default) \|ON *(Optional)* | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). | | CMAKE_C_COMPILER | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path. | | CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)* | Set `icpx/icx` compiler for SYCL code path. | From 55a49ba4b8595c29c16f4a2712632e9379dfb4cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20=C5=9Alusarczyk?= Date: Fri, 14 Mar 2025 12:42:20 +0100 Subject: [PATCH 6/7] fix markdown in docs/backend/SYCL.md again --- docs/backend/SYCL.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md index 49d7ffcf4d3..573b115489c 100644 --- a/docs/backend/SYCL.md +++ b/docs/backend/SYCL.md @@ -662,7 +662,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512 | GGML_SYCL_TARGET | INTEL *(default)* \| NVIDIA \| AMD | Set the SYCL target device type. | | GGML_SYCL_DEVICE_ARCH | Optional (except for AMD) | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. | | GGML_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. | -| GGML_SYCL_GRAPH | OFF *(default) \|ON *(Optional)* | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). | +| GGML_SYCL_GRAPH | OFF *(default)* \|ON *(Optional)* | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). | | CMAKE_C_COMPILER | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path. | | CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)* | Set `icpx/icx` compiler for SYCL code path. | From 1693dde78a76c6beea089eea39bc93321e3f514c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20=C5=9Alusarczyk?= Date: Mon, 17 Mar 2025 13:33:17 +0100 Subject: [PATCH 7/7] compiling graphs by default, renamed graph_enable to graph_disable --- docs/backend/SYCL.md | 6 +++--- ggml/CMakeLists.txt | 2 +- ggml/src/ggml-sycl/ggml-sycl.cpp | 8 ++++---- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md index 573b115489c..184cd419554 100644 --- a/docs/backend/SYCL.md +++ b/docs/backend/SYCL.md @@ -660,9 +660,9 @@ use 1 SYCL GPUs: [0] with Max compute units:512 |--------------------|---------------------------------------|---------------------------------------------| | GGML_SYCL | ON (mandatory) | Enable build with SYCL code path.
FP32 path - recommended for better perforemance than FP16 on quantized model| | GGML_SYCL_TARGET | INTEL *(default)* \| NVIDIA \| AMD | Set the SYCL target device type. | -| GGML_SYCL_DEVICE_ARCH | Optional (except for AMD) | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. | +| GGML_SYCL_DEVICE_ARCH | Optional (except for AMD) | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. | | GGML_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. | -| GGML_SYCL_GRAPH | OFF *(default)* \|ON *(Optional)* | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). | +| GGML_SYCL_GRAPH | ON *(default)* \|OFF *(Optional)* | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). | | CMAKE_C_COMPILER | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path. | | CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)* | Set `icpx/icx` compiler for SYCL code path. | @@ -672,7 +672,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512 |-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------| | GGML_SYCL_DEBUG | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG | | GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features based on Intel GPU type, to compare the performance increase | -| GGML_SYCL_ENABLE_GRAPH | 0 (default) or 1 | Enable running computations through SYCL Graphs feature. Requires also the code be be compiled with GGML_SYCL_GRAPH=ON. | +| GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because graph performance isn't yet better than non-graph performance. | | ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.
Recommended to use when --split-mode = layer | diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 09b6f630c52..740f9f69cf2 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -186,7 +186,7 @@ option(GGML_OPENMP "ggml: use OpenMP" option(GGML_RPC "ggml: use RPC" OFF) option(GGML_SYCL "ggml: use SYCL" OFF) option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF) -option(GGML_SYCL_GRAPH "ggml: enable graphs in the SYCL backend" OFF) +option(GGML_SYCL_GRAPH "ggml: enable graphs in the SYCL backend" ON) set (GGML_SYCL_TARGET "INTEL" CACHE STRING "ggml: sycl target device") set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 92b298193a4..1bbfb4b2415 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -46,7 +46,7 @@ static bool g_sycl_loaded = false; int g_ggml_sycl_debug = 0; int g_ggml_sycl_disable_optimize = 0; -int g_ggml_sycl_enable_graph = 0; +int g_ggml_sycl_disable_graph = 0; static ggml_sycl_device_info ggml_sycl_init() { ggml_sycl_device_info info = {}; @@ -192,12 +192,12 @@ static void ggml_check_sycl() try { if (!initialized) { g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0); g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 0); - g_ggml_sycl_enable_graph = get_sycl_env("GGML_SYCL_ENABLE_GRAPH", 0); + g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1); GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n"); GGML_LOG_INFO("Running with Environment Variables:\n"); GGML_LOG_INFO(" GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug); GGML_LOG_INFO(" GGML_SYCL_DISABLE_OPT: %d\n", g_ggml_sycl_disable_optimize); - GGML_LOG_INFO(" GGML_SYCL_ENABLE_GRAPH: %d\n", g_ggml_sycl_enable_graph); + GGML_LOG_INFO(" GGML_SYCL_DISABLE_GRAPH: %d\n", g_ggml_sycl_disable_graph); GGML_LOG_INFO("Build with Macros:\n"); #if defined(GGML_SYCL_FORCE_MMQ) GGML_LOG_INFO(" GGML_SYCL_FORCE_MMQ: yes\n"); @@ -3704,7 +3704,7 @@ static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_ auto * sycl_ctx = static_cast(backend->context); #ifdef GGML_SYCL_GRAPH - if (g_ggml_sycl_enable_graph) { + if (!g_ggml_sycl_disable_graph) { if (!sycl_ctx->exec_graph && !dpct::get_device(sycl_ctx->device).has(sycl::aspect::ext_oneapi_graph)) { GGML_SYCL_DEBUG("[SYCL-GRAPH] can not use graphs on device:%d\n", sycl_ctx->device); ggml_backend_sycl_graph_compute_impl(sycl_ctx, cgraph);