From 2896987bc2efcd0668b4bf7849944d843ff5473e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=81ukasz=20=C5=9Alusarczyk?=
 <lukasz.slusarczyk@intel.com>
Date: Tue, 4 Mar 2025 17:58:14 +0100
Subject: [PATCH 1/7] alberto changes

---
 ggml/CMakeLists.txt               |  1 +
 ggml/src/ggml-sycl/CMakeLists.txt |  3 +++
 ggml/src/ggml-sycl/common.hpp     |  5 +++++
 ggml/src/ggml-sycl/ggml-sycl.cpp  | 27 +++++++++++++++++++++++++++
 4 files changed, 36 insertions(+)

diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index 9a4ee4992d0..c33f229660b 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -186,6 +186,7 @@ option(GGML_OPENMP                          "ggml: use OpenMP"
 option(GGML_RPC                             "ggml: use RPC"                                   OFF)
 option(GGML_SYCL                            "ggml: use SYCL"                                  OFF)
 option(GGML_SYCL_F16                        "ggml: use 16 bit floats for sycl calculations"   OFF)
+option(GGML_USE_SYCL_GRAPH                  "ggml: enable graphs in the SYCL backend"         OFF)
 set   (GGML_SYCL_TARGET "INTEL" CACHE STRING
                                             "ggml: sycl target device")
 set   (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt
index 3ad044432a2..9f59dffe343 100644
--- a/ggml/src/ggml-sycl/CMakeLists.txt
+++ b/ggml/src/ggml-sycl/CMakeLists.txt
@@ -72,6 +72,9 @@ else()
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
         add_compile_definitions(GGML_SYCL_NVIDIA)
         target_link_libraries(ggml-sycl PRIVATE sycl pthread m dl onemkl_blas_cublas)
+        if (GGML_USE_SYCL_GRAPH)
+            add_compile_definitions(GGML_USE_SYCL_GRAPH)
+        endif()
     elseif (GGML_SYCL_TARGET STREQUAL "AMD")
         if (NOT GGML_SYCL_DEVICE_ARCH)
             message(ERROR "Can't enable SYCL hip backend, GGML_SYCL_DEVICE_ARCH has not been set.")
diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp
index a92988b7dbd..1981c1e0f99 100644
--- a/ggml/src/ggml-sycl/common.hpp
+++ b/ggml/src/ggml-sycl/common.hpp
@@ -301,6 +301,7 @@ inline optimize_feature check_gpu_optimize_feature(syclex::architecture &arch) {
     return opt;
 }
 
+namespace sycl_ex = sycl::ext::oneapi::experimental;
 struct ggml_backend_sycl_context {
     int device;
     std::string name;
@@ -392,6 +393,10 @@ struct ggml_backend_sycl_context {
         return pool(device);
     }
 
+#ifdef GGML_USE_SYCL_GRAPH
+    std::unique_ptr<sycl_ex::command_graph<sycl_ex::graph_state::executable>> exec_graph = nullptr;
+#endif
+
     ggml_sycl_pool & host_pool(int device) {
         if (host_pools[device] == nullptr) {
             host_pools[device] = new_pool_for_host(stream(device, 0), device);
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index 6977b705e48..0c0a5063165 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -3676,6 +3676,12 @@ static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_
     ggml_sycl_set_main_device(sycl_ctx->device);
 
     if (!g_ggml_sycl_disable_optimize) optimize_graph_once(cgraph, sycl_ctx);
+#ifdef GGML_USE_SYCL_GRAPH
+    namespace sycl_ex = sycl::ext::oneapi::experimental;
+    sycl_ex::command_graph model_sycl_graph(*(sycl_ctx->stream()));
+
+    model_sycl_graph.begin_recording(*(sycl_ctx->stream()));
+#endif
 
     for (int i = 0; i < cgraph->n_nodes; i++) {
         ggml_tensor * node = cgraph->nodes[i];
@@ -3697,6 +3703,27 @@ static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_
         GGML_ASSERT(ok);
     }
 
+#ifdef GGML_USE_SYCL_GRAPH
+    model_sycl_graph.end_recording();
+
+    if (!sycl_ctx->exec_graph) {
+        auto exec_graph = model_sycl_graph.finalize({sycl_ex::property::graph::updatable{}});
+        sycl_ctx->exec_graph = std::make_unique<
+            sycl_ex::command_graph<sycl_ex::graph_state::executable>>(exec_graph);
+    } else {
+        try {
+            sycl_ctx->exec_graph->update(model_sycl_graph);
+        } catch (sycl::exception e) {
+          GGML_SYCL_DEBUG("[SYCL-GRAPH] Exception when updating graph.\n");
+          auto exec_graph = model_sycl_graph.finalize({sycl_ex::property::graph::updatable{}});
+          sycl_ctx->exec_graph = std::make_unique<
+              sycl_ex::command_graph<sycl_ex::graph_state::executable>>(exec_graph);
+        }
+    }
+
+    sycl_ctx->stream()->ext_oneapi_graph(*(sycl_ctx->exec_graph));
+#endif
+
     return GGML_STATUS_SUCCESS;
 }
 

From 4efab98a658ea4a9040d4aafabfe47c5dd30123f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=81ukasz=20=C5=9Alusarczyk?=
 <lukasz.slusarczyk@intel.com>
Date: Thu, 13 Mar 2025 15:05:46 +0100
Subject: [PATCH 2/7] enable sycl graphs by env variable

---
 ggml/src/ggml-sycl/CMakeLists.txt |  6 +--
 ggml/src/ggml-sycl/ggml-sycl.cpp  | 65 +++++++++++++++++++------------
 2 files changed, 43 insertions(+), 28 deletions(-)

diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt
index 9f59dffe343..b1dab13b250 100644
--- a/ggml/src/ggml-sycl/CMakeLists.txt
+++ b/ggml/src/ggml-sycl/CMakeLists.txt
@@ -66,15 +66,15 @@ if (WIN32)
     find_package(MKL REQUIRED)
     target_link_libraries(ggml-sycl PRIVATE IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
 else()
+    if (GGML_USE_SYCL_GRAPH)
+        add_compile_definitions(GGML_USE_SYCL_GRAPH)
+    endif()
     if (GGML_SYCL_TARGET STREQUAL "INTEL")
         target_link_libraries(ggml-sycl PRIVATE sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
     elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
         add_compile_definitions(GGML_SYCL_NVIDIA)
         target_link_libraries(ggml-sycl PRIVATE sycl pthread m dl onemkl_blas_cublas)
-        if (GGML_USE_SYCL_GRAPH)
-            add_compile_definitions(GGML_USE_SYCL_GRAPH)
-        endif()
     elseif (GGML_SYCL_TARGET STREQUAL "AMD")
         if (NOT GGML_SYCL_DEVICE_ARCH)
             message(ERROR "Can't enable SYCL hip backend, GGML_SYCL_DEVICE_ARCH has not been set.")
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index 0c0a5063165..8c46c7f0566 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -46,6 +46,7 @@
 static bool g_sycl_loaded = false;
 int g_ggml_sycl_debug = 0;
 int g_ggml_sycl_disable_optimize = 0;
+int g_ggml_sycl_graphs = 0;
 
 static ggml_sycl_device_info ggml_sycl_init() {
     ggml_sycl_device_info info = {};
@@ -191,10 +192,12 @@ static void ggml_check_sycl() try {
     if (!initialized) {
         g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
         g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 0);
+        g_ggml_sycl_graphs = get_sycl_env("GGML_SYCL_GRAPHS", 0);
         GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n");
         GGML_LOG_INFO("Running with Environment Variables:\n");
         GGML_LOG_INFO("  GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug);
         GGML_LOG_INFO("  GGML_SYCL_DISABLE_OPT: %d\n", g_ggml_sycl_disable_optimize);
+        GGML_LOG_INFO("  GGML_SYCL_GRAPHS: %d\n", g_ggml_sycl_graphs);
         GGML_LOG_INFO("Build with Macros:\n");
 #if defined(GGML_SYCL_FORCE_MMQ)
         GGML_LOG_INFO("  GGML_SYCL_FORCE_MMQ: yes\n");
@@ -3671,17 +3674,10 @@ void optimize_graph_once(ggml_cgraph * cgraph, ggml_backend_sycl_context * ctx)
         if (ctx->opt_feature.reorder) opt_for_reorder(cgraph->nodes[i], stream);
     }
 }
-static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
-    ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
-    ggml_sycl_set_main_device(sycl_ctx->device);
 
+static void ggml_backend_sycl_graph_compute_impl(ggml_backend_sycl_context * sycl_ctx, ggml_cgraph * cgraph) {
+    ggml_sycl_set_main_device(sycl_ctx->device);
     if (!g_ggml_sycl_disable_optimize) optimize_graph_once(cgraph, sycl_ctx);
-#ifdef GGML_USE_SYCL_GRAPH
-    namespace sycl_ex = sycl::ext::oneapi::experimental;
-    sycl_ex::command_graph model_sycl_graph(*(sycl_ctx->stream()));
-
-    model_sycl_graph.begin_recording(*(sycl_ctx->stream()));
-#endif
 
     for (int i = 0; i < cgraph->n_nodes; i++) {
         ggml_tensor * node = cgraph->nodes[i];
@@ -3702,28 +3698,47 @@ static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_
         }
         GGML_ASSERT(ok);
     }
+}
+
+static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+    auto * sycl_ctx = static_cast<ggml_backend_sycl_context *>(backend->context);
 
 #ifdef GGML_USE_SYCL_GRAPH
-    model_sycl_graph.end_recording();
+    if (g_ggml_sycl_graphs) {
+        if (!sycl_ctx->exec_graph && !dpct::get_device(sycl_ctx->device).has(sycl::aspect::ext_oneapi_graph)) {
+            GGML_SYCL_DEBUG("[SYCL-GRAPH] can not use graphs on device:%d\n", sycl_ctx->device);
+            ggml_backend_sycl_graph_compute_impl(sycl_ctx, cgraph);
+            return GGML_STATUS_SUCCESS;
+        }
 
-    if (!sycl_ctx->exec_graph) {
-        auto exec_graph = model_sycl_graph.finalize({sycl_ex::property::graph::updatable{}});
-        sycl_ctx->exec_graph = std::make_unique<
-            sycl_ex::command_graph<sycl_ex::graph_state::executable>>(exec_graph);
-    } else {
-        try {
-            sycl_ctx->exec_graph->update(model_sycl_graph);
-        } catch (sycl::exception e) {
-          GGML_SYCL_DEBUG("[SYCL-GRAPH] Exception when updating graph.\n");
-          auto exec_graph = model_sycl_graph.finalize({sycl_ex::property::graph::updatable{}});
-          sycl_ctx->exec_graph = std::make_unique<
-              sycl_ex::command_graph<sycl_ex::graph_state::executable>>(exec_graph);
+        namespace sycl_ex = sycl::ext::oneapi::experimental;
+        sycl_ex::command_graph model_sycl_graph(*(sycl_ctx->stream()));
+        model_sycl_graph.begin_recording(*(sycl_ctx->stream()));
+        ggml_backend_sycl_graph_compute_impl(sycl_ctx, cgraph);
+        model_sycl_graph.end_recording();
+
+        if (!sycl_ctx->exec_graph) {
+            auto exec_graph = model_sycl_graph.finalize({sycl_ex::property::graph::updatable{}});
+            sycl_ctx->exec_graph = std::make_unique<
+                sycl_ex::command_graph<sycl_ex::graph_state::executable>>(exec_graph);
+        } else {
+            try {
+                sycl_ctx->exec_graph->update(model_sycl_graph);
+                GGML_SYCL_DEBUG("[SYCL-GRAPH] update success\n");
+            } catch (sycl::exception const & e) {
+                GGML_SYCL_DEBUG("[SYCL-GRAPH] Exception when updating graph, %s\n", e.what());
+                auto exec_graph = model_sycl_graph.finalize({sycl_ex::property::graph::updatable{}});
+                sycl_ctx->exec_graph = std::make_unique<
+                    sycl_ex::command_graph<sycl_ex::graph_state::executable>>(exec_graph);
+            }
         }
-    }
 
-    sycl_ctx->stream()->ext_oneapi_graph(*(sycl_ctx->exec_graph));
+        sycl_ctx->stream()->ext_oneapi_graph(*(sycl_ctx->exec_graph));
+    } else
 #endif
-
+    {
+        ggml_backend_sycl_graph_compute_impl(sycl_ctx, cgraph);
+    }
     return GGML_STATUS_SUCCESS;
 }
 

From d02a0d2872f0a66d2d63584fbfff9b8d935951e6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=81ukasz=20=C5=9Alusarczyk?=
 <lukasz.slusarczyk@intel.com>
Date: Thu, 13 Mar 2025 15:06:17 +0100
Subject: [PATCH 3/7] fixed compilation warnings in ggml-sycl.cpp

---
 ggml/src/ggml-sycl/ggml-sycl.cpp | 43 ++++++++++++++++----------------
 1 file changed, 21 insertions(+), 22 deletions(-)

diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index 8c46c7f0566..d2dddfaf043 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -96,7 +96,7 @@ const ggml_sycl_device_info & ggml_sycl_info() {
     return info;
 }
 
-void print_device_detail(int id, sycl::device &device, std::string device_type) {
+static void print_device_detail(int id, sycl::device &device, std::string device_type) {
 
     dpct::device_info prop;
     SYCL_CHECK(CHECK_TRY_ERROR(
@@ -119,7 +119,7 @@ void print_device_detail(int id, sycl::device &device, std::string device_type)
             global_mem_size, device.get_info<sycl::info::device::driver_version>().c_str());
 }
 
-void print_device_opt_feature(int device_count) {
+static void print_device_opt_feature(int device_count) {
     GGML_LOG_INFO("SYCL Optimization Feature:\n");
     GGML_LOG_INFO(
         "|ID|        Device Type|Reorder|\n");
@@ -403,7 +403,7 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst,
+static void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst,
                     const void *ptr_src, size_t size) {
     char *host_buf = (char *)malloc(size);
     q_src.memcpy(host_buf, (const char *)ptr_src, size).wait();
@@ -607,7 +607,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
     return &ggml_backend_sycl_buffer_types[device];
 }
 
-ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(ggml_backend_sycl_context * ctx) {
+static ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(ggml_backend_sycl_context * ctx) {
     GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n");
 
     int device = ctx->device;
@@ -1669,7 +1669,7 @@ static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx,
 
         stream->parallel_for(
             sycl::nd_range<3>(num_blocks * block_size, block_size),
-            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+            [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
                 quantize_q8_1<QUANT_BLOCK_TILE>(x, vy, kx, kx_padded, item_ct1);
             });
     }
@@ -1690,7 +1690,7 @@ static void ggml_mul_mat_p021_f16_f32_sycl(const void *vx, const float *y,
 
         stream->parallel_for(
             sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+            [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
                 mul_mat_p021_f16_f32(vx, y, dst, ncols_x, nrows_x, nchannels_x,
                                      nchannels_y, item_ct1);
             });
@@ -1710,7 +1710,7 @@ static void ggml_mul_mat_vec_nc_f16_f32_sycl(
 
         stream->parallel_for(
             sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+            [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
                 mul_mat_vec_nc_f16_f32(vx, y, dst, ncols_x, nrows_x,
                                        row_stride_x, channel_stride_x,
                                        nchannels_y / nchannels_x, item_ct1);
@@ -1751,7 +1751,7 @@ static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols,
     const sycl::range<3> block_nums(1, nrows, 1);
     stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
                          [=](sycl::nd_item<3> item_ct1)
-                             [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+                             [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
                                  k_sum_rows_f32(x, dst, ncols, item_ct1);
                              });
 }
@@ -2901,7 +2901,7 @@ inline bool ggml_sycl_supports_mmq(enum ggml_type type) {
     return false;
 }
 
-bool ggml_sycl_supports_dmmv(enum ggml_type type) {
+static bool ggml_sycl_supports_dmmv(enum ggml_type type) {
     switch (type) {
         case GGML_TYPE_Q4_0:
         case GGML_TYPE_Q4_1:
@@ -3274,7 +3274,7 @@ static void ggml_sycl_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
 }
 
 
-void ggml_sycl_set_main_device(const int main_device) try {
+static void ggml_sycl_set_main_device(const int main_device) try {
     if (dpct::get_current_device_id() == static_cast<unsigned int> (main_device)) {
         return;
     }
@@ -3295,7 +3295,7 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tensor * dst) {
+static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tensor * dst) {
     if (!g_sycl_loaded) return false;
 
     if (dst->src[0] != nullptr && ggml_backend_buffer_is_sycl_split(dst->src[0]->buffer)) {
@@ -3613,7 +3613,7 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-void reorder_qw(char *data_device, const int ncols, const int nrows,
+static void reorder_qw(char *data_device, const int ncols, const int nrows,
                 size_t size, size_t offset, dpct::queue_ptr stream) {
     auto tmp_buf = sycl::malloc_shared<char>(size, *stream);
     SYCL_CHECK(
@@ -3627,7 +3627,7 @@ void reorder_qw(char *data_device, const int ncols, const int nrows,
 
     stream->parallel_for(
         size / sizeof(block_q4_0),
-            [=](auto i) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+            [=](auto i) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
             const block_q4_0* x = (const block_q4_0*)tmp_buf;
             const int ib = i;
 
@@ -3641,7 +3641,7 @@ void reorder_qw(char *data_device, const int ncols, const int nrows,
     sycl::free(tmp_buf, *stream);
 }
 
-void reorder_qw(ggml_tensor * src0, dpct::queue_ptr stream) {
+static void reorder_qw(ggml_tensor * src0, dpct::queue_ptr stream) {
     char*data_device = (char*)src0->data;
     size_t ncols = src0->ne[0];
     size_t nrows = src0->ne[1];
@@ -3650,7 +3650,7 @@ void reorder_qw(ggml_tensor * src0, dpct::queue_ptr stream) {
     reorder_qw(data_device, ncols, nrows, size, 0, stream);
 }
 
-void opt_for_reorder(ggml_tensor * dst, dpct::queue_ptr stream) {
+static void opt_for_reorder(ggml_tensor * dst, dpct::queue_ptr stream) {
     ggml_tensor *src0 = dst->src[0];
     ggml_tensor *src1 = dst->src[1];
 
@@ -3663,7 +3663,7 @@ void opt_for_reorder(ggml_tensor * dst, dpct::queue_ptr stream) {
     }
 }
 
-void optimize_graph_once(ggml_cgraph * cgraph, ggml_backend_sycl_context * ctx) {
+static void optimize_graph_once(ggml_cgraph * cgraph, ggml_backend_sycl_context * ctx) {
     dpct::queue_ptr stream = ctx->stream();
     if (ctx->optimized_graph) {
        return;
@@ -3893,7 +3893,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
                     return true;
                 }
                 return false;
-            } break;
+            }
         case GGML_OP_UNARY:
             switch (ggml_get_unary_op(op)) {
                 case GGML_UNARY_OP_NEG:
@@ -3911,7 +3911,6 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
                 default:
                     return false;
             }
-            break;
         case GGML_OP_MUL_MAT:
         case GGML_OP_MUL_MAT_ID:
             {
@@ -3942,7 +3941,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
                     return false;
                 }
                 return true;
-            } break;
+            }
         case GGML_OP_OUT_PROD:
             return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->ne[2] == 1 && op->ne[3] == 1;
         case GGML_OP_GET_ROWS:
@@ -3959,7 +3958,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
                     default:
                         return false;
                 }
-            } break;
+            }
         case GGML_OP_CPY:
             {
                 ggml_type src0_type = op->src[0]->type;
@@ -4010,12 +4009,12 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
                     return true;
                 }
                 return false;
-            } break;
+            }
         case GGML_OP_CONCAT:
             {
                 ggml_type src0_type = op->src[0]->type;
                 return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
-            } break;
+            }
         case GGML_OP_DUP:
         case GGML_OP_ARGMAX:
         case GGML_OP_NONE:

From 89ac171a8355016c62b2394d1df90c5adfe83803 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=81ukasz=20=C5=9Alusarczyk?=
 <lukasz.slusarczyk@intel.com>
Date: Fri, 14 Mar 2025 10:50:27 +0100
Subject: [PATCH 4/7] renamed graph variables

---
 docs/backend/SYCL.md              |  2 ++
 ggml/CMakeLists.txt               |  2 +-
 ggml/src/ggml-sycl/CMakeLists.txt |  4 ++--
 ggml/src/ggml-sycl/common.hpp     |  2 +-
 ggml/src/ggml-sycl/ggml-sycl.cpp  | 11 +++++------
 5 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md
index 5da439e94e0..1497d6ed1a1 100644
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -662,6 +662,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
 | GGML_SYCL_TARGET   | INTEL *(default)* \| NVIDIA \| AMD    | Set the SYCL target device type.            |
 | GGML_SYCL_DEVICE_ARCH | Optional (except for AMD)          | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. |
 | GGML_SYCL_F16      | OFF *(default)* \|ON *(optional)*     | Enable FP16 build with SYCL code path.      |
+| GGML_SYCL_GRAPH    | OFF *(default) \|ON *(Optional)*      | Enable build with (SYCL Graph extension)[https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc]. |
 | CMAKE_C_COMPILER   | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path.      |
 | CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)*   | Set `icpx/icx` compiler for SYCL code path. |
 
@@ -671,6 +672,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
 |-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------|
 | GGML_SYCL_DEBUG   | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG                                                                             |
 | GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features based on Intel GPU type, to compare the performance increase |
+| GGML_SYCL_ENABLE_GRAPH | 0 (default) or 1 | Enable running computations through SYCL Graphs feature. Requires also the code be be compiled with GGML_SYCL_GRAPH=ON. |
 | ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
 
 
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index c33f229660b..09b6f630c52 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -186,7 +186,7 @@ option(GGML_OPENMP                          "ggml: use OpenMP"
 option(GGML_RPC                             "ggml: use RPC"                                   OFF)
 option(GGML_SYCL                            "ggml: use SYCL"                                  OFF)
 option(GGML_SYCL_F16                        "ggml: use 16 bit floats for sycl calculations"   OFF)
-option(GGML_USE_SYCL_GRAPH                  "ggml: enable graphs in the SYCL backend"         OFF)
+option(GGML_SYCL_GRAPH                      "ggml: enable graphs in the SYCL backend"         OFF)
 set   (GGML_SYCL_TARGET "INTEL" CACHE STRING
                                             "ggml: sycl target device")
 set   (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt
index b1dab13b250..271413ca414 100644
--- a/ggml/src/ggml-sycl/CMakeLists.txt
+++ b/ggml/src/ggml-sycl/CMakeLists.txt
@@ -66,8 +66,8 @@ if (WIN32)
     find_package(MKL REQUIRED)
     target_link_libraries(ggml-sycl PRIVATE IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
 else()
-    if (GGML_USE_SYCL_GRAPH)
-        add_compile_definitions(GGML_USE_SYCL_GRAPH)
+    if (GGML_SYCL_GRAPH)
+        add_compile_definitions(GGML_SYCL_GRAPH)
     endif()
     if (GGML_SYCL_TARGET STREQUAL "INTEL")
         target_link_libraries(ggml-sycl PRIVATE sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp
index 1981c1e0f99..3b86d8b2a79 100644
--- a/ggml/src/ggml-sycl/common.hpp
+++ b/ggml/src/ggml-sycl/common.hpp
@@ -393,7 +393,7 @@ struct ggml_backend_sycl_context {
         return pool(device);
     }
 
-#ifdef GGML_USE_SYCL_GRAPH
+#ifdef GGML_SYCL_GRAPH
     std::unique_ptr<sycl_ex::command_graph<sycl_ex::graph_state::executable>> exec_graph = nullptr;
 #endif
 
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index d2dddfaf043..92b298193a4 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -46,7 +46,7 @@
 static bool g_sycl_loaded = false;
 int g_ggml_sycl_debug = 0;
 int g_ggml_sycl_disable_optimize = 0;
-int g_ggml_sycl_graphs = 0;
+int g_ggml_sycl_enable_graph = 0;
 
 static ggml_sycl_device_info ggml_sycl_init() {
     ggml_sycl_device_info info = {};
@@ -192,12 +192,12 @@ static void ggml_check_sycl() try {
     if (!initialized) {
         g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
         g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 0);
-        g_ggml_sycl_graphs = get_sycl_env("GGML_SYCL_GRAPHS", 0);
+        g_ggml_sycl_enable_graph = get_sycl_env("GGML_SYCL_ENABLE_GRAPH", 0);
         GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n");
         GGML_LOG_INFO("Running with Environment Variables:\n");
         GGML_LOG_INFO("  GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug);
         GGML_LOG_INFO("  GGML_SYCL_DISABLE_OPT: %d\n", g_ggml_sycl_disable_optimize);
-        GGML_LOG_INFO("  GGML_SYCL_GRAPHS: %d\n", g_ggml_sycl_graphs);
+        GGML_LOG_INFO("  GGML_SYCL_ENABLE_GRAPH: %d\n", g_ggml_sycl_enable_graph);
         GGML_LOG_INFO("Build with Macros:\n");
 #if defined(GGML_SYCL_FORCE_MMQ)
         GGML_LOG_INFO("  GGML_SYCL_FORCE_MMQ: yes\n");
@@ -3703,15 +3703,14 @@ static void ggml_backend_sycl_graph_compute_impl(ggml_backend_sycl_context * syc
 static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
     auto * sycl_ctx = static_cast<ggml_backend_sycl_context *>(backend->context);
 
-#ifdef GGML_USE_SYCL_GRAPH
-    if (g_ggml_sycl_graphs) {
+#ifdef GGML_SYCL_GRAPH
+    if (g_ggml_sycl_enable_graph) {
         if (!sycl_ctx->exec_graph && !dpct::get_device(sycl_ctx->device).has(sycl::aspect::ext_oneapi_graph)) {
             GGML_SYCL_DEBUG("[SYCL-GRAPH] can not use graphs on device:%d\n", sycl_ctx->device);
             ggml_backend_sycl_graph_compute_impl(sycl_ctx, cgraph);
             return GGML_STATUS_SUCCESS;
         }
 
-        namespace sycl_ex = sycl::ext::oneapi::experimental;
         sycl_ex::command_graph model_sycl_graph(*(sycl_ctx->stream()));
         model_sycl_graph.begin_recording(*(sycl_ctx->stream()));
         ggml_backend_sycl_graph_compute_impl(sycl_ctx, cgraph);

From d75f29acdb92eb20435a9050002ed4d91f740734 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=81ukasz=20=C5=9Alusarczyk?=
 <112692748+lslusarczyk@users.noreply.github.com>
Date: Fri, 14 Mar 2025 12:38:54 +0100
Subject: [PATCH 5/7] fix markdown in docs/backend/SYCL.md

Co-authored-by: Romain Biessy <romain.biessy@codeplay.com>
---
 docs/backend/SYCL.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md
index 1497d6ed1a1..49d7ffcf4d3 100644
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -662,7 +662,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
 | GGML_SYCL_TARGET   | INTEL *(default)* \| NVIDIA \| AMD    | Set the SYCL target device type.            |
 | GGML_SYCL_DEVICE_ARCH | Optional (except for AMD)          | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. |
 | GGML_SYCL_F16      | OFF *(default)* \|ON *(optional)*     | Enable FP16 build with SYCL code path.      |
-| GGML_SYCL_GRAPH    | OFF *(default) \|ON *(Optional)*      | Enable build with (SYCL Graph extension)[https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc]. |
+| GGML_SYCL_GRAPH    | OFF *(default) \|ON *(Optional)*      | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). |
 | CMAKE_C_COMPILER   | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path.      |
 | CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)*   | Set `icpx/icx` compiler for SYCL code path. |
 

From 55a49ba4b8595c29c16f4a2712632e9379dfb4cf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=81ukasz=20=C5=9Alusarczyk?=
 <lukasz.slusarczyk@intel.com>
Date: Fri, 14 Mar 2025 12:42:20 +0100
Subject: [PATCH 6/7] fix markdown in docs/backend/SYCL.md again

---
 docs/backend/SYCL.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md
index 49d7ffcf4d3..573b115489c 100644
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -662,7 +662,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
 | GGML_SYCL_TARGET   | INTEL *(default)* \| NVIDIA \| AMD    | Set the SYCL target device type.            |
 | GGML_SYCL_DEVICE_ARCH | Optional (except for AMD)          | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. |
 | GGML_SYCL_F16      | OFF *(default)* \|ON *(optional)*     | Enable FP16 build with SYCL code path.      |
-| GGML_SYCL_GRAPH    | OFF *(default) \|ON *(Optional)*      | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). |
+| GGML_SYCL_GRAPH    | OFF *(default)* \|ON *(Optional)*     | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). |
 | CMAKE_C_COMPILER   | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path.      |
 | CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)*   | Set `icpx/icx` compiler for SYCL code path. |
 

From 1693dde78a76c6beea089eea39bc93321e3f514c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=81ukasz=20=C5=9Alusarczyk?=
 <lukasz.slusarczyk@intel.com>
Date: Mon, 17 Mar 2025 13:33:17 +0100
Subject: [PATCH 7/7] compiling graphs by default, renamed graph_enable to
 graph_disable

---
 docs/backend/SYCL.md             | 6 +++---
 ggml/CMakeLists.txt              | 2 +-
 ggml/src/ggml-sycl/ggml-sycl.cpp | 8 ++++----
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md
index 573b115489c..184cd419554 100644
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -660,9 +660,9 @@ use 1 SYCL GPUs: [0] with Max compute units:512
 |--------------------|---------------------------------------|---------------------------------------------|
 | GGML_SYCL          | ON (mandatory)                        | Enable build with SYCL code path.<br>FP32 path - recommended for better perforemance than FP16 on quantized model|
 | GGML_SYCL_TARGET   | INTEL *(default)* \| NVIDIA \| AMD    | Set the SYCL target device type.            |
-| GGML_SYCL_DEVICE_ARCH | Optional (except for AMD)          | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. |
+| GGML_SYCL_DEVICE_ARCH | Optional (except for AMD)             | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. |
 | GGML_SYCL_F16      | OFF *(default)* \|ON *(optional)*     | Enable FP16 build with SYCL code path.      |
-| GGML_SYCL_GRAPH    | OFF *(default)* \|ON *(Optional)*     | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). |
+| GGML_SYCL_GRAPH    | ON *(default)* \|OFF *(Optional)*     | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). |
 | CMAKE_C_COMPILER   | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path.      |
 | CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)*   | Set `icpx/icx` compiler for SYCL code path. |
 
@@ -672,7 +672,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
 |-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------|
 | GGML_SYCL_DEBUG   | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG                                                                             |
 | GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features based on Intel GPU type, to compare the performance increase |
-| GGML_SYCL_ENABLE_GRAPH | 0 (default) or 1 | Enable running computations through SYCL Graphs feature. Requires also the code be be compiled with GGML_SYCL_GRAPH=ON. |
+| GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because graph performance isn't yet better than non-graph performance. |
 | ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
 
 
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index 09b6f630c52..740f9f69cf2 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -186,7 +186,7 @@ option(GGML_OPENMP                          "ggml: use OpenMP"
 option(GGML_RPC                             "ggml: use RPC"                                   OFF)
 option(GGML_SYCL                            "ggml: use SYCL"                                  OFF)
 option(GGML_SYCL_F16                        "ggml: use 16 bit floats for sycl calculations"   OFF)
-option(GGML_SYCL_GRAPH                      "ggml: enable graphs in the SYCL backend"         OFF)
+option(GGML_SYCL_GRAPH                      "ggml: enable graphs in the SYCL backend"         ON)
 set   (GGML_SYCL_TARGET "INTEL" CACHE STRING
                                             "ggml: sycl target device")
 set   (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index 92b298193a4..1bbfb4b2415 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -46,7 +46,7 @@
 static bool g_sycl_loaded = false;
 int g_ggml_sycl_debug = 0;
 int g_ggml_sycl_disable_optimize = 0;
-int g_ggml_sycl_enable_graph = 0;
+int g_ggml_sycl_disable_graph = 0;
 
 static ggml_sycl_device_info ggml_sycl_init() {
     ggml_sycl_device_info info = {};
@@ -192,12 +192,12 @@ static void ggml_check_sycl() try {
     if (!initialized) {
         g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
         g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 0);
-        g_ggml_sycl_enable_graph = get_sycl_env("GGML_SYCL_ENABLE_GRAPH", 0);
+        g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1);
         GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n");
         GGML_LOG_INFO("Running with Environment Variables:\n");
         GGML_LOG_INFO("  GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug);
         GGML_LOG_INFO("  GGML_SYCL_DISABLE_OPT: %d\n", g_ggml_sycl_disable_optimize);
-        GGML_LOG_INFO("  GGML_SYCL_ENABLE_GRAPH: %d\n", g_ggml_sycl_enable_graph);
+        GGML_LOG_INFO("  GGML_SYCL_DISABLE_GRAPH: %d\n", g_ggml_sycl_disable_graph);
         GGML_LOG_INFO("Build with Macros:\n");
 #if defined(GGML_SYCL_FORCE_MMQ)
         GGML_LOG_INFO("  GGML_SYCL_FORCE_MMQ: yes\n");
@@ -3704,7 +3704,7 @@ static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_
     auto * sycl_ctx = static_cast<ggml_backend_sycl_context *>(backend->context);
 
 #ifdef GGML_SYCL_GRAPH
-    if (g_ggml_sycl_enable_graph) {
+    if (!g_ggml_sycl_disable_graph) {
         if (!sycl_ctx->exec_graph && !dpct::get_device(sycl_ctx->device).has(sycl::aspect::ext_oneapi_graph)) {
             GGML_SYCL_DEBUG("[SYCL-GRAPH] can not use graphs on device:%d\n", sycl_ctx->device);
             ggml_backend_sycl_graph_compute_impl(sycl_ctx, cgraph);