From e9a70b10c2ab0ef4acef49a17f41c5c7603a3c2c Mon Sep 17 00:00:00 2001
From: msy-kato <62578291+msy-kato@users.noreply.github.com>
Date: Wed, 29 May 2024 13:33:37 +0900
Subject: [PATCH 1/9] ggml: Added OpenMP for multi-threads processing

---
 CMakeLists.txt | 12 +++++++
 Makefile       |  6 ++++
 ggml.c         | 91 ++++++++++++++++++++++++++++++--------------------
 3 files changed, 73 insertions(+), 36 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fbbc38644ef..22b13c192b1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -125,6 +125,7 @@ set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING
 set(LLAMA_METAL_STD "" CACHE STRING          "llama: metal standard version (-std flag)")
 option(LLAMA_KOMPUTE                         "llama: use Kompute"                               OFF)
 option(LLAMA_RPC                             "llama: use RPC"                                   OFF)
+option(LLAMA_OPENMP                          "llama: use OpenMP"                                OFF)
 option(LLAMA_SYCL                            "llama: use SYCL"                                  OFF)
 option(LLAMA_SYCL_F16                        "llama: use 16 bit floats for sycl calculations"   OFF)
 set(LLAMA_SYCL_TARGET   "INTEL" CACHE STRING "llama: sycl target device")
@@ -295,6 +296,17 @@ if (LLAMA_METAL)
         )
 endif()
 
+if (LLAMA_OPENMP)
+    find_package(OpenMP)
+    if(OpenMP_FOUND)
+        message(STATUS "OpenMP found")
+        add_compile_definitions(GGML_USE_OPENMP)
+
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+    endif()
+endif()
+
 if (LLAMA_BLAS)
     if (LLAMA_STATIC)
         set(BLA_STATIC ON)
diff --git a/Makefile b/Makefile
index 5caf31cdf37..9d7d7f965c2 100644
--- a/Makefile
+++ b/Makefile
@@ -400,6 +400,12 @@ ifndef LLAMA_NO_ACCELERATE
 	endif
 endif # LLAMA_NO_ACCELERATE
 
+ifdef LLAMA_OPENMP
+	MK_CPPFLAGS += -DGGML_USE_OPENMP
+	MK_CFLAGS   += -fopenmp
+	MK_CXXFLAGS += -fopenmp
+endif # LLAMA_OPENMP
+
 ifdef LLAMA_OPENBLAS
 	MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
 	MK_CFLAGS   += $(shell pkg-config --cflags-only-other openblas)
diff --git a/ggml.c b/ggml.c
index 8bfb9531eb8..0bef7ac852e 100644
--- a/ggml.c
+++ b/ggml.c
@@ -28,6 +28,10 @@
 #include <syscall.h>
 #endif
 
+#ifdef GGML_USE_OPENMP
+#include <omp.h>
+#endif
+
 #ifdef GGML_USE_METAL
 #include <unistd.h>
 #endif
@@ -19661,6 +19665,48 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
     return cplan;
 }
 
+static enum ggml_status ggml_graph_compute_parallel(struct ggml_compute_state * workers, int n_threads){
+    enum ggml_status compute_status = GGML_STATUS_SUCCESS;
+
+#ifdef GGML_USE_OPENMP
+#pragma omp parallel num_threads(n_threads)
+    {
+        ggml_graph_compute_thread(&workers[omp_get_thread_num()]);
+    }
+#else
+    // create thread pool
+    if (n_threads > 1) {
+        for (int j = 1; j < n_threads; ++j) {
+            const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
+            GGML_ASSERT(rc == 0);
+            UNUSED(rc);
+        }
+    }
+
+    // this is a work thread too
+    ggml_graph_compute_thread(&workers[0]);
+
+    // don't leave affinity set on the main thread
+    clear_numa_thread_affinity();
+
+    // join or kill thread pool
+    if (n_threads > 1) {
+        for (int j = 1; j < n_threads; j++) {
+            const int rc = ggml_thread_join(workers[j].thrd, NULL);
+            GGML_ASSERT(rc == 0);
+            UNUSED(rc);
+        }
+    }
+#endif
+    for (int j = 0; j < n_threads; j++) {
+        if (workers[j].ec != GGML_STATUS_SUCCESS) {
+            compute_status = workers[j].ec;
+            break;
+        }
+    }
+    return compute_status;
+}
+
 enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
     {
         GGML_ASSERT(cplan);
@@ -19687,47 +19733,20 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
         /*.current_chunk;          =*/ 0,
     };
     struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
-
-    // create thread pool
-    if (n_threads > 1) {
-        for (int j = 1; j < n_threads; ++j) {
-            workers[j] = (struct ggml_compute_state) {
-                .thrd   = 0,
-                .ith = j,
-                .shared = &state_shared,
-                .ec = GGML_STATUS_SUCCESS,
-            };
-
-            const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
-            GGML_ASSERT(rc == 0);
-            UNUSED(rc);
-        }
-    }
-
-    workers[0].ith = 0;
-    workers[0].shared = &state_shared;
-    workers[0].ec = GGML_STATUS_SUCCESS;
-
     const int64_t perf_start_cycles  = ggml_perf_cycles();
     const int64_t perf_start_time_us = ggml_perf_time_us();
 
-    // this is a work thread too
-    ggml_graph_compute_thread(&workers[0]);
-    enum ggml_status compute_status = workers[0].ec;
-
-    // don't leave affinity set on the main thread
-    clear_numa_thread_affinity();
-
-    // join or kill thread pool
-    if (n_threads > 1) {
-        for (int j = 1; j < n_threads; j++) {
-            const int rc = ggml_thread_join(workers[j].thrd, NULL);
-            GGML_ASSERT(rc == 0);
-            if (workers[j].ec != GGML_STATUS_SUCCESS)
-                compute_status = workers[j].ec;
-        }
+    for (int j = 0; j < n_threads; ++j) {
+        workers[j] = (struct ggml_compute_state) {
+            .thrd   = 0,
+            .ith    = j,
+            .shared = &state_shared,
+            .ec     = GGML_STATUS_SUCCESS,
+        };
     }
 
+    enum ggml_status compute_status = ggml_graph_compute_parallel(workers, n_threads);
+
     // performance stats (graph)
     {
         int64_t perf_cycles_cur  = ggml_perf_cycles()  - perf_start_cycles;

From 7918ed7f2ca522fb0426e2eb5bbdba28172f8a0d Mon Sep 17 00:00:00 2001
From: msy-kato <62578291+msy-kato@users.noreply.github.com>
Date: Thu, 30 May 2024 14:05:52 +0900
Subject: [PATCH 2/9] ggml : Limit the number of threads used to avoid deadlock

---
 CMakeLists.txt | 2 ++
 ggml.c         | 7 +++++++
 2 files changed, 9 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 22b13c192b1..0abc78f4c7a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -304,6 +304,8 @@ if (LLAMA_OPENMP)
 
         set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+    else()
+        message(WARNING "OpenMP not found")
     endif()
 endif()
 
diff --git a/ggml.c b/ggml.c
index 0bef7ac852e..0b27712a338 100644
--- a/ggml.c
+++ b/ggml.c
@@ -5,6 +5,7 @@
 #include "ggml-quants.h"
 #include "ggml.h"
 
+
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <malloc.h> // using malloc.h with MSC/MINGW
 #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
@@ -19485,6 +19486,12 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
     if (n_threads <= 0) {
         n_threads = GGML_DEFAULT_N_THREADS;
     }
+#if defined(GGML_USE_OPENMP)
+    // Limit the number of threads used to avoid deadlock
+    // ref: https://github.com/ggerganov/llama.cpp/pull/7606
+    n_threads = MIN(n_threads, omp_get_max_threads());
+    n_threads = MIN(n_threads, omp_get_thread_limit());
+#endif
 
     size_t work_size = 0;
 

From fa864af945c19816dd073425d62b62e851d52ca2 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Thu, 30 May 2024 09:47:29 +0200
Subject: [PATCH 3/9] update shared state n_threads in parallel region

---
 ggml.c | 32 +++++++++++++++++++++-----------
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/ggml.c b/ggml.c
index 0b27712a338..1ec3e144c37 100644
--- a/ggml.c
+++ b/ggml.c
@@ -1751,7 +1751,7 @@ struct ggml_compute_state_shared {
     int64_t perf_node_start_cycles;
     int64_t perf_node_start_time_us;
 
-    const int n_threads;
+    int n_threads;
 
     // synchronization primitives
     atomic_int n_active;  // num active threads
@@ -19486,12 +19486,6 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
     if (n_threads <= 0) {
         n_threads = GGML_DEFAULT_N_THREADS;
     }
-#if defined(GGML_USE_OPENMP)
-    // Limit the number of threads used to avoid deadlock
-    // ref: https://github.com/ggerganov/llama.cpp/pull/7606
-    n_threads = MIN(n_threads, omp_get_max_threads());
-    n_threads = MIN(n_threads, omp_get_thread_limit());
-#endif
 
     size_t work_size = 0;
 
@@ -19676,9 +19670,20 @@ static enum ggml_status ggml_graph_compute_parallel(struct ggml_compute_state *
     enum ggml_status compute_status = GGML_STATUS_SUCCESS;
 
 #ifdef GGML_USE_OPENMP
-#pragma omp parallel num_threads(n_threads)
-    {
-        ggml_graph_compute_thread(&workers[omp_get_thread_num()]);
+    if (n_threads > 1) {
+        #pragma omp parallel num_threads(n_threads)
+        {
+            #pragma omp single
+            {
+                // update the number of threads from the actual number of threads that we got from OpenMP
+                n_threads = omp_get_num_threads();
+                workers[0].shared->n_threads = n_threads;
+                workers[0].shared->n_active  = n_threads;
+            }
+            ggml_graph_compute_thread(&workers[omp_get_thread_num()]);
+        }
+    } else {
+        ggml_graph_compute_thread(&workers[0]);
     }
 #else
     // create thread pool
@@ -19724,7 +19729,12 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
         }
     }
 
-    const int n_threads = cplan->n_threads;
+    int n_threads = cplan->n_threads;
+
+#if defined(GGML_USE_OPENMP)
+    n_threads = MIN(n_threads, omp_get_max_threads());
+    n_threads = MIN(n_threads, omp_get_thread_limit());
+#endif
 
     struct ggml_compute_state_shared state_shared = {
         /*.cgraph                  =*/ cgraph,

From 377bc783414eab809aa4dd3043e855fd4df1b877 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Thu, 30 May 2024 09:58:08 +0200
Subject: [PATCH 4/9] clear numa affinity for main thread even with openmp

---
 ggml.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ggml.c b/ggml.c
index 1ec3e144c37..eb894a21f78 100644
--- a/ggml.c
+++ b/ggml.c
@@ -19698,9 +19698,6 @@ static enum ggml_status ggml_graph_compute_parallel(struct ggml_compute_state *
     // this is a work thread too
     ggml_graph_compute_thread(&workers[0]);
 
-    // don't leave affinity set on the main thread
-    clear_numa_thread_affinity();
-
     // join or kill thread pool
     if (n_threads > 1) {
         for (int j = 1; j < n_threads; j++) {
@@ -19710,6 +19707,9 @@ static enum ggml_status ggml_graph_compute_parallel(struct ggml_compute_state *
         }
     }
 #endif
+    // don't leave affinity set on the main thread
+    clear_numa_thread_affinity();
+
     for (int j = 0; j < n_threads; j++) {
         if (workers[j].ec != GGML_STATUS_SUCCESS) {
             compute_status = workers[j].ec;

From 5ddbd1843db1a775f12f74fb233a2ef193364fe9 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Thu, 30 May 2024 10:01:37 +0200
Subject: [PATCH 5/9] enable openmp by default

---
 CMakeLists.txt | 2 +-
 Makefile       | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0abc78f4c7a..7ae457a215e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -125,7 +125,7 @@ set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING
 set(LLAMA_METAL_STD "" CACHE STRING          "llama: metal standard version (-std flag)")
 option(LLAMA_KOMPUTE                         "llama: use Kompute"                               OFF)
 option(LLAMA_RPC                             "llama: use RPC"                                   OFF)
-option(LLAMA_OPENMP                          "llama: use OpenMP"                                OFF)
+option(LLAMA_OPENMP                          "llama: use OpenMP"                                ON)
 option(LLAMA_SYCL                            "llama: use SYCL"                                  OFF)
 option(LLAMA_SYCL_F16                        "llama: use 16 bit floats for sycl calculations"   OFF)
 set(LLAMA_SYCL_TARGET   "INTEL" CACHE STRING "llama: sycl target device")
diff --git a/Makefile b/Makefile
index 9d7d7f965c2..6ca9ea4b6b5 100644
--- a/Makefile
+++ b/Makefile
@@ -400,11 +400,11 @@ ifndef LLAMA_NO_ACCELERATE
 	endif
 endif # LLAMA_NO_ACCELERATE
 
-ifdef LLAMA_OPENMP
+ifndef LLAMA_NO_OPENMP
 	MK_CPPFLAGS += -DGGML_USE_OPENMP
 	MK_CFLAGS   += -fopenmp
 	MK_CXXFLAGS += -fopenmp
-endif # LLAMA_OPENMP
+endif # LLAMA_NO_OPENMP
 
 ifdef LLAMA_OPENBLAS
 	MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)

From 5970a26d6637cce19e8093f770d8c5e75cafcab0 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Thu, 30 May 2024 10:26:02 +0200
Subject: [PATCH 6/9] fix msvc build

---
 CMakeLists.txt | 4 +---
 ggml.c         | 1 -
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7ae457a215e..5355adaa441 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -301,9 +301,7 @@ if (LLAMA_OPENMP)
     if(OpenMP_FOUND)
         message(STATUS "OpenMP found")
         add_compile_definitions(GGML_USE_OPENMP)
-
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
     else()
         message(WARNING "OpenMP not found")
     endif()
diff --git a/ggml.c b/ggml.c
index eb894a21f78..f07b6b83f1d 100644
--- a/ggml.c
+++ b/ggml.c
@@ -19733,7 +19733,6 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
 
 #if defined(GGML_USE_OPENMP)
     n_threads = MIN(n_threads, omp_get_max_threads());
-    n_threads = MIN(n_threads, omp_get_thread_limit());
 #endif
 
     struct ggml_compute_state_shared state_shared = {

From f1772c9973f89532f61ecf6a9956aa5b1aeb4237 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Thu, 30 May 2024 11:46:28 +0200
Subject: [PATCH 7/9] disable openmp on macos

---
 CMakeLists.txt | 2 +-
 Makefile       | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5355adaa441..0bfa2d4c378 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -298,7 +298,7 @@ endif()
 
 if (LLAMA_OPENMP)
     find_package(OpenMP)
-    if(OpenMP_FOUND)
+    if (OpenMP_FOUND)
         message(STATUS "OpenMP found")
         add_compile_definitions(GGML_USE_OPENMP)
         set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
diff --git a/Makefile b/Makefile
index 6ca9ea4b6b5..39ba2eb9cff 100644
--- a/Makefile
+++ b/Makefile
@@ -57,6 +57,8 @@ ifeq ($(UNAME_S),Darwin)
 		LLAMA_METAL := 1
 	endif
 
+	LLAMA_NO_OPENMP := 1
+
 	ifneq ($(UNAME_P),arm)
 		SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null)
 		ifeq ($(SYSCTL_M),1)

From 62855ca3f63d260310590d7c62b983e0dcec8d09 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Thu, 30 May 2024 16:55:16 +0200
Subject: [PATCH 8/9] ci : disable openmp with thread sanitizer

---
 .github/workflows/build.yml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 7b616281b6f..e824136a56e 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -294,12 +294,22 @@ jobs:
 
       - name: Build
         id: cmake_build
+        if: ${{ matrix.sanitizer != 'THREAD' }}
         run: |
           mkdir build
           cd build
           cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
           cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
 
+      - name: Build (no OpenMP)
+        id: cmake_build_no_openmp
+        if: ${{ matrix.sanitizer == 'THREAD' }}
+        run: |
+          mkdir build
+          cd build
+          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DLLAMA_OPENMP=OFF
+          cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
+
       - name: Test
         id: cmake_test
         run: |

From e0b077d4daf9a3003e80bcddafda2da34bbcd1da Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Mon, 3 Jun 2024 14:18:14 +0200
Subject: [PATCH 9/9] Update ggml.c

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 ggml.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml.c b/ggml.c
index f07b6b83f1d..623eba3aa9a 100644
--- a/ggml.c
+++ b/ggml.c
@@ -19666,7 +19666,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
     return cplan;
 }
 
-static enum ggml_status ggml_graph_compute_parallel(struct ggml_compute_state * workers, int n_threads){
+static enum ggml_status ggml_graph_compute_parallel(struct ggml_compute_state * workers, int n_threads) {
     enum ggml_status compute_status = GGML_STATUS_SUCCESS;
 
 #ifdef GGML_USE_OPENMP