From e9a70b10c2ab0ef4acef49a17f41c5c7603a3c2c Mon Sep 17 00:00:00 2001 From: msy-kato <62578291+msy-kato@users.noreply.github.com> Date: Wed, 29 May 2024 13:33:37 +0900 Subject: [PATCH 1/9] ggml: Added OpenMP for multi-threads processing --- CMakeLists.txt | 12 +++++++ Makefile | 6 ++++ ggml.c | 91 ++++++++++++++++++++++++++++++-------------------- 3 files changed, 73 insertions(+), 36 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index fbbc38644ef..22b13c192b1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -125,6 +125,7 @@ set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)") option(LLAMA_KOMPUTE "llama: use Kompute" OFF) option(LLAMA_RPC "llama: use RPC" OFF) +option(LLAMA_OPENMP "llama: use OpenMP" OFF) option(LLAMA_SYCL "llama: use SYCL" OFF) option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF) set(LLAMA_SYCL_TARGET "INTEL" CACHE STRING "llama: sycl target device") @@ -295,6 +296,17 @@ if (LLAMA_METAL) ) endif() +if (LLAMA_OPENMP) + find_package(OpenMP) + if(OpenMP_FOUND) + message(STATUS "OpenMP found") + add_compile_definitions(GGML_USE_OPENMP) + + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") + endif() +endif() + if (LLAMA_BLAS) if (LLAMA_STATIC) set(BLA_STATIC ON) diff --git a/Makefile b/Makefile index 5caf31cdf37..9d7d7f965c2 100644 --- a/Makefile +++ b/Makefile @@ -400,6 +400,12 @@ ifndef LLAMA_NO_ACCELERATE endif endif # LLAMA_NO_ACCELERATE +ifdef LLAMA_OPENMP + MK_CPPFLAGS += -DGGML_USE_OPENMP + MK_CFLAGS += -fopenmp + MK_CXXFLAGS += -fopenmp +endif # LLAMA_OPENMP + ifdef LLAMA_OPENBLAS MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas) MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas) diff --git a/ggml.c b/ggml.c index 8bfb9531eb8..0bef7ac852e 100644 --- a/ggml.c +++ b/ggml.c @@ -28,6 +28,10 @@ #include #endif +#ifdef GGML_USE_OPENMP +#include +#endif + #ifdef GGML_USE_METAL #include #endif @@ -19661,6 +19665,48 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa return cplan; } +static enum ggml_status ggml_graph_compute_parallel(struct ggml_compute_state * workers, int n_threads){ + enum ggml_status compute_status = GGML_STATUS_SUCCESS; + +#ifdef GGML_USE_OPENMP +#pragma omp parallel num_threads(n_threads) + { + ggml_graph_compute_thread(&workers[omp_get_thread_num()]); + } +#else + // create thread pool + if (n_threads > 1) { + for (int j = 1; j < n_threads; ++j) { + const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]); + GGML_ASSERT(rc == 0); + UNUSED(rc); + } + } + + // this is a work thread too + ggml_graph_compute_thread(&workers[0]); + + // don't leave affinity set on the main thread + clear_numa_thread_affinity(); + + // join or kill thread pool + if (n_threads > 1) { + for (int j = 1; j < n_threads; j++) { + const int rc = ggml_thread_join(workers[j].thrd, NULL); + GGML_ASSERT(rc == 0); + UNUSED(rc); + } + } +#endif + for (int j = 0; j < n_threads; j++) { + if (workers[j].ec != GGML_STATUS_SUCCESS) { + compute_status = workers[j].ec; + break; + } + } + return compute_status; +} + enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { { GGML_ASSERT(cplan); @@ -19687,47 +19733,20 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl /*.current_chunk; =*/ 0, }; struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads); - - // create thread pool - if (n_threads > 1) { - for (int j = 1; j < n_threads; ++j) { - workers[j] = (struct ggml_compute_state) { - .thrd = 0, - .ith = j, - .shared = &state_shared, - .ec = GGML_STATUS_SUCCESS, - }; - - const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]); - GGML_ASSERT(rc == 0); - UNUSED(rc); - } - } - - workers[0].ith = 0; - workers[0].shared = &state_shared; - workers[0].ec = GGML_STATUS_SUCCESS; - const int64_t perf_start_cycles = ggml_perf_cycles(); const int64_t perf_start_time_us = ggml_perf_time_us(); - // this is a work thread too - ggml_graph_compute_thread(&workers[0]); - enum ggml_status compute_status = workers[0].ec; - - // don't leave affinity set on the main thread - clear_numa_thread_affinity(); - - // join or kill thread pool - if (n_threads > 1) { - for (int j = 1; j < n_threads; j++) { - const int rc = ggml_thread_join(workers[j].thrd, NULL); - GGML_ASSERT(rc == 0); - if (workers[j].ec != GGML_STATUS_SUCCESS) - compute_status = workers[j].ec; - } + for (int j = 0; j < n_threads; ++j) { + workers[j] = (struct ggml_compute_state) { + .thrd = 0, + .ith = j, + .shared = &state_shared, + .ec = GGML_STATUS_SUCCESS, + }; } + enum ggml_status compute_status = ggml_graph_compute_parallel(workers, n_threads); + // performance stats (graph) { int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles; From 7918ed7f2ca522fb0426e2eb5bbdba28172f8a0d Mon Sep 17 00:00:00 2001 From: msy-kato <62578291+msy-kato@users.noreply.github.com> Date: Thu, 30 May 2024 14:05:52 +0900 Subject: [PATCH 2/9] ggml : Limit the number of threads used to avoid deadlock --- CMakeLists.txt | 2 ++ ggml.c | 7 +++++++ 2 files changed, 9 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 22b13c192b1..0abc78f4c7a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -304,6 +304,8 @@ if (LLAMA_OPENMP) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") + else() + message(WARNING "OpenMP not found") endif() endif() diff --git a/ggml.c b/ggml.c index 0bef7ac852e..0b27712a338 100644 --- a/ggml.c +++ b/ggml.c @@ -5,6 +5,7 @@ #include "ggml-quants.h" #include "ggml.h" + #if defined(_MSC_VER) || defined(__MINGW32__) #include // using malloc.h with MSC/MINGW #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) @@ -19485,6 +19486,12 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa if (n_threads <= 0) { n_threads = GGML_DEFAULT_N_THREADS; } +#if defined(GGML_USE_OPENMP) + // Limit the number of threads used to avoid deadlock + // ref: https://github.com/ggerganov/llama.cpp/pull/7606 + n_threads = MIN(n_threads, omp_get_max_threads()); + n_threads = MIN(n_threads, omp_get_thread_limit()); +#endif size_t work_size = 0; From fa864af945c19816dd073425d62b62e851d52ca2 Mon Sep 17 00:00:00 2001 From: slaren Date: Thu, 30 May 2024 09:47:29 +0200 Subject: [PATCH 3/9] update shared state n_threads in parallel region --- ggml.c | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/ggml.c b/ggml.c index 0b27712a338..1ec3e144c37 100644 --- a/ggml.c +++ b/ggml.c @@ -1751,7 +1751,7 @@ struct ggml_compute_state_shared { int64_t perf_node_start_cycles; int64_t perf_node_start_time_us; - const int n_threads; + int n_threads; // synchronization primitives atomic_int n_active; // num active threads @@ -19486,12 +19486,6 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa if (n_threads <= 0) { n_threads = GGML_DEFAULT_N_THREADS; } -#if defined(GGML_USE_OPENMP) - // Limit the number of threads used to avoid deadlock - // ref: https://github.com/ggerganov/llama.cpp/pull/7606 - n_threads = MIN(n_threads, omp_get_max_threads()); - n_threads = MIN(n_threads, omp_get_thread_limit()); -#endif size_t work_size = 0; @@ -19676,9 +19670,20 @@ static enum ggml_status ggml_graph_compute_parallel(struct ggml_compute_state * enum ggml_status compute_status = GGML_STATUS_SUCCESS; #ifdef GGML_USE_OPENMP -#pragma omp parallel num_threads(n_threads) - { - ggml_graph_compute_thread(&workers[omp_get_thread_num()]); + if (n_threads > 1) { + #pragma omp parallel num_threads(n_threads) + { + #pragma omp single + { + // update the number of threads from the actual number of threads that we got from OpenMP + n_threads = omp_get_num_threads(); + workers[0].shared->n_threads = n_threads; + workers[0].shared->n_active = n_threads; + } + ggml_graph_compute_thread(&workers[omp_get_thread_num()]); + } + } else { + ggml_graph_compute_thread(&workers[0]); } #else // create thread pool @@ -19724,7 +19729,12 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl } } - const int n_threads = cplan->n_threads; + int n_threads = cplan->n_threads; + +#if defined(GGML_USE_OPENMP) + n_threads = MIN(n_threads, omp_get_max_threads()); + n_threads = MIN(n_threads, omp_get_thread_limit()); +#endif struct ggml_compute_state_shared state_shared = { /*.cgraph =*/ cgraph, From 377bc783414eab809aa4dd3043e855fd4df1b877 Mon Sep 17 00:00:00 2001 From: slaren Date: Thu, 30 May 2024 09:58:08 +0200 Subject: [PATCH 4/9] clear numa affinity for main thread even with openmp --- ggml.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ggml.c b/ggml.c index 1ec3e144c37..eb894a21f78 100644 --- a/ggml.c +++ b/ggml.c @@ -19698,9 +19698,6 @@ static enum ggml_status ggml_graph_compute_parallel(struct ggml_compute_state * // this is a work thread too ggml_graph_compute_thread(&workers[0]); - // don't leave affinity set on the main thread - clear_numa_thread_affinity(); - // join or kill thread pool if (n_threads > 1) { for (int j = 1; j < n_threads; j++) { @@ -19710,6 +19707,9 @@ static enum ggml_status ggml_graph_compute_parallel(struct ggml_compute_state * } } #endif + // don't leave affinity set on the main thread + clear_numa_thread_affinity(); + for (int j = 0; j < n_threads; j++) { if (workers[j].ec != GGML_STATUS_SUCCESS) { compute_status = workers[j].ec; From 5ddbd1843db1a775f12f74fb233a2ef193364fe9 Mon Sep 17 00:00:00 2001 From: slaren Date: Thu, 30 May 2024 10:01:37 +0200 Subject: [PATCH 5/9] enable openmp by default --- CMakeLists.txt | 2 +- Makefile | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0abc78f4c7a..7ae457a215e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -125,7 +125,7 @@ set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)") option(LLAMA_KOMPUTE "llama: use Kompute" OFF) option(LLAMA_RPC "llama: use RPC" OFF) -option(LLAMA_OPENMP "llama: use OpenMP" OFF) +option(LLAMA_OPENMP "llama: use OpenMP" ON) option(LLAMA_SYCL "llama: use SYCL" OFF) option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF) set(LLAMA_SYCL_TARGET "INTEL" CACHE STRING "llama: sycl target device") diff --git a/Makefile b/Makefile index 9d7d7f965c2..6ca9ea4b6b5 100644 --- a/Makefile +++ b/Makefile @@ -400,11 +400,11 @@ ifndef LLAMA_NO_ACCELERATE endif endif # LLAMA_NO_ACCELERATE -ifdef LLAMA_OPENMP +ifndef LLAMA_NO_OPENMP MK_CPPFLAGS += -DGGML_USE_OPENMP MK_CFLAGS += -fopenmp MK_CXXFLAGS += -fopenmp -endif # LLAMA_OPENMP +endif # LLAMA_NO_OPENMP ifdef LLAMA_OPENBLAS MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas) From 5970a26d6637cce19e8093f770d8c5e75cafcab0 Mon Sep 17 00:00:00 2001 From: slaren Date: Thu, 30 May 2024 10:26:02 +0200 Subject: [PATCH 6/9] fix msvc build --- CMakeLists.txt | 4 +--- ggml.c | 1 - 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7ae457a215e..5355adaa441 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -301,9 +301,7 @@ if (LLAMA_OPENMP) if(OpenMP_FOUND) message(STATUS "OpenMP found") add_compile_definitions(GGML_USE_OPENMP) - - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} OpenMP::OpenMP_C OpenMP::OpenMP_CXX) else() message(WARNING "OpenMP not found") endif() diff --git a/ggml.c b/ggml.c index eb894a21f78..f07b6b83f1d 100644 --- a/ggml.c +++ b/ggml.c @@ -19733,7 +19733,6 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl #if defined(GGML_USE_OPENMP) n_threads = MIN(n_threads, omp_get_max_threads()); - n_threads = MIN(n_threads, omp_get_thread_limit()); #endif struct ggml_compute_state_shared state_shared = { From f1772c9973f89532f61ecf6a9956aa5b1aeb4237 Mon Sep 17 00:00:00 2001 From: slaren Date: Thu, 30 May 2024 11:46:28 +0200 Subject: [PATCH 7/9] disable openmp on macos --- CMakeLists.txt | 2 +- Makefile | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5355adaa441..0bfa2d4c378 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -298,7 +298,7 @@ endif() if (LLAMA_OPENMP) find_package(OpenMP) - if(OpenMP_FOUND) + if (OpenMP_FOUND) message(STATUS "OpenMP found") add_compile_definitions(GGML_USE_OPENMP) set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} OpenMP::OpenMP_C OpenMP::OpenMP_CXX) diff --git a/Makefile b/Makefile index 6ca9ea4b6b5..39ba2eb9cff 100644 --- a/Makefile +++ b/Makefile @@ -57,6 +57,8 @@ ifeq ($(UNAME_S),Darwin) LLAMA_METAL := 1 endif + LLAMA_NO_OPENMP := 1 + ifneq ($(UNAME_P),arm) SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null) ifeq ($(SYSCTL_M),1) From 62855ca3f63d260310590d7c62b983e0dcec8d09 Mon Sep 17 00:00:00 2001 From: slaren Date: Thu, 30 May 2024 16:55:16 +0200 Subject: [PATCH 8/9] ci : disable openmp with thread sanitizer --- .github/workflows/build.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 7b616281b6f..e824136a56e 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -294,12 +294,22 @@ jobs: - name: Build id: cmake_build + if: ${{ matrix.sanitizer != 'THREAD' }} run: | mkdir build cd build cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} cmake --build . --config ${{ matrix.build_type }} -j $(nproc) + - name: Build (no OpenMP) + id: cmake_build_no_openmp + if: ${{ matrix.sanitizer == 'THREAD' }} + run: | + mkdir build + cd build + cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DLLAMA_OPENMP=OFF + cmake --build . --config ${{ matrix.build_type }} -j $(nproc) + - name: Test id: cmake_test run: | From e0b077d4daf9a3003e80bcddafda2da34bbcd1da Mon Sep 17 00:00:00 2001 From: slaren Date: Mon, 3 Jun 2024 14:18:14 +0200 Subject: [PATCH 9/9] Update ggml.c Co-authored-by: Georgi Gerganov --- ggml.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml.c b/ggml.c index f07b6b83f1d..623eba3aa9a 100644 --- a/ggml.c +++ b/ggml.c @@ -19666,7 +19666,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa return cplan; } -static enum ggml_status ggml_graph_compute_parallel(struct ggml_compute_state * workers, int n_threads){ +static enum ggml_status ggml_graph_compute_parallel(struct ggml_compute_state * workers, int n_threads) { enum ggml_status compute_status = GGML_STATUS_SUCCESS; #ifdef GGML_USE_OPENMP