From d89f8dd0ea120c0d7701d83f12f69baaa7fdd16b Mon Sep 17 00:00:00 2001 From: Yuri Khrustalev Date: Mon, 30 Mar 2026 18:43:11 -0700 Subject: [PATCH 1/6] server: respect the ignore eos flag --- tools/server/server-context.cpp | 3 ++ tools/server/server-context.h | 3 ++ tools/server/server-task.cpp | 3 +- tools/server/server-task.h | 1 + tools/server/tests/unit/test_ignore_eos.py | 43 ++++++++++++++++++++++ 5 files changed, 52 insertions(+), 1 deletion(-) create mode 100644 tools/server/tests/unit/test_ignore_eos.py diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 6f737d94d02..d002ea1c3b4 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -3000,6 +3000,8 @@ server_context_meta server_context::get_meta() const { /* fim_rep_token */ llama_vocab_fim_rep(impl->vocab), /* fim_sep_token */ llama_vocab_fim_sep(impl->vocab), + /* logit_bias_eog */ impl->params_base.sampling.logit_bias_eog, + /* model_vocab_type */ llama_vocab_type(impl->vocab), /* model_vocab_n_tokens */ llama_vocab_n_tokens(impl->vocab), /* model_n_ctx_train */ llama_model_n_ctx_train(impl->model), @@ -3084,6 +3086,7 @@ std::unique_ptr server_routes::handle_completions_impl( ctx_server.vocab, params, meta->slot_n_ctx, + meta->logit_bias_eog, data); task.id_slot = json_value(data, "id_slot", -1); diff --git a/tools/server/server-context.h b/tools/server/server-context.h index a4d2201cbed..fa71ace9785 100644 --- a/tools/server/server-context.h +++ b/tools/server/server-context.h @@ -39,6 +39,9 @@ struct server_context_meta { llama_token fim_rep_token; llama_token fim_sep_token; + // sampling + std::vector logit_bias_eog; + // model meta enum llama_vocab_type model_vocab_type; int32_t model_vocab_n_tokens; diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp index 3018ac90f8c..8dada579941 100644 --- a/tools/server/server-task.cpp +++ b/tools/server/server-task.cpp @@ -239,6 +239,7 @@ task_params server_task::params_from_json_cmpl( const llama_vocab * vocab, const common_params & params_base, const int n_ctx_slot, + const std::vector & logit_bias_eog, const json & data) { task_params params; @@ -562,7 +563,7 @@ task_params server_task::params_from_json_cmpl( if (params.sampling.ignore_eos) { params.sampling.logit_bias.insert( params.sampling.logit_bias.end(), - defaults.sampling.logit_bias_eog.begin(), defaults.sampling.logit_bias_eog.end()); + logit_bias_eog.begin(), logit_bias_eog.end()); } } diff --git a/tools/server/server-task.h b/tools/server/server-task.h index a49ddb594b9..0b319142aa5 100644 --- a/tools/server/server-task.h +++ b/tools/server/server-task.h @@ -209,6 +209,7 @@ struct server_task { const llama_vocab * vocab, const common_params & params_base, const int n_ctx_slot, + const std::vector & logit_bias_eog, const json & data); // utility function diff --git a/tools/server/tests/unit/test_ignore_eos.py b/tools/server/tests/unit/test_ignore_eos.py new file mode 100644 index 00000000000..f40faf5a829 --- /dev/null +++ b/tools/server/tests/unit/test_ignore_eos.py @@ -0,0 +1,43 @@ +import pytest +from utils import * + +server = ServerPreset.tinyllama2() + + +@pytest.fixture(autouse=True) +def create_server(): + global server + server = ServerPreset.tinyllama2() + + +def test_ignore_eos_populates_logit_bias(): + """ignore_eos=true must add EOG logit biases to generation_settings.""" + global server + server.start() + res = server.make_request("POST", "/completion", data={ + "n_predict": 8, + "prompt": "Once upon a time", + "ignore_eos": True, + "temperature": 0.0, + }) + assert res.status_code == 200 + # EOG token biases must be present with -inf bias + logit_bias = res.body["generation_settings"]["logit_bias"] + assert len(logit_bias) > 0 + for entry in logit_bias: + assert entry["bias"] is None # null in JSON represents -inf + + +def test_ignore_eos_false_no_logit_bias(): + """ignore_eos=false (default) must NOT add EOG logit biases.""" + global server + server.start() + res = server.make_request("POST", "/completion", data={ + "n_predict": 8, + "prompt": "Once upon a time", + "ignore_eos": False, + "temperature": 0.0, + }) + assert res.status_code == 200 + logit_bias = res.body["generation_settings"]["logit_bias"] + assert len(logit_bias) == 0 From 851c31e4e641d60b24f35afd6e4f20b11b2d0ede Mon Sep 17 00:00:00 2001 From: Yuri Khrustalev Date: Wed, 8 Apr 2026 16:12:21 -0400 Subject: [PATCH 2/6] ci: add android arm64 build and release --- .github/workflows/build.yml | 50 ++++++++++++++++++++++++ .github/workflows/release.yml | 73 +++++++++++++++++++++++++++++++++++ 2 files changed, 123 insertions(+) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index f4ae3675602..5e0b2df7db4 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -267,6 +267,56 @@ jobs: wget https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K-be.gguf ./bin/llama-completion -m stories260K-be.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256 + android-arm64: + runs-on: ubuntu-latest + + env: + NDK_VERSION: "29.0.14206865" + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + + - name: ccache + uses: ggml-org/ccache-action@v1.2.21 + with: + key: android-arm64 + evict-old-files: 1d + save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} + + - name: Set up JDK + uses: actions/setup-java@v5 + with: + java-version: 17 + distribution: temurin + + - name: Setup Android SDK + uses: android-actions/setup-android@9fc6c4e9069bf8d3d10b2204b1fb8f6ef7065407 # v3 + with: + log-accepted-android-sdk-licenses: false + + - name: Install NDK + run: | + sdkmanager "ndk;${{ env.NDK_VERSION }}" + echo "ANDROID_NDK=${ANDROID_SDK_ROOT}/ndk/${{ env.NDK_VERSION }}" >> $GITHUB_ENV + + - name: Build + id: cmake_build + run: | + cmake -B build \ + -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \ + -DANDROID_ABI=arm64-v8a \ + -DANDROID_PLATFORM=android-28 \ + -DLLAMA_FATAL_WARNINGS=ON \ + -DGGML_BACKEND_DL=ON \ + -DGGML_NATIVE=OFF \ + -DGGML_CPU_ALL_VARIANTS=ON \ + -DGGML_OPENMP=OFF \ + -DLLAMA_OPENSSL=OFF \ + -DGGML_RPC=ON + time cmake --build build --config Release -j $(nproc) + ubuntu-latest-rpc: runs-on: ubuntu-latest diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 8263c55ac5f..8046732a21d 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -236,6 +236,75 @@ jobs: path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz name: llama-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz + android-arm64: + runs-on: ubuntu-latest + + env: + NDK_VERSION: "29.0.14206865" + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + with: + fetch-depth: 0 + + - name: ccache + uses: ggml-org/ccache-action@v1.2.21 + with: + key: android-arm64 + evict-old-files: 1d + + - name: Set up JDK + uses: actions/setup-java@v5 + with: + java-version: 17 + distribution: temurin + + - name: Setup Android SDK + uses: android-actions/setup-android@9fc6c4e9069bf8d3d10b2204b1fb8f6ef7065407 # v3 + with: + log-accepted-android-sdk-licenses: false + + - name: Install NDK + run: | + sdkmanager "ndk;${{ env.NDK_VERSION }}" + echo "ANDROID_NDK=${ANDROID_SDK_ROOT}/ndk/${{ env.NDK_VERSION }}" >> $GITHUB_ENV + + - name: Build + id: cmake_build + run: | + cmake -B build \ + -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \ + -DANDROID_ABI=arm64-v8a \ + -DANDROID_PLATFORM=android-28 \ + -DCMAKE_INSTALL_RPATH='$ORIGIN' \ + -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \ + -DGGML_BACKEND_DL=ON \ + -DGGML_NATIVE=OFF \ + -DGGML_CPU_ALL_VARIANTS=ON \ + -DLLAMA_FATAL_WARNINGS=ON \ + -DGGML_OPENMP=OFF \ + -DLLAMA_OPENSSL=OFF \ + ${{ env.CMAKE_ARGS }} + cmake --build build --config Release -j $(nproc) + + - name: Determine tag name + id: tag + uses: ./.github/actions/get-tag-name + + - name: Pack artifacts + id: pack_artifacts + run: | + cp LICENSE ./build/bin/ + tar -czvf llama-${{ steps.tag.outputs.name }}-bin-android-arm64.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin . + + - name: Upload artifacts + uses: actions/upload-artifact@v6 + with: + path: llama-${{ steps.tag.outputs.name }}-bin-android-arm64.tar.gz + name: llama-bin-android-arm64.tar.gz + ubuntu-24-openvino: runs-on: ubuntu-24.04 @@ -971,6 +1040,7 @@ jobs: - ubuntu-cpu - ubuntu-vulkan - ubuntu-24-openvino + - android-arm64 - macOS-cpu - ios-xcode-build - openEuler-cann @@ -1059,6 +1129,9 @@ jobs: - [Ubuntu x64 (ROCm 7.2)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-7.2-x64.tar.gz) - [Ubuntu x64 (OpenVINO)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ needs.ubuntu-24-openvino.outputs.openvino_version }}-x64.tar.gz) + **Android:** + - [Android arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-android-arm64.tar.gz) + **Windows:** - [Windows x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-x64.zip) - [Windows arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-arm64.zip) From d606ba0d82672c0b38c7f5e88eda32076f2c9f52 Mon Sep 17 00:00:00 2001 From: Yuri Khrustalev Date: Wed, 15 Apr 2026 12:28:49 -0400 Subject: [PATCH 3/6] patch --- tools/mtmd/CMakeLists.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt index 3bafde178de..bf09ca3ea43 100644 --- a/tools/mtmd/CMakeLists.txt +++ b/tools/mtmd/CMakeLists.txt @@ -81,6 +81,11 @@ if (NOT MSVC) target_compile_options(mtmd PRIVATE -Wno-cast-qual) endif() +if (ANDROID) + # miniaudio.h defines ma_android_sdk_version() without a prior prototype + target_compile_options(mtmd PRIVATE -Wno-missing-prototypes) +endif() + if (TARGET BUILD_INFO) add_dependencies(mtmd BUILD_INFO) add_dependencies(mtmd-helper BUILD_INFO) From 2d9336c27a850486722d1f60ee2860d5fd9609ae Mon Sep 17 00:00:00 2001 From: Yuri Khrustalev Date: Wed, 15 Apr 2026 14:23:18 -0400 Subject: [PATCH 4/6] pin android-setup actions to v4 --- .github/workflows/build-android.yml | 2 +- .github/workflows/build.yml | 2 +- .github/workflows/release.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build-android.yml b/.github/workflows/build-android.yml index 5fc24d8d349..b38a793f186 100644 --- a/.github/workflows/build-android.yml +++ b/.github/workflows/build-android.yml @@ -51,7 +51,7 @@ jobs: distribution: zulu - name: Setup Android SDK - uses: android-actions/setup-android@9fc6c4e9069bf8d3d10b2204b1fb8f6ef7065407 # v3 + uses: android-actions/setup-android@40fd30fb8d7440372e1316f5d1809ec01dcd3699 # v4.0.1 with: log-accepted-android-sdk-licenses: false diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index b081542be7f..863b2e5739b 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -292,7 +292,7 @@ jobs: distribution: temurin - name: Setup Android SDK - uses: android-actions/setup-android@9fc6c4e9069bf8d3d10b2204b1fb8f6ef7065407 # v3 + uses: android-actions/setup-android@40fd30fb8d7440372e1316f5d1809ec01dcd3699 # v4.0.1 with: log-accepted-android-sdk-licenses: false diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 1fe8ad914e0..9804aafeb4c 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -262,7 +262,7 @@ jobs: distribution: temurin - name: Setup Android SDK - uses: android-actions/setup-android@9fc6c4e9069bf8d3d10b2204b1fb8f6ef7065407 # v3 + uses: android-actions/setup-android@40fd30fb8d7440372e1316f5d1809ec01dcd3699 # v4.0.1 with: log-accepted-android-sdk-licenses: false From b92d00614fef371d21099c3654051ceebea2bf37 Mon Sep 17 00:00:00 2001 From: Yuri Khrustalev Date: Thu, 16 Apr 2026 16:05:42 -0400 Subject: [PATCH 5/6] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- .github/workflows/build.yml | 2 +- .github/workflows/release.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 863b2e5739b..bb722a76b57 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -313,7 +313,7 @@ jobs: -DGGML_NATIVE=OFF \ -DGGML_CPU_ALL_VARIANTS=ON \ -DGGML_OPENMP=OFF \ - -DLLAMA_OPENSSL=OFF \ + -DLLAMA_BUILD_BORINGSSL=ON \ -DGGML_RPC=ON time cmake --build build --config Release -j $(nproc) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 9804aafeb4c..9ab269c44ed 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -285,7 +285,7 @@ jobs: -DGGML_CPU_ALL_VARIANTS=ON \ -DLLAMA_FATAL_WARNINGS=ON \ -DGGML_OPENMP=OFF \ - -DLLAMA_OPENSSL=OFF \ + -DLLAMA_BUILD_BORINGSSL=ON \ ${{ env.CMAKE_ARGS }} cmake --build build --config Release -j $(nproc) From cdba641015611405ae180370d32a2338e99364ee Mon Sep 17 00:00:00 2001 From: Yuri Khrustalev Date: Thu, 16 Apr 2026 20:07:49 -0400 Subject: [PATCH 6/6] lf in the suggestion --- .github/workflows/build.yml | 2 +- .github/workflows/release.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 7433da76318..28c8665bd8b 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -313,7 +313,7 @@ jobs: -DGGML_NATIVE=OFF \ -DGGML_CPU_ALL_VARIANTS=ON \ -DGGML_OPENMP=OFF \ - -DLLAMA_BUILD_BORINGSSL=ON \ + -DLLAMA_BUILD_BORINGSSL=ON \ -DGGML_RPC=ON time cmake --build build --config Release -j $(nproc) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 9ab269c44ed..8a49715b395 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -285,7 +285,7 @@ jobs: -DGGML_CPU_ALL_VARIANTS=ON \ -DLLAMA_FATAL_WARNINGS=ON \ -DGGML_OPENMP=OFF \ - -DLLAMA_BUILD_BORINGSSL=ON \ + -DLLAMA_BUILD_BORINGSSL=ON \ ${{ env.CMAKE_ARGS }} cmake --build build --config Release -j $(nproc)