args: refactor mlock/mmap/directio into load-mode by taronaeo · Pull Request #20834 · ggml-org/llama.cpp

taronaeo · 2026-03-21T06:10:24Z

Ref: #20211 (comment)
Obsoletes: #20461

This PR overhauls the three separate loading modes (mlock, mmap, and direct-io) into one single -lm/--load-mode option to simplify the logic. While working on #20461, I realised that it became quite complex to maintain multiple loading modes when they are mutually exclusive of one another. This PR solves that by allowing only one loading mode to exist at a time.

Flags --mlock, --mmap, --direct-io and their negative flags have been marked as deprecated, with help messages informing the user to use the new --load-mode.

Verification

To verify that this refactor did not break any existing codepaths, I have added the following debug statements to verify that the corresponding system calls are registered correctly.

Click to expand patch file for codepath verification

diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp
index ccc29c130..e35065faa 100644
--- a/src/llama-mmap.cpp
+++ b/src/llama-mmap.cpp
@@ -174,13 +174,19 @@ struct llama_file::impl {
 #else
     impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) : fname(fname) {
 #ifdef __linux__
+        direct_io_requested = use_direct_io;
+        printf("%s: opening '%s' mode='%s' use_direct_io=%d\n", __func__, fname, mode, use_direct_io);
         // Try unbuffered I/O for read only
         if (use_direct_io && std::strcmp(mode, "rb") == 0) {
             if (init_fd()) {
+                printf("%s: Direct I/O enabled for '%s' (fd=%d, alignment=%zu, size=%zu)\n",
+                        __func__, fname, fd, alignment, size);
                 return;
             }
             LLAMA_LOG_WARN("Failed to open file '%s' with error: %s. Falling back to buffered I/O",
                            fname, strerror(errno));
+        } else if (use_direct_io) {
+            printf("%s: Direct I/O requested for '%s' but mode='%s' is not compatible\n", __func__, fname, mode);
         }
 #endif
         init_fp(mode);
@@ -189,6 +195,9 @@ struct llama_file::impl {
 #ifdef __linux__
     bool init_fd() {
         fd = open(fname.c_str(), O_RDONLY | O_DIRECT);
+        if (fd == -1) {
+            printf("%s: open(O_DIRECT) failed for '%s': %s\n", __func__, fname.c_str(), strerror(errno));
+        }
 
         if (fd != -1) {
             struct stat file_stats{};
@@ -281,7 +290,7 @@ struct llama_file::impl {
                     }
                     // Fallback to std::fread in case the DMA controller cannot access the buffer
                     if (errno == EFAULT || errno == EINVAL) {
-                        LLAMA_LOG_WARN("%s: Falling back to buffered IO due to %s\n", __func__, strerror(errno));
+                        LLAMA_LOG_WARN("%s: Falling back to buffered IO for '%s' due to %s\n", __func__, fname.c_str(), strerror(errno));
                         auto curr_off = tell();
                         close(fd);
                         fd = -1;
@@ -334,6 +343,13 @@ struct llama_file::impl {
 
     void read_raw(void * ptr, size_t len) {
         if (has_direct_io()) {
+            if (!direct_io_usage_logged) {
+                printf("%s: Direct I/O read path active for '%s' (fd=%d, alignment=%zu)\n",
+                        __func__, fname.c_str(), fd, alignment);
+                direct_io_usage_logged = true;
+            }
+            direct_io_read_calls++;
+            direct_io_read_bytes += len;
             read_aligned_chunk(ptr, len);
         } else {
             read_raw_unsafe(ptr, len);
@@ -366,6 +382,13 @@ struct llama_file::impl {
     }
 
     ~impl() {
+        if (direct_io_read_calls > 0) {
+            printf("%s: Direct I/O usage summary for '%s': reads=%zu, requested_bytes=%zu\n",
+                    __func__, fname.c_str(), direct_io_read_calls, direct_io_read_bytes);
+        } else if (direct_io_requested) {
+            printf("%s: Direct I/O was requested for '%s' but never used\n", __func__, fname.c_str());
+        }
+
         if (fd != -1) {
             close(fd);
         } else if (owns_fp) {
@@ -374,6 +397,10 @@ struct llama_file::impl {
     }
     int fd = -1;
     std::string fname;
+    bool direct_io_requested = false;
+    bool direct_io_usage_logged = false;
+    size_t direct_io_read_calls = 0;
+    size_t direct_io_read_bytes = 0;
 #endif
 
     size_t read_alignment() const {
@@ -435,6 +462,7 @@ struct llama_mmap::impl {
     std::vector<std::pair<size_t, size_t>> mapped_fragments;
 
     impl(struct llama_file * file, size_t prefetch, bool numa) {
+        printf("%s: file_size=%zu, prefetch=%zu, numa=%d\n", __func__, file->size(), prefetch, numa);
         size = file->size();
         int fd = file->file_id();
         int flags = MAP_SHARED;
@@ -450,6 +478,7 @@ struct llama_mmap::impl {
         if (addr == MAP_FAILED) {
             throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
         }
+        printf("%s: mmap activated: addr=%p, size=%zu\n", __func__, addr, file->size());
 
         if (prefetch > 0) {
             if (posix_madvise(addr, std::min(file->size(), prefetch), POSIX_MADV_WILLNEED)) {
@@ -633,6 +662,7 @@ struct llama_mlock::impl {
 
     bool raw_lock(const void * addr, size_t size) const {
         if (!mlock(addr, size)) {
+            printf("DEBUG: mlock succeeded for addr=%p, size=%zu\n", addr, size);
             return true;
         }
 
@@ -681,6 +711,7 @@ struct llama_mlock::impl {
     bool raw_lock(void * ptr, size_t len) const {
         for (int tries = 1; ; tries++) {
             if (VirtualLock(ptr, len)) {
+                printf("DEBUG: VirtualLock succeeded for ptr=%p, len=%zu\n", ptr, len);
                 return true;
             }
             if (tries == 2) {

--mmap or --load-mode mmap

$ build/bin/llama-completion -hf ibm-granite/granite-3.3-2b-instruct-GGUF:Q4_K_M -n 15 --seed 42 --temp 0 -p "Sing me a birthday song" -no-cnv --mmap

upstream/master

impl: file_size=1545303328, prefetch=18446744073709551615, numa=0
impl: mmap activated: addr=0x300000000, size=1545303328

Sing me a birthday song for a 3-year-old

(To the tune of

pr/20834

impl: file_size=1545303328, prefetch=18446744073709551615, numa=0
impl: mmap activated: addr=0x300000000, size=1545303328

Sing me a birthday song for a 3-year-old

(To the tune of

--mlock or --load-mode mlock

$ build/bin/llama-completion -hf ibm-granite/granite-3.3-2b-instruct-GGUF:Q4_K_M -n 15 --seed 42 --temp 0 -p "Sing me a birthday song" -no-cnv --mlock

upstream/master

impl: file_size=1545303328, prefetch=18446744073709551615, numa=0
impl: mmap activated: addr=0x300000000, size=1545303328
DEBUG: mlock succeeded for addr=0x300000000, size=84344832
..DEBUG: mlock succeeded for addr=0x305070000, size=606208
...truncated...
DEBUG: mlock succeeded for addr=0x35a298000, size=23199744
..DEBUG: mlock succeeded for addr=0x35b8b8000, size=9437184

Sing me a birthday song for a 3-year-old

(To the tune of

pr/20834

impl: file_size=1545303328, prefetch=18446744073709551615, numa=0
impl: mmap activated: addr=0x300000000, size=1545303328
DEBUG: mlock succeeded for addr=0x300000000, size=84344832
..DEBUG: mlock succeeded for addr=0x305070000, size=606208
...truncated...
DEBUG: mlock succeeded for addr=0x35a298000, size=23199744
..DEBUG: mlock succeeded for addr=0x35b8b8000, size=9437184

Sing me a birthday song for a 3-year-old

(To the tune of

--direct-io or --load-mode dio

$ build/bin/llama-completion -hf ibm-granite/granite-3.3-2b-instruct-GGUF:Q4_K_M -n 15 --seed 42 --temp 0 -p "Sing me a birthday song" -no-cnv --direct-io

upstream/master

impl: opening '/home/taronaeo/.cache/huggingface/hub/models--ibm-granite--granite-3.3-2b-instruct-GGUF/snapshots/7cdf86ccd1f1bb3491c9b7017b033f2e51367397/granite-3.3-2b-instruct-Q4_K_M.gguf' mode='rb' use_direct_i
impl: Direct I/O enabled for '/home/taronaeo/.cache/huggingface/hub/models--ibm-granite--granite-3.3-2b-instruct-GGUF/snapshots/7cdf86ccd1f1bb3491c9b7017b033f2e51367397/granite-3.3-2b-instruct-Q4_K_M.gguf' (fd=4, 
~impl: Direct I/O was requested for '/home/taronaeo/.cache/huggingface/hub/models--ibm-granite--granite-3.3-2b-instruct-GGUF/snapshots/7cdf86ccd1f1bb3491c9b7017b033f2e51367397/granite-3.3-2b-instruct-Q4_K_M.gguf' 
impl: opening '/home/taronaeo/.cache/huggingface/hub/models--ibm-granite--granite-3.3-2b-instruct-GGUF/snapshots/7cdf86ccd1f1bb3491c9b7017b033f2e51367397/granite-3.3-2b-instruct-Q4_K_M.gguf' mode='rb' use_direct_i
impl: Direct I/O enabled for '/home/taronaeo/.cache/huggingface/hub/models--ibm-granite--granite-3.3-2b-instruct-GGUF/snapshots/7cdf86ccd1f1bb3491c9b7017b033f2e51367397/granite-3.3-2b-instruct-Q4_K_M.gguf' (fd=4, 
read_raw: Direct I/O read path active for '/home/taronaeo/.cache/huggingface/hub/models--ibm-granite--granite-3.3-2b-instruct-GGUF/snapshots/7cdf86ccd1f1bb3491c9b7017b033f2e51367397/granite-3.3-2b-instruct-Q4_K_M.
~impl: Direct I/O usage summary for '/home/taronaeo/.cache/huggingface/hub/models--ibm-granite--granite-3.3-2b-instruct-GGUF/snapshots/7cdf86ccd1f1bb3491c9b7017b033f2e51367397/granite-3.3-2b-instruct-Q4_K_M.gguf':

Sing me a birthday song for a 3-year-old

(To the tune of

pr/20834

impl: opening '/home/taronaeo/.cache/huggingface/hub/models--ibm-granite--granite-3.3-2b-instruct-GGUF/snapshots/7cdf86ccd1f1bb3491c9b7017b033f2e51367397/granite-3.3-2b-instruct-Q4_K_M.gguf' mode='rb' use_direct_i
impl: opening '/home/taronaeo/.cache/huggingface/hub/models--ibm-granite--granite-3.3-2b-instruct-GGUF/snapshots/7cdf86ccd1f1bb3491c9b7017b033f2e51367397/granite-3.3-2b-instruct-Q4_K_M.gguf' mode='rb' use_direct_i
impl: Direct I/O enabled for '/home/taronaeo/.cache/huggingface/hub/models--ibm-granite--granite-3.3-2b-instruct-GGUF/snapshots/7cdf86ccd1f1bb3491c9b7017b033f2e51367397/granite-3.3-2b-instruct-Q4_K_M.gguf' (fd=4, 
read_raw: Direct I/O read path active for '/home/taronaeo/.cache/huggingface/hub/models--ibm-granite--granite-3.3-2b-instruct-GGUF/snapshots/7cdf86ccd1f1bb3491c9b7017b033f2e51367397/granite-3.3-2b-instruct-Q4_K_M.
................................................................................................
~impl: Direct I/O usage summary for '/home/taronaeo/.cache/huggingface/hub/models--ibm-granite--granite-3.3-2b-instruct-GGUF/snapshots/7cdf86ccd1f1bb3491c9b7017b033f2e51367397/granite-3.3-2b-instruct-Q4_K_M.gguf':

Sing me a birthday song for a 3-year-old

(To the tune of

Responsible AI Disclosure: AI was used to write debugger code for llama-mmap.cpp to ensure feature parity between upstream/master and this refactor. AI was also used to identify affected lines within the refactor, but changes were made by a human.

0cc4m · 2026-04-02T12:25:32Z

I think this makes sense. The modes are mutually exclusive, so just flags creates a lot of overlap or impossible configurations. mlock means mmap+mlock, right?

Opinions from other maintainers?

taronaeo · 2026-04-02T15:35:39Z

mlock means mmap+mlock, right?

Yep, I've updated the PR description to showcase the tests to ensure feature parity.

The latest push also includes some bugfixes I've found when doing the feature parity check, updated all the documentation to the latest with llama-gen-docs, and a rebase.

0cc4m · 2026-04-07T05:24:20Z

@ggml-org/maintainers What do you think about this change?

am17an

Need to resolve the conflicts. I also think for a period of time we should have both --load-mode and all the other existing options co-existing, but print a depreciation warning. --load-mode should take precedence if specified.

ORippler · 2026-04-07T12:03:01Z

This PR overhauls the three separate loading modes (mlock, mmap, and direct-io) into one single -lm/--load-mode option to simplify the logic. While working on #20461, I realised that it became quite complex to maintain multiple loading modes when they are mutually exclusive of one another. This PR solves that by allowing only one loading mode to exist at a time.
Flags --mlock, --mmap, --direct-io and their negative flags have been marked as deprecated, with help messages informing the user to use the new --load-mode.

Conceptually fine I'd say.

I think this makes sense. The modes are mutually exclusive, so just flags creates a lot of overlap or impossible configurations. mlock means mmap+mlock, right?

I think in the long-run it makes sense to implement some kind of "backend-specific" loading functionality instead of keeping the modes mutually exclusive (i.e. CPU backend can use mmap-based weight-streaming, while NV dGPUs might prefer directIO on Gen 5 NVMe drives to avoid cache-pollution and reduce SRAM contention in general).

taronaeo · 2026-04-08T14:09:12Z

I've just pushed changes to rebase with upstream/master and resolve the PR conflicts. It also updates the documentation change suggested for tools/cli/README.md.

PTAL again.

RE the unrelated changes in this PR, these are artifacts from running build/bin/llama-gen-docs to update all the *.md documentation files. I can manually remove these changes from my PR to avoid conflicts if needed.

I also think for a period of time we should have both --load-mode and all the other existing options co-existing, but print a depreciation warning. --load-mode should take precedence if specified.

Yep, the deprecated flags all still work as per normal. For llama-cli / llama-completion, --load-mode will override any deprecated flags, but for llama-bench, it will queue the modes to be tested as per how llama-bench works.

llama-cli / llama-completion

$ build/bin/llama-completion -m ~/Documents/hf_models/deepseek-r1-distill-qwen-1.5b-bf16.gguf --mmap -lm none 2>&1 | grep mmap

DEPRECATED: --mmap and --no-mmap are deprecated. use --load-mode mmap instead
load_tensors: loading model tensors, this can take a while... (mmap = false, direct_io = false)

llama-bench

$ build/bin/llama-bench -m ~/Documents/hf_models/deepseek-r1-distill-qwen-1.5b-bf16.gguf --mmap 1 -lm none

ggml_metal_device_init: tensor API disabled for pre-M5 and pre-A19 devices
ggml_metal_library_init: using embedded metal library
ggml_metal_library_init: loaded in 0.012 sec
ggml_metal_rsets_init: creating a residency set collection (keep_alive = 180 s)
ggml_metal_device_init: GPU name:   MTL0
ggml_metal_device_init: GPU family: MTLGPUFamilyApple7  (1007)
ggml_metal_device_init: GPU family: MTLGPUFamilyCommon3 (3003)
ggml_metal_device_init: GPU family: MTLGPUFamilyMetal4  (5002)
ggml_metal_device_init: simdgroup reduction   = true
ggml_metal_device_init: simdgroup matrix mul. = true
ggml_metal_device_init: has unified memory    = true
ggml_metal_device_init: has bfloat            = true
ggml_metal_device_init: has tensor            = false
ggml_metal_device_init: use residency sets    = true
ggml_metal_device_init: use shared buffers    = true
ggml_metal_device_init: recommendedMaxWorkingSetSize  = 26800.60 MB
DEPRECATED: -mmp and --mmap are deprecated. use --load-mode mmap instead
| model                          |       size |     params | backend    | threads |    lm |            test |                  t/s |
| ------------------------------ | ---------: | ---------: | ---------- | ------: | ----: | --------------: | -------------------: |
| qwen2 1.5B BF16                |   3.31 GiB |     1.78 B | MTL,BLAS   |       8 |  mmap |           pp512 |      1217.39 ± 26.36 |
| qwen2 1.5B BF16                |   3.31 GiB |     1.78 B | MTL,BLAS   |       8 |  mmap |           tg128 |         49.02 ± 0.91 |
| qwen2 1.5B BF16                |   3.31 GiB |     1.78 B | MTL,BLAS   |       8 |  none |           pp512 |       1231.30 ± 3.41 |
| qwen2 1.5B BF16                |   3.31 GiB |     1.78 B | MTL,BLAS   |       8 |  none |           tg128 |         48.87 ± 0.18 |

I think in the long-run it makes sense to implement some kind of "backend-specific" loading functionality instead of keeping the modes mutually exclusive (i.e. CPU backend can use mmap-based weight-streaming, while NV dGPUs might prefer directIO on Gen 5 NVMe drives to avoid cache-pollution and reduce SRAM contention in general).

I agree. Conceptually I think it should go like this:

Detect the compiled backends
If compiled with GPU backends and devices list greater than 0 and --load-mode has not been manually set by the user, we automatically set the --load-mode to DirectIO.

Maybe in another discussion/PR? :)

Edit: Addressed the usage of deprecated flags.

0cc4m · 2026-04-08T14:18:10Z

2. If compiled with GPU backends and devices list greater than 0 and `--load-mode` has not been manually set by the user, we automatically set the `--load-mode` to DirectIO.

We've previously tried defaulting to DirectIO and this was a bad idea. It seems to fail on a not insignificant number of configurations out there.

NV dGPUs might prefer directIO on Gen 5 NVMe drives to avoid cache-pollution and reduce SRAM contention in general

This is a very rare configuration. I think it makes sense to disable mmap by default or at least if a GPU is detected, but probably not DirectIO.

taronaeo · 2026-04-13T13:56:54Z

Just rebased with upstream/master to resolve conflicts.

@am17an @ORippler Could you review the PR again? Or if everything is good to go, an approval would be much appreciated :)

am17an

Overall looks good, I tried it out locally as well. There are some diffs which are unrelated and should be fixed. Also you need to change this log line and the probably the surrounding code

llama.cpp/src/llama-model.cpp

Lines 2969 to 2970 in e21cdc1

    
           LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s, direct_io = %s)\n", 
        
               __func__, ml.use_mmap ? "true" : "false", ml.use_direct_io ? "true" : "false");

taronaeo · 2026-04-14T14:09:03Z

There are some diffs which are unrelated and should be fixed.

I think my reply got buried in the PR history, but to quote my previous comments,

RE the unrelated changes in this PR, these are artifacts from running build/bin/llama-gen-docs to update all the *.md documentation files. I can manually remove these changes from my PR to avoid conflicts if needed.

Nevertheless, I've manually went through the documentation changes made by build/bin/llama-gen-docs to exclude anything unrelated to this PR.

Also you need to change this log line and the probably the surrounding code

llama.cpp/src/llama-model.cpp

Lines 2969 to 2970 in e21cdc1

LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s, direct_io = %s)\n",

__func__, ml.use_mmap ? "true" : "false", ml.use_direct_io ? "true" : "false");

I've updated it to log this way. Let me know if this is the desired log message.

load_tensors: loading model tensors, this can take a while... (load_mode = mmap)

### OR

load_tensors: loading model tensors, this can take a while... (load_mode = mlock)

### OR

load_tensors: loading model tensors, this can take a while... (load_mode = dio)

am17an · 2026-04-14T14:17:07Z

What I meant was llama_models_params hasn't gone through this change. I think we intend to keep that as is? In case this PR looks good

llama.cpp/src/llama-model.cpp

Lines 9303 to 9305 in 006809f

    
           /*.use_mmap                    =*/ true, 
        
           /*.use_direct_io               =*/ false, 
        
           /*.use_mlock                   =*/ false,

taronaeo · 2026-04-14T14:48:22Z

+        /*.load_mode                   =*/ LLAMA_LOAD_MODE_MMAP,
        /*.main_gpu                    =*/ 0,
        /*.tensor_split                =*/ nullptr,
        /*.progress_callback           =*/ nullptr,
        /*.progress_callback_user_data =*/ nullptr,
        /*.kv_overrides                =*/ nullptr,
        /*.vocab_only                  =*/ false,
-        /*.use_mmap                    =*/ true,
-        /*.use_direct_io               =*/ false,
-        /*.use_mlock                   =*/ false,


What I meant was llama_models_params hasn't gone through this change. I think we intend to keep that as is?

It's changed here. I don't think it's beneficial to keep the respective parameters available as the code has been refactored to use load_mode instead.

JohannesGaessler

Due to the changes in llama-bench.cpp it is also necessary to change scripts/compare-llama-bench.py.

taronaeo · 2026-04-14T15:30:09Z

Thanks! I forgot about that. The output for scripts/compare-llama-bench.py looks like this now.

| Model           | Load mode         | Test   |   t/s master |   t/s master |   Speedup |
|:----------------|:------------------|:-------|-------------:|-------------:|----------:|
| qwen2 1.5B BF16 | mmap              | pp512  |      1232.05 |      1232.05 |      1.00 |
| qwen2 1.5B BF16 | mmap              | tg128  |        49.37 |        49.37 |      1.00 |
| qwen2 1.5B BF16 | none              | pp512  |      1233.24 |      1233.24 |      1.00 |
| qwen2 1.5B BF16 | none              | tg128  |        49.09 |        49.09 |      1.00 |

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

taronaeo · 2026-04-21T14:30:28Z

Would it make sense to use bit flags rather than an enum for the load mode?

I had considered this while writing this refactor. IMO it would not be the best option here since mmap/mlock are mutually exclusive to direct-io. If we were to implement this as bit flags, I think we would run into the same problem as what we are trying to fix in this refactor, albeit now in a bit mask instead of different feature flags.

I would still think that using an enum is more appropriate in this scenario. Let me know what you think.

taronaeo · 2026-04-21T14:33:18Z

I have just rebased this PR with upstream/master following the breaking changes to llama_fit_params and verified that it still works correctly. PR description is also updated with the latest test logs.

github-actions Bot added testing Everything test related examples server labels Mar 21, 2026

taronaeo force-pushed the refactor/mmap-dio branch 2 times, most recently from fe953f2 to 21603f8 Compare March 21, 2026 07:43

taronaeo marked this pull request as ready for review March 22, 2026 15:29

taronaeo requested review from a team, CISC, am17an, ggerganov and ngxson as code owners March 22, 2026 15:29

taronaeo force-pushed the refactor/mmap-dio branch from 2d1d26c to e3ca2c4 Compare April 2, 2026 15:28

am17an reviewed Apr 7, 2026

View reviewed changes

Comment thread tools/cli/README.md Outdated

Comment thread tools/cli/README.md Outdated

Comment thread tools/cli/README.md Outdated

taronaeo force-pushed the refactor/mmap-dio branch from e3ca2c4 to 88ca79d Compare April 8, 2026 13:49

taronaeo force-pushed the refactor/mmap-dio branch from a15401d to 4e19747 Compare April 13, 2026 13:54

am17an reviewed Apr 14, 2026

View reviewed changes

Comment thread tools/cli/README.md

Comment thread tools/cli/README.md

Comment thread src/llama-model-loader.cpp Outdated

Comment thread tools/llama-bench/README.md Outdated

taronaeo commented Apr 14, 2026

View reviewed changes

JohannesGaessler reviewed Apr 14, 2026

View reviewed changes

Comment thread include/llama.h Outdated

Comment thread examples/training/finetune.cpp Outdated

Comment thread common/arg.cpp Outdated

Comment thread tools/llama-bench/llama-bench.cpp Outdated

github-actions Bot added the script Script related label Apr 14, 2026

taronaeo added 26 commits April 21, 2026 21:40

args: add missing none option handler

eaaf0ee

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

docs: update docs again via llama-gen-docs

59c1068

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

chore: clean up refactor

ab3be5f

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

args: lessen the blow of the deprecation

5d9b972

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

tests: clean up

4746fde

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

docs: update tools documentation

b3f221e

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

args: add newline

23cd3bf

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

debug: add debug helpers

6675aa8

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

debug: add more direct-io debuggers

d8fe36f

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

fix: bugfix for direct-io

c940e55

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

fix: mlock not using mmap

4a1f392

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

revert: rm llama-mmap.cpp debug lines

97486b7

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

args: update help message

48d24d1

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

llama-bench: bring back support for --mmap and --direct-io for now

321ad94

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

llama-bench: minor code cleanup

e2d7f3e

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

llama-bench: forgot one last deprecation message

229b75b

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

docs: manually revert llama-gen-docs output

b31b28f

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

chore: rm brackets

3ed0aa9

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

llama: add new llama_load_mode_name api to conv load_mode to str

a743bbc

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

finetune: update load_mode checks

aaa2519

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

args: update mlock docs

ceca2fc

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

llama: completely remove old flags

9b1a39e

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

scripts: update compare-llama-bench to match refactor

cdea7b6

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

chore: update indents

b32a2fd

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

chore: update llama_bench pretty name

77da5e7

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

fix: broken rebase with upstream/master

3ce3b6d

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

taronaeo force-pushed the refactor/mmap-dio branch from 6f1dca3 to 3ce3b6d Compare April 21, 2026 14:12

chore: clean up commented code

06b8df0

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

	LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s, direct_io = %s)\n",
	__func__, ml.use_mmap ? "true" : "false", ml.use_direct_io ? "true" : "false");

Conversation

taronaeo commented Mar 21, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Verification

Uh oh!

0cc4m commented Apr 2, 2026

Uh oh!

taronaeo commented Apr 2, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

0cc4m commented Apr 7, 2026

Uh oh!

am17an left a comment • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

ORippler commented Apr 7, 2026

Uh oh!

taronaeo commented Apr 8, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

0cc4m commented Apr 8, 2026

Uh oh!

taronaeo commented Apr 13, 2026

Uh oh!

am17an left a comment • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

taronaeo commented Apr 14, 2026

Uh oh!

am17an commented Apr 14, 2026

Uh oh!

taronaeo Apr 14, 2026

Choose a reason for hiding this comment

Uh oh!

JohannesGaessler left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

taronaeo commented Apr 14, 2026

Uh oh!

taronaeo commented Apr 21, 2026

Uh oh!

taronaeo commented Apr 21, 2026

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

5 participants

taronaeo commented Mar 21, 2026 •

edited

Loading

taronaeo commented Apr 2, 2026 •

edited

Loading

am17an left a comment •

edited

Loading

taronaeo commented Apr 8, 2026 •

edited

Loading

am17an left a comment •

edited

Loading