From e4d7c6cb2ed37e97153e8834f65ba91bdbf9fe93 Mon Sep 17 00:00:00 2001
From: Martin Evans <martindevans@gmail.com>
Date: Sat, 14 Mar 2026 14:30:13 +0000
Subject: [PATCH 1/2] - Updated binaries to
 73c9eb8ceda397b651dbb6661b2935f0283a2b1d (Qwen3.5 support!)  - Removed
 deprecated native func `llama_adapter_lora_free` and related managed method
 `LoraAdapter.Unload`

---
 LLama/Batched/BatchedExecutor.cs         | 17 ++++---
 LLama/LLamaEmbedder.cs                   |  5 +-
 LLama/LLamaReranker.cs                   |  6 +--
 LLama/LLamaSharp.csproj                  |  2 +-
 LLama/Native/LLamaFtype.cs               |  7 ++-
 LLama/Native/LLamaModelQuantizeParams.cs | 10 ++++
 LLama/Native/LoraAdapter.cs              | 20 --------
 LLama/Native/NativeApi.Mtmd.cs           |  2 +-
 LLama/Native/NativeApi.cs                | 36 ++++++--------
 LLama/Native/SafeLLamaContextHandle.cs   | 61 +++++++++++++++++++++---
 LLama/Native/SafeLlamaModelHandle.cs     |  6 +--
 llama.cpp                                |  2 +-
 12 files changed, 101 insertions(+), 73 deletions(-)
diff --git a/LLama/Batched/BatchedExecutor.cs b/LLama/Batched/BatchedExecutor.cs
index 1a6698b1a..db9be6a7a 100644
--- a/LLama/Batched/BatchedExecutor.cs
+++ b/LLama/Batched/BatchedExecutor.cs
@@ -43,7 +43,12 @@ public sealed class BatchedExecutor
     /// The <see cref="LLamaWeights"/> this executor is using
     /// </summary>
     public LLamaWeights Model { get; }
-    
+
+    /// <summary>
+    /// The optional <see cref="MtmdWeights"/> this executor is using
+    /// </summary>
+    public MtmdWeights? ClipModel { get; }
+
     /// <summary>
     /// Get the number of tokens in the batch, waiting for <see cref="Infer"/> to be called
     /// </summary>
@@ -79,12 +84,8 @@ public int BatchedTokenCount
     /// </summary>
     /// <param name="model">The model to use</param>
     /// <param name="contextParams">Parameters to create a new context</param>
-    public BatchedExecutor(LLamaWeights model, IContextParams contextParams)
-        : this(model, contextParams, null)
-    {
-    }
-
-    public BatchedExecutor(LLamaWeights model, IContextParams contextParams, MtmdWeights? clipModel)
+    /// <param name="clipModel">Clip model to use for multimodal capabilities</param>
+    public BatchedExecutor(LLamaWeights model, IContextParams contextParams, MtmdWeights? clipModel = null)
     {
         Model = model;
         Context = model.CreateContext(contextParams);
@@ -92,8 +93,6 @@ public BatchedExecutor(LLamaWeights model, IContextParams contextParams, MtmdWei
         Epoch = 1;
     }
 
-    public MtmdWeights? ClipModel { get; }
-
     /// <summary>
     /// Start a new <see cref="Conversation"/>
     /// </summary>
diff --git a/LLama/LLamaEmbedder.cs b/LLama/LLamaEmbedder.cs
index eee9a01e9..5f6a2878d 100644
--- a/LLama/LLamaEmbedder.cs
+++ b/LLama/LLamaEmbedder.cs
@@ -1,14 +1,11 @@
 using System;
 using System.Collections.Generic;
-using System.Linq;
 using System.Threading;
 using System.Threading.Tasks;
 using LLama.Abstractions;
 using LLama.Exceptions;
 using LLama.Native;
-using Microsoft.Extensions.AI;
 using Microsoft.Extensions.Logging;
-using static System.Net.Mime.MediaTypeNames;
 
 namespace LLama;
 
@@ -79,7 +76,7 @@ public async Task<IReadOnlyList<float[]>> GetEmbeddings(string input, Cancellati
             Context.Dispose();
 
         Context = _weights.CreateContext(_params, _logger);
-        NativeApi.llama_set_embeddings(Context.NativeHandle, true);
+        Context.NativeHandle.SetEmbeddings(true);
 
         // Add all of the tokens to the batch
         var tokens = Context.Tokenize(input, special: true);
diff --git a/LLama/LLamaReranker.cs b/LLama/LLamaReranker.cs
index 16a206c40..58919fd0a 100644
--- a/LLama/LLamaReranker.cs
+++ b/LLama/LLamaReranker.cs
@@ -1,11 +1,7 @@
 using System;
 using System.Collections.Generic;
-using System.IO;
-using System.Linq;
-using System.Text;
 using System.Threading;
 using System.Threading.Tasks;
-using System.Xml.Linq;
 using LLama.Abstractions;
 using LLama.Exceptions;
 using LLama.Native;
@@ -44,7 +40,7 @@ public LLamaReranker(LLamaWeights weights, IContextParams @params, ILogger? logg
         if (@params.PoolingType != LLamaPoolingType.Rank)
             throw new NotSupportedException("Computing rank score, PoolingType must be equal to LLamaPoolingType.Rank");
         Context = weights.CreateContext(@params, logger);
-        NativeApi.llama_set_embeddings(Context.NativeHandle, true);
+        Context.NativeHandle.SetEmbeddings(true);
     }
 
     /// <inheritdoc />
diff --git a/LLama/LLamaSharp.csproj b/LLama/LLamaSharp.csproj
index 4bdf9289a..9eff34c4e 100644
--- a/LLama/LLamaSharp.csproj
+++ b/LLama/LLamaSharp.csproj
@@ -57,7 +57,7 @@
   </ItemGroup>
 
   <PropertyGroup>
-    <BinaryReleaseId>ff4affb4c1aa7eb4_v3</BinaryReleaseId>
+    <BinaryReleaseId>73c9eb8ceda397b</BinaryReleaseId>
   </PropertyGroup>
 
   <PropertyGroup>
diff --git a/LLama/Native/LLamaFtype.cs b/LLama/Native/LLamaFtype.cs
index 813bad1ae..d48be1855 100644
--- a/LLama/Native/LLamaFtype.cs
+++ b/LLama/Native/LLamaFtype.cs
@@ -206,7 +206,12 @@ public enum LLamaFtype
         /// except 1d tensors 
         /// </summary>
         LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38,
-        
+
+        /// <summary>
+        /// Except 1d tensors
+        /// </summary>
+        LLAMA_FTYPE_MOSTLY_NVFP4 = 39,
+
         /// <summary>
         /// File type was not specified
         /// </summary>
diff --git a/LLama/Native/LLamaModelQuantizeParams.cs b/LLama/Native/LLamaModelQuantizeParams.cs
index 857f0cfb9..d47c78f11 100644
--- a/LLama/Native/LLamaModelQuantizeParams.cs
+++ b/LLama/Native/LLamaModelQuantizeParams.cs
@@ -79,6 +79,16 @@ public bool keep_split
         }
         private sbyte _keep_split;
 
+        /// <summary>
+        /// calculate and show the final quantization size without performing quantization
+        /// </summary>
+        public bool dry_run
+        {
+            get => Convert.ToBoolean(_dry_run);
+            set => _dry_run = Convert.ToSByte(value);
+        }
+        private sbyte _dry_run;
+
         /// <summary>
         /// pointer to importance matrix data
         /// </summary>
diff --git a/LLama/Native/LoraAdapter.cs b/LLama/Native/LoraAdapter.cs
index 8fdd649a3..9bc24ce17 100644
--- a/LLama/Native/LoraAdapter.cs
+++ b/LLama/Native/LoraAdapter.cs
@@ -22,30 +22,10 @@ public class LoraAdapter
     /// </summary>
     internal IntPtr Pointer { get; }
 
-    /// <summary>
-    /// Indicates if this adapter has been unloaded
-    /// </summary>
-    internal bool Loaded { get; private set; }
-
     internal LoraAdapter(SafeLlamaModelHandle model, string path, IntPtr nativePtr)
     {
         Model = model;
         Path = path;
         Pointer = nativePtr;
-        Loaded = true;
-    }
-
-    /// <summary>
-    /// Unload this adapter
-    /// </summary>
-    public void Unload()
-    {
-        Loaded = false;
-        llama_adapter_lora_free(Pointer);
-
-        // Manually free a LoRA adapter. loaded adapters will be free when the associated model is deleted
-        [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
-        [Obsolete("adapters are now freed together with the associated model")]
-        static extern void llama_adapter_lora_free(IntPtr adapter);
     }
 }
\ No newline at end of file
diff --git a/LLama/Native/NativeApi.Mtmd.cs b/LLama/Native/NativeApi.Mtmd.cs
index 827c0e1b0..5eb75028f 100644
--- a/LLama/Native/NativeApi.Mtmd.cs
+++ b/LLama/Native/NativeApi.Mtmd.cs
@@ -168,7 +168,7 @@ internal static unsafe void mtmd_bitmap_set_id(SafeMtmdEmbed bitmap, string? id)
     // tokenize ----------------------------------------------------------
 
     /// <summary>
-    /// Native text structure consumed by <see cref="mtmd_tokenize"/>.
+    /// Native text structure consumed by <see cref="NativeApi.mtmd_tokenize(LLama.Native.SafeMtmdModelHandle,System.IntPtr,in LLama.Native.NativeApi.mtmd_input_text_native,System.IntPtr[],System.UIntPtr)"/>.
     /// </summary>
     internal unsafe struct mtmd_input_text_native
     {
diff --git a/LLama/Native/NativeApi.cs b/LLama/Native/NativeApi.cs
index ce8c36197..381754103 100644
--- a/LLama/Native/NativeApi.cs
+++ b/LLama/Native/NativeApi.cs
@@ -132,19 +132,7 @@ public static void llama_empty_call()
         [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
         public static extern unsafe nuint llama_state_seq_load_file(SafeLLamaContextHandle ctx, string filepath, LLamaSeqId dest_seq_id, LLamaToken* tokens_out, nuint n_token_capacity, out nuint n_token_count_out);
 
-        /// <summary>
-        /// Set whether to use causal attention or not. If set to true, the model will only attend to the past tokens
-        /// </summary>
-        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
-        public static extern void llama_set_causal_attn(SafeLLamaContextHandle ctx, [MarshalAs(UnmanagedType.U1)] bool causalAttn);
-
-        /// <summary>
-        /// Set whether the context outputs embeddings or not
-        /// </summary>
-        /// <param name="ctx"></param>
-        /// <param name="embeddings">If true, embeddings will be returned but logits will not</param>
-        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
-        public static extern void llama_set_embeddings(SafeLLamaContextHandle ctx, [MarshalAs(UnmanagedType.U1)] bool embeddings);
+        
 
         /// <summary>
         /// Set abort callback
@@ -152,14 +140,6 @@ public static void llama_empty_call()
         [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
         public static extern void llama_set_abort_callback(SafeLLamaContextHandle ctx, IntPtr /* ggml_abort_callback */ abortCallback, IntPtr abortCallbackData);
 
-        /// <summary>
-        /// Get the n_seq_max for this context
-        /// </summary>
-        /// <param name="ctx"></param>
-        /// <returns></returns>
-        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
-        public static extern uint llama_n_seq_max(SafeLLamaContextHandle ctx);
-
         /// <summary>
         /// Get all output token embeddings.
         /// When pooling_type == LLAMA_POOLING_TYPE_NONE or when using a generative model, the embeddings for which
@@ -515,6 +495,18 @@ public static extern unsafe LLamaParamsFitStatus llama_params_fit(
         [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
         public static extern long llama_time_us();
 
-        
+        /* Directly exposes `ggml_tensor` and `gguf_context` which LLamaSharp does not currently support!
+         
+        typedef void (* llama_model_set_tensor_data_t) (struct ggml_tensor * tensor, void* userdata);
+
+        // Create a new model from GGUF metadata as well as a function to set the tensor data
+        //   - tensors are created as GGML_TYPE_F32 by default,
+        //     override by adding a tensor with the same name but a different name to the context
+        LLAMA_API struct llama_model * llama_model_init_from_user(
+        struct gguf_context * metadata,
+        llama_model_set_tensor_data_t set_tensor_data,    // function to initialize tensor data with
+        void* set_tensor_data_ud, // userdata for function
+        struct llama_model_params   params);
+        */
     }
 }
diff --git a/LLama/Native/SafeLLamaContextHandle.cs b/LLama/Native/SafeLLamaContextHandle.cs
index 5ec78f053..71261eefb 100644
--- a/LLama/Native/SafeLLamaContextHandle.cs
+++ b/LLama/Native/SafeLLamaContextHandle.cs
@@ -37,7 +37,7 @@ public sealed class SafeLLamaContextHandle
         /// <summary>
         /// Get the number of maximum sequences allowed
         /// </summary>
-        public uint MaxSeq => NativeApi.llama_n_seq_max(this);
+        public uint MaxSeq => llama_n_seq_max(this);
 
         /// <summary>
         /// Get or set the number of threads used for generation of a single token.
@@ -355,6 +355,7 @@ static SafeLLamaContextHandle()
         /// <param name="buf_size"></param>
         /// <returns>The length of the value string (on success) -1 otherwise </returns>
         [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+        // ReSharper disable once InconsistentNaming
         private static extern int llama_adapter_meta_val_str(IntPtr adapter, string key, StringBuilder buf, UIntPtr buf_size);
         
         /// <summary>
@@ -374,6 +375,7 @@ static SafeLLamaContextHandle()
         /// <param name="buf_size"></param>
         /// <returns>The length of string i.e meta key (on success) -1 otherwise</returns>
         [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+        // ReSharper disable once InconsistentNaming
         private static extern int llama_adapter_meta_key_by_index(IntPtr adapter, int i, StringBuilder buf, UIntPtr buf_size);
         
         /// <summary>
@@ -385,6 +387,7 @@ static SafeLLamaContextHandle()
         /// <param name="buf_size"></param>
         /// <returns>The length of value string (on success) -1 otherwise</returns>
         [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+        // ReSharper disable once InconsistentNaming
         private static extern int llama_adapter_meta_val_by_index(IntPtr adapter, int i, StringBuilder buf,  UIntPtr buf_size);
 
         /// <summary>
@@ -424,6 +427,56 @@ static SafeLLamaContextHandle()
         /// <param name="warmup"></param>
         [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
         private static extern void llama_set_warmup(SafeLLamaContextHandle ctx, [MarshalAs(UnmanagedType.U1)] bool warmup);
+
+        /// <summary>
+        /// Set whether to use causal attention or not. If set to true, the model will only attend to the past tokens
+        /// </summary>
+        [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+        private static extern void llama_set_causal_attn(SafeLLamaContextHandle ctx, [MarshalAs(UnmanagedType.U1)] bool causalAttn);
+
+        /// <summary>
+        /// Set whether the context outputs embeddings or not
+        /// </summary>
+        /// <param name="ctx"></param>
+        /// <param name="embeddings">If true, embeddings will be returned but logits will not</param>
+        [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+        private static extern void llama_set_embeddings(SafeLLamaContextHandle ctx, [MarshalAs(UnmanagedType.U1)] bool embeddings);
+
+        /// <summary>
+        /// Get the n_seq_max for this context
+        /// </summary>
+        /// <param name="ctx"></param>
+        /// <returns></returns>
+        [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+        private static extern uint llama_n_seq_max(SafeLLamaContextHandle ctx);
+        #endregion
+
+        #region Setters
+        /// <summary>
+        /// Set whether the model is in warmup mode or not
+        /// If true, all model tensors are activated during <see cref="Decode(LLamaBatch)"/> to load and cache their weights.
+        /// </summary>
+        public void SetWarmup(bool value)
+        {
+            llama_set_warmup(this, value);
+        }
+
+        /// <summary>
+        /// Set whether to use causal attention or not. If set to true, the model will only attend to the past tokens
+        /// </summary>
+        public void SetCausalAttention(bool value)
+        {
+            llama_set_causal_attn(this, value);
+        }
+
+        /// <summary>
+        /// Set whether the context outputs embeddings or not
+        /// </summary>
+        /// <param name="value">If true, embeddings will be returned but logits will not</param>
+        public void SetEmbeddings(bool value)
+        {
+            llama_set_embeddings(this, value);
+        }
         #endregion
 
         #region LoRA
@@ -434,14 +487,10 @@ static SafeLLamaContextHandle()
         /// <exception cref="ArgumentException"></exception>
         public void SetLoraAdapters(params Span<(LoraAdapter Adapter, float Scale)> adapters)
         {
-            // Check adapters are all valid
+            // Check adapters are all valid and attached to this model
             foreach (var adapter in adapters)
-            {
                 if (adapter.Adapter.Model != ModelHandle)
                     throw new ArgumentException("Cannot add LoRA adapter which was loaded for a different model");
-                if (!adapter.Adapter.Loaded)
-                    throw new ArgumentException("Cannot add LoRA adapter which has been unloaded");
-            }
 
             // Copy data into buffers
             Span<IntPtr> adapterPtrs = stackalloc IntPtr[adapters.Length];
diff --git a/LLama/Native/SafeLlamaModelHandle.cs b/LLama/Native/SafeLlamaModelHandle.cs
index 668074090..2a6855741 100644
--- a/LLama/Native/SafeLlamaModelHandle.cs
+++ b/LLama/Native/SafeLlamaModelHandle.cs
@@ -175,7 +175,7 @@ static SafeLlamaModelHandle()
         private static extern unsafe byte* llama_model_chat_template(SafeLlamaModelHandle model, string? name);
 
         /// <summary>
-        /// Load the model from a file
+        /// Load a model from a file
         /// If the file is split into multiple parts, the file name must follow this pattern: {name}-%05d-of-%05d.gguf
         /// If the split file name does not follow this pattern, use llama_model_load_from_splits
         /// </summary>
@@ -186,7 +186,7 @@ static SafeLlamaModelHandle()
         private static extern SafeLlamaModelHandle llama_model_load_from_file(string path, LLamaModelParams @params);
 
         /// <summary>
-        /// Load the model from multiple splits (support custom naming scheme)
+        /// Load a model from multiple splits (support custom naming scheme)
         /// The paths must be in the correct order
         /// </summary>
         /// <returns></returns>
@@ -460,7 +460,7 @@ private static int llama_model_meta_val_str(SafeLlamaModelHandle model, string k
         /// <param name="i"></param>
         /// <returns></returns>
         [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
-        private static extern string? llama_model_cls_label(SafeLlamaModelHandle model, uint i);
+        private static extern IntPtr /* char* */ llama_model_cls_label(SafeLlamaModelHandle model, uint i);
         #endregion
 
         #region LoRA
diff --git a/llama.cpp b/llama.cpp
index ff4affb4c..73c9eb8ce 160000
--- a/llama.cpp
+++ b/llama.cpp
@@ -1 +1 @@
-Subproject commit ff4affb4c1aa7eb4f28a0d9de1b205bd719802f2
+Subproject commit 73c9eb8ceda397b651dbb6661b2935f0283a2b1d

From 2a2d6d990a363d96133e619393a31559ec675c80 Mon Sep 17 00:00:00 2001
From: Martin Evans <martindevans@gmail.com>
Date: Fri, 17 Apr 2026 14:06:44 +0100
Subject: [PATCH 2/2] Updated to:
 https://github.com/ggml-org/llama.cpp/commit/3f7c29d318e317b63f54c558bc69803963d7d88c
 Build: https://github.com/SciSharp/LLamaSharp/actions/runs/24541043843

---
 LLama.Unittest/MtmdWeightsTests.cs       |  6 +--
 LLama/LLamaSharp.csproj                  |  2 +-
 LLama/MtmdWeights.cs                     |  4 +-
 LLama/Native/GPUSplitMode.cs             |  6 +++
 LLama/Native/LLamaFtype.cs               |  5 ++
 LLama/Native/LLamaModelImatrixData.cs    | 12 +++++
 LLama/Native/LLamaModelQuantizeParams.cs |  8 ++--
 LLama/Native/LLamaModelTensorOverride.cs |  9 ++++
 LLama/Native/LoraAdapter.cs              | 32 +++++++++++++
 LLama/Native/NativeApi.Mtmd.cs           | 59 +++++++++++++++++++++++-
 LLama/Native/NativeApi.cs                |  8 ++++
 LLama/Native/SafeLLamaContextHandle.cs   |  4 ++
 LLama/Native/SafeMtmdModelHandle.cs      |  2 +-
 13 files changed, 144 insertions(+), 13 deletions(-)
 create mode 100644 LLama/Native/LLamaModelImatrixData.cs
 create mode 100644 LLama/Native/LLamaModelTensorOverride.cs

diff --git a/LLama.Unittest/MtmdWeightsTests.cs b/LLama.Unittest/MtmdWeightsTests.cs
index 4a1a59c75..42ba52356 100644
--- a/LLama.Unittest/MtmdWeightsTests.cs
+++ b/LLama.Unittest/MtmdWeightsTests.cs
@@ -81,7 +81,7 @@ public void BasicPropertyChecks()
             Assert.True(_mtmdWeights.SupportsVision);
             Assert.False(_mtmdWeights.UsesMRope);
             Assert.True(_mtmdWeights.UsesNonCausalAttention);
-            Assert.Equal(-1, _mtmdWeights.AudioBitrate);
+            Assert.Equal(-1, _mtmdWeights.SampleRate);
         }
 
         [Fact,Trait("Category", "NoCI")]
@@ -143,8 +143,8 @@ public void TokenizeProvidesChunkMetadata()
             Assert.True(_mtmdWeights.SupportsVision);
             Assert.False(_mtmdWeights.SupportsAudio);
 
-            var audioBitrate = _mtmdWeights.AudioBitrate;
-            Assert.True(audioBitrate <= 0);
+            var audioSampleRate = _mtmdWeights.SampleRate;
+            Assert.True(audioSampleRate <= 0);
         }
     }
 }
diff --git a/LLama/LLamaSharp.csproj b/LLama/LLamaSharp.csproj
index 9eff34c4e..02c3e11c4 100644
--- a/LLama/LLamaSharp.csproj
+++ b/LLama/LLamaSharp.csproj
@@ -57,7 +57,7 @@
   </ItemGroup>
 
   <PropertyGroup>
-    <BinaryReleaseId>73c9eb8ceda397b</BinaryReleaseId>
+    <BinaryReleaseId>3f7c29d318e317b6</BinaryReleaseId>
   </PropertyGroup>
 
   <PropertyGroup>
diff --git a/LLama/MtmdWeights.cs b/LLama/MtmdWeights.cs
index 07e739a61..32dd6b572 100644
--- a/LLama/MtmdWeights.cs
+++ b/LLama/MtmdWeights.cs
@@ -137,9 +137,9 @@ public int DecodeImageChunk(IntPtr chunkPtr, SafeLLamaContextHandle llamaContext
     public bool UsesMRope => NativeHandle.DecodeUseMRope();
 
     /// <summary>
-    /// Gets the audio bitrate advertised by the model.
+    /// Gets the audio sample rate advertised by the model.
     /// </summary>
-    public int AudioBitrate => NativeHandle.GetAudioBitrate();
+    public int SampleRate => NativeHandle.GetAudioSampleRate();
 
     /// <inheritdoc />
     public void Dispose() => NativeHandle.Dispose();
diff --git a/LLama/Native/GPUSplitMode.cs b/LLama/Native/GPUSplitMode.cs
index 27ee7ae49..338131b52 100644
--- a/LLama/Native/GPUSplitMode.cs
+++ b/LLama/Native/GPUSplitMode.cs
@@ -20,4 +20,10 @@ public enum GPUSplitMode
     /// split layers and KV across GPUs, use tensor parallelism if supported
     /// </summary>
     Row = 2,
+    
+    // Undocumented in llama.h
+    /// <summary>
+    /// 
+    /// </summary>
+    Tensor = 3,
 }
\ No newline at end of file
diff --git a/LLama/Native/LLamaFtype.cs b/LLama/Native/LLamaFtype.cs
index d48be1855..78a682130 100644
--- a/LLama/Native/LLamaFtype.cs
+++ b/LLama/Native/LLamaFtype.cs
@@ -212,6 +212,11 @@ public enum LLamaFtype
         /// </summary>
         LLAMA_FTYPE_MOSTLY_NVFP4 = 39,
 
+        /// <summary>
+        /// Except 1d tensors
+        /// </summary>
+        LLAMA_FTYPE_MOSTLY_Q1_0 = 40,
+
         /// <summary>
         /// File type was not specified
         /// </summary>
diff --git a/LLama/Native/LLamaModelImatrixData.cs b/LLama/Native/LLamaModelImatrixData.cs
new file mode 100644
index 000000000..df0eb3856
--- /dev/null
+++ b/LLama/Native/LLamaModelImatrixData.cs
@@ -0,0 +1,12 @@
+namespace LLama.Native;
+
+/* /// <summary>
+/// 
+/// </summary>
+/// <remarks>llama_model_imatrix_data</remarks>
+public unsafe struct LLamaModelImatrixData
+{
+    char* name;
+    float* data;
+    nuint size;
+} */
\ No newline at end of file
diff --git a/LLama/Native/LLamaModelQuantizeParams.cs b/LLama/Native/LLamaModelQuantizeParams.cs
index d47c78f11..9cd04b3f8 100644
--- a/LLama/Native/LLamaModelQuantizeParams.cs
+++ b/LLama/Native/LLamaModelQuantizeParams.cs
@@ -92,22 +92,22 @@ public bool dry_run
         /// <summary>
         /// pointer to importance matrix data
         /// </summary>
-        public IntPtr imatrix;
+        public IntPtr imatrix; // LLamaModelImatrixData *
 
         /// <summary>
         /// pointer to vector containing overrides
         /// </summary>
-        public IntPtr kv_overrides;
+        public IntPtr kv_overrides; // llama_model_kv_override *
 
         /// <summary>
         /// pointer to vector containing tensor types
         /// </summary>
-        public IntPtr tensor_types;
+        public IntPtr tensor_types; // llama_model_tensor_override *
 
         /// <summary>
         /// Pointer to vector containing layer indices to prune
         /// </summary>
-        public IntPtr prune_layers;
+        public IntPtr prune_layers; // int32 *
 
         /// <summary>
         /// Create a LLamaModelQuantizeParams with default values
diff --git a/LLama/Native/LLamaModelTensorOverride.cs b/LLama/Native/LLamaModelTensorOverride.cs
new file mode 100644
index 000000000..82f44b530
--- /dev/null
+++ b/LLama/Native/LLamaModelTensorOverride.cs
@@ -0,0 +1,9 @@
+namespace LLama.Native;
+
+// Unsupported - we can't handle ggml_type since LlamaSharp doesn't wrap/expose ggml
+/*
+ * struct llama_model_tensor_override {
+       const char * pattern;
+       enum ggml_type type; // GGMLType might work?
+   };
+*/
\ No newline at end of file
diff --git a/LLama/Native/LoraAdapter.cs b/LLama/Native/LoraAdapter.cs
index 9bc24ce17..2aa96eb13 100644
--- a/LLama/Native/LoraAdapter.cs
+++ b/LLama/Native/LoraAdapter.cs
@@ -22,10 +22,42 @@ public class LoraAdapter
     /// </summary>
     internal IntPtr Pointer { get; }
 
+    /// <summary>
+    /// Indicates if this adapter has been unloaded
+    /// </summary>
+    internal bool Loaded { get; private set; }
+
     internal LoraAdapter(SafeLlamaModelHandle model, string path, IntPtr nativePtr)
     {
         Model = model;
         Path = path;
         Pointer = nativePtr;
+        Loaded = true;
+    }
+
+    /// <summary>
+    /// Unload this adapter
+    /// </summary>
+    public void Unload()
+    {
+        // Early exit if already unloaded
+        if (!Loaded)
+            return;
+        
+        // If the model has been unloaded this handle will have been auto unloaded
+        if (Model.IsClosed)
+        {
+            Loaded = false;
+            return;
+        }
+
+        // Unload
+        Loaded = false;
+        llama_adapter_lora_free(Pointer);
+
+        // Manually free a LoRA adapter. loaded adapters which have not been
+        // freed will be automatically freed when the associated model is deleted
+        [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+        static extern void llama_adapter_lora_free(IntPtr adapter);
     }
 }
\ No newline at end of file
diff --git a/LLama/Native/NativeApi.Mtmd.cs b/LLama/Native/NativeApi.Mtmd.cs
index 5eb75028f..1bf2b5f9a 100644
--- a/LLama/Native/NativeApi.Mtmd.cs
+++ b/LLama/Native/NativeApi.Mtmd.cs
@@ -41,24 +41,50 @@ internal struct mtmd_context_params
     [DllImport(mtmdLibraryName, EntryPoint = "mtmd_context_params_default", CallingConvention = CallingConvention.Cdecl)]
     internal static extern mtmd_context_params mtmd_context_params_default();
 
+    /// <summary>
+    /// whether we need to set non-causal mask before llama_decode
+    /// if chunk is nullptr, we assume the default case where chunk is an image chunk
+    /// </summary>
+    /// <param name="ctx"></param>
+    /// <returns></returns>
     [DllImport(mtmdLibraryName, EntryPoint = "mtmd_decode_use_non_causal", CallingConvention = CallingConvention.Cdecl)]
     [return: MarshalAs(UnmanagedType.I1)]
     internal static extern bool mtmd_decode_use_non_causal(SafeMtmdModelHandle ctx);
 
+    /// <summary>
+    /// whether the current model use M-RoPE for llama_decode
+    /// </summary>
+    /// <param name="ctx"></param>
+    /// <returns></returns>
     [DllImport(mtmdLibraryName, EntryPoint = "mtmd_decode_use_mrope", CallingConvention = CallingConvention.Cdecl)]
     [return: MarshalAs(UnmanagedType.I1)]
     internal static extern bool mtmd_decode_use_mrope(SafeMtmdModelHandle ctx);
 
+    /// <summary>
+    /// whether the current model supports vision input
+    /// </summary>
+    /// <param name="ctx"></param>
+    /// <returns></returns>
     [DllImport(mtmdLibraryName, EntryPoint = "mtmd_support_vision", CallingConvention = CallingConvention.Cdecl)]
     [return: MarshalAs(UnmanagedType.I1)]
     internal static extern bool mtmd_support_vision(SafeMtmdModelHandle ctx);
 
+    /// <summary>
+    /// whether the current model supports audio input
+    /// </summary>
+    /// <param name="ctx"></param>
+    /// <returns></returns>
     [DllImport(mtmdLibraryName, EntryPoint = "mtmd_support_audio", CallingConvention = CallingConvention.Cdecl)]
     [return: MarshalAs(UnmanagedType.I1)]
     internal static extern bool mtmd_support_audio(SafeMtmdModelHandle ctx);
 
-    [DllImport(mtmdLibraryName, EntryPoint = "mtmd_get_audio_bitrate", CallingConvention = CallingConvention.Cdecl)]
-    internal static extern int mtmd_get_audio_bitrate(SafeMtmdModelHandle ctx);
+    /// <summary>
+    /// get audio sample rate in Hz, for example 16000 for Whisper
+    /// </summary>
+    /// <param name="ctx"></param>
+    /// <returns></returns>
+    [DllImport(mtmdLibraryName, EntryPoint = "mtmd_get_audio_sample_rate", CallingConvention = CallingConvention.Cdecl)]
+    internal static extern int mtmd_get_audio_sample_rate(SafeMtmdModelHandle ctx);
 
     // bitmap ------------------------------------------------------------
 
@@ -153,9 +179,11 @@ internal static unsafe void mtmd_bitmap_set_id(SafeMtmdEmbed bitmap, string? id)
     [DllImport(mtmdLibraryName, EntryPoint = "mtmd_image_tokens_get_n_tokens", CallingConvention = CallingConvention.Cdecl)]
     internal static extern UIntPtr mtmd_image_tokens_get_n_tokens(IntPtr image_tokens);
 
+    [Obsolete("use mtmd_image_tokens_get_decoder_pos() instead")]
     [DllImport(mtmdLibraryName, EntryPoint = "mtmd_image_tokens_get_nx", CallingConvention = CallingConvention.Cdecl)]
     internal static extern UIntPtr mtmd_image_tokens_get_nx(IntPtr image_tokens);
 
+    [Obsolete("use mtmd_image_tokens_get_decoder_pos() instead")]
     [DllImport(mtmdLibraryName, EntryPoint = "mtmd_image_tokens_get_ny", CallingConvention = CallingConvention.Cdecl)]
     internal static extern UIntPtr mtmd_image_tokens_get_ny(IntPtr image_tokens);
 
@@ -165,6 +193,28 @@ internal static unsafe void mtmd_bitmap_set_id(SafeMtmdEmbed bitmap, string? id)
     [DllImport(mtmdLibraryName, EntryPoint = "mtmd_image_tokens_get_n_pos", CallingConvention = CallingConvention.Cdecl)]
     internal static extern int mtmd_image_tokens_get_n_pos(IntPtr image_tokens);
 
+    [StructLayout(LayoutKind.Explicit)]
+    internal struct mtmd_decoder_pos
+    {
+        [FieldOffset(0)]
+        uint t;
+
+        [FieldOffset(4)]
+        uint x;
+
+        [FieldOffset(8)]
+        uint y;
+    };
+
+    /// <summary>
+    /// get position for decoder attention, to be used by M-RoPE models
+    /// </summary>
+    /// <param name="image_tokens"></param>
+    /// <param name="i">i is the index of the embedding token, ranging from 0 to mtmd_image_tokens_get_n_tokens() - 1</param>
+    /// <returns>return relative position (for example, embedding 0 will have position (0, 0, 0); remember to adjust it to the current absolute position)</returns>
+    [DllImport(mtmdLibraryName, EntryPoint = "mtmd_image_tokens_get_decoder_pos", CallingConvention = CallingConvention.Cdecl)]
+    internal static extern mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(IntPtr image_tokens, nuint i);
+
     // tokenize ----------------------------------------------------------
 
     /// <summary>
@@ -259,6 +309,11 @@ internal static unsafe IntPtr mtmd_helper_bitmap_init_from_file(SafeMtmdModelHan
     [DllImport(mtmdLibraryName, EntryPoint = "mtmd_helper_get_n_pos", CallingConvention = CallingConvention.Cdecl)]
     internal static extern int mtmd_helper_get_n_pos(SafeMtmdInputChunks chunks);
 
+    [DllImport(mtmdLibraryName, EntryPoint = "mtmd_helper_image_get_decoder_pos", CallingConvention = CallingConvention.Cdecl)]
+    // helper to get the list of relative positions corresponding to the embedding tokens, to be used by M-RoPE
+    // out_pos must have length == mtmd_helper_get_n_tokens(image)
+    internal static extern void mtmd_helper_image_get_decoder_pos(IntPtr /* mtmd_image_tokens* */ image, IntPtr /* mtmd_decoder_pos* */ out_pos);
+
     [DllImport(mtmdLibraryName, EntryPoint = "mtmd_helper_eval_chunks", CallingConvention = CallingConvention.Cdecl)]
     internal static extern int mtmd_helper_eval_chunks(
         SafeMtmdModelHandle ctx,
diff --git a/LLama/Native/NativeApi.cs b/LLama/Native/NativeApi.cs
index 381754103..cbdd05a53 100644
--- a/LLama/Native/NativeApi.cs
+++ b/LLama/Native/NativeApi.cs
@@ -508,5 +508,13 @@ struct gguf_context * metadata,
         void* set_tensor_data_ud, // userdata for function
         struct llama_model_params   params);
         */
+
+        /*
+         // Cannot be wrapped, pinvoke doesn't understand FILE
+         // Load a model from an open FILE pointer
+        LLAMA_API struct llama_model * llama_model_load_from_file_ptr(
+            FILE* file,
+        struct llama_model_params   params);
+        */
     }
 }
diff --git a/LLama/Native/SafeLLamaContextHandle.cs b/LLama/Native/SafeLLamaContextHandle.cs
index 71261eefb..0041fbf5e 100644
--- a/LLama/Native/SafeLLamaContextHandle.cs
+++ b/LLama/Native/SafeLLamaContextHandle.cs
@@ -489,8 +489,12 @@ public void SetLoraAdapters(params Span<(LoraAdapter Adapter, float Scale)> adap
         {
             // Check adapters are all valid and attached to this model
             foreach (var adapter in adapters)
+            {
                 if (adapter.Adapter.Model != ModelHandle)
                     throw new ArgumentException("Cannot add LoRA adapter which was loaded for a different model");
+                if (!adapter.Adapter.Loaded)
+                    throw new ArgumentException("Cannot add LoRA adapter which has been unloaded");
+            }
 
             // Copy data into buffers
             Span<IntPtr> adapterPtrs = stackalloc IntPtr[adapters.Length];
diff --git a/LLama/Native/SafeMtmdModelHandle.cs b/LLama/Native/SafeMtmdModelHandle.cs
index 4a356f637..24ccc8ee2 100644
--- a/LLama/Native/SafeMtmdModelHandle.cs
+++ b/LLama/Native/SafeMtmdModelHandle.cs
@@ -348,7 +348,7 @@ public int DecodeImageChunk(IntPtr chunkPtr, SafeLLamaContextHandle llamaContext
         /// <summary>
         /// Gets the audio bitrate advertised by the model.
         /// </summary>
-        public int GetAudioBitrate() => NativeApi.mtmd_get_audio_bitrate(this);
+        public int GetAudioSampleRate() => NativeApi.mtmd_get_audio_sample_rate(this);
 
         private void EnsureNotDisposed()
         {