From e4d7c6cb2ed37e97153e8834f65ba91bdbf9fe93 Mon Sep 17 00:00:00 2001 From: Martin Evans Date: Sat, 14 Mar 2026 14:30:13 +0000 Subject: [PATCH 1/2] - Updated binaries to 73c9eb8ceda397b651dbb6661b2935f0283a2b1d (Qwen3.5 support!) - Removed deprecated native func `llama_adapter_lora_free` and related managed method `LoraAdapter.Unload` --- LLama/Batched/BatchedExecutor.cs | 17 ++++--- LLama/LLamaEmbedder.cs | 5 +- LLama/LLamaReranker.cs | 6 +-- LLama/LLamaSharp.csproj | 2 +- LLama/Native/LLamaFtype.cs | 7 ++- LLama/Native/LLamaModelQuantizeParams.cs | 10 ++++ LLama/Native/LoraAdapter.cs | 20 -------- LLama/Native/NativeApi.Mtmd.cs | 2 +- LLama/Native/NativeApi.cs | 36 ++++++-------- LLama/Native/SafeLLamaContextHandle.cs | 61 +++++++++++++++++++++--- LLama/Native/SafeLlamaModelHandle.cs | 6 +-- llama.cpp | 2 +- 12 files changed, 101 insertions(+), 73 deletions(-) diff --git a/LLama/Batched/BatchedExecutor.cs b/LLama/Batched/BatchedExecutor.cs index 1a6698b1a..db9be6a7a 100644 --- a/LLama/Batched/BatchedExecutor.cs +++ b/LLama/Batched/BatchedExecutor.cs @@ -43,7 +43,12 @@ public sealed class BatchedExecutor /// The this executor is using /// public LLamaWeights Model { get; } - + + /// + /// The optional this executor is using + /// + public MtmdWeights? ClipModel { get; } + /// /// Get the number of tokens in the batch, waiting for to be called /// @@ -79,12 +84,8 @@ public int BatchedTokenCount /// /// The model to use /// Parameters to create a new context - public BatchedExecutor(LLamaWeights model, IContextParams contextParams) - : this(model, contextParams, null) - { - } - - public BatchedExecutor(LLamaWeights model, IContextParams contextParams, MtmdWeights? clipModel) + /// Clip model to use for multimodal capabilities + public BatchedExecutor(LLamaWeights model, IContextParams contextParams, MtmdWeights? clipModel = null) { Model = model; Context = model.CreateContext(contextParams); @@ -92,8 +93,6 @@ public BatchedExecutor(LLamaWeights model, IContextParams contextParams, MtmdWei Epoch = 1; } - public MtmdWeights? ClipModel { get; } - /// /// Start a new /// diff --git a/LLama/LLamaEmbedder.cs b/LLama/LLamaEmbedder.cs index eee9a01e9..5f6a2878d 100644 --- a/LLama/LLamaEmbedder.cs +++ b/LLama/LLamaEmbedder.cs @@ -1,14 +1,11 @@ using System; using System.Collections.Generic; -using System.Linq; using System.Threading; using System.Threading.Tasks; using LLama.Abstractions; using LLama.Exceptions; using LLama.Native; -using Microsoft.Extensions.AI; using Microsoft.Extensions.Logging; -using static System.Net.Mime.MediaTypeNames; namespace LLama; @@ -79,7 +76,7 @@ public async Task> GetEmbeddings(string input, Cancellati Context.Dispose(); Context = _weights.CreateContext(_params, _logger); - NativeApi.llama_set_embeddings(Context.NativeHandle, true); + Context.NativeHandle.SetEmbeddings(true); // Add all of the tokens to the batch var tokens = Context.Tokenize(input, special: true); diff --git a/LLama/LLamaReranker.cs b/LLama/LLamaReranker.cs index 16a206c40..58919fd0a 100644 --- a/LLama/LLamaReranker.cs +++ b/LLama/LLamaReranker.cs @@ -1,11 +1,7 @@ using System; using System.Collections.Generic; -using System.IO; -using System.Linq; -using System.Text; using System.Threading; using System.Threading.Tasks; -using System.Xml.Linq; using LLama.Abstractions; using LLama.Exceptions; using LLama.Native; @@ -44,7 +40,7 @@ public LLamaReranker(LLamaWeights weights, IContextParams @params, ILogger? logg if (@params.PoolingType != LLamaPoolingType.Rank) throw new NotSupportedException("Computing rank score, PoolingType must be equal to LLamaPoolingType.Rank"); Context = weights.CreateContext(@params, logger); - NativeApi.llama_set_embeddings(Context.NativeHandle, true); + Context.NativeHandle.SetEmbeddings(true); } /// diff --git a/LLama/LLamaSharp.csproj b/LLama/LLamaSharp.csproj index 4bdf9289a..9eff34c4e 100644 --- a/LLama/LLamaSharp.csproj +++ b/LLama/LLamaSharp.csproj @@ -57,7 +57,7 @@ - ff4affb4c1aa7eb4_v3 + 73c9eb8ceda397b diff --git a/LLama/Native/LLamaFtype.cs b/LLama/Native/LLamaFtype.cs index 813bad1ae..d48be1855 100644 --- a/LLama/Native/LLamaFtype.cs +++ b/LLama/Native/LLamaFtype.cs @@ -206,7 +206,12 @@ public enum LLamaFtype /// except 1d tensors /// LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38, - + + /// + /// Except 1d tensors + /// + LLAMA_FTYPE_MOSTLY_NVFP4 = 39, + /// /// File type was not specified /// diff --git a/LLama/Native/LLamaModelQuantizeParams.cs b/LLama/Native/LLamaModelQuantizeParams.cs index 857f0cfb9..d47c78f11 100644 --- a/LLama/Native/LLamaModelQuantizeParams.cs +++ b/LLama/Native/LLamaModelQuantizeParams.cs @@ -79,6 +79,16 @@ public bool keep_split } private sbyte _keep_split; + /// + /// calculate and show the final quantization size without performing quantization + /// + public bool dry_run + { + get => Convert.ToBoolean(_dry_run); + set => _dry_run = Convert.ToSByte(value); + } + private sbyte _dry_run; + /// /// pointer to importance matrix data /// diff --git a/LLama/Native/LoraAdapter.cs b/LLama/Native/LoraAdapter.cs index 8fdd649a3..9bc24ce17 100644 --- a/LLama/Native/LoraAdapter.cs +++ b/LLama/Native/LoraAdapter.cs @@ -22,30 +22,10 @@ public class LoraAdapter /// internal IntPtr Pointer { get; } - /// - /// Indicates if this adapter has been unloaded - /// - internal bool Loaded { get; private set; } - internal LoraAdapter(SafeLlamaModelHandle model, string path, IntPtr nativePtr) { Model = model; Path = path; Pointer = nativePtr; - Loaded = true; - } - - /// - /// Unload this adapter - /// - public void Unload() - { - Loaded = false; - llama_adapter_lora_free(Pointer); - - // Manually free a LoRA adapter. loaded adapters will be free when the associated model is deleted - [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] - [Obsolete("adapters are now freed together with the associated model")] - static extern void llama_adapter_lora_free(IntPtr adapter); } } \ No newline at end of file diff --git a/LLama/Native/NativeApi.Mtmd.cs b/LLama/Native/NativeApi.Mtmd.cs index 827c0e1b0..5eb75028f 100644 --- a/LLama/Native/NativeApi.Mtmd.cs +++ b/LLama/Native/NativeApi.Mtmd.cs @@ -168,7 +168,7 @@ internal static unsafe void mtmd_bitmap_set_id(SafeMtmdEmbed bitmap, string? id) // tokenize ---------------------------------------------------------- /// - /// Native text structure consumed by . + /// Native text structure consumed by . /// internal unsafe struct mtmd_input_text_native { diff --git a/LLama/Native/NativeApi.cs b/LLama/Native/NativeApi.cs index ce8c36197..381754103 100644 --- a/LLama/Native/NativeApi.cs +++ b/LLama/Native/NativeApi.cs @@ -132,19 +132,7 @@ public static void llama_empty_call() [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] public static extern unsafe nuint llama_state_seq_load_file(SafeLLamaContextHandle ctx, string filepath, LLamaSeqId dest_seq_id, LLamaToken* tokens_out, nuint n_token_capacity, out nuint n_token_count_out); - /// - /// Set whether to use causal attention or not. If set to true, the model will only attend to the past tokens - /// - [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] - public static extern void llama_set_causal_attn(SafeLLamaContextHandle ctx, [MarshalAs(UnmanagedType.U1)] bool causalAttn); - - /// - /// Set whether the context outputs embeddings or not - /// - /// - /// If true, embeddings will be returned but logits will not - [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] - public static extern void llama_set_embeddings(SafeLLamaContextHandle ctx, [MarshalAs(UnmanagedType.U1)] bool embeddings); + /// /// Set abort callback @@ -152,14 +140,6 @@ public static void llama_empty_call() [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] public static extern void llama_set_abort_callback(SafeLLamaContextHandle ctx, IntPtr /* ggml_abort_callback */ abortCallback, IntPtr abortCallbackData); - /// - /// Get the n_seq_max for this context - /// - /// - /// - [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] - public static extern uint llama_n_seq_max(SafeLLamaContextHandle ctx); - /// /// Get all output token embeddings. /// When pooling_type == LLAMA_POOLING_TYPE_NONE or when using a generative model, the embeddings for which @@ -515,6 +495,18 @@ public static extern unsafe LLamaParamsFitStatus llama_params_fit( [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] public static extern long llama_time_us(); - + /* Directly exposes `ggml_tensor` and `gguf_context` which LLamaSharp does not currently support! + + typedef void (* llama_model_set_tensor_data_t) (struct ggml_tensor * tensor, void* userdata); + + // Create a new model from GGUF metadata as well as a function to set the tensor data + // - tensors are created as GGML_TYPE_F32 by default, + // override by adding a tensor with the same name but a different name to the context + LLAMA_API struct llama_model * llama_model_init_from_user( + struct gguf_context * metadata, + llama_model_set_tensor_data_t set_tensor_data, // function to initialize tensor data with + void* set_tensor_data_ud, // userdata for function + struct llama_model_params params); + */ } } diff --git a/LLama/Native/SafeLLamaContextHandle.cs b/LLama/Native/SafeLLamaContextHandle.cs index 5ec78f053..71261eefb 100644 --- a/LLama/Native/SafeLLamaContextHandle.cs +++ b/LLama/Native/SafeLLamaContextHandle.cs @@ -37,7 +37,7 @@ public sealed class SafeLLamaContextHandle /// /// Get the number of maximum sequences allowed /// - public uint MaxSeq => NativeApi.llama_n_seq_max(this); + public uint MaxSeq => llama_n_seq_max(this); /// /// Get or set the number of threads used for generation of a single token. @@ -355,6 +355,7 @@ static SafeLLamaContextHandle() /// /// The length of the value string (on success) -1 otherwise [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] + // ReSharper disable once InconsistentNaming private static extern int llama_adapter_meta_val_str(IntPtr adapter, string key, StringBuilder buf, UIntPtr buf_size); /// @@ -374,6 +375,7 @@ static SafeLLamaContextHandle() /// /// The length of string i.e meta key (on success) -1 otherwise [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] + // ReSharper disable once InconsistentNaming private static extern int llama_adapter_meta_key_by_index(IntPtr adapter, int i, StringBuilder buf, UIntPtr buf_size); /// @@ -385,6 +387,7 @@ static SafeLLamaContextHandle() /// /// The length of value string (on success) -1 otherwise [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] + // ReSharper disable once InconsistentNaming private static extern int llama_adapter_meta_val_by_index(IntPtr adapter, int i, StringBuilder buf, UIntPtr buf_size); /// @@ -424,6 +427,56 @@ static SafeLLamaContextHandle() /// [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] private static extern void llama_set_warmup(SafeLLamaContextHandle ctx, [MarshalAs(UnmanagedType.U1)] bool warmup); + + /// + /// Set whether to use causal attention or not. If set to true, the model will only attend to the past tokens + /// + [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] + private static extern void llama_set_causal_attn(SafeLLamaContextHandle ctx, [MarshalAs(UnmanagedType.U1)] bool causalAttn); + + /// + /// Set whether the context outputs embeddings or not + /// + /// + /// If true, embeddings will be returned but logits will not + [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] + private static extern void llama_set_embeddings(SafeLLamaContextHandle ctx, [MarshalAs(UnmanagedType.U1)] bool embeddings); + + /// + /// Get the n_seq_max for this context + /// + /// + /// + [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] + private static extern uint llama_n_seq_max(SafeLLamaContextHandle ctx); + #endregion + + #region Setters + /// + /// Set whether the model is in warmup mode or not + /// If true, all model tensors are activated during to load and cache their weights. + /// + public void SetWarmup(bool value) + { + llama_set_warmup(this, value); + } + + /// + /// Set whether to use causal attention or not. If set to true, the model will only attend to the past tokens + /// + public void SetCausalAttention(bool value) + { + llama_set_causal_attn(this, value); + } + + /// + /// Set whether the context outputs embeddings or not + /// + /// If true, embeddings will be returned but logits will not + public void SetEmbeddings(bool value) + { + llama_set_embeddings(this, value); + } #endregion #region LoRA @@ -434,14 +487,10 @@ static SafeLLamaContextHandle() /// public void SetLoraAdapters(params Span<(LoraAdapter Adapter, float Scale)> adapters) { - // Check adapters are all valid + // Check adapters are all valid and attached to this model foreach (var adapter in adapters) - { if (adapter.Adapter.Model != ModelHandle) throw new ArgumentException("Cannot add LoRA adapter which was loaded for a different model"); - if (!adapter.Adapter.Loaded) - throw new ArgumentException("Cannot add LoRA adapter which has been unloaded"); - } // Copy data into buffers Span adapterPtrs = stackalloc IntPtr[adapters.Length]; diff --git a/LLama/Native/SafeLlamaModelHandle.cs b/LLama/Native/SafeLlamaModelHandle.cs index 668074090..2a6855741 100644 --- a/LLama/Native/SafeLlamaModelHandle.cs +++ b/LLama/Native/SafeLlamaModelHandle.cs @@ -175,7 +175,7 @@ static SafeLlamaModelHandle() private static extern unsafe byte* llama_model_chat_template(SafeLlamaModelHandle model, string? name); /// - /// Load the model from a file + /// Load a model from a file /// If the file is split into multiple parts, the file name must follow this pattern: {name}-%05d-of-%05d.gguf /// If the split file name does not follow this pattern, use llama_model_load_from_splits /// @@ -186,7 +186,7 @@ static SafeLlamaModelHandle() private static extern SafeLlamaModelHandle llama_model_load_from_file(string path, LLamaModelParams @params); /// - /// Load the model from multiple splits (support custom naming scheme) + /// Load a model from multiple splits (support custom naming scheme) /// The paths must be in the correct order /// /// @@ -460,7 +460,7 @@ private static int llama_model_meta_val_str(SafeLlamaModelHandle model, string k /// /// [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] - private static extern string? llama_model_cls_label(SafeLlamaModelHandle model, uint i); + private static extern IntPtr /* char* */ llama_model_cls_label(SafeLlamaModelHandle model, uint i); #endregion #region LoRA diff --git a/llama.cpp b/llama.cpp index ff4affb4c..73c9eb8ce 160000 --- a/llama.cpp +++ b/llama.cpp @@ -1 +1 @@ -Subproject commit ff4affb4c1aa7eb4f28a0d9de1b205bd719802f2 +Subproject commit 73c9eb8ceda397b651dbb6661b2935f0283a2b1d From 2a2d6d990a363d96133e619393a31559ec675c80 Mon Sep 17 00:00:00 2001 From: Martin Evans Date: Fri, 17 Apr 2026 14:06:44 +0100 Subject: [PATCH 2/2] Updated to: https://github.com/ggml-org/llama.cpp/commit/3f7c29d318e317b63f54c558bc69803963d7d88c Build: https://github.com/SciSharp/LLamaSharp/actions/runs/24541043843 --- LLama.Unittest/MtmdWeightsTests.cs | 6 +-- LLama/LLamaSharp.csproj | 2 +- LLama/MtmdWeights.cs | 4 +- LLama/Native/GPUSplitMode.cs | 6 +++ LLama/Native/LLamaFtype.cs | 5 ++ LLama/Native/LLamaModelImatrixData.cs | 12 +++++ LLama/Native/LLamaModelQuantizeParams.cs | 8 ++-- LLama/Native/LLamaModelTensorOverride.cs | 9 ++++ LLama/Native/LoraAdapter.cs | 32 +++++++++++++ LLama/Native/NativeApi.Mtmd.cs | 59 +++++++++++++++++++++++- LLama/Native/NativeApi.cs | 8 ++++ LLama/Native/SafeLLamaContextHandle.cs | 4 ++ LLama/Native/SafeMtmdModelHandle.cs | 2 +- 13 files changed, 144 insertions(+), 13 deletions(-) create mode 100644 LLama/Native/LLamaModelImatrixData.cs create mode 100644 LLama/Native/LLamaModelTensorOverride.cs diff --git a/LLama.Unittest/MtmdWeightsTests.cs b/LLama.Unittest/MtmdWeightsTests.cs index 4a1a59c75..42ba52356 100644 --- a/LLama.Unittest/MtmdWeightsTests.cs +++ b/LLama.Unittest/MtmdWeightsTests.cs @@ -81,7 +81,7 @@ public void BasicPropertyChecks() Assert.True(_mtmdWeights.SupportsVision); Assert.False(_mtmdWeights.UsesMRope); Assert.True(_mtmdWeights.UsesNonCausalAttention); - Assert.Equal(-1, _mtmdWeights.AudioBitrate); + Assert.Equal(-1, _mtmdWeights.SampleRate); } [Fact,Trait("Category", "NoCI")] @@ -143,8 +143,8 @@ public void TokenizeProvidesChunkMetadata() Assert.True(_mtmdWeights.SupportsVision); Assert.False(_mtmdWeights.SupportsAudio); - var audioBitrate = _mtmdWeights.AudioBitrate; - Assert.True(audioBitrate <= 0); + var audioSampleRate = _mtmdWeights.SampleRate; + Assert.True(audioSampleRate <= 0); } } } diff --git a/LLama/LLamaSharp.csproj b/LLama/LLamaSharp.csproj index 9eff34c4e..02c3e11c4 100644 --- a/LLama/LLamaSharp.csproj +++ b/LLama/LLamaSharp.csproj @@ -57,7 +57,7 @@ - 73c9eb8ceda397b + 3f7c29d318e317b6 diff --git a/LLama/MtmdWeights.cs b/LLama/MtmdWeights.cs index 07e739a61..32dd6b572 100644 --- a/LLama/MtmdWeights.cs +++ b/LLama/MtmdWeights.cs @@ -137,9 +137,9 @@ public int DecodeImageChunk(IntPtr chunkPtr, SafeLLamaContextHandle llamaContext public bool UsesMRope => NativeHandle.DecodeUseMRope(); /// - /// Gets the audio bitrate advertised by the model. + /// Gets the audio sample rate advertised by the model. /// - public int AudioBitrate => NativeHandle.GetAudioBitrate(); + public int SampleRate => NativeHandle.GetAudioSampleRate(); /// public void Dispose() => NativeHandle.Dispose(); diff --git a/LLama/Native/GPUSplitMode.cs b/LLama/Native/GPUSplitMode.cs index 27ee7ae49..338131b52 100644 --- a/LLama/Native/GPUSplitMode.cs +++ b/LLama/Native/GPUSplitMode.cs @@ -20,4 +20,10 @@ public enum GPUSplitMode /// split layers and KV across GPUs, use tensor parallelism if supported /// Row = 2, + + // Undocumented in llama.h + /// + /// + /// + Tensor = 3, } \ No newline at end of file diff --git a/LLama/Native/LLamaFtype.cs b/LLama/Native/LLamaFtype.cs index d48be1855..78a682130 100644 --- a/LLama/Native/LLamaFtype.cs +++ b/LLama/Native/LLamaFtype.cs @@ -212,6 +212,11 @@ public enum LLamaFtype /// LLAMA_FTYPE_MOSTLY_NVFP4 = 39, + /// + /// Except 1d tensors + /// + LLAMA_FTYPE_MOSTLY_Q1_0 = 40, + /// /// File type was not specified /// diff --git a/LLama/Native/LLamaModelImatrixData.cs b/LLama/Native/LLamaModelImatrixData.cs new file mode 100644 index 000000000..df0eb3856 --- /dev/null +++ b/LLama/Native/LLamaModelImatrixData.cs @@ -0,0 +1,12 @@ +namespace LLama.Native; + +/* /// +/// +/// +/// llama_model_imatrix_data +public unsafe struct LLamaModelImatrixData +{ + char* name; + float* data; + nuint size; +} */ \ No newline at end of file diff --git a/LLama/Native/LLamaModelQuantizeParams.cs b/LLama/Native/LLamaModelQuantizeParams.cs index d47c78f11..9cd04b3f8 100644 --- a/LLama/Native/LLamaModelQuantizeParams.cs +++ b/LLama/Native/LLamaModelQuantizeParams.cs @@ -92,22 +92,22 @@ public bool dry_run /// /// pointer to importance matrix data /// - public IntPtr imatrix; + public IntPtr imatrix; // LLamaModelImatrixData * /// /// pointer to vector containing overrides /// - public IntPtr kv_overrides; + public IntPtr kv_overrides; // llama_model_kv_override * /// /// pointer to vector containing tensor types /// - public IntPtr tensor_types; + public IntPtr tensor_types; // llama_model_tensor_override * /// /// Pointer to vector containing layer indices to prune /// - public IntPtr prune_layers; + public IntPtr prune_layers; // int32 * /// /// Create a LLamaModelQuantizeParams with default values diff --git a/LLama/Native/LLamaModelTensorOverride.cs b/LLama/Native/LLamaModelTensorOverride.cs new file mode 100644 index 000000000..82f44b530 --- /dev/null +++ b/LLama/Native/LLamaModelTensorOverride.cs @@ -0,0 +1,9 @@ +namespace LLama.Native; + +// Unsupported - we can't handle ggml_type since LlamaSharp doesn't wrap/expose ggml +/* + * struct llama_model_tensor_override { + const char * pattern; + enum ggml_type type; // GGMLType might work? + }; +*/ \ No newline at end of file diff --git a/LLama/Native/LoraAdapter.cs b/LLama/Native/LoraAdapter.cs index 9bc24ce17..2aa96eb13 100644 --- a/LLama/Native/LoraAdapter.cs +++ b/LLama/Native/LoraAdapter.cs @@ -22,10 +22,42 @@ public class LoraAdapter /// internal IntPtr Pointer { get; } + /// + /// Indicates if this adapter has been unloaded + /// + internal bool Loaded { get; private set; } + internal LoraAdapter(SafeLlamaModelHandle model, string path, IntPtr nativePtr) { Model = model; Path = path; Pointer = nativePtr; + Loaded = true; + } + + /// + /// Unload this adapter + /// + public void Unload() + { + // Early exit if already unloaded + if (!Loaded) + return; + + // If the model has been unloaded this handle will have been auto unloaded + if (Model.IsClosed) + { + Loaded = false; + return; + } + + // Unload + Loaded = false; + llama_adapter_lora_free(Pointer); + + // Manually free a LoRA adapter. loaded adapters which have not been + // freed will be automatically freed when the associated model is deleted + [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] + static extern void llama_adapter_lora_free(IntPtr adapter); } } \ No newline at end of file diff --git a/LLama/Native/NativeApi.Mtmd.cs b/LLama/Native/NativeApi.Mtmd.cs index 5eb75028f..1bf2b5f9a 100644 --- a/LLama/Native/NativeApi.Mtmd.cs +++ b/LLama/Native/NativeApi.Mtmd.cs @@ -41,24 +41,50 @@ internal struct mtmd_context_params [DllImport(mtmdLibraryName, EntryPoint = "mtmd_context_params_default", CallingConvention = CallingConvention.Cdecl)] internal static extern mtmd_context_params mtmd_context_params_default(); + /// + /// whether we need to set non-causal mask before llama_decode + /// if chunk is nullptr, we assume the default case where chunk is an image chunk + /// + /// + /// [DllImport(mtmdLibraryName, EntryPoint = "mtmd_decode_use_non_causal", CallingConvention = CallingConvention.Cdecl)] [return: MarshalAs(UnmanagedType.I1)] internal static extern bool mtmd_decode_use_non_causal(SafeMtmdModelHandle ctx); + /// + /// whether the current model use M-RoPE for llama_decode + /// + /// + /// [DllImport(mtmdLibraryName, EntryPoint = "mtmd_decode_use_mrope", CallingConvention = CallingConvention.Cdecl)] [return: MarshalAs(UnmanagedType.I1)] internal static extern bool mtmd_decode_use_mrope(SafeMtmdModelHandle ctx); + /// + /// whether the current model supports vision input + /// + /// + /// [DllImport(mtmdLibraryName, EntryPoint = "mtmd_support_vision", CallingConvention = CallingConvention.Cdecl)] [return: MarshalAs(UnmanagedType.I1)] internal static extern bool mtmd_support_vision(SafeMtmdModelHandle ctx); + /// + /// whether the current model supports audio input + /// + /// + /// [DllImport(mtmdLibraryName, EntryPoint = "mtmd_support_audio", CallingConvention = CallingConvention.Cdecl)] [return: MarshalAs(UnmanagedType.I1)] internal static extern bool mtmd_support_audio(SafeMtmdModelHandle ctx); - [DllImport(mtmdLibraryName, EntryPoint = "mtmd_get_audio_bitrate", CallingConvention = CallingConvention.Cdecl)] - internal static extern int mtmd_get_audio_bitrate(SafeMtmdModelHandle ctx); + /// + /// get audio sample rate in Hz, for example 16000 for Whisper + /// + /// + /// + [DllImport(mtmdLibraryName, EntryPoint = "mtmd_get_audio_sample_rate", CallingConvention = CallingConvention.Cdecl)] + internal static extern int mtmd_get_audio_sample_rate(SafeMtmdModelHandle ctx); // bitmap ------------------------------------------------------------ @@ -153,9 +179,11 @@ internal static unsafe void mtmd_bitmap_set_id(SafeMtmdEmbed bitmap, string? id) [DllImport(mtmdLibraryName, EntryPoint = "mtmd_image_tokens_get_n_tokens", CallingConvention = CallingConvention.Cdecl)] internal static extern UIntPtr mtmd_image_tokens_get_n_tokens(IntPtr image_tokens); + [Obsolete("use mtmd_image_tokens_get_decoder_pos() instead")] [DllImport(mtmdLibraryName, EntryPoint = "mtmd_image_tokens_get_nx", CallingConvention = CallingConvention.Cdecl)] internal static extern UIntPtr mtmd_image_tokens_get_nx(IntPtr image_tokens); + [Obsolete("use mtmd_image_tokens_get_decoder_pos() instead")] [DllImport(mtmdLibraryName, EntryPoint = "mtmd_image_tokens_get_ny", CallingConvention = CallingConvention.Cdecl)] internal static extern UIntPtr mtmd_image_tokens_get_ny(IntPtr image_tokens); @@ -165,6 +193,28 @@ internal static unsafe void mtmd_bitmap_set_id(SafeMtmdEmbed bitmap, string? id) [DllImport(mtmdLibraryName, EntryPoint = "mtmd_image_tokens_get_n_pos", CallingConvention = CallingConvention.Cdecl)] internal static extern int mtmd_image_tokens_get_n_pos(IntPtr image_tokens); + [StructLayout(LayoutKind.Explicit)] + internal struct mtmd_decoder_pos + { + [FieldOffset(0)] + uint t; + + [FieldOffset(4)] + uint x; + + [FieldOffset(8)] + uint y; + }; + + /// + /// get position for decoder attention, to be used by M-RoPE models + /// + /// + /// i is the index of the embedding token, ranging from 0 to mtmd_image_tokens_get_n_tokens() - 1 + /// return relative position (for example, embedding 0 will have position (0, 0, 0); remember to adjust it to the current absolute position) + [DllImport(mtmdLibraryName, EntryPoint = "mtmd_image_tokens_get_decoder_pos", CallingConvention = CallingConvention.Cdecl)] + internal static extern mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(IntPtr image_tokens, nuint i); + // tokenize ---------------------------------------------------------- /// @@ -259,6 +309,11 @@ internal static unsafe IntPtr mtmd_helper_bitmap_init_from_file(SafeMtmdModelHan [DllImport(mtmdLibraryName, EntryPoint = "mtmd_helper_get_n_pos", CallingConvention = CallingConvention.Cdecl)] internal static extern int mtmd_helper_get_n_pos(SafeMtmdInputChunks chunks); + [DllImport(mtmdLibraryName, EntryPoint = "mtmd_helper_image_get_decoder_pos", CallingConvention = CallingConvention.Cdecl)] + // helper to get the list of relative positions corresponding to the embedding tokens, to be used by M-RoPE + // out_pos must have length == mtmd_helper_get_n_tokens(image) + internal static extern void mtmd_helper_image_get_decoder_pos(IntPtr /* mtmd_image_tokens* */ image, IntPtr /* mtmd_decoder_pos* */ out_pos); + [DllImport(mtmdLibraryName, EntryPoint = "mtmd_helper_eval_chunks", CallingConvention = CallingConvention.Cdecl)] internal static extern int mtmd_helper_eval_chunks( SafeMtmdModelHandle ctx, diff --git a/LLama/Native/NativeApi.cs b/LLama/Native/NativeApi.cs index 381754103..cbdd05a53 100644 --- a/LLama/Native/NativeApi.cs +++ b/LLama/Native/NativeApi.cs @@ -508,5 +508,13 @@ struct gguf_context * metadata, void* set_tensor_data_ud, // userdata for function struct llama_model_params params); */ + + /* + // Cannot be wrapped, pinvoke doesn't understand FILE + // Load a model from an open FILE pointer + LLAMA_API struct llama_model * llama_model_load_from_file_ptr( + FILE* file, + struct llama_model_params params); + */ } } diff --git a/LLama/Native/SafeLLamaContextHandle.cs b/LLama/Native/SafeLLamaContextHandle.cs index 71261eefb..0041fbf5e 100644 --- a/LLama/Native/SafeLLamaContextHandle.cs +++ b/LLama/Native/SafeLLamaContextHandle.cs @@ -489,8 +489,12 @@ public void SetLoraAdapters(params Span<(LoraAdapter Adapter, float Scale)> adap { // Check adapters are all valid and attached to this model foreach (var adapter in adapters) + { if (adapter.Adapter.Model != ModelHandle) throw new ArgumentException("Cannot add LoRA adapter which was loaded for a different model"); + if (!adapter.Adapter.Loaded) + throw new ArgumentException("Cannot add LoRA adapter which has been unloaded"); + } // Copy data into buffers Span adapterPtrs = stackalloc IntPtr[adapters.Length]; diff --git a/LLama/Native/SafeMtmdModelHandle.cs b/LLama/Native/SafeMtmdModelHandle.cs index 4a356f637..24ccc8ee2 100644 --- a/LLama/Native/SafeMtmdModelHandle.cs +++ b/LLama/Native/SafeMtmdModelHandle.cs @@ -348,7 +348,7 @@ public int DecodeImageChunk(IntPtr chunkPtr, SafeLLamaContextHandle llamaContext /// /// Gets the audio bitrate advertised by the model. /// - public int GetAudioBitrate() => NativeApi.mtmd_get_audio_bitrate(this); + public int GetAudioSampleRate() => NativeApi.mtmd_get_audio_sample_rate(this); private void EnsureNotDisposed() {