Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions LLama.Web/Common/AppType.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
namespace LLama.Web.Common
{
public enum AppType
{
Web = 0,
WebApi = 1
}
}
103 changes: 103 additions & 0 deletions LLama.Web/Common/InferenceOptions.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
using LLama.Common;
using LLama.Abstractions;
using LLama.Native;
using System.Text.Json.Serialization;

namespace LLama.Web.Common
{
public class InferenceOptions : IInferenceParams
{
public string Name { get; set; }


/// <summary>
/// number of tokens to keep from initial prompt
/// </summary>
public int TokensKeep { get; set; } = 0;
/// <summary>
/// how many new tokens to predict (n_predict), set to -1 to inifinitely generate response
/// until it complete.
/// </summary>
public int MaxTokens { get; set; } = -1;
/// <summary>
/// logit bias for specific tokens
/// </summary>
public Dictionary<int, float>? LogitBias { get; set; } = null;

/// <summary>
/// Sequences where the model will stop generating further tokens.
/// </summary>
public IEnumerable<string> AntiPrompts { get; set; } = Array.Empty<string>();
/// <summary>
/// path to file for saving/loading model eval state
/// </summary>
public string PathSession { get; set; } = string.Empty;
/// <summary>
/// string to suffix user inputs with
/// </summary>
public string InputSuffix { get; set; } = string.Empty;
/// <summary>
/// string to prefix user inputs with
/// </summary>
public string InputPrefix { get; set; } = string.Empty;
/// <summary>
/// 0 or lower to use vocab size
/// </summary>
public int TopK { get; set; } = 40;
/// <summary>
/// 1.0 = disabled
/// </summary>
public float TopP { get; set; } = 0.95f;
/// <summary>
/// 1.0 = disabled
/// </summary>
public float TfsZ { get; set; } = 1.0f;
/// <summary>
/// 1.0 = disabled
/// </summary>
public float TypicalP { get; set; } = 1.0f;
/// <summary>
/// 1.0 = disabled
/// </summary>
public float Temperature { get; set; } = 0.8f;
/// <summary>
/// 1.0 = disabled
/// </summary>
public float RepeatPenalty { get; set; } = 1.1f;
/// <summary>
/// last n tokens to penalize (0 = disable penalty, -1 = context size) (repeat_last_n)
/// </summary>
public int RepeatLastTokensCount { get; set; } = 64;
/// <summary>
/// frequency penalty coefficient
/// 0.0 = disabled
/// </summary>
public float FrequencyPenalty { get; set; } = .0f;
/// <summary>
/// presence penalty coefficient
/// 0.0 = disabled
/// </summary>
public float PresencePenalty { get; set; } = .0f;
/// <summary>
/// Mirostat uses tokens instead of words.
/// algorithm described in the paper https://arxiv.org/abs/2007.14966.
/// 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
/// </summary>
public MirostatType Mirostat { get; set; } = MirostatType.Disable;
/// <summary>
/// target entropy
/// </summary>
public float MirostatTau { get; set; } = 5.0f;
/// <summary>
/// learning rate
/// </summary>
public float MirostatEta { get; set; } = 0.1f;
/// <summary>
/// consider newlines as a repeatable token (penalize_nl)
/// </summary>
public bool PenalizeNL { get; set; } = true;

[JsonIgnore]
public SafeLLamaGrammarHandle Grammar { get; set; }
}
}
11 changes: 2 additions & 9 deletions LLama.Web/Common/LLamaOptions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,12 @@
{
public class LLamaOptions
{
public AppType AppType { get; set; }
public ModelCacheType ModelCacheType { get; set; }
public List<ModelOptions> Models { get; set; }
public List<PromptOptions> Prompts { get; set; } = new List<PromptOptions>();
public List<ParameterOptions> Parameters { get; set; } = new List<ParameterOptions>();

public void Initialize()
{
foreach (var prompt in Prompts)
{
if (File.Exists(prompt.Path))
{
prompt.Prompt = File.ReadAllText(prompt.Path).Trim();
}
}
}
}
}
10 changes: 10 additions & 0 deletions LLama.Web/Common/ModelCacheType.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
namespace LLama.Web.Common
{
public enum ModelCacheType
{
Single = 0,
Multiple = 1,
PreloadSingle = 2,
PreloadMultiple = 3,
}
}
187 changes: 93 additions & 94 deletions LLama.Web/Common/ModelOptions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,112 +5,111 @@ namespace LLama.Web.Common
public class ModelOptions
: IModelParams
{

public string Name { get; set; }
public int MaxInstances { get; set; }


/// <summary>
/// Model context size (n_ctx)
/// </summary>
public int ContextSize { get; set; } = 512;
/// <summary>
/// the GPU that is used for scratch and small tensors
/// </summary>
public int MainGpu { get; set; } = 0;
/// <summary>
/// if true, reduce VRAM usage at the cost of performance
/// </summary>
public bool LowVram { get; set; } = false;
/// <summary>
/// Number of layers to run in VRAM / GPU memory (n_gpu_layers)
/// </summary>
public int GpuLayerCount { get; set; } = 20;
/// <summary>
/// Seed for the random number generator (seed)
/// </summary>
public int Seed { get; set; } = 1686349486;
/// <summary>
/// Use f16 instead of f32 for memory kv (memory_f16)
/// </summary>
public bool UseFp16Memory { get; set; } = true;
/// <summary>
/// Use mmap for faster loads (use_mmap)
/// </summary>
public bool UseMemorymap { get; set; } = true;
/// <summary>
/// Use mlock to keep model in memory (use_mlock)
/// </summary>
public bool UseMemoryLock { get; set; } = false;
/// <summary>
/// Compute perplexity over the prompt (perplexity)
/// </summary>
public bool Perplexity { get; set; } = false;
/// <summary>
/// Model path (model)
/// </summary>
public string ModelPath { get; set; }
/// <summary>
/// model alias
/// </summary>
public string ModelAlias { get; set; } = "unknown";
/// <summary>
/// lora adapter path (lora_adapter)
/// </summary>
public string LoraAdapter { get; set; } = string.Empty;
/// <summary>
/// base model path for the lora adapter (lora_base)
/// </summary>
public string LoraBase { get; set; } = string.Empty;
/// <summary>
/// Number of threads (-1 = autodetect) (n_threads)
/// </summary>
public int Threads { get; set; } = Math.Max(Environment.ProcessorCount / 2, 1);
/// <summary>
/// batch size for prompt processing (must be >=32 to use BLAS) (n_batch)
/// </summary>
public int BatchSize { get; set; } = 512;
/// <summary>
/// Model context size (n_ctx)
/// </summary>
public int ContextSize { get; set; } = 512;
/// <summary>
/// the GPU that is used for scratch and small tensors
/// </summary>
public int MainGpu { get; set; } = 0;
/// <summary>
/// if true, reduce VRAM usage at the cost of performance
/// </summary>
public bool LowVram { get; set; } = false;
/// <summary>
/// Number of layers to run in VRAM / GPU memory (n_gpu_layers)
/// </summary>
public int GpuLayerCount { get; set; } = 20;
/// <summary>
/// Seed for the random number generator (seed)
/// </summary>
public int Seed { get; set; } = 1686349486;
/// <summary>
/// Use f16 instead of f32 for memory kv (memory_f16)
/// </summary>
public bool UseFp16Memory { get; set; } = true;
/// <summary>
/// Use mmap for faster loads (use_mmap)
/// </summary>
public bool UseMemorymap { get; set; } = true;
/// <summary>
/// Use mlock to keep model in memory (use_mlock)
/// </summary>
public bool UseMemoryLock { get; set; } = false;
/// <summary>
/// Compute perplexity over the prompt (perplexity)
/// </summary>
public bool Perplexity { get; set; } = false;
/// <summary>
/// Model path (model)
/// </summary>
public string ModelPath { get; set; }
/// <summary>
/// model alias
/// </summary>
public string ModelAlias { get; set; } = "unknown";
/// <summary>
/// lora adapter path (lora_adapter)
/// </summary>
public string LoraAdapter { get; set; } = string.Empty;
/// <summary>
/// base model path for the lora adapter (lora_base)
/// </summary>
public string LoraBase { get; set; } = string.Empty;
/// <summary>
/// Number of threads (-1 = autodetect) (n_threads)
/// </summary>
public int Threads { get; set; } = Math.Max(Environment.ProcessorCount / 2, 1);
/// <summary>
/// batch size for prompt processing (must be >=32 to use BLAS) (n_batch)
/// </summary>
public int BatchSize { get; set; } = 512;

/// <summary>
/// Whether to convert eos to newline during the inference.
/// </summary>
public bool ConvertEosToNewLine { get; set; } = false;
/// <summary>
/// Whether to convert eos to newline during the inference.
/// </summary>
public bool ConvertEosToNewLine { get; set; } = false;

/// <summary>
/// Whether to use embedding mode. (embedding) Note that if this is set to true,
/// The LLamaModel won't produce text response anymore.
/// </summary>
public bool EmbeddingMode { get; set; } = false;
/// <summary>
/// Whether to use embedding mode. (embedding) Note that if this is set to true,
/// The LLamaModel won't produce text response anymore.
/// </summary>
public bool EmbeddingMode { get; set; } = false;

/// <summary>
/// how split tensors should be distributed across GPUs
/// </summary>
public float[] TensorSplits { get; set; }
/// <summary>
/// how split tensors should be distributed across GPUs
/// </summary>
public float[] TensorSplits { get; set; }

/// <summary>
/// Grouped-Query Attention
/// </summary>
public int GroupedQueryAttention { get; set; } = 1;
/// <summary>
/// Grouped-Query Attention
/// </summary>
public int GroupedQueryAttention { get; set; } = 1;

/// <summary>
/// RMS Norm Epsilon
/// </summary>
public float RmsNormEpsilon { get; set; } = 5e-6f;
/// <summary>
/// RMS Norm Epsilon
/// </summary>
public float RmsNormEpsilon { get; set; } = 5e-6f;

/// <summary>
/// RoPE base frequency
/// </summary>
public float RopeFrequencyBase { get; set; } = 10000.0f;
/// <summary>
/// RoPE base frequency
/// </summary>
public float RopeFrequencyBase { get; set; } = 10000.0f;

/// <summary>
/// RoPE frequency scaling factor
/// </summary>
public float RopeFrequencyScale { get; set; } = 1.0f;
/// <summary>
/// RoPE frequency scaling factor
/// </summary>
public float RopeFrequencyScale { get; set; } = 1.0f;

/// <summary>
/// Use experimental mul_mat_q kernels
/// </summary>
public bool MulMatQ { get; set; }
/// <summary>
/// Use experimental mul_mat_q kernels
/// </summary>
public bool MulMatQ { get; set; }

/// <summary>
/// The encoding to use for models
Expand Down
Loading