SciSharp · saddam213 · Aug 22, 2023 · Aug 22, 2023 · Aug 22, 2023 · Aug 22, 2023
diff --git a/LLama.Web/Common/AppType.cs b/LLama.Web/Common/AppType.cs
@@ -0,0 +1,8 @@
+namespace LLama.Web.Common
+{
+    public enum AppType
+    {
+        Web = 0,
+        WebApi = 1
+    }
+}
diff --git a/LLama.Web/Common/InferenceOptions.cs b/LLama.Web/Common/InferenceOptions.cs
@@ -0,0 +1,103 @@
+using LLama.Common;
+using LLama.Abstractions;
+using LLama.Native;
+using System.Text.Json.Serialization;
+
+namespace LLama.Web.Common
+{
+    public class InferenceOptions : IInferenceParams
+    {
+        public string Name { get; set; }
+
+
+        /// <summary>
+        /// number of tokens to keep from initial prompt
+        /// </summary>
+        public int TokensKeep { get; set; } = 0;
+        /// <summary>
+        /// how many new tokens to predict (n_predict), set to -1 to inifinitely generate response
+        /// until it complete.
+        /// </summary>
+        public int MaxTokens { get; set; } = -1;
+        /// <summary>
+        /// logit bias for specific tokens
+        /// </summary>
+        public Dictionary<int, float>? LogitBias { get; set; } = null;
+
+        /// <summary>
+        /// Sequences where the model will stop generating further tokens.
+        /// </summary>
+        public IEnumerable<string> AntiPrompts { get; set; } = Array.Empty<string>();
+        /// <summary>
+        /// path to file for saving/loading model eval state
+        /// </summary>
+        public string PathSession { get; set; } = string.Empty;
+        /// <summary>
+        /// string to suffix user inputs with
+        /// </summary>
+        public string InputSuffix { get; set; } = string.Empty;
+        /// <summary>
+        /// string to prefix user inputs with
+        /// </summary>
+        public string InputPrefix { get; set; } = string.Empty;
+        /// <summary>
+        ///  0 or lower to use vocab size
+        /// </summary>
+        public int TopK { get; set; } = 40;
+        /// <summary>
+        /// 1.0 = disabled
+        /// </summary>
+        public float TopP { get; set; } = 0.95f;
+        /// <summary>
+        /// 1.0 = disabled
+        /// </summary>
+        public float TfsZ { get; set; } = 1.0f;
+        /// <summary>
+        /// 1.0 = disabled
+        /// </summary>
+        public float TypicalP { get; set; } = 1.0f;
+        /// <summary>
+        /// 1.0 = disabled
+        /// </summary>
+        public float Temperature { get; set; } = 0.8f;
+        /// <summary>
+        /// 1.0 = disabled
+        /// </summary>
+        public float RepeatPenalty { get; set; } = 1.1f;
+        /// <summary>
+        /// last n tokens to penalize (0 = disable penalty, -1 = context size) (repeat_last_n)
+        /// </summary>
+        public int RepeatLastTokensCount { get; set; } = 64;
+        /// <summary>
+        /// frequency penalty coefficient
+        /// 0.0 = disabled
+        /// </summary>
+        public float FrequencyPenalty { get; set; } = .0f;
+        /// <summary>
+        /// presence penalty coefficient
+        /// 0.0 = disabled
+        /// </summary>
+        public float PresencePenalty { get; set; } = .0f;
+        /// <summary>
+        /// Mirostat uses tokens instead of words.
+        /// algorithm described in the paper https://arxiv.org/abs/2007.14966.
+        /// 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
+        /// </summary>
+        public MirostatType Mirostat { get; set; } = MirostatType.Disable;
+        /// <summary>
+        /// target entropy
+        /// </summary>
+        public float MirostatTau { get; set; } = 5.0f;
+        /// <summary>
+        /// learning rate
+        /// </summary>
+        public float MirostatEta { get; set; } = 0.1f;
+        /// <summary>
+        /// consider newlines as a repeatable token (penalize_nl)
+        /// </summary>
+        public bool PenalizeNL { get; set; } = true;
+
+        [JsonIgnore]
+        public SafeLLamaGrammarHandle Grammar { get; set; }
+    }
+}
diff --git a/LLama.Web/Common/LLamaOptions.cs b/LLama.Web/Common/LLamaOptions.cs
@@ -2,19 +2,12 @@
 {
     public class LLamaOptions
     {
+        public AppType AppType { get; set; }
+        public ModelCacheType ModelCacheType { get; set; }
         public List<ModelOptions> Models { get; set; }
-        public List<PromptOptions> Prompts { get; set; } = new List<PromptOptions>();
-        public List<ParameterOptions> Parameters { get; set; } = new List<ParameterOptions>();
 
         public void Initialize()
         {
-            foreach (var prompt in Prompts)
-            {
-                if (File.Exists(prompt.Path))
-                {
-                    prompt.Prompt = File.ReadAllText(prompt.Path).Trim();
-                }
-            }
         }
     }
 }
diff --git a/LLama.Web/Common/ModelCacheType.cs b/LLama.Web/Common/ModelCacheType.cs
@@ -0,0 +1,10 @@
+namespace LLama.Web.Common
+{
+    public enum ModelCacheType
+    {
+        Single = 0,
+        Multiple = 1,
+        PreloadSingle = 2,
+        PreloadMultiple = 3,
+    }
+}
diff --git a/LLama.Web/Common/ModelOptions.cs b/LLama.Web/Common/ModelOptions.cs
@@ -5,112 +5,111 @@ namespace LLama.Web.Common
     public class ModelOptions
         : IModelParams
     {
-
         public string Name { get; set; }
         public int MaxInstances { get; set; }
 
 
-		/// <summary>
-		/// Model context size (n_ctx)
-		/// </summary>
-		public int ContextSize { get; set; } = 512;
-		/// <summary>
-		/// the GPU that is used for scratch and small tensors
-		/// </summary>
-		public int MainGpu { get; set; } = 0;
-		/// <summary>
-		/// if true, reduce VRAM usage at the cost of performance
-		/// </summary>
-		public bool LowVram { get; set; } = false;
-		/// <summary>
-		/// Number of layers to run in VRAM / GPU memory (n_gpu_layers)
-		/// </summary>
-		public int GpuLayerCount { get; set; } = 20;
-		/// <summary>
-		/// Seed for the random number generator (seed)
-		/// </summary>
-		public int Seed { get; set; } = 1686349486;
-		/// <summary>
-		/// Use f16 instead of f32 for memory kv (memory_f16)
-		/// </summary>
-		public bool UseFp16Memory { get; set; } = true;
-		/// <summary>
-		/// Use mmap for faster loads (use_mmap)
-		/// </summary>
-		public bool UseMemorymap { get; set; } = true;
-		/// <summary>
-		/// Use mlock to keep model in memory (use_mlock)
-		/// </summary>
-		public bool UseMemoryLock { get; set; } = false;
-		/// <summary>
-		/// Compute perplexity over the prompt (perplexity)
-		/// </summary>
-		public bool Perplexity { get; set; } = false;
-		/// <summary>
-		/// Model path (model)
-		/// </summary>
-		public string ModelPath { get; set; }
-		/// <summary>
-		/// model alias
-		/// </summary>
-		public string ModelAlias { get; set; } = "unknown";
-		/// <summary>
-		/// lora adapter path (lora_adapter)
-		/// </summary>
-		public string LoraAdapter { get; set; } = string.Empty;
-		/// <summary>
-		/// base model path for the lora adapter (lora_base)
-		/// </summary>
-		public string LoraBase { get; set; } = string.Empty;
-		/// <summary>
-		/// Number of threads (-1 = autodetect) (n_threads)
-		/// </summary>
-		public int Threads { get; set; } = Math.Max(Environment.ProcessorCount / 2, 1);
-		/// <summary>
-		/// batch size for prompt processing (must be >=32 to use BLAS) (n_batch)
-		/// </summary>
-		public int BatchSize { get; set; } = 512;
+        /// <summary>
+        /// Model context size (n_ctx)
+        /// </summary>
+        public int ContextSize { get; set; } = 512;
+        /// <summary>
+        /// the GPU that is used for scratch and small tensors
+        /// </summary>
+        public int MainGpu { get; set; } = 0;
+        /// <summary>
+        /// if true, reduce VRAM usage at the cost of performance
+        /// </summary>
+        public bool LowVram { get; set; } = false;
+        /// <summary>
+        /// Number of layers to run in VRAM / GPU memory (n_gpu_layers)
+        /// </summary>
+        public int GpuLayerCount { get; set; } = 20;
+        /// <summary>
+        /// Seed for the random number generator (seed)
+        /// </summary>
+        public int Seed { get; set; } = 1686349486;
+        /// <summary>
+        /// Use f16 instead of f32 for memory kv (memory_f16)
+        /// </summary>
+        public bool UseFp16Memory { get; set; } = true;
+        /// <summary>
+        /// Use mmap for faster loads (use_mmap)
+        /// </summary>
+        public bool UseMemorymap { get; set; } = true;
+        /// <summary>
+        /// Use mlock to keep model in memory (use_mlock)
+        /// </summary>
+        public bool UseMemoryLock { get; set; } = false;
+        /// <summary>
+        /// Compute perplexity over the prompt (perplexity)
+        /// </summary>
+        public bool Perplexity { get; set; } = false;
+        /// <summary>
+        /// Model path (model)
+        /// </summary>
+        public string ModelPath { get; set; }
+        /// <summary>
+        /// model alias
+        /// </summary>
+        public string ModelAlias { get; set; } = "unknown";
+        /// <summary>
+        /// lora adapter path (lora_adapter)
+        /// </summary>
+        public string LoraAdapter { get; set; } = string.Empty;
+        /// <summary>
+        /// base model path for the lora adapter (lora_base)
+        /// </summary>
+        public string LoraBase { get; set; } = string.Empty;
+        /// <summary>
+        /// Number of threads (-1 = autodetect) (n_threads)
+        /// </summary>
+        public int Threads { get; set; } = Math.Max(Environment.ProcessorCount / 2, 1);
+        /// <summary>
+        /// batch size for prompt processing (must be >=32 to use BLAS) (n_batch)
+        /// </summary>
+        public int BatchSize { get; set; } = 512;
 
-		/// <summary>
-		/// Whether to convert eos to newline during the inference.
-		/// </summary>
-		public bool ConvertEosToNewLine { get; set; } = false;
+        /// <summary>
+        /// Whether to convert eos to newline during the inference.
+        /// </summary>
+        public bool ConvertEosToNewLine { get; set; } = false;
 
-		/// <summary>
-		/// Whether to use embedding mode. (embedding) Note that if this is set to true, 
-		/// The LLamaModel won't produce text response anymore.
-		/// </summary>
-		public bool EmbeddingMode { get; set; } = false;
+        /// <summary>
+        /// Whether to use embedding mode. (embedding) Note that if this is set to true, 
+        /// The LLamaModel won't produce text response anymore.
+        /// </summary>
+        public bool EmbeddingMode { get; set; } = false;
 
-		/// <summary>
-		/// how split tensors should be distributed across GPUs
-		/// </summary>
-		public float[] TensorSplits { get; set; }
+        /// <summary>
+        /// how split tensors should be distributed across GPUs
+        /// </summary>
+        public float[] TensorSplits { get; set; }
 
-		/// <summary>
-		/// Grouped-Query Attention
-		/// </summary>
-		public int GroupedQueryAttention { get; set; } = 1;
+        /// <summary>
+        /// Grouped-Query Attention
+        /// </summary>
+        public int GroupedQueryAttention { get; set; } = 1;
 
-		/// <summary>
-		/// RMS Norm Epsilon
-		/// </summary>
-		public float RmsNormEpsilon { get; set; } = 5e-6f;
+        /// <summary>
+        /// RMS Norm Epsilon
+        /// </summary>
+        public float RmsNormEpsilon { get; set; } = 5e-6f;
 
-		/// <summary>
-		/// RoPE base frequency
-		/// </summary>
-		public float RopeFrequencyBase { get; set; } = 10000.0f;
+        /// <summary>
+        /// RoPE base frequency
+        /// </summary>
+        public float RopeFrequencyBase { get; set; } = 10000.0f;
 
-		/// <summary>
-		/// RoPE frequency scaling factor
-		/// </summary>
-		public float RopeFrequencyScale { get; set; } = 1.0f;
+        /// <summary>
+        /// RoPE frequency scaling factor
+        /// </summary>
+        public float RopeFrequencyScale { get; set; } = 1.0f;
 
-		/// <summary>
-		/// Use experimental mul_mat_q kernels
-		/// </summary>
-		public bool MulMatQ { get; set; }
+        /// <summary>
+        /// Use experimental mul_mat_q kernels
+        /// </summary>
+        public bool MulMatQ { get; set; }
 
         /// <summary>
         /// The encoding to use for models