From baf96d319093fbf8d9c971e27d6be6ebbbacac67 Mon Sep 17 00:00:00 2001 From: Devis Lucato Date: Tue, 26 Nov 2024 23:22:29 -0800 Subject: [PATCH 1/4] Allow to select tokenizers via configuration - allow to configure tokenizer dependencies - auto detect tokenizer when using OpenAI - fix a few DI missing params --- KernelMemory.sln.DotSettings | 1 + .../002-dotnet-Serverless/appsettings.json | 31 +++++++- examples/210-KM-without-builder/Program.cs | 7 +- .../210-KM-without-builder/appsettings.json | 20 ++++- examples/212-dotnet-ollama/Program.cs | 4 +- extensions/Anthropic/AnthropicConfig.cs | 6 ++ .../Anthropic/AnthropicTextGeneration.cs | 5 +- .../AzureOpenAI/AzureOpenAIConfig.cs | 6 ++ .../AzureOpenAITextEmbeddingGenerator.cs | 5 +- .../AzureOpenAI/AzureOpenAITextGenerator.cs | 5 +- .../Internals/KernelMemoryComposer.cs | 31 +++----- extensions/ONNX/Onnx/OnnxTextGenerator.cs | 4 +- extensions/Ollama/Ollama/OllamaModelConfig.cs | 6 ++ .../Ollama/OllamaTextEmbeddingGenerator.cs | 5 +- .../Ollama/Ollama/OllamaTextGenerator.cs | 5 +- .../OpenAI.UnitTests/GPTTokenizersTests.cs | 6 +- .../OpenAI/OpenAI/DependencyInjection.cs | 6 +- extensions/OpenAI/OpenAI/OpenAIConfig.cs | 14 +++- .../OpenAI/OpenAITextEmbeddingGenerator.cs | 10 ++- .../OpenAI/OpenAI/OpenAITextGenerator.cs | 10 ++- .../{GPT4Tokenizer.cs => CL100KTokenizer.cs} | 7 +- .../OpenAI/OpenAI/Tokenizers/GPT3Tokenizer.cs | 29 -------- .../OpenAI/OpenAI/Tokenizers/GPTTokenizers.cs | 34 +++++++++ .../{GPT4oTokenizer.cs => O200KTokenizer.cs} | 8 +- .../{GPT2Tokenizer.cs => P50KTokenizer.cs} | 7 +- .../OpenAI/Tokenizers/TiktokenTokenizer.cs | 41 +++++++++++ .../OpenAI/Tokenizers/TokenizerFactory.cs | 73 +++++++++++++++++++ service/Service/appsettings.json | 23 ++++++ .../Services/AzureOpenAIEmbedding.cs | 2 + .../Services/AzureOpenAIText.cs | 2 + 30 files changed, 312 insertions(+), 101 deletions(-) rename extensions/OpenAI/OpenAI/Tokenizers/{GPT4Tokenizer.cs => CL100KTokenizer.cs} (74%) delete mode 100644 extensions/OpenAI/OpenAI/Tokenizers/GPT3Tokenizer.cs create mode 100644 extensions/OpenAI/OpenAI/Tokenizers/GPTTokenizers.cs rename extensions/OpenAI/OpenAI/Tokenizers/{GPT4oTokenizer.cs => O200KTokenizer.cs} (70%) rename extensions/OpenAI/OpenAI/Tokenizers/{GPT2Tokenizer.cs => P50KTokenizer.cs} (73%) create mode 100644 extensions/OpenAI/OpenAI/Tokenizers/TiktokenTokenizer.cs create mode 100644 extensions/OpenAI/OpenAI/Tokenizers/TokenizerFactory.cs diff --git a/KernelMemory.sln.DotSettings b/KernelMemory.sln.DotSettings index e0cd1b828..4d9e859e9 100644 --- a/KernelMemory.sln.DotSettings +++ b/KernelMemory.sln.DotSettings @@ -94,6 +94,7 @@ AMQP API BOM + CL CORS DB DI diff --git a/examples/002-dotnet-Serverless/appsettings.json b/examples/002-dotnet-Serverless/appsettings.json index 09477c1be..c76954983 100644 --- a/examples/002-dotnet-Serverless/appsettings.json +++ b/examples/002-dotnet-Serverless/appsettings.json @@ -78,20 +78,30 @@ "Auth": "AzureIdentity", "Endpoint": "https://<...>.openai.azure.com/", "APIKey": "", + // Your Azure Deployment name "Deployment": "", // The max number of tokens supported by model deployed // See https://learn.microsoft.com/azure/ai-services/openai/concepts/models "MaxTokenTotal": 8191, + // Which tokenizer to use to correctly measure the size of chunks. + // Supported values: "p50k", "cl100k", "o200k". Leave it empty if unsure. + // - Use p50k for the old text-davinci-003 models + // - Use cl100k for the old gpt-3.4 and gpt-4 family, and for text embedding models + // - Use o200k for the most recent gpt-4o family + "Tokenizer": "cl100k", // The number of dimensions output embeddings should have. // Only supported in "text-embedding-3" and later models developed with // MRL, see https://arxiv.org/abs/2205.13147 "EmbeddingDimensions": null, // How many embeddings to calculate in parallel. The max value depends on // the model and deployment in use. - // See also hhttps://learn.microsoft.com/azure/ai-services/openai/reference#embeddings - "MaxEmbeddingBatchSize": 10, + // See https://learn.microsoft.com/azure/ai-services/openai/reference#embeddings + "MaxEmbeddingBatchSize": 1, // How many times to retry in case of throttling. - "MaxRetries": 10 + "MaxRetries": 10, + // Thumbprints of certificates that should be trusted for HTTPS requests when SSL policy errors are detected. + // This should only be used for local development when using a proxy to call the OpenAI endpoints. + "TrustedCertificateThumbprints": [] }, "AzureOpenAIText": { // "ApiKey" or "AzureIdentity" @@ -104,16 +114,27 @@ // The max number of tokens supported by model deployed // See https://learn.microsoft.com/azure/ai-services/openai/concepts/models "MaxTokenTotal": 16384, + // Which tokenizer to use to correctly measure the size of chunks. + // Supported values: "p50k", "cl100k", "o200k". Leave it empty if unsure. + // - Use p50k for the old text-davinci-003 models + // - Use cl100k for the old gpt-3.4 and gpt-4 family, and for text embedding models + // - Use o200k for the most recent gpt-4o family + "Tokenizer": "o200k", // "ChatCompletion" or "TextCompletion" "APIType": "ChatCompletion", // How many times to retry in case of throttling. - "MaxRetries": 10 + "MaxRetries": 10, + // Thumbprints of certificates that should be trusted for HTTPS requests when SSL policy errors are detected. + // This should only be used for local development when using a proxy to call the OpenAI endpoints. + "TrustedCertificateThumbprints": [] }, "OpenAI": { // Name of the model used to generate text (text completion or chat completion) "TextModel": "gpt-4o-mini", // The max number of tokens supported by the text model. "TextModelMaxTokenTotal": 16384, + // Supported values: "p50k", "cl100k", "o200k". Leave it empty for autodetect. + "TextModelTokenizer": "", // What type of text generation, by default autodetect using the model name. // Possible values: "Auto", "TextCompletion", "Chat" "TextGenerationType": "Auto", @@ -122,6 +143,8 @@ // The max number of tokens supported by the embedding model // See https://platform.openai.com/docs/guides/embeddings/what-are-embeddings "EmbeddingModelMaxTokenTotal": 8191, + // Supported values: "p50k", "cl100k", "o200k". Leave it empty for autodetect. + "EmbeddingModelTokenizer": "", // OpenAI API Key "APIKey": "", // OpenAI Organization ID (usually empty, unless you have multiple accounts on different orgs) diff --git a/examples/210-KM-without-builder/Program.cs b/examples/210-KM-without-builder/Program.cs index 888446b9b..3324b439c 100644 --- a/examples/210-KM-without-builder/Program.cs +++ b/examples/210-KM-without-builder/Program.cs @@ -79,11 +79,12 @@ public static async Task Main() var promptProvider = new EmbeddedPromptProvider(); // AI dependencies - var tokenizer = new GPT4oTokenizer(); + var tokenizerForChat = new O200KTokenizer(); + var tokenizerForEmbeddings = new CL100KTokenizer(); var embeddingGeneratorHttpClient = new HttpClient(); - var embeddingGenerator = new AzureOpenAITextEmbeddingGenerator(azureOpenAIEmbeddingConfig, tokenizer, loggerFactory, embeddingGeneratorHttpClient); + var embeddingGenerator = new AzureOpenAITextEmbeddingGenerator(azureOpenAIEmbeddingConfig, tokenizerForEmbeddings, loggerFactory, embeddingGeneratorHttpClient); var textGeneratorHttpClient = new HttpClient(); - var textGenerator = new AzureOpenAITextGenerator(azureOpenAITextConfig, tokenizer, loggerFactory, textGeneratorHttpClient); + var textGenerator = new AzureOpenAITextGenerator(azureOpenAITextConfig, tokenizerForChat, loggerFactory, textGeneratorHttpClient); var contentModeration = new AzureAIContentSafetyModeration(azureAIContentSafetyModerationConfig, loggerFactory); // Storage diff --git a/examples/210-KM-without-builder/appsettings.json b/examples/210-KM-without-builder/appsettings.json index 331025b97..e283025ed 100644 --- a/examples/210-KM-without-builder/appsettings.json +++ b/examples/210-KM-without-builder/appsettings.json @@ -217,7 +217,7 @@ // the same line verbatim. "FrequencyPenalty": 0, // Sequences where the completion will stop generating further tokens. - "StopSequences": [] + "StopSequences": [], // Modify the likelihood of specified tokens appearing in the completion. //"TokenSelectionBiases": { } // Whether to check is the generated answers are safe. @@ -232,6 +232,8 @@ "ApiKey": "", // See https://docs.anthropic.com/claude/docs/models-overview for list of models and details "TextModelName": "claude-3-haiku-20240307", + // Supported values: "p50k", "cl100k", "o200k". Leave it empty if unsure. + "Tokenizer": "cl100k", // How many tokens the model can receive in input and generate in output // See https://docs.anthropic.com/claude/docs/models-overview "MaxTokenIn": 200000, @@ -297,6 +299,12 @@ // The max number of tokens supported by model deployed // See https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models "MaxTokenTotal": 8191, + // Which tokenizer to use to correctly measure the size of chunks. + // Supported values: "p50k", "cl100k", "o200k". Leave it empty if unsure. + // - Use p50k for the old text-davinci-003 models + // - Use cl100k for the old gpt-3.4 and gpt-4 family, and for text embedding models + // - Use o200k for the most recent gpt-4o family + "Tokenizer": "cl100k", // The number of dimensions output embeddings should have. // Only supported in "text-embedding-3" and later models developed with // MRL, see https://arxiv.org/abs/2205.13147 @@ -319,6 +327,12 @@ // The max number of tokens supported by model deployed // See https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models "MaxTokenTotal": 16384, + // Which tokenizer to use to correctly measure the size of chunks. + // Supported values: "p50k", "cl100k", "o200k". Leave it empty if unsure. + // - Use p50k for the old text-davinci-003 models + // - Use cl100k for the old gpt-3.4 and gpt-4 family, and for text embedding models + // - Use o200k for the most recent gpt-4o family + "Tokenizer": "o200k", // "ChatCompletion" or "TextCompletion" "APIType": "ChatCompletion", // How many times to retry in case of throttling. @@ -389,6 +403,8 @@ "TextModel": "gpt-4o-mini", // The max number of tokens supported by the text model. "TextModelMaxTokenTotal": 16384, + // Supported values: "p50k", "cl100k", "o200k". Leave it empty for autodetect. + "TextModelTokenizer": "", // What type of text generation, by default autodetect using the model name. // Possible values: "Auto", "TextCompletion", "Chat" "TextGenerationType": "Auto", @@ -397,6 +413,8 @@ // The max number of tokens supported by the embedding model // See https://platform.openai.com/docs/guides/embeddings/what-are-embeddings "EmbeddingModelMaxTokenTotal": 8191, + // Supported values: "p50k", "cl100k", "o200k". Leave it empty for autodetect. + "EmbeddingModelTokenizer": "", // OpenAI API Key "APIKey": "", // OpenAI Organization ID (usually empty, unless you have multiple accounts on different orgs) diff --git a/examples/212-dotnet-ollama/Program.cs b/examples/212-dotnet-ollama/Program.cs index c7492564c..4d149fcca 100644 --- a/examples/212-dotnet-ollama/Program.cs +++ b/examples/212-dotnet-ollama/Program.cs @@ -36,8 +36,8 @@ public static async Task Main() }; var memory = new KernelMemoryBuilder() - .WithOllamaTextGeneration(config, new GPT4oTokenizer()) - .WithOllamaTextEmbeddingGeneration(config, new GPT4oTokenizer()) + .WithOllamaTextGeneration(config, new CL100KTokenizer()) + .WithOllamaTextEmbeddingGeneration(config, new CL100KTokenizer()) .Configure(builder => builder.Services.AddLogging(l => { l.SetMinimumLevel(logLevel); diff --git a/extensions/Anthropic/AnthropicConfig.cs b/extensions/Anthropic/AnthropicConfig.cs index da6a1baf1..111de13a0 100644 --- a/extensions/Anthropic/AnthropicConfig.cs +++ b/extensions/Anthropic/AnthropicConfig.cs @@ -48,6 +48,12 @@ public class AnthropicConfig /// public int MaxTokenOut { get; set; } = 4096; + /// + /// Name of the tokenizer used to count tokens. + /// Supported values: "p50k", "cl100k", "o200k". Leave it empty if unsure. + /// + public string Tokenizer { get; set; } = "cl100k"; + /// /// System prompt used when generating text /// diff --git a/extensions/Anthropic/AnthropicTextGeneration.cs b/extensions/Anthropic/AnthropicTextGeneration.cs index df0252ce4..8121b34da 100644 --- a/extensions/Anthropic/AnthropicTextGeneration.cs +++ b/extensions/Anthropic/AnthropicTextGeneration.cs @@ -70,12 +70,13 @@ public AnthropicTextGeneration( var endpointVersion = string.IsNullOrWhiteSpace(config.Endpoint) ? DefaultEndpointVersion : config.EndpointVersion; this._client = new RawAnthropicClient(this._httpClient, endpoint, endpointVersion, config.ApiKey); + textTokenizer ??= TokenizerFactory.GetTokenizerForEncoding(config.Tokenizer); if (textTokenizer == null) { this._log.LogWarning( "Tokenizer not specified, will use {0}. The token count might be incorrect, causing unexpected errors", - nameof(GPT4oTokenizer)); - textTokenizer = new GPT4oTokenizer(); + nameof(CL100KTokenizer)); + textTokenizer = new CL100KTokenizer(); } this._textTokenizer = textTokenizer; diff --git a/extensions/AzureOpenAI/AzureOpenAI/AzureOpenAIConfig.cs b/extensions/AzureOpenAI/AzureOpenAI/AzureOpenAIConfig.cs index a50938d59..f86a53083 100644 --- a/extensions/AzureOpenAI/AzureOpenAI/AzureOpenAIConfig.cs +++ b/extensions/AzureOpenAI/AzureOpenAI/AzureOpenAIConfig.cs @@ -63,6 +63,12 @@ public enum APITypes /// public int MaxTokenTotal { get; set; } = 8191; + /// + /// Name of the tokenizer used to count tokens. + /// Supported values: "p50k", "cl100k", "o200k". Leave it empty if unsure. + /// + public string Tokenizer { get; set; } = "cl100k"; + /// /// The number of dimensions output embeddings should have. /// Only supported in "text-embedding-3" and later models developed with diff --git a/extensions/AzureOpenAI/AzureOpenAI/AzureOpenAITextEmbeddingGenerator.cs b/extensions/AzureOpenAI/AzureOpenAI/AzureOpenAITextEmbeddingGenerator.cs index 7aa954d36..b4777b8cc 100644 --- a/extensions/AzureOpenAI/AzureOpenAI/AzureOpenAITextEmbeddingGenerator.cs +++ b/extensions/AzureOpenAI/AzureOpenAI/AzureOpenAITextEmbeddingGenerator.cs @@ -95,12 +95,13 @@ public AzureOpenAITextEmbeddingGenerator( this.MaxTokens = config.MaxTokenTotal; this.MaxBatchSize = config.MaxEmbeddingBatchSize; + textTokenizer ??= TokenizerFactory.GetTokenizerForEncoding(config.Tokenizer); if (textTokenizer == null) { this._log.LogWarning( "Tokenizer not specified, will use {0}. The token count might be incorrect, causing unexpected errors", - nameof(GPT4oTokenizer)); - textTokenizer = new GPT4oTokenizer(); + nameof(CL100KTokenizer)); + textTokenizer = new CL100KTokenizer(); } this._textTokenizer = textTokenizer; diff --git a/extensions/AzureOpenAI/AzureOpenAI/AzureOpenAITextGenerator.cs b/extensions/AzureOpenAI/AzureOpenAI/AzureOpenAITextGenerator.cs index a95e48c45..2606c3cb4 100644 --- a/extensions/AzureOpenAI/AzureOpenAI/AzureOpenAITextGenerator.cs +++ b/extensions/AzureOpenAI/AzureOpenAI/AzureOpenAITextGenerator.cs @@ -90,12 +90,13 @@ public AzureOpenAITextGenerator( this._log = (loggerFactory ?? DefaultLogger.Factory).CreateLogger(); this.MaxTokenTotal = config.MaxTokenTotal; + textTokenizer ??= TokenizerFactory.GetTokenizerForEncoding(config.Tokenizer); if (textTokenizer == null) { this._log.LogWarning( "Tokenizer not specified, will use {0}. The token count might be incorrect, causing unexpected errors", - nameof(GPT4oTokenizer)); - textTokenizer = new GPT4oTokenizer(); + nameof(O200KTokenizer)); + textTokenizer = new O200KTokenizer(); } this._textTokenizer = textTokenizer; diff --git a/extensions/KM/KernelMemory/Internals/KernelMemoryComposer.cs b/extensions/KM/KernelMemory/Internals/KernelMemoryComposer.cs index a850099cb..9068c9e77 100644 --- a/extensions/KM/KernelMemory/Internals/KernelMemoryComposer.cs +++ b/extensions/KM/KernelMemory/Internals/KernelMemoryComposer.cs @@ -7,7 +7,6 @@ using Microsoft.KernelMemory.AI; using Microsoft.KernelMemory.AI.Anthropic; using Microsoft.KernelMemory.AI.Ollama; -using Microsoft.KernelMemory.AI.OpenAI; using Microsoft.KernelMemory.DocumentStorage.DevTools; using Microsoft.KernelMemory.MemoryDb.SQLServer; using Microsoft.KernelMemory.MemoryStorage; @@ -201,8 +200,7 @@ private void ConfigureIngestionEmbeddingGenerators() { var instance = this.GetServiceInstance( s => s.AddAzureOpenAIEmbeddingGeneration( - config: this.GetServiceConfig("AzureOpenAIEmbedding"), - textTokenizer: new GPT4oTokenizer())); + config: this.GetServiceConfig("AzureOpenAIEmbedding"))); this._builder.AddIngestionEmbeddingGenerator(instance); break; } @@ -211,8 +209,7 @@ private void ConfigureIngestionEmbeddingGenerators() { var instance = this.GetServiceInstance( s => s.AddOpenAITextEmbeddingGeneration( - config: this.GetServiceConfig("OpenAI"), - textTokenizer: new GPT4oTokenizer())); + config: this.GetServiceConfig("OpenAI"))); this._builder.AddIngestionEmbeddingGenerator(instance); break; } @@ -221,8 +218,7 @@ private void ConfigureIngestionEmbeddingGenerators() { var instance = this.GetServiceInstance( s => s.AddOllamaTextEmbeddingGeneration( - config: this.GetServiceConfig("Ollama"), - textTokenizer: new GPT4oTokenizer())); + config: this.GetServiceConfig("Ollama"))); this._builder.AddIngestionEmbeddingGenerator(instance); break; } @@ -371,20 +367,17 @@ private void ConfigureRetrievalEmbeddingGenerator() case string x when x.Equals("AzureOpenAI", StringComparison.OrdinalIgnoreCase): case string y when y.Equals("AzureOpenAIEmbedding", StringComparison.OrdinalIgnoreCase): this._builder.Services.AddAzureOpenAIEmbeddingGeneration( - config: this.GetServiceConfig("AzureOpenAIEmbedding"), - textTokenizer: new GPT4oTokenizer()); + config: this.GetServiceConfig("AzureOpenAIEmbedding")); break; case string x when x.Equals("OpenAI", StringComparison.OrdinalIgnoreCase): this._builder.Services.AddOpenAITextEmbeddingGeneration( - config: this.GetServiceConfig("OpenAI"), - textTokenizer: new GPT4oTokenizer()); + config: this.GetServiceConfig("OpenAI")); break; case string x when x.Equals("Ollama", StringComparison.OrdinalIgnoreCase): this._builder.Services.AddOllamaTextEmbeddingGeneration( - config: this.GetServiceConfig("Ollama"), - textTokenizer: new GPT4oTokenizer()); + config: this.GetServiceConfig("Ollama")); break; case string x when x.Equals("LlamaSharp", StringComparison.OrdinalIgnoreCase): @@ -453,26 +446,22 @@ private void ConfigureTextGenerator() case string x when x.Equals("AzureOpenAI", StringComparison.OrdinalIgnoreCase): case string y when y.Equals("AzureOpenAIText", StringComparison.OrdinalIgnoreCase): this._builder.Services.AddAzureOpenAITextGeneration( - config: this.GetServiceConfig("AzureOpenAIText"), - textTokenizer: new GPT4oTokenizer()); + config: this.GetServiceConfig("AzureOpenAIText")); break; case string x when x.Equals("OpenAI", StringComparison.OrdinalIgnoreCase): this._builder.Services.AddOpenAITextGeneration( - config: this.GetServiceConfig("OpenAI"), - textTokenizer: new GPT4oTokenizer()); + config: this.GetServiceConfig("OpenAI")); break; case string x when x.Equals("Anthropic", StringComparison.OrdinalIgnoreCase): this._builder.Services.AddAnthropicTextGeneration( - config: this.GetServiceConfig("Anthropic"), - textTokenizer: new GPT4oTokenizer()); + config: this.GetServiceConfig("Anthropic")); break; case string x when x.Equals("Ollama", StringComparison.OrdinalIgnoreCase): this._builder.Services.AddOllamaTextGeneration( - config: this.GetServiceConfig("Ollama"), - textTokenizer: new GPT4oTokenizer()); + config: this.GetServiceConfig("Ollama")); break; case string x when x.Equals("LlamaSharp", StringComparison.OrdinalIgnoreCase): diff --git a/extensions/ONNX/Onnx/OnnxTextGenerator.cs b/extensions/ONNX/Onnx/OnnxTextGenerator.cs index adfa0c8cc..fc6705c18 100644 --- a/extensions/ONNX/Onnx/OnnxTextGenerator.cs +++ b/extensions/ONNX/Onnx/OnnxTextGenerator.cs @@ -64,8 +64,8 @@ public OnnxTextGenerator( { this._log.LogWarning( "Tokenizer not specified, will use {0}. The token count might be incorrect, causing unexpected errors", - nameof(GPT4oTokenizer)); - textTokenizer = new GPT4oTokenizer(); + nameof(O200KTokenizer)); + textTokenizer = new O200KTokenizer(); } config.Validate(); diff --git a/extensions/Ollama/Ollama/OllamaModelConfig.cs b/extensions/Ollama/Ollama/OllamaModelConfig.cs index c1c3af561..6f14a9306 100644 --- a/extensions/Ollama/Ollama/OllamaModelConfig.cs +++ b/extensions/Ollama/Ollama/OllamaModelConfig.cs @@ -15,6 +15,12 @@ public class OllamaModelConfig /// public int? MaxTokenTotal { get; set; } + /// + /// Name of the tokenizer used to count tokens. + /// Supported values: "p50k", "cl100k", "o200k". Leave it empty if unsure. + /// + public string Tokenizer { get; set; } = "cl100k"; + /// /// Enable Mirostat sampling for controlling perplexity. /// (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) diff --git a/extensions/Ollama/Ollama/OllamaTextEmbeddingGenerator.cs b/extensions/Ollama/Ollama/OllamaTextEmbeddingGenerator.cs index 7e4a5ae9b..77afc0ac8 100644 --- a/extensions/Ollama/Ollama/OllamaTextEmbeddingGenerator.cs +++ b/extensions/Ollama/Ollama/OllamaTextEmbeddingGenerator.cs @@ -41,12 +41,13 @@ public OllamaTextEmbeddingGenerator( this.MaxBatchSize = modelConfig.MaxBatchSize; this._log = (loggerFactory ?? DefaultLogger.Factory).CreateLogger(); + textTokenizer ??= TokenizerFactory.GetTokenizerForEncoding(modelConfig.Tokenizer); if (textTokenizer == null) { this._log.LogWarning( "Tokenizer not specified, will use {0}. The token count might be incorrect, causing unexpected errors", - nameof(GPT4oTokenizer)); - textTokenizer = new GPT4oTokenizer(); + nameof(CL100KTokenizer)); + textTokenizer = new CL100KTokenizer(); } this._textTokenizer = textTokenizer; diff --git a/extensions/Ollama/Ollama/OllamaTextGenerator.cs b/extensions/Ollama/Ollama/OllamaTextGenerator.cs index 5f15d7428..694123ed0 100644 --- a/extensions/Ollama/Ollama/OllamaTextGenerator.cs +++ b/extensions/Ollama/Ollama/OllamaTextGenerator.cs @@ -38,12 +38,13 @@ public OllamaTextGenerator( this._modelConfig = modelConfig; this._log = (loggerFactory ?? DefaultLogger.Factory).CreateLogger(); + textTokenizer ??= TokenizerFactory.GetTokenizerForEncoding(modelConfig.Tokenizer); if (textTokenizer == null) { this._log.LogWarning( "Tokenizer not specified, will use {0}. The token count might be incorrect, causing unexpected errors", - nameof(GPT4oTokenizer)); - textTokenizer = new GPT4oTokenizer(); + nameof(O200KTokenizer)); + textTokenizer = new O200KTokenizer(); } this._textTokenizer = textTokenizer; diff --git a/extensions/OpenAI/OpenAI.UnitTests/GPTTokenizersTests.cs b/extensions/OpenAI/OpenAI.UnitTests/GPTTokenizersTests.cs index 0d93d1336..025fcbbea 100644 --- a/extensions/OpenAI/OpenAI.UnitTests/GPTTokenizersTests.cs +++ b/extensions/OpenAI/OpenAI.UnitTests/GPTTokenizersTests.cs @@ -15,12 +15,9 @@ public class GPTTokenizersTests(ITestOutputHelper output) : BaseUnitTestCase(out public void CanTokenize() { const string helloWorld = "hello world"; - var gpt2 = new GPT2Tokenizer(); - var tokens = gpt2.GetTokens(helloWorld); - Assert.Equal(["hello", " world"], tokens); var gpt3 = new GPT3Tokenizer(); - tokens = gpt3.GetTokens(helloWorld); + var tokens = gpt3.GetTokens(helloWorld); Assert.Equal(["hello", " world"], tokens); var gpt4 = new GPT4Tokenizer(); @@ -39,7 +36,6 @@ public void TheyCountTokens() { const string text = "{'bos_token': '<|endoftext|>',\n 'eos_token': '<|endoftext|>',\n 'unk_token': '<|endoftext|>'}"; - Assert.Equal(29, new GPT2Tokenizer().CountTokens(text)); Assert.Equal(29, new GPT3Tokenizer().CountTokens(text)); Assert.Equal(21, new GPT4Tokenizer().CountTokens(text)); Assert.Equal(22, new GPT4oTokenizer().CountTokens(text)); diff --git a/extensions/OpenAI/OpenAI/DependencyInjection.cs b/extensions/OpenAI/OpenAI/DependencyInjection.cs index d9c64984b..11ee2a92e 100644 --- a/extensions/OpenAI/OpenAI/DependencyInjection.cs +++ b/extensions/OpenAI/OpenAI/DependencyInjection.cs @@ -92,7 +92,7 @@ public static IKernelMemoryBuilder WithOpenAI( { config.Validate(); builder.WithOpenAITextEmbeddingGeneration(config, textEmbeddingTokenizer, onlyForRetrieval, httpClient); - builder.WithOpenAITextGeneration(config, textGenerationTokenizer); + builder.WithOpenAITextGeneration(config, textGenerationTokenizer, httpClient); return builder; } @@ -137,7 +137,7 @@ public static IKernelMemoryBuilder WithOpenAITextEmbeddingGeneration( HttpClient? httpClient = null) { config.Validate(); - builder.Services.AddOpenAITextEmbeddingGeneration(config, httpClient: httpClient); + builder.Services.AddOpenAITextEmbeddingGeneration(config, textTokenizer, httpClient: httpClient); if (!onlyForRetrieval) { builder.AddIngestionEmbeddingGenerator( @@ -164,7 +164,7 @@ public static IKernelMemoryBuilder WithOpenAITextEmbeddingGeneration( bool onlyForRetrieval = false) { config.Validate(); - builder.Services.AddOpenAITextEmbeddingGeneration(config, openAIClient); + builder.Services.AddOpenAITextEmbeddingGeneration(config, openAIClient, textTokenizer); if (!onlyForRetrieval) { builder.AddIngestionEmbeddingGenerator( diff --git a/extensions/OpenAI/OpenAI/OpenAIConfig.cs b/extensions/OpenAI/OpenAI/OpenAIConfig.cs index 2b8e234ff..8e5eee5dc 100644 --- a/extensions/OpenAI/OpenAI/OpenAIConfig.cs +++ b/extensions/OpenAI/OpenAI/OpenAIConfig.cs @@ -53,7 +53,13 @@ public enum TextGenerationTypes public int TextModelMaxTokenTotal { get; set; } = 8192; /// - /// Model used to embedding generation/ + /// Name of the tokenizer used to count tokens. + /// Supported values: "p50k", "cl100k", "o200k". Leave it empty for autodetect. + /// + public string TextModelTokenizer { get; set; } = string.Empty; + + /// + /// Model used to embedding generation. /// public string EmbeddingModel { get; set; } = string.Empty; @@ -63,6 +69,12 @@ public enum TextGenerationTypes /// public int EmbeddingModelMaxTokenTotal { get; set; } = 8191; + /// + /// Name of the tokenizer used to count tokens. + /// Supported values: "p50k", "cl100k", "o200k". Leave it empty for autodetect. + /// + public string EmbeddingModelTokenizer { get; set; } = string.Empty; + /// /// The number of dimensions output embeddings should have. /// Only supported in "text-embedding-3" and later models developed with diff --git a/extensions/OpenAI/OpenAI/OpenAITextEmbeddingGenerator.cs b/extensions/OpenAI/OpenAI/OpenAITextEmbeddingGenerator.cs index d9582d70c..5823ace2f 100644 --- a/extensions/OpenAI/OpenAI/OpenAITextEmbeddingGenerator.cs +++ b/extensions/OpenAI/OpenAI/OpenAITextEmbeddingGenerator.cs @@ -96,12 +96,18 @@ public OpenAITextEmbeddingGenerator( this.MaxTokens = config.EmbeddingModelMaxTokenTotal; this.MaxBatchSize = config.MaxEmbeddingBatchSize; + if (textTokenizer == null && !string.IsNullOrEmpty(config.EmbeddingModelTokenizer)) + { + textTokenizer = TokenizerFactory.GetTokenizerForEncoding(config.EmbeddingModelTokenizer); + } + + textTokenizer ??= TokenizerFactory.GetTokenizerForModel(config.EmbeddingModel); if (textTokenizer == null) { this._log.LogWarning( "Tokenizer not specified, will use {0}. The token count might be incorrect, causing unexpected errors", - nameof(GPT4oTokenizer)); - textTokenizer = new GPT4oTokenizer(); + nameof(CL100KTokenizer)); + textTokenizer = new CL100KTokenizer(); } this._textTokenizer = textTokenizer; diff --git a/extensions/OpenAI/OpenAI/OpenAITextGenerator.cs b/extensions/OpenAI/OpenAI/OpenAITextGenerator.cs index c0cd6b0b7..7e39986c7 100644 --- a/extensions/OpenAI/OpenAI/OpenAITextGenerator.cs +++ b/extensions/OpenAI/OpenAI/OpenAITextGenerator.cs @@ -89,12 +89,18 @@ public OpenAITextGenerator( this._log = (loggerFactory ?? DefaultLogger.Factory).CreateLogger(); this.MaxTokenTotal = config.TextModelMaxTokenTotal; + if (textTokenizer == null && !string.IsNullOrEmpty(config.TextModelTokenizer)) + { + textTokenizer = TokenizerFactory.GetTokenizerForEncoding(config.TextModelTokenizer); + } + + textTokenizer ??= TokenizerFactory.GetTokenizerForModel(config.TextModel); if (textTokenizer == null) { this._log.LogWarning( "Tokenizer not specified, will use {0}. The token count might be incorrect, causing unexpected errors", - nameof(GPT4oTokenizer)); - textTokenizer = new GPT4oTokenizer(); + nameof(O200KTokenizer)); + textTokenizer = new O200KTokenizer(); } this._textTokenizer = textTokenizer; diff --git a/extensions/OpenAI/OpenAI/Tokenizers/GPT4Tokenizer.cs b/extensions/OpenAI/OpenAI/Tokenizers/CL100KTokenizer.cs similarity index 74% rename from extensions/OpenAI/OpenAI/Tokenizers/GPT4Tokenizer.cs rename to extensions/OpenAI/OpenAI/Tokenizers/CL100KTokenizer.cs index 5cef0f5cc..7b27d3cba 100644 --- a/extensions/OpenAI/OpenAI/Tokenizers/GPT4Tokenizer.cs +++ b/extensions/OpenAI/OpenAI/Tokenizers/CL100KTokenizer.cs @@ -8,12 +8,9 @@ // ReSharper disable once CheckNamespace namespace Microsoft.KernelMemory.AI.OpenAI; -/// -/// GPT 3.5 and GPT 4 tokenizer (cl100k_base.tiktoken + special tokens) -/// -public sealed class GPT4Tokenizer : ITextTokenizer +public class CL100KTokenizer : ITextTokenizer { - private static readonly Tokenizer s_tokenizer = TiktokenTokenizer.CreateForModel("gpt-4", + private static readonly Tokenizer s_tokenizer = ML.Tokenizers.TiktokenTokenizer.CreateForEncoding("cl100k_base", new Dictionary { { "<|im_start|>", 100264 }, { "<|im_end|>", 100265 } }); /// diff --git a/extensions/OpenAI/OpenAI/Tokenizers/GPT3Tokenizer.cs b/extensions/OpenAI/OpenAI/Tokenizers/GPT3Tokenizer.cs deleted file mode 100644 index e7d03d721..000000000 --- a/extensions/OpenAI/OpenAI/Tokenizers/GPT3Tokenizer.cs +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright (c) Microsoft. All rights reserved. - -using System.Collections.Generic; -using System.Linq; -using Microsoft.ML.Tokenizers; - -#pragma warning disable IDE0130 // reduce number of "using" statements -// ReSharper disable once CheckNamespace -namespace Microsoft.KernelMemory.AI.OpenAI; - -/// -/// TikToken GPT3 tokenizer (p50k_base.tiktoken) -/// -public sealed class GPT3Tokenizer : ITextTokenizer -{ - private static readonly Tokenizer s_tokenizer = TiktokenTokenizer.CreateForModel("text-davinci-003"); - - /// - public int CountTokens(string text) - { - return s_tokenizer.CountTokens(text); - } - - /// - public IReadOnlyList GetTokens(string text) - { - return s_tokenizer.EncodeToTokens(text, out string? _).Select(t => t.Value).ToList(); - } -} diff --git a/extensions/OpenAI/OpenAI/Tokenizers/GPTTokenizers.cs b/extensions/OpenAI/OpenAI/Tokenizers/GPTTokenizers.cs new file mode 100644 index 000000000..ff8dc8ec5 --- /dev/null +++ b/extensions/OpenAI/OpenAI/Tokenizers/GPTTokenizers.cs @@ -0,0 +1,34 @@ +// Copyright (c) Microsoft. All rights reserved. + +#pragma warning disable IDE0130 // reduce number of "using" statements +// ReSharper disable once CheckNamespace +namespace Microsoft.KernelMemory.AI.OpenAI; + +/// +/// GPT3 tokenizer +/// +public sealed class GPT3Tokenizer : P50KTokenizer +{ +} + +/// +/// gpt-3.5-turbo +/// gpt-3.5-turbo-* +/// gpt-4 +/// text-embedding-ada-002 +/// text-embedding-3-small +/// text-embedding-3-large +/// +public sealed class GPT4Tokenizer : CL100KTokenizer +{ +} + +/// +/// GPT 4o / 4o mini tokenizer +/// gpt-4o +/// gpt-4o-* +/// +// ReSharper disable once InconsistentNaming +public sealed class GPT4oTokenizer : O200KTokenizer +{ +} diff --git a/extensions/OpenAI/OpenAI/Tokenizers/GPT4oTokenizer.cs b/extensions/OpenAI/OpenAI/Tokenizers/O200KTokenizer.cs similarity index 70% rename from extensions/OpenAI/OpenAI/Tokenizers/GPT4oTokenizer.cs rename to extensions/OpenAI/OpenAI/Tokenizers/O200KTokenizer.cs index a0052c803..4613948f7 100644 --- a/extensions/OpenAI/OpenAI/Tokenizers/GPT4oTokenizer.cs +++ b/extensions/OpenAI/OpenAI/Tokenizers/O200KTokenizer.cs @@ -8,13 +8,9 @@ // ReSharper disable once CheckNamespace namespace Microsoft.KernelMemory.AI.OpenAI; -/// -/// GPT 4o / 4o mini tokenizer (cl200k_base.tiktoken + special tokens) -/// -// ReSharper disable once InconsistentNaming -public sealed class GPT4oTokenizer : ITextTokenizer +public class O200KTokenizer : ITextTokenizer { - private static readonly Tokenizer s_tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o", + private static readonly Tokenizer s_tokenizer = ML.Tokenizers.TiktokenTokenizer.CreateForEncoding("o200k_base", new Dictionary { { "<|im_start|>", 100264 }, { "<|im_end|>", 100265 } }); /// diff --git a/extensions/OpenAI/OpenAI/Tokenizers/GPT2Tokenizer.cs b/extensions/OpenAI/OpenAI/Tokenizers/P50KTokenizer.cs similarity index 73% rename from extensions/OpenAI/OpenAI/Tokenizers/GPT2Tokenizer.cs rename to extensions/OpenAI/OpenAI/Tokenizers/P50KTokenizer.cs index 8b3df3559..2c6faf0d8 100644 --- a/extensions/OpenAI/OpenAI/Tokenizers/GPT2Tokenizer.cs +++ b/extensions/OpenAI/OpenAI/Tokenizers/P50KTokenizer.cs @@ -8,12 +8,9 @@ // ReSharper disable once CheckNamespace namespace Microsoft.KernelMemory.AI.OpenAI; -/// -/// TikToken GPT2 tokenizer (gpt2.tiktoken) -/// -public sealed class GPT2Tokenizer : ITextTokenizer +public class P50KTokenizer : ITextTokenizer { - private static readonly Tokenizer s_tokenizer = TiktokenTokenizer.CreateForModel("gpt2"); + private static readonly Tokenizer s_tokenizer = ML.Tokenizers.TiktokenTokenizer.CreateForEncoding("p50k_base"); /// public int CountTokens(string text) diff --git a/extensions/OpenAI/OpenAI/Tokenizers/TiktokenTokenizer.cs b/extensions/OpenAI/OpenAI/Tokenizers/TiktokenTokenizer.cs new file mode 100644 index 000000000..18173225d --- /dev/null +++ b/extensions/OpenAI/OpenAI/Tokenizers/TiktokenTokenizer.cs @@ -0,0 +1,41 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML.Tokenizers; + +#pragma warning disable IDE0130 // reduce number of "using" statements +// ReSharper disable once CheckNamespace +namespace Microsoft.KernelMemory.AI.OpenAI; + +public class TiktokenTokenizer : ITextTokenizer +{ + private readonly Tokenizer _tokenizer; + + public TiktokenTokenizer(string modelId) + { + try + { + this._tokenizer = Microsoft.ML.Tokenizers.TiktokenTokenizer.CreateForModel(modelId); + } + catch (NotSupportedException) + { + throw new KernelMemoryException("Autodetect failed"); + } + catch (ArgumentNullException) + { + throw new KernelMemoryException("Autodetect failed"); + } + } + + public int CountTokens(string text) + { + return this._tokenizer.CountTokens(text); + } + + public IReadOnlyList GetTokens(string text) + { + return this._tokenizer.EncodeToTokens(text, out string? _).Select(t => t.Value).ToList(); + } +} diff --git a/extensions/OpenAI/OpenAI/Tokenizers/TokenizerFactory.cs b/extensions/OpenAI/OpenAI/Tokenizers/TokenizerFactory.cs new file mode 100644 index 000000000..6fb619897 --- /dev/null +++ b/extensions/OpenAI/OpenAI/Tokenizers/TokenizerFactory.cs @@ -0,0 +1,73 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System; + +#pragma warning disable IDE0130 // reduce number of "using" statements +// ReSharper disable once CheckNamespace +namespace Microsoft.KernelMemory.AI.OpenAI; + +public static class TokenizerFactory +{ + public static ITextTokenizer? GetTokenizerForEncoding(string encodingId) + { + encodingId = encodingId.ToLowerInvariant(); + + switch (encodingId.ToLowerInvariant()) + { + case "p50k": + return new P50KTokenizer(); + + case "cl100k": + return new CL100KTokenizer(); + + case "o200k": + return new O200KTokenizer(); + } + + return null; + } + + public static ITextTokenizer? GetTokenizerForModel(string modelId) + { + try + { + return new TiktokenTokenizer(modelId); + } + catch (KernelMemoryException) + { + // ignore + } + + modelId = modelId.ToLowerInvariant(); + + if (modelId.StartsWith("text-embedding-", StringComparison.Ordinal) + || modelId.StartsWith("gpt-3.5-", StringComparison.Ordinal) + || modelId.StartsWith("gpt-4-", StringComparison.Ordinal)) + { + return new CL100KTokenizer(); + } + + if (modelId.StartsWith("gpt-4o-", StringComparison.Ordinal)) + { + return new O200KTokenizer(); + } + + switch (modelId.ToLowerInvariant()) + { + case "code-davinci-001": + case "code-davinci-002": + case "text-davinci-002": + case "text-davinci-003": + return new P50KTokenizer(); + + case "gpt-3.5-turbo": + case "gpt-4": + return new CL100KTokenizer(); + + case "gpt-4o": + return new O200KTokenizer(); + } + + return null; + } +} diff --git a/service/Service/appsettings.json b/service/Service/appsettings.json index a9940b9b5..506f52c2d 100644 --- a/service/Service/appsettings.json +++ b/service/Service/appsettings.json @@ -238,6 +238,8 @@ "ApiKey": "", // See https://docs.anthropic.com/claude/docs/models-overview for list of models and details "TextModelName": "claude-3-haiku-20240307", + // Supported values: "p50k", "cl100k", "o200k". Leave it empty if unsure. + "Tokenizer": "cl100k", // How many tokens the model can receive in input and generate in output // See https://docs.anthropic.com/claude/docs/models-overview "MaxTokenIn": 200000, @@ -326,10 +328,17 @@ "Auth": "AzureIdentity", "Endpoint": "https://<...>.openai.azure.com/", "APIKey": "", + // Your Azure Deployment name "Deployment": "", // The max number of tokens supported by model deployed // See https://learn.microsoft.com/azure/ai-services/openai/concepts/models "MaxTokenTotal": 8191, + // Which tokenizer to use to correctly measure the size of chunks. + // Supported values: "p50k", "cl100k", "o200k". Leave it empty if unsure. + // - Use p50k for the old text-davinci-003 models + // - Use cl100k for the old gpt-3.4 and gpt-4 family, and for text embedding models + // - Use o200k for the most recent gpt-4o family + "Tokenizer": "cl100k", // The number of dimensions output embeddings should have. // Only supported in "text-embedding-3" and later models developed with // MRL, see https://arxiv.org/abs/2205.13147 @@ -355,6 +364,12 @@ // The max number of tokens supported by model deployed // See https://learn.microsoft.com/azure/ai-services/openai/concepts/models "MaxTokenTotal": 16384, + // Which tokenizer to use to correctly measure the size of chunks. + // Supported values: "p50k", "cl100k", "o200k". Leave it empty if unsure. + // - Use p50k for the old text-davinci-003 models + // - Use cl100k for the old gpt-3.4 and gpt-4 family, and for text embedding models + // - Use o200k for the most recent gpt-4o family + "Tokenizer": "o200k", // "ChatCompletion" or "TextCompletion" "APIType": "ChatCompletion", // How many times to retry in case of throttling. @@ -427,6 +442,8 @@ "Endpoint": "http://localhost:11434", "TextModel": { "ModelName": "phi3:medium-128k", + // Supported values: "p50k", "cl100k", "o200k". Leave it empty if unsure. + "Tokenizer": "cl100k", "MaxTokenTotal": 131072, // How many requests can be processed in parallel "MaxBatchSize": 1 @@ -482,6 +499,8 @@ }, "EmbeddingModel": { "ModelName": "nomic-embed-text", + // Supported values: "p50k", "cl100k", "o200k". Leave it empty if unsure. + "Tokenizer": "cl100k", "MaxTokenTotal": 2048, // How many requests can be processed in parallel "MaxBatchSize": 1 @@ -541,6 +560,8 @@ "TextModel": "gpt-4o-mini", // The max number of tokens supported by the text model. "TextModelMaxTokenTotal": 16384, + // Supported values: "p50k", "cl100k", "o200k". Leave it empty for autodetect. + "TextModelTokenizer": "", // What type of text generation, by default autodetect using the model name. // Possible values: "Auto", "TextCompletion", "Chat" "TextGenerationType": "Auto", @@ -549,6 +570,8 @@ // The max number of tokens supported by the embedding model // See https://platform.openai.com/docs/guides/embeddings/what-are-embeddings "EmbeddingModelMaxTokenTotal": 8191, + // Supported values: "p50k", "cl100k", "o200k". Leave it empty for autodetect. + "EmbeddingModelTokenizer": "", // OpenAI API Key "APIKey": "", // OpenAI Organization ID (usually empty, unless you have multiple accounts on different orgs) diff --git a/tools/InteractiveSetup/Services/AzureOpenAIEmbedding.cs b/tools/InteractiveSetup/Services/AzureOpenAIEmbedding.cs index 9635e371d..0d07c531f 100644 --- a/tools/InteractiveSetup/Services/AzureOpenAIEmbedding.cs +++ b/tools/InteractiveSetup/Services/AzureOpenAIEmbedding.cs @@ -21,6 +21,7 @@ public static void Setup(Context ctx, bool force = false) { "APIType", "EmbeddingGeneration" }, { "Endpoint", "" }, { "Deployment", "" }, + { "Tokenizer", "cl100k" }, { "Auth", "ApiKey" }, { "APIKey", "" }, }; @@ -47,5 +48,6 @@ public static void Setup(Context ctx, bool force = false) AppSettings.Change(x => x.Services[ServiceName]["APIType"] = "EmbeddingGeneration"); AppSettings.Change(x => x.Services[ServiceName]["Endpoint"] = SetupUI.AskOpenQuestion("Azure OpenAI ", config["Endpoint"].ToString())); AppSettings.Change(x => x.Services[ServiceName]["Deployment"] = SetupUI.AskOpenQuestion("Azure OpenAI ", config["Deployment"].ToString())); + AppSettings.Change(x => x.Services[ServiceName]["Tokenizer"] = SetupUI.AskOpenQuestion("Tokenizer (p50k/cl100k/o200k)", config["Tokenizer"].ToString())); } } diff --git a/tools/InteractiveSetup/Services/AzureOpenAIText.cs b/tools/InteractiveSetup/Services/AzureOpenAIText.cs index 00b9dec09..cdc3e035a 100644 --- a/tools/InteractiveSetup/Services/AzureOpenAIText.cs +++ b/tools/InteractiveSetup/Services/AzureOpenAIText.cs @@ -21,6 +21,7 @@ public static void Setup(Context ctx, bool force = false) { "APIType", "ChatCompletion" }, { "Endpoint", "" }, { "Deployment", "" }, + { "Tokenizer", "o200k" }, { "Auth", "ApiKey" }, { "APIKey", "" }, }; @@ -47,5 +48,6 @@ public static void Setup(Context ctx, bool force = false) AppSettings.Change(x => x.Services[ServiceName]["APIType"] = "ChatCompletion"); AppSettings.Change(x => x.Services[ServiceName]["Endpoint"] = SetupUI.AskOpenQuestion("Azure OpenAI ", config["Endpoint"].ToString())); AppSettings.Change(x => x.Services[ServiceName]["Deployment"] = SetupUI.AskOpenQuestion("Azure OpenAI ", config["Deployment"].ToString())); + AppSettings.Change(x => x.Services[ServiceName]["Tokenizer"] = SetupUI.AskOpenQuestion("Tokenizer (p50k/cl100k/o200k)", config["Tokenizer"].ToString())); } } From e4f5ddda1c7cf91bd46e2f89e39cd7a82eafb884 Mon Sep 17 00:00:00 2001 From: Devis Lucato Date: Wed, 27 Nov 2024 00:34:12 -0800 Subject: [PATCH 2/4] Reorganize tokenizers code and dependency tree --- Directory.Build.props | 2 +- KernelMemory.sln | 9 +++++- examples/210-KM-without-builder/Program.cs | 1 - examples/212-dotnet-ollama/Program.cs | 2 +- extensions/Anthropic/Anthropic.csproj | 2 +- .../Anthropic/AnthropicTextGeneration.cs | 5 ++- .../AzureOpenAI/AzureOpenAI.csproj | 2 +- .../AzureOpenAITextEmbeddingGenerator.cs | 5 ++- .../AzureOpenAI/AzureOpenAITextGenerator.cs | 5 ++- .../KM/KernelMemory/KernelMemory.csproj | 9 +++--- .../LlamaSharpTextGeneratorTest.cs | 2 +- extensions/ONNX/Onnx/Onnx.csproj | 17 +++++----- extensions/ONNX/Onnx/OnnxTextGenerator.cs | 5 ++- extensions/Ollama/Ollama/Ollama.csproj | 18 +++++------ .../Ollama/OllamaTextEmbeddingGenerator.cs | 5 ++- .../Ollama/Ollama/OllamaTextGenerator.cs | 5 ++- extensions/OpenAI/OpenAI/OpenAI.csproj | 1 + .../OpenAI/OpenAITextEmbeddingGenerator.cs | 4 +-- .../OpenAI/OpenAI/OpenAITextGenerator.cs | 4 +-- extensions/Tiktoken/README.md | 6 ++++ .../Tiktoken.UnitTests}/Startup.cs | 2 +- .../Tiktoken.UnitTests.csproj} | 4 +-- .../Tiktoken.UnitTests/TokenizersTests.cs} | 6 ++-- .../Tiktoken}/CL100KTokenizer.cs | 4 +-- .../Tiktoken}/GPTTokenizers.cs | 4 +-- .../Tiktoken}/O200KTokenizer.cs | 4 +-- .../Tiktoken}/P50KTokenizer.cs | 4 +-- extensions/Tiktoken/Tiktoken/Tiktoken.csproj | 32 +++++++++++++++++++ .../Tiktoken}/TiktokenTokenizer.cs | 4 +-- .../Tiktoken}/TokenizerFactory.cs | 4 +-- service/Core/AI/DefaultGPTTokenizer.cs | 28 ---------------- service/Core/Core.csproj | 1 + service/Core/DataFormats/Text/TextChunker.cs | 9 ++++-- .../Core/Handlers/TextPartitioningHandler.cs | 2 +- .../SemanticKernelTextEmbeddingGenerator.cs | 4 +-- .../SemanticKernelTextGenerator.cs | 4 +-- 36 files changed, 115 insertions(+), 110 deletions(-) create mode 100644 extensions/Tiktoken/README.md rename extensions/{OpenAI/OpenAI.UnitTests => Tiktoken/Tiktoken.UnitTests}/Startup.cs (89%) rename extensions/{OpenAI/OpenAI.UnitTests/OpenAI.UnitTests.csproj => Tiktoken/Tiktoken.UnitTests/Tiktoken.UnitTests.csproj} (91%) rename extensions/{OpenAI/OpenAI.UnitTests/GPTTokenizersTests.cs => Tiktoken/Tiktoken.UnitTests/TokenizersTests.cs} (87%) rename extensions/{OpenAI/OpenAI/Tokenizers => Tiktoken/Tiktoken}/CL100KTokenizer.cs (81%) rename extensions/{OpenAI/OpenAI/Tokenizers => Tiktoken/Tiktoken}/GPTTokenizers.cs (78%) rename extensions/{OpenAI/OpenAI/Tokenizers => Tiktoken/Tiktoken}/O200KTokenizer.cs (81%) rename extensions/{OpenAI/OpenAI/Tokenizers => Tiktoken/Tiktoken}/P50KTokenizer.cs (79%) create mode 100644 extensions/Tiktoken/Tiktoken/Tiktoken.csproj rename extensions/{OpenAI/OpenAI/Tokenizers => Tiktoken/Tiktoken}/TiktokenTokenizer.cs (85%) rename extensions/{OpenAI/OpenAI/Tokenizers => Tiktoken/Tiktoken}/TokenizerFactory.cs (91%) delete mode 100644 service/Core/AI/DefaultGPTTokenizer.cs diff --git a/Directory.Build.props b/Directory.Build.props index 5cde406ee..3737938e1 100644 --- a/Directory.Build.props +++ b/Directory.Build.props @@ -2,7 +2,7 @@ - 0.93.0 + 0.94.0 12 diff --git a/KernelMemory.sln b/KernelMemory.sln index 6bbcc4566..8c2a0c66a 100644 --- a/KernelMemory.sln +++ b/KernelMemory.sln @@ -265,7 +265,7 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Service.AspNetCore", "servi EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "111-dotnet-azure-ai-hybrid-search", "examples\111-dotnet-azure-ai-hybrid-search\111-dotnet-azure-ai-hybrid-search.csproj", "{28534545-CB39-446A-9EB9-A5ABBFE0CFD3}" EndProject -Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "OpenAI.UnitTests", "extensions\OpenAI\OpenAI.UnitTests\OpenAI.UnitTests.csproj", "{8ADA17CD-B779-4817-B10A-E9D7B019088D}" +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Tiktoken.UnitTests", "extensions\Tiktoken\Tiktoken.UnitTests\Tiktoken.UnitTests.csproj", "{8ADA17CD-B779-4817-B10A-E9D7B019088D}" EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "SQLServer", "extensions\SQLServer\SQLServer\SQLServer.csproj", "{B9BE1099-F78F-4A5F-A897-BF2C75E19C57}" EndProject @@ -335,6 +335,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "AzureOpenAI.FunctionalTests EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "007-dotnet-serverless-azure", "examples\007-dotnet-serverless-azure\007-dotnet-serverless-azure.csproj", "{AF1E12A9-D8A1-4815-995E-C6F7B2022016}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Tiktoken", "extensions\Tiktoken\Tiktoken\Tiktoken.csproj", "{830C91B5-6F8D-4DAD-B1BD-3C2F9DEEC8F6}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -623,6 +625,10 @@ Global {AF1E12A9-D8A1-4815-995E-C6F7B2022016}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {AF1E12A9-D8A1-4815-995E-C6F7B2022016}.Debug|Any CPU.Build.0 = Debug|Any CPU {AF1E12A9-D8A1-4815-995E-C6F7B2022016}.Release|Any CPU.ActiveCfg = Release|Any CPU + {830C91B5-6F8D-4DAD-B1BD-3C2F9DEEC8F6}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {830C91B5-6F8D-4DAD-B1BD-3C2F9DEEC8F6}.Debug|Any CPU.Build.0 = Debug|Any CPU + {830C91B5-6F8D-4DAD-B1BD-3C2F9DEEC8F6}.Release|Any CPU.ActiveCfg = Release|Any CPU + {830C91B5-6F8D-4DAD-B1BD-3C2F9DEEC8F6}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE @@ -723,6 +729,7 @@ Global {AB097B62-5A0B-4D74-9F8B-A41FE8241447} = {155DA079-E267-49AF-973A-D1D44681970F} {8E907766-4A7D-46E2-B5E3-EB2994B1AA54} = {3C17F42B-CFC8-4900-8CFB-88936311E919} {AF1E12A9-D8A1-4815-995E-C6F7B2022016} = {0A43C65C-6007-4BB4-B3FE-8D439FC91841} + {830C91B5-6F8D-4DAD-B1BD-3C2F9DEEC8F6} = {155DA079-E267-49AF-973A-D1D44681970F} EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {CC136C62-115C-41D1-B414-F9473EFF6EA8} diff --git a/examples/210-KM-without-builder/Program.cs b/examples/210-KM-without-builder/Program.cs index 3324b439c..db0c3a829 100644 --- a/examples/210-KM-without-builder/Program.cs +++ b/examples/210-KM-without-builder/Program.cs @@ -4,7 +4,6 @@ using Microsoft.KernelMemory; using Microsoft.KernelMemory.AI; using Microsoft.KernelMemory.AI.AzureOpenAI; -using Microsoft.KernelMemory.AI.OpenAI; using Microsoft.KernelMemory.Configuration; using Microsoft.KernelMemory.Context; using Microsoft.KernelMemory.DataFormats; diff --git a/examples/212-dotnet-ollama/Program.cs b/examples/212-dotnet-ollama/Program.cs index 4d149fcca..bb6ae8fda 100644 --- a/examples/212-dotnet-ollama/Program.cs +++ b/examples/212-dotnet-ollama/Program.cs @@ -1,8 +1,8 @@ // Copyright (c) Microsoft. All rights reserved. using Microsoft.KernelMemory; +using Microsoft.KernelMemory.AI; using Microsoft.KernelMemory.AI.Ollama; -using Microsoft.KernelMemory.AI.OpenAI; using Microsoft.KernelMemory.Context; using Microsoft.KernelMemory.Diagnostics; diff --git a/extensions/Anthropic/Anthropic.csproj b/extensions/Anthropic/Anthropic.csproj index 7e34304d0..c8d6ae45c 100644 --- a/extensions/Anthropic/Anthropic.csproj +++ b/extensions/Anthropic/Anthropic.csproj @@ -10,7 +10,7 @@ - + diff --git a/extensions/Anthropic/AnthropicTextGeneration.cs b/extensions/Anthropic/AnthropicTextGeneration.cs index 8121b34da..9257853bd 100644 --- a/extensions/Anthropic/AnthropicTextGeneration.cs +++ b/extensions/Anthropic/AnthropicTextGeneration.cs @@ -8,7 +8,6 @@ using System.Threading.Tasks; using Microsoft.Extensions.Logging; using Microsoft.KernelMemory.AI.Anthropic.Client; -using Microsoft.KernelMemory.AI.OpenAI; using Microsoft.KernelMemory.Context; using Microsoft.KernelMemory.Diagnostics; @@ -73,10 +72,10 @@ public AnthropicTextGeneration( textTokenizer ??= TokenizerFactory.GetTokenizerForEncoding(config.Tokenizer); if (textTokenizer == null) { + textTokenizer = new CL100KTokenizer(); this._log.LogWarning( "Tokenizer not specified, will use {0}. The token count might be incorrect, causing unexpected errors", - nameof(CL100KTokenizer)); - textTokenizer = new CL100KTokenizer(); + textTokenizer.GetType().FullName); } this._textTokenizer = textTokenizer; diff --git a/extensions/AzureOpenAI/AzureOpenAI/AzureOpenAI.csproj b/extensions/AzureOpenAI/AzureOpenAI/AzureOpenAI.csproj index a2c44bc35..9bf95c22c 100644 --- a/extensions/AzureOpenAI/AzureOpenAI/AzureOpenAI.csproj +++ b/extensions/AzureOpenAI/AzureOpenAI/AzureOpenAI.csproj @@ -10,7 +10,7 @@ - + diff --git a/extensions/AzureOpenAI/AzureOpenAI/AzureOpenAITextEmbeddingGenerator.cs b/extensions/AzureOpenAI/AzureOpenAI/AzureOpenAITextEmbeddingGenerator.cs index b4777b8cc..0d7411f9b 100644 --- a/extensions/AzureOpenAI/AzureOpenAI/AzureOpenAITextEmbeddingGenerator.cs +++ b/extensions/AzureOpenAI/AzureOpenAI/AzureOpenAITextEmbeddingGenerator.cs @@ -10,7 +10,6 @@ using Azure.AI.OpenAI; using Microsoft.Extensions.Logging; using Microsoft.KernelMemory.AI.AzureOpenAI.Internals; -using Microsoft.KernelMemory.AI.OpenAI; using Microsoft.KernelMemory.Diagnostics; using Microsoft.SemanticKernel; using Microsoft.SemanticKernel.AI.Embeddings; @@ -98,10 +97,10 @@ public AzureOpenAITextEmbeddingGenerator( textTokenizer ??= TokenizerFactory.GetTokenizerForEncoding(config.Tokenizer); if (textTokenizer == null) { + textTokenizer = new CL100KTokenizer(); this._log.LogWarning( "Tokenizer not specified, will use {0}. The token count might be incorrect, causing unexpected errors", - nameof(CL100KTokenizer)); - textTokenizer = new CL100KTokenizer(); + textTokenizer.GetType().FullName); } this._textTokenizer = textTokenizer; diff --git a/extensions/AzureOpenAI/AzureOpenAI/AzureOpenAITextGenerator.cs b/extensions/AzureOpenAI/AzureOpenAI/AzureOpenAITextGenerator.cs index 2606c3cb4..94375aa2b 100644 --- a/extensions/AzureOpenAI/AzureOpenAI/AzureOpenAITextGenerator.cs +++ b/extensions/AzureOpenAI/AzureOpenAI/AzureOpenAITextGenerator.cs @@ -9,7 +9,6 @@ using Azure.AI.OpenAI; using Microsoft.Extensions.Logging; using Microsoft.KernelMemory.AI.AzureOpenAI.Internals; -using Microsoft.KernelMemory.AI.OpenAI; using Microsoft.KernelMemory.Diagnostics; using Microsoft.SemanticKernel; using Microsoft.SemanticKernel.Connectors.AzureOpenAI; @@ -93,10 +92,10 @@ public AzureOpenAITextGenerator( textTokenizer ??= TokenizerFactory.GetTokenizerForEncoding(config.Tokenizer); if (textTokenizer == null) { + textTokenizer = new O200KTokenizer(); this._log.LogWarning( "Tokenizer not specified, will use {0}. The token count might be incorrect, causing unexpected errors", - nameof(O200KTokenizer)); - textTokenizer = new O200KTokenizer(); + textTokenizer.GetType().FullName); } this._textTokenizer = textTokenizer; diff --git a/extensions/KM/KernelMemory/KernelMemory.csproj b/extensions/KM/KernelMemory/KernelMemory.csproj index ea33c0c6b..26a5c99df 100644 --- a/extensions/KM/KernelMemory/KernelMemory.csproj +++ b/extensions/KM/KernelMemory/KernelMemory.csproj @@ -9,12 +9,12 @@ - - - + + + @@ -24,8 +24,9 @@ - + + diff --git a/extensions/LlamaSharp/LlamaSharp.FunctionalTests/LlamaSharpTextGeneratorTest.cs b/extensions/LlamaSharp/LlamaSharp.FunctionalTests/LlamaSharpTextGeneratorTest.cs index 3bca592e4..285ff1425 100644 --- a/extensions/LlamaSharp/LlamaSharp.FunctionalTests/LlamaSharpTextGeneratorTest.cs +++ b/extensions/LlamaSharp/LlamaSharp.FunctionalTests/LlamaSharpTextGeneratorTest.cs @@ -40,7 +40,7 @@ public void ItCountsTokens() // Assert Console.WriteLine("Phi3 token count: " + tokenCount); - Console.WriteLine("GPT4 token count: " + DefaultGPTTokenizer.StaticCountTokens(text)); + Console.WriteLine("GPT4 token count: " + (new CL100KTokenizer()).CountTokens(text)); Console.WriteLine($"Time: {this._timer.ElapsedMilliseconds / 1000} secs"); // Expected result with Phi-3-mini-4k-instruct-q4.gguf, without BoS (https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf) diff --git a/extensions/ONNX/Onnx/Onnx.csproj b/extensions/ONNX/Onnx/Onnx.csproj index 96ec73dff..a24bba3bb 100644 --- a/extensions/ONNX/Onnx/Onnx.csproj +++ b/extensions/ONNX/Onnx/Onnx.csproj @@ -8,6 +8,15 @@ $(NoWarn);KMEXP00;KMEXP01;CA1724; + + + + + + + + + true Microsoft.KernelMemory.AI.Onnx @@ -21,12 +30,4 @@ - - - - - - - - diff --git a/extensions/ONNX/Onnx/OnnxTextGenerator.cs b/extensions/ONNX/Onnx/OnnxTextGenerator.cs index fc6705c18..67c75fdf2 100644 --- a/extensions/ONNX/Onnx/OnnxTextGenerator.cs +++ b/extensions/ONNX/Onnx/OnnxTextGenerator.cs @@ -9,7 +9,6 @@ using System.Threading; using System.Threading.Tasks; using Microsoft.Extensions.Logging; -using Microsoft.KernelMemory.AI.OpenAI; using Microsoft.KernelMemory.Diagnostics; using Microsoft.ML.OnnxRuntimeGenAI; using static Microsoft.KernelMemory.OnnxConfig; @@ -62,10 +61,10 @@ public OnnxTextGenerator( this._log = (loggerFactory ?? DefaultLogger.Factory).CreateLogger(); if (textTokenizer == null) { + textTokenizer = new O200KTokenizer(); this._log.LogWarning( "Tokenizer not specified, will use {0}. The token count might be incorrect, causing unexpected errors", - nameof(O200KTokenizer)); - textTokenizer = new O200KTokenizer(); + textTokenizer.GetType().FullName); } config.Validate(); diff --git a/extensions/Ollama/Ollama/Ollama.csproj b/extensions/Ollama/Ollama/Ollama.csproj index e26c65d47..6a943c693 100644 --- a/extensions/Ollama/Ollama/Ollama.csproj +++ b/extensions/Ollama/Ollama/Ollama.csproj @@ -8,6 +8,15 @@ $(NoWarn);KMEXP00;KMEXP01;CA1724; + + + + + + + + + true Microsoft.KernelMemory.AI.Ollama @@ -21,13 +30,4 @@ - - - - - - - - - diff --git a/extensions/Ollama/Ollama/OllamaTextEmbeddingGenerator.cs b/extensions/Ollama/Ollama/OllamaTextEmbeddingGenerator.cs index 77afc0ac8..a2d093c98 100644 --- a/extensions/Ollama/Ollama/OllamaTextEmbeddingGenerator.cs +++ b/extensions/Ollama/Ollama/OllamaTextEmbeddingGenerator.cs @@ -7,7 +7,6 @@ using System.Threading; using System.Threading.Tasks; using Microsoft.Extensions.Logging; -using Microsoft.KernelMemory.AI.OpenAI; using Microsoft.KernelMemory.Context; using Microsoft.KernelMemory.Diagnostics; using OllamaSharp; @@ -44,10 +43,10 @@ public OllamaTextEmbeddingGenerator( textTokenizer ??= TokenizerFactory.GetTokenizerForEncoding(modelConfig.Tokenizer); if (textTokenizer == null) { + textTokenizer = new CL100KTokenizer(); this._log.LogWarning( "Tokenizer not specified, will use {0}. The token count might be incorrect, causing unexpected errors", - nameof(CL100KTokenizer)); - textTokenizer = new CL100KTokenizer(); + textTokenizer.GetType().FullName); } this._textTokenizer = textTokenizer; diff --git a/extensions/Ollama/Ollama/OllamaTextGenerator.cs b/extensions/Ollama/Ollama/OllamaTextGenerator.cs index 694123ed0..34900713c 100644 --- a/extensions/Ollama/Ollama/OllamaTextGenerator.cs +++ b/extensions/Ollama/Ollama/OllamaTextGenerator.cs @@ -7,7 +7,6 @@ using System.Threading; using System.Threading.Tasks; using Microsoft.Extensions.Logging; -using Microsoft.KernelMemory.AI.OpenAI; using Microsoft.KernelMemory.Context; using Microsoft.KernelMemory.Diagnostics; using OllamaSharp; @@ -41,10 +40,10 @@ public OllamaTextGenerator( textTokenizer ??= TokenizerFactory.GetTokenizerForEncoding(modelConfig.Tokenizer); if (textTokenizer == null) { + textTokenizer = new O200KTokenizer(); this._log.LogWarning( "Tokenizer not specified, will use {0}. The token count might be incorrect, causing unexpected errors", - nameof(O200KTokenizer)); - textTokenizer = new O200KTokenizer(); + textTokenizer.GetType().FullName); } this._textTokenizer = textTokenizer; diff --git a/extensions/OpenAI/OpenAI/OpenAI.csproj b/extensions/OpenAI/OpenAI/OpenAI.csproj index f1879abed..145d086fd 100644 --- a/extensions/OpenAI/OpenAI/OpenAI.csproj +++ b/extensions/OpenAI/OpenAI/OpenAI.csproj @@ -10,6 +10,7 @@ + diff --git a/extensions/OpenAI/OpenAI/OpenAITextEmbeddingGenerator.cs b/extensions/OpenAI/OpenAI/OpenAITextEmbeddingGenerator.cs index 5823ace2f..58b049796 100644 --- a/extensions/OpenAI/OpenAI/OpenAITextEmbeddingGenerator.cs +++ b/extensions/OpenAI/OpenAI/OpenAITextEmbeddingGenerator.cs @@ -104,10 +104,10 @@ public OpenAITextEmbeddingGenerator( textTokenizer ??= TokenizerFactory.GetTokenizerForModel(config.EmbeddingModel); if (textTokenizer == null) { + textTokenizer = new CL100KTokenizer(); this._log.LogWarning( "Tokenizer not specified, will use {0}. The token count might be incorrect, causing unexpected errors", - nameof(CL100KTokenizer)); - textTokenizer = new CL100KTokenizer(); + textTokenizer.GetType().FullName); } this._textTokenizer = textTokenizer; diff --git a/extensions/OpenAI/OpenAI/OpenAITextGenerator.cs b/extensions/OpenAI/OpenAI/OpenAITextGenerator.cs index 7e39986c7..dbc9cb857 100644 --- a/extensions/OpenAI/OpenAI/OpenAITextGenerator.cs +++ b/extensions/OpenAI/OpenAI/OpenAITextGenerator.cs @@ -97,10 +97,10 @@ public OpenAITextGenerator( textTokenizer ??= TokenizerFactory.GetTokenizerForModel(config.TextModel); if (textTokenizer == null) { + textTokenizer = new O200KTokenizer(); this._log.LogWarning( "Tokenizer not specified, will use {0}. The token count might be incorrect, causing unexpected errors", - nameof(O200KTokenizer)); - textTokenizer = new O200KTokenizer(); + textTokenizer.GetType().FullName); } this._textTokenizer = textTokenizer; diff --git a/extensions/Tiktoken/README.md b/extensions/Tiktoken/README.md new file mode 100644 index 000000000..adb6e0306 --- /dev/null +++ b/extensions/Tiktoken/README.md @@ -0,0 +1,6 @@ +# Kernel Memory with Tiktoken tokenizers + +[![Nuget package](https://img.shields.io/nuget/v/Microsoft.KernelMemory.AI.Tiktoken)](https://www.nuget.org/packages/Microsoft.KernelMemory.AI.Tiktoken/) +[![Discord](https://img.shields.io/discord/1063152441819942922?label=Discord&logo=discord&logoColor=white&color=d82679)](https://aka.ms/KMdiscord) + +This project contains the Tiktoken tokenizers for Kernel Memory. diff --git a/extensions/OpenAI/OpenAI.UnitTests/Startup.cs b/extensions/Tiktoken/Tiktoken.UnitTests/Startup.cs similarity index 89% rename from extensions/OpenAI/OpenAI.UnitTests/Startup.cs rename to extensions/Tiktoken/Tiktoken.UnitTests/Startup.cs index b489661b8..c840e11ef 100644 --- a/extensions/OpenAI/OpenAI.UnitTests/Startup.cs +++ b/extensions/Tiktoken/Tiktoken.UnitTests/Startup.cs @@ -5,7 +5,7 @@ using Microsoft.Extensions.Hosting; -namespace Microsoft.OpenAI.UnitTests; +namespace Microsoft.Tiktoken.UnitTests; public class Startup { diff --git a/extensions/OpenAI/OpenAI.UnitTests/OpenAI.UnitTests.csproj b/extensions/Tiktoken/Tiktoken.UnitTests/Tiktoken.UnitTests.csproj similarity index 91% rename from extensions/OpenAI/OpenAI.UnitTests/OpenAI.UnitTests.csproj rename to extensions/Tiktoken/Tiktoken.UnitTests/Tiktoken.UnitTests.csproj index bfd523257..4b8557bf2 100644 --- a/extensions/OpenAI/OpenAI.UnitTests/OpenAI.UnitTests.csproj +++ b/extensions/Tiktoken/Tiktoken.UnitTests/Tiktoken.UnitTests.csproj @@ -1,8 +1,8 @@ - Microsoft.OpenAI.UnitTests - Microsoft.OpenAI.UnitTests + Microsoft.Tiktoken.UnitTests + Microsoft.Tiktoken.UnitTests net8.0 LatestMajor true diff --git a/extensions/OpenAI/OpenAI.UnitTests/GPTTokenizersTests.cs b/extensions/Tiktoken/Tiktoken.UnitTests/TokenizersTests.cs similarity index 87% rename from extensions/OpenAI/OpenAI.UnitTests/GPTTokenizersTests.cs rename to extensions/Tiktoken/Tiktoken.UnitTests/TokenizersTests.cs index 025fcbbea..270e4ba63 100644 --- a/extensions/OpenAI/OpenAI.UnitTests/GPTTokenizersTests.cs +++ b/extensions/Tiktoken/Tiktoken.UnitTests/TokenizersTests.cs @@ -1,13 +1,13 @@ // Copyright (c) Microsoft. All rights reserved. -using Microsoft.KernelMemory.AI.OpenAI; +using Microsoft.KernelMemory.AI; using Microsoft.KM.TestHelpers; using Xunit; using Xunit.Abstractions; -namespace Microsoft.OpenAI.UnitTests; +namespace Microsoft.Tiktoken.UnitTests; -public class GPTTokenizersTests(ITestOutputHelper output) : BaseUnitTestCase(output) +public class TokenizersTests(ITestOutputHelper output) : BaseUnitTestCase(output) { [Fact] [Trait("Category", "UnitTest")] diff --git a/extensions/OpenAI/OpenAI/Tokenizers/CL100KTokenizer.cs b/extensions/Tiktoken/Tiktoken/CL100KTokenizer.cs similarity index 81% rename from extensions/OpenAI/OpenAI/Tokenizers/CL100KTokenizer.cs rename to extensions/Tiktoken/Tiktoken/CL100KTokenizer.cs index 7b27d3cba..30c401864 100644 --- a/extensions/OpenAI/OpenAI/Tokenizers/CL100KTokenizer.cs +++ b/extensions/Tiktoken/Tiktoken/CL100KTokenizer.cs @@ -4,9 +4,7 @@ using System.Linq; using Microsoft.ML.Tokenizers; -#pragma warning disable IDE0130 // reduce number of "using" statements -// ReSharper disable once CheckNamespace -namespace Microsoft.KernelMemory.AI.OpenAI; +namespace Microsoft.KernelMemory.AI; public class CL100KTokenizer : ITextTokenizer { diff --git a/extensions/OpenAI/OpenAI/Tokenizers/GPTTokenizers.cs b/extensions/Tiktoken/Tiktoken/GPTTokenizers.cs similarity index 78% rename from extensions/OpenAI/OpenAI/Tokenizers/GPTTokenizers.cs rename to extensions/Tiktoken/Tiktoken/GPTTokenizers.cs index ff8dc8ec5..109cca358 100644 --- a/extensions/OpenAI/OpenAI/Tokenizers/GPTTokenizers.cs +++ b/extensions/Tiktoken/Tiktoken/GPTTokenizers.cs @@ -1,8 +1,6 @@ // Copyright (c) Microsoft. All rights reserved. -#pragma warning disable IDE0130 // reduce number of "using" statements -// ReSharper disable once CheckNamespace -namespace Microsoft.KernelMemory.AI.OpenAI; +namespace Microsoft.KernelMemory.AI; /// /// GPT3 tokenizer diff --git a/extensions/OpenAI/OpenAI/Tokenizers/O200KTokenizer.cs b/extensions/Tiktoken/Tiktoken/O200KTokenizer.cs similarity index 81% rename from extensions/OpenAI/OpenAI/Tokenizers/O200KTokenizer.cs rename to extensions/Tiktoken/Tiktoken/O200KTokenizer.cs index 4613948f7..3a33979c5 100644 --- a/extensions/OpenAI/OpenAI/Tokenizers/O200KTokenizer.cs +++ b/extensions/Tiktoken/Tiktoken/O200KTokenizer.cs @@ -4,9 +4,7 @@ using System.Linq; using Microsoft.ML.Tokenizers; -#pragma warning disable IDE0130 // reduce number of "using" statements -// ReSharper disable once CheckNamespace -namespace Microsoft.KernelMemory.AI.OpenAI; +namespace Microsoft.KernelMemory.AI; public class O200KTokenizer : ITextTokenizer { diff --git a/extensions/OpenAI/OpenAI/Tokenizers/P50KTokenizer.cs b/extensions/Tiktoken/Tiktoken/P50KTokenizer.cs similarity index 79% rename from extensions/OpenAI/OpenAI/Tokenizers/P50KTokenizer.cs rename to extensions/Tiktoken/Tiktoken/P50KTokenizer.cs index 2c6faf0d8..6a059f42f 100644 --- a/extensions/OpenAI/OpenAI/Tokenizers/P50KTokenizer.cs +++ b/extensions/Tiktoken/Tiktoken/P50KTokenizer.cs @@ -4,9 +4,7 @@ using System.Linq; using Microsoft.ML.Tokenizers; -#pragma warning disable IDE0130 // reduce number of "using" statements -// ReSharper disable once CheckNamespace -namespace Microsoft.KernelMemory.AI.OpenAI; +namespace Microsoft.KernelMemory.AI; public class P50KTokenizer : ITextTokenizer { diff --git a/extensions/Tiktoken/Tiktoken/Tiktoken.csproj b/extensions/Tiktoken/Tiktoken/Tiktoken.csproj new file mode 100644 index 000000000..839e109bf --- /dev/null +++ b/extensions/Tiktoken/Tiktoken/Tiktoken.csproj @@ -0,0 +1,32 @@ + + + + net8.0 + LatestMajor + Microsoft.KernelMemory.AI.Tiktoken + Microsoft.KernelMemory.AI + $(NoWarn);KMEXP00;CA1308;NU5104; + + + + + + + + + + + + true + Microsoft.KernelMemory.AI.Tiktoken + Tiktoken tokenizers for Kernel Memory + Provide tokenizers to allow counting content tokens for text and embeddings + Tiktoken, Tokenizer, RAG, Kernel Memory, AI, Artificial Intelligence, Embeddings, Vector DB, Vector Search, Memory DB + bin/$(Configuration)/$(TargetFramework)/$(AssemblyName).xml + + + + + + + diff --git a/extensions/OpenAI/OpenAI/Tokenizers/TiktokenTokenizer.cs b/extensions/Tiktoken/Tiktoken/TiktokenTokenizer.cs similarity index 85% rename from extensions/OpenAI/OpenAI/Tokenizers/TiktokenTokenizer.cs rename to extensions/Tiktoken/Tiktoken/TiktokenTokenizer.cs index 18173225d..9d441893f 100644 --- a/extensions/OpenAI/OpenAI/Tokenizers/TiktokenTokenizer.cs +++ b/extensions/Tiktoken/Tiktoken/TiktokenTokenizer.cs @@ -5,9 +5,7 @@ using System.Linq; using Microsoft.ML.Tokenizers; -#pragma warning disable IDE0130 // reduce number of "using" statements -// ReSharper disable once CheckNamespace -namespace Microsoft.KernelMemory.AI.OpenAI; +namespace Microsoft.KernelMemory.AI; public class TiktokenTokenizer : ITextTokenizer { diff --git a/extensions/OpenAI/OpenAI/Tokenizers/TokenizerFactory.cs b/extensions/Tiktoken/Tiktoken/TokenizerFactory.cs similarity index 91% rename from extensions/OpenAI/OpenAI/Tokenizers/TokenizerFactory.cs rename to extensions/Tiktoken/Tiktoken/TokenizerFactory.cs index 6fb619897..2db88d151 100644 --- a/extensions/OpenAI/OpenAI/Tokenizers/TokenizerFactory.cs +++ b/extensions/Tiktoken/Tiktoken/TokenizerFactory.cs @@ -2,9 +2,7 @@ using System; -#pragma warning disable IDE0130 // reduce number of "using" statements -// ReSharper disable once CheckNamespace -namespace Microsoft.KernelMemory.AI.OpenAI; +namespace Microsoft.KernelMemory.AI; public static class TokenizerFactory { diff --git a/service/Core/AI/DefaultGPTTokenizer.cs b/service/Core/AI/DefaultGPTTokenizer.cs deleted file mode 100644 index 9a3adb619..000000000 --- a/service/Core/AI/DefaultGPTTokenizer.cs +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright (c) Microsoft. All rights reserved. - -using System.Collections.Generic; -using System.Linq; -using Microsoft.ML.Tokenizers; - -namespace Microsoft.KernelMemory.AI; - -public class DefaultGPTTokenizer : ITextTokenizer -{ - private static readonly Tokenizer s_tokenizer = TiktokenTokenizer.CreateForModel( - "gpt-4", new Dictionary { { "<|im_start|>", 100264 }, { "<|im_end|>", 100265 } }); - - public static int StaticCountTokens(string text) - { - return s_tokenizer.CountTokens(text); - } - - public int CountTokens(string text) - { - return s_tokenizer.CountTokens(text); - } - - public IReadOnlyList GetTokens(string text) - { - return s_tokenizer.EncodeToTokens(text, out string? _).Select(t => t.Value).ToList(); - } -} diff --git a/service/Core/Core.csproj b/service/Core/Core.csproj index 26aca1d01..1c5c69eaf 100644 --- a/service/Core/Core.csproj +++ b/service/Core/Core.csproj @@ -9,6 +9,7 @@ + diff --git a/service/Core/DataFormats/Text/TextChunker.cs b/service/Core/DataFormats/Text/TextChunker.cs index 374a4eb5f..a89100492 100644 --- a/service/Core/DataFormats/Text/TextChunker.cs +++ b/service/Core/DataFormats/Text/TextChunker.cs @@ -25,6 +25,9 @@ public static class TextChunker /// The number of tokens in the input string. public delegate int TokenCounter(string input); + // Fallback when TokenCounter is not set + private static readonly TokenCounter s_defaultTokenCounter = (new CL100KTokenizer()).CountTokens; + private static readonly char[] s_spaceChar = [' ']; private static readonly string?[] s_plaintextSplitOptions = ["\n\r", ".", "?!", ";", ":", ",", ")]}", " ", "-", null]; private static readonly string?[] s_markdownSplitOptions = [".", "?!", ";", ":", ",", ")]}", " ", "-", "\n\r", null]; @@ -47,7 +50,7 @@ public static List SplitPlainTextLines( s_plaintextSplitOptions, tokenCounter); /// - /// Split markdown text into lines. + /// Split Markdown text into lines. /// /// Text to split /// Maximum number of tokens per line. @@ -93,7 +96,7 @@ public static List SplitPlainTextParagraphs( tokenCounter); /// - /// Split markdown text into paragraphs. + /// Split Markdown text into paragraphs. /// /// Lines of text. /// Maximum number of tokens per paragraph. @@ -399,6 +402,6 @@ private static (List, bool) Split( private static int GetTokenCount(string input, TokenCounter? tokenCounter) { // Fall back to GPT tokenizer if none configured - return tokenCounter?.Invoke(input) ?? DefaultGPTTokenizer.StaticCountTokens(input); + return tokenCounter?.Invoke(input) ?? s_defaultTokenCounter(input); } } diff --git a/service/Core/Handlers/TextPartitioningHandler.cs b/service/Core/Handlers/TextPartitioningHandler.cs index 489ffbfb0..34cc47ee2 100644 --- a/service/Core/Handlers/TextPartitioningHandler.cs +++ b/service/Core/Handlers/TextPartitioningHandler.cs @@ -49,7 +49,7 @@ public TextPartitioningHandler( this._log = (loggerFactory ?? DefaultLogger.Factory).CreateLogger(); this._log.LogInformation("Handler '{0}' ready", stepName); - this._tokenCounter = DefaultGPTTokenizer.StaticCountTokens; + this._tokenCounter = (new CL100KTokenizer()).CountTokens; if (orchestrator.EmbeddingGenerationEnabled) { foreach (var gen in orchestrator.GetEmbeddingGenerators()) diff --git a/service/Core/SemanticKernel/SemanticKernelTextEmbeddingGenerator.cs b/service/Core/SemanticKernel/SemanticKernelTextEmbeddingGenerator.cs index 07c487581..5f4a61cfc 100644 --- a/service/Core/SemanticKernel/SemanticKernelTextEmbeddingGenerator.cs +++ b/service/Core/SemanticKernel/SemanticKernelTextEmbeddingGenerator.cs @@ -41,10 +41,10 @@ public SemanticKernelTextEmbeddingGenerator( if (textTokenizer == null) { + textTokenizer = new CL100KTokenizer(); this._log.LogWarning( "Tokenizer not specified, will use {0}. The token count might be incorrect, causing unexpected errors", - nameof(DefaultGPTTokenizer)); - textTokenizer = new DefaultGPTTokenizer(); + textTokenizer.GetType().FullName); } this._tokenizer = textTokenizer; diff --git a/service/Core/SemanticKernel/SemanticKernelTextGenerator.cs b/service/Core/SemanticKernel/SemanticKernelTextGenerator.cs index 5d592db33..06c4844d3 100644 --- a/service/Core/SemanticKernel/SemanticKernelTextGenerator.cs +++ b/service/Core/SemanticKernel/SemanticKernelTextGenerator.cs @@ -42,10 +42,10 @@ public SemanticKernelTextGenerator( if (textTokenizer == null) { + textTokenizer = new O200KTokenizer(); this._log.LogWarning( "Tokenizer not specified, will use {0}. The token count might be incorrect, causing unexpected errors", - nameof(DefaultGPTTokenizer)); - textTokenizer = new DefaultGPTTokenizer(); + textTokenizer.GetType().FullName); } this._tokenizer = textTokenizer; From 8c3a7a7916ba1883130d412cf7054eae22a4bf35 Mon Sep 17 00:00:00 2001 From: Devis Lucato Date: Wed, 27 Nov 2024 11:55:50 -0800 Subject: [PATCH 3/4] Fix ONNX and other misc --- Directory.Packages.props | 4 +++- examples/002-dotnet-Serverless/Program.cs | 2 +- examples/002-dotnet-Serverless/appsettings.json | 13 ------------- .../AzureOpenAI.FunctionalTests/Issue855Test.cs | 3 ++- .../ONNX/Onnx.FunctionalTests/appsettings.json | 4 +++- extensions/ONNX/Onnx/Onnx.csproj | 2 ++ extensions/ONNX/Onnx/OnnxConfig.cs | 6 ++++++ extensions/ONNX/Onnx/OnnxTextGenerator.cs | 2 ++ .../DefaultTestCases/DocumentUploadTest.cs | 2 +- 9 files changed, 20 insertions(+), 18 deletions(-) diff --git a/Directory.Packages.props b/Directory.Packages.props index 9947848b6..1c9cf7265 100644 --- a/Directory.Packages.props +++ b/Directory.Packages.props @@ -24,7 +24,9 @@ - + + + diff --git a/examples/002-dotnet-Serverless/Program.cs b/examples/002-dotnet-Serverless/Program.cs index f0e7d5fbe..ee5f680d6 100644 --- a/examples/002-dotnet-Serverless/Program.cs +++ b/examples/002-dotnet-Serverless/Program.cs @@ -57,7 +57,7 @@ public static async Task Main() var builder = new KernelMemoryBuilder() .Configure(builder => builder.Services.AddLogging(l => { - l.SetMinimumLevel(LogLevel.Warning); + l.SetMinimumLevel(LogLevel.Error); l.AddSimpleConsole(c => c.SingleLine = true); })) .AddSingleton(memoryConfiguration) diff --git a/examples/002-dotnet-Serverless/appsettings.json b/examples/002-dotnet-Serverless/appsettings.json index c76954983..a8cf55665 100644 --- a/examples/002-dotnet-Serverless/appsettings.json +++ b/examples/002-dotnet-Serverless/appsettings.json @@ -1,17 +1,4 @@ { - "Logging": { - "LogLevel": { - "Default": "Warning", - // Examples: how to handle logs differently by class - // "Microsoft.KernelMemory.Handlers.TextExtractionHandler": "Information", - // "Microsoft.KernelMemory.Handlers.TextPartitioningHandler": "Information", - // "Microsoft.KernelMemory.Handlers.GenerateEmbeddingsHandler": "Information", - // "Microsoft.KernelMemory.Handlers.SaveEmbeddingsHandler": "Information", - // "Microsoft.KernelMemory.DocumentStorage.AzureBlobs": "Information", - // "Microsoft.KernelMemory.Pipeline.Queue.AzureQueues": "Information", - "Microsoft.AspNetCore": "Warning" - } - }, "KernelMemory": { "Services": { "AzureAIContentSafety": { diff --git a/extensions/AzureOpenAI/AzureOpenAI.FunctionalTests/Issue855Test.cs b/extensions/AzureOpenAI/AzureOpenAI.FunctionalTests/Issue855Test.cs index f2a3ab768..e0740d134 100644 --- a/extensions/AzureOpenAI/AzureOpenAI.FunctionalTests/Issue855Test.cs +++ b/extensions/AzureOpenAI/AzureOpenAI.FunctionalTests/Issue855Test.cs @@ -22,10 +22,11 @@ public Issue855Test(IConfiguration cfg, ITestOutputHelper output) : base(cfg, ou this._target = new AzureOpenAITextEmbeddingGenerator(this.AzureOpenAIEmbeddingConfiguration); } + // [Fact] // Enable manually on a need basis [Fact(Skip = "Enable and run manually")] [Trait("Category", "Manual")] [Trait("Category", "BugFix")] - public async Task ItDoesntWhenThrottling() + public async Task ItDoesntFailWhenThrottling() { for (int i = 0; i < 50; i++) { diff --git a/extensions/ONNX/Onnx.FunctionalTests/appsettings.json b/extensions/ONNX/Onnx.FunctionalTests/appsettings.json index ec84442d4..56b9181c5 100644 --- a/extensions/ONNX/Onnx.FunctionalTests/appsettings.json +++ b/extensions/ONNX/Onnx.FunctionalTests/appsettings.json @@ -7,7 +7,9 @@ "Services": { "Onnx": { // Path to directory containing ONNX Model, e.g. "C:\\....\\Phi-3-mini-128k-instruct-onnx\\....\\cpu-int4-rtn-block-32" - "TextModelDir": "Z:\\tools\\LocalModels\\Phi-3-mini-128k-instruct-onnx\\cpu_and_mobile\\cpu-int4-rtn-block-32" + "TextModelDir": "Z:\\tools\\LocalModels\\Phi-3-mini-128k-instruct-onnx\\cpu_and_mobile\\cpu-int4-rtn-block-32", + // Supported values: "p50k", "cl100k", "o200k". Leave it empty if unsure. + "Tokenizer": "o200k", }, "SimpleVectorDb": { // Options: "Disk" or "Volatile". Volatile data is lost after each execution. diff --git a/extensions/ONNX/Onnx/Onnx.csproj b/extensions/ONNX/Onnx/Onnx.csproj index a24bba3bb..ecafdfed3 100644 --- a/extensions/ONNX/Onnx/Onnx.csproj +++ b/extensions/ONNX/Onnx/Onnx.csproj @@ -15,6 +15,8 @@ + + diff --git a/extensions/ONNX/Onnx/OnnxConfig.cs b/extensions/ONNX/Onnx/OnnxConfig.cs index 4a0ce66fe..3a54540ac 100644 --- a/extensions/ONNX/Onnx/OnnxConfig.cs +++ b/extensions/ONNX/Onnx/OnnxConfig.cs @@ -50,6 +50,12 @@ public enum OnnxSearchType /// public int MaxTokens { get; set; } = 2048; + /// + /// Name of the tokenizer used to count tokens. + /// Supported values: "p50k", "cl100k", "o200k". Leave it empty if unsure. + /// + public string Tokenizer { get; set; } = "o200k"; + /// /// The minimum length of the response that the model will generate. See https://onnxruntime.ai/docs/genai/reference/config.html /// diff --git a/extensions/ONNX/Onnx/OnnxTextGenerator.cs b/extensions/ONNX/Onnx/OnnxTextGenerator.cs index 67c75fdf2..7f31e49a7 100644 --- a/extensions/ONNX/Onnx/OnnxTextGenerator.cs +++ b/extensions/ONNX/Onnx/OnnxTextGenerator.cs @@ -59,6 +59,8 @@ public OnnxTextGenerator( ILoggerFactory? loggerFactory = null) { this._log = (loggerFactory ?? DefaultLogger.Factory).CreateLogger(); + + textTokenizer ??= TokenizerFactory.GetTokenizerForEncoding(config.Tokenizer); if (textTokenizer == null) { textTokenizer = new O200KTokenizer(); diff --git a/service/tests/Core.FunctionalTests/DefaultTestCases/DocumentUploadTest.cs b/service/tests/Core.FunctionalTests/DefaultTestCases/DocumentUploadTest.cs index 7848f36d5..7123609f1 100644 --- a/service/tests/Core.FunctionalTests/DefaultTestCases/DocumentUploadTest.cs +++ b/service/tests/Core.FunctionalTests/DefaultTestCases/DocumentUploadTest.cs @@ -53,7 +53,7 @@ await memory.ImportDocumentAsync( var count = 0; while (!await memory.IsDocumentReadyAsync(documentId: Id)) { - Assert.True(count++ <= 30, "Document import timed out"); + Assert.True(count++ <= 60, "Document import timed out"); log("Waiting for memory ingestion to complete..."); await Task.Delay(TimeSpan.FromSeconds(1)); } From 61e5837f6c329c5c9ef26230f500f13caabf492c Mon Sep 17 00:00:00 2001 From: Devis Lucato Date: Wed, 27 Nov 2024 12:31:48 -0800 Subject: [PATCH 4/4] Misc --- .../007-dotnet-serverless-azure.csproj | 1 + .../007-dotnet-serverless-azure/Program.cs | 43 +++++++++++++------ .../AzureAISearch/AzureAISearchMemory.cs | 4 +- service/Core/Search/SearchClient.cs | 7 +++ service/Core/Search/SearchClientResult.cs | 1 + 5 files changed, 43 insertions(+), 13 deletions(-) diff --git a/examples/007-dotnet-serverless-azure/007-dotnet-serverless-azure.csproj b/examples/007-dotnet-serverless-azure/007-dotnet-serverless-azure.csproj index fca07cf1e..7f9ccf1b6 100644 --- a/examples/007-dotnet-serverless-azure/007-dotnet-serverless-azure.csproj +++ b/examples/007-dotnet-serverless-azure/007-dotnet-serverless-azure.csproj @@ -3,6 +3,7 @@ net8.0 enable + CS0162;CA2007;CA1303;IDE0058;IDE0008;CA1050;CA1515; diff --git a/examples/007-dotnet-serverless-azure/Program.cs b/examples/007-dotnet-serverless-azure/Program.cs index d99e2c41f..b8c356b82 100644 --- a/examples/007-dotnet-serverless-azure/Program.cs +++ b/examples/007-dotnet-serverless-azure/Program.cs @@ -15,8 +15,13 @@ public static class Program { private static MemoryServerless? s_memory; + private const string IndexName = "example006"; + // Use these booleans in case you don't want to use these Azure Services + private const bool UseAzureAIDocIntelligence = true; + private const bool UseAzureAIContentSafety = true; + public static async Task Main() { var memoryConfiguration = new KernelMemoryConfig(); @@ -43,37 +48,49 @@ public static async Task Main() var builder = new KernelMemoryBuilder() .WithAzureBlobsDocumentStorage(azureBlobConfig) - .WithAzureAIDocIntel(azureAIDocIntelConfig) .WithAzureOpenAITextEmbeddingGeneration(azureOpenAIEmbeddingConfig) .WithAzureOpenAITextGeneration(azureOpenAITextConfig) .WithAzureAISearchMemoryDb(azureAISearchConfig) - .WithAzureAIContentSafetyModeration(azureAIContentSafetyConfig) + // .WithAzureAIDocIntel(azureAIDocIntelConfig) // see below + // .WithAzureAIContentSafetyModeration(azureAIContentSafetyConfig) // see below .Configure(builder => builder.Services.AddLogging(l => { - l.SetMinimumLevel(LogLevel.Warning); + l.SetMinimumLevel(LogLevel.Error); l.AddSimpleConsole(c => c.SingleLine = true); })); + // We split this builder code out in case you don't have these Azure services + if (UseAzureAIDocIntelligence) { builder.WithAzureAIContentSafetyModeration(azureAIContentSafetyConfig); } + + if (UseAzureAIContentSafety) { builder.WithAzureAIDocIntel(azureAIDocIntelConfig); } + s_memory = builder.Build(); - await StoreWebPage(); - await StoreImage(); + // ====== Store some data ====== - // Test 1 + await StoreWebPageAsync(); // Works with Azure AI Search and Azure OpenAI + await StoreImageAsync(); // Works only if Azure AI Document Intelligence is used + + // ====== Answer some questions ====== + + // When using hybrid search, relevance is much lower than cosine similarity + var minRelevance = azureAISearchConfig.UseHybridSearch ? 0 : 0.5; + + // Test 1 (answer from the web page) var question = "What's Kernel Memory?"; Console.WriteLine($"Question: {question}"); - var answer = await s_memory.AskAsync(question, minRelevance: 0.5, index: IndexName); + var answer = await s_memory.AskAsync(question, minRelevance: minRelevance, index: IndexName); Console.WriteLine($"Answer: {answer.Result}\n\n"); - // Test 2 + // Test 2 (requires Azure AI Document Intelligence to have parsed the image) question = "Which conference is Microsoft sponsoring?"; Console.WriteLine($"Question: {question}"); - answer = await s_memory.AskAsync(question, minRelevance: 0.5, index: IndexName); + answer = await s_memory.AskAsync(question, minRelevance: minRelevance, index: IndexName); Console.WriteLine($"Answer: {answer.Result}\n\n"); } // Downloading web pages - private static async Task StoreWebPage() + private static async Task StoreWebPageAsync() { const string DocId = "webPage1"; if (!await s_memory!.IsDocumentReadyAsync(DocId, index: IndexName)) @@ -87,9 +104,11 @@ private static async Task StoreWebPage() } } - // Extract memory from images (OCR required) - private static async Task StoreImage() + // Extract memory from images (requires Azure AI Document Intelligence) + private static async Task StoreImageAsync() { + if (!UseAzureAIDocIntelligence) { return; } + const string DocId = "img001"; if (!await s_memory!.IsDocumentReadyAsync(DocId, index: IndexName)) { diff --git a/extensions/AzureAISearch/AzureAISearch/AzureAISearchMemory.cs b/extensions/AzureAISearch/AzureAISearch/AzureAISearchMemory.cs index 72eed149d..fb838e862 100644 --- a/extensions/AzureAISearch/AzureAISearch/AzureAISearchMemory.cs +++ b/extensions/AzureAISearch/AzureAISearch/AzureAISearchMemory.cs @@ -372,7 +372,9 @@ private async Task DoesIndexExistAsync(string index, CancellationToken can private SearchClient GetSearchClient(string index) { var normalIndexName = this.NormalizeIndexName(index); - this._log.LogTrace("Preparing search client, index name '{0}' normalized to '{1}'", index, normalIndexName); + + if (index != normalIndexName) { this._log.LogTrace("Preparing search client, index name '{0}' normalized to '{1}'", index, normalIndexName); } + else { this._log.LogTrace("Preparing search client, index name '{0}'", normalIndexName); } // Search an available client from the local cache if (!this._clientsByIndex.TryGetValue(normalIndexName, out SearchClient? client)) diff --git a/service/Core/Search/SearchClient.cs b/service/Core/Search/SearchClient.cs index 26eb0d76c..cf410ad54 100644 --- a/service/Core/Search/SearchClient.cs +++ b/service/Core/Search/SearchClient.cs @@ -102,6 +102,8 @@ public async Task SearchAsync( if (result.State == SearchState.Stop) { break; } } + this._log.LogTrace("{Count} records processed", result.RecordCount); + if (result.SearchResult.Results.Count == 0) { this._log.LogDebug("No memories found"); @@ -167,6 +169,8 @@ public async Task AskAsync( if (result.State == SearchState.Stop) { break; } } + this._log.LogTrace("{Count} records processed", result.RecordCount); + return await this._answerGenerator.GenerateAnswerAsync(question, result, context, cancellationToken).ConfigureAwait(false); } @@ -189,6 +193,9 @@ private SearchClientResult ProcessMemoryRecord( return result.SkipRecord(); } + // Keep track of how many records have been processed + result.RecordCount++; + // Note: a document can be composed by multiple files string documentId = record.GetDocumentId(this._log); diff --git a/service/Core/Search/SearchClientResult.cs b/service/Core/Search/SearchClientResult.cs index 605055970..c509809a7 100644 --- a/service/Core/Search/SearchClientResult.cs +++ b/service/Core/Search/SearchClientResult.cs @@ -21,6 +21,7 @@ internal class SearchClientResult { public SearchMode Mode { get; private init; } public SearchState State { get; set; } + public int RecordCount { get; set; } // Use by in Search and Ask mode public MemoryAnswer AskResult { get; private init; } = new();