From 2c1065cbef9c1aa7825344decb9e56e47f4df17e Mon Sep 17 00:00:00 2001 From: Tarek Mahmoud Sayed Date: Sat, 22 Feb 2025 18:54:59 -0800 Subject: [PATCH] Phi-4 Tokenizer Support --- .../Microsoft.ML.Tokenizers.Data.Gpt2.csproj | 4 ++-- ...rosoft.ML.Tokenizers.Data.O200kBase.csproj | 4 ++-- ...crosoft.ML.Tokenizers.Data.P50kBase.csproj | 4 ++-- ...crosoft.ML.Tokenizers.Data.R50kBase.csproj | 4 ++-- .../Model/TiktokenTokenizer.cs | 19 +++++++++++++++--- .../TiktokenTests.cs | 20 ++++++++++++++++++- 6 files changed, 43 insertions(+), 12 deletions(-) diff --git a/src/Microsoft.ML.Tokenizers.Data.Gpt2/Microsoft.ML.Tokenizers.Data.Gpt2.csproj b/src/Microsoft.ML.Tokenizers.Data.Gpt2/Microsoft.ML.Tokenizers.Data.Gpt2.csproj index 66c89a06c1..bc90091b63 100644 --- a/src/Microsoft.ML.Tokenizers.Data.Gpt2/Microsoft.ML.Tokenizers.Data.Gpt2.csproj +++ b/src/Microsoft.ML.Tokenizers.Data.Gpt2/Microsoft.ML.Tokenizers.Data.Gpt2.csproj @@ -16,10 +16,10 @@ The file under MIT copyright license https://github.com/openai/tiktoken/blob/main/LICENSE Gpt2 vocab data is exact as the r50k_base vocab data, but with a different name. - In the CompressFile task above we modify the file's content to elimenate the ranks, thus reducing the file size, + In the CompressFile task above we modify the file's content to eliminate the ranks, thus reducing the file size, since the rank corresponds to the line number in the file. For the file p50k_base.tiktoken, we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number. - After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly. + After we eliminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly. --> diff --git a/src/Microsoft.ML.Tokenizers.Data.O200kBase/Microsoft.ML.Tokenizers.Data.O200kBase.csproj b/src/Microsoft.ML.Tokenizers.Data.O200kBase/Microsoft.ML.Tokenizers.Data.O200kBase.csproj index b9ce1bb964..701ec1fd4e 100644 --- a/src/Microsoft.ML.Tokenizers.Data.O200kBase/Microsoft.ML.Tokenizers.Data.O200kBase.csproj +++ b/src/Microsoft.ML.Tokenizers.Data.O200kBase/Microsoft.ML.Tokenizers.Data.O200kBase.csproj @@ -15,10 +15,10 @@ The file under MIT copyright license https://github.com/openai/tiktoken/blob/main/LICENSE - In the CompressFile task above we modify the file's content to elimenate the ranks, thus reducing the file size, + In the CompressFile task above we modify the file's content to eliminate the ranks, thus reducing the file size, since the rank corresponds to the line number in the file. For the file p50k_base.tiktoken, we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number. - After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly. + After we eliminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly. --> diff --git a/src/Microsoft.ML.Tokenizers.Data.P50kBase/Microsoft.ML.Tokenizers.Data.P50kBase.csproj b/src/Microsoft.ML.Tokenizers.Data.P50kBase/Microsoft.ML.Tokenizers.Data.P50kBase.csproj index 2d60f2ee5c..5464e96730 100644 --- a/src/Microsoft.ML.Tokenizers.Data.P50kBase/Microsoft.ML.Tokenizers.Data.P50kBase.csproj +++ b/src/Microsoft.ML.Tokenizers.Data.P50kBase/Microsoft.ML.Tokenizers.Data.P50kBase.csproj @@ -15,10 +15,10 @@ The file under MIT copyright license https://github.com/openai/tiktoken/blob/main/LICENSE - In the CompressFile task above we modify the file's content to elimenate the ranks, thus reducing the file size, + In the CompressFile task above we modify the file's content to eliminate the ranks, thus reducing the file size, since the rank corresponds to the line number in the file. For the file p50k_base.tiktoken, we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number. - After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly. + After we eliminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly. --> diff --git a/src/Microsoft.ML.Tokenizers.Data.R50kBase/Microsoft.ML.Tokenizers.Data.R50kBase.csproj b/src/Microsoft.ML.Tokenizers.Data.R50kBase/Microsoft.ML.Tokenizers.Data.R50kBase.csproj index b61f83a489..07960c3d1c 100644 --- a/src/Microsoft.ML.Tokenizers.Data.R50kBase/Microsoft.ML.Tokenizers.Data.R50kBase.csproj +++ b/src/Microsoft.ML.Tokenizers.Data.R50kBase/Microsoft.ML.Tokenizers.Data.R50kBase.csproj @@ -15,10 +15,10 @@ The file under MIT copyright license https://github.com/openai/tiktoken/blob/main/LICENSE - In the CompressFile task above we modify the file's content to elimenate the ranks, thus reducing the file size, + In the CompressFile task above we modify the file's content to eliminate the ranks, thus reducing the file size, since the rank corresponds to the line number in the file. For the file p50k_base.tiktoken, we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number. - After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly. + After we eliminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly. --> diff --git a/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs b/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs index 54d4daa9e3..8fb73a5225 100644 --- a/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs +++ b/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs @@ -1010,6 +1010,9 @@ public override OperationStatus Decode(IEnumerable ids, Span destinat private const string FimMiddle = "<|fim_middle|>"; private const string FimSuffix = "<|fim_suffix|>"; private const string EndOfPrompt = "<|endofprompt|>"; + private const string IMStart = "<|im_start|>"; + private const string IMEnd = "<|im_end|>"; + private const string IMSep = "<|im_sep|>"; private enum ModelEncoding { @@ -1022,6 +1025,8 @@ private enum ModelEncoding O200kBase } + private const string Phi4ModelName = "phi-4"; + private static readonly (string Prefix, ModelEncoding Encoding)[] _modelPrefixToEncoding = [ // chat @@ -1090,7 +1095,10 @@ private static readonly (string Prefix, ModelEncoding Encoding)[] _modelPrefixTo { "code-search-ada-code-001", ModelEncoding.R50kBase }, // open source - { "gpt2", ModelEncoding.GPT2 } + { "gpt2", ModelEncoding.GPT2 }, + + // phi-4 + { Phi4ModelName, ModelEncoding.Cl100kBase }, }; private static ModelEncoding GetModelEncoding(string modelName) @@ -1122,8 +1130,13 @@ private static (Dictionary SpecialTokens, Regex Regex, string Vocab switch (modelEncoding) { case ModelEncoding.Cl100kBase: - return (new Dictionary - { { EndOfText, 100257}, { FimPrefix, 100258}, { FimMiddle, 100259}, { FimSuffix, 100260}, { EndOfPrompt, 100276} }, Cl100kBaseRegex(), Cl100kBaseVocabFile, Type.GetType(Cl100kBaseTypeName), Cl100kBasePackageName); + return ( + Phi4ModelName.Equals(modelName, StringComparison.OrdinalIgnoreCase) ? + new Dictionary { { EndOfText, 100257 }, { FimPrefix, 100258 }, { FimMiddle, 100259 }, { FimSuffix, 100260 }, { EndOfPrompt, 100276 }, { IMStart, 100264 }, + { IMEnd, 100265 }, { IMSep, 100266 }, { "<|dummy_85|>", 100349}, // <|dummy_85|> is used for padding according to the phi-4 special token mapping. + } : + new Dictionary { { EndOfText, 100257 }, { FimPrefix, 100258 }, { FimMiddle, 100259 }, { FimSuffix, 100260 }, { EndOfPrompt, 100276 } }, + Cl100kBaseRegex(), Cl100kBaseVocabFile, Type.GetType(Cl100kBaseTypeName), Cl100kBasePackageName); case ModelEncoding.GPT2: return (new Dictionary { { EndOfText, 50256 }, }, P50kBaseRegex(), GPT2File, Type.GetType(Gpt2TypeName), Gpt2PackageName); diff --git a/test/Microsoft.ML.Tokenizers.Tests/TiktokenTests.cs b/test/Microsoft.ML.Tokenizers.Tests/TiktokenTests.cs index 6333723e7d..049f2cd82b 100644 --- a/test/Microsoft.ML.Tokenizers.Tests/TiktokenTests.cs +++ b/test/Microsoft.ML.Tokenizers.Tests/TiktokenTests.cs @@ -21,6 +21,7 @@ public class TiktokenTests { const string IMStart = "<|im_start|>"; const string IMEnd = "<|im_end|>"; + const string IMSep = "<|im_sep|>"; private static readonly Dictionary _specialTokens = new Dictionary { @@ -34,11 +35,13 @@ public class TiktokenTests public static Tokenizer R50kBase { get; } = TiktokenTokenizer.CreateForModel("ada"); public static Tokenizer P50kEdit { get; } = TiktokenTokenizer.CreateForModel("text-davinci-edit-001"); public static Tokenizer GPT4o { get; } = TiktokenTokenizer.CreateForModel("gpt-4o"); + public static Tokenizer Phi4 { get; } = TiktokenTokenizer.CreateForModel("phi-4"); [Fact] public async Task TestTokenizerCreation() { TestGPT4TokenizationEncoding(GPT4); + TestGPT4TokenizationEncoding(Phi4); Assert.True(GPT4 is TiktokenTokenizer); IReadOnlyDictionary? specialTokens = (GPT4 as TiktokenTokenizer)!.SpecialTokens; @@ -443,6 +446,7 @@ public void TestEncodeR50kBase() [InlineData("code-search-babbage-code-001")] [InlineData("code-search-ada-code-001")] [InlineData("gpt2")] + [InlineData("phi-4")] public void TestAllSupportedModelNames(string modelName) { Tokenizer tokenizer = TiktokenTokenizer.CreateForModel(modelName); @@ -504,6 +508,7 @@ public void TestEncodingNamesNegativeCases() [InlineData("text-davinci-003")] [InlineData("text-curie-001")] [InlineData("text-davinci-edit-001")] + [InlineData("phi-4")] [ConditionalTheory(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))] public void TestCreationUsingModel(string modelName) { @@ -568,8 +573,12 @@ public static IEnumerable TokenizerTestData [MemberData(nameof(TokenizerTestData))] public void TestTokenizerEncoding(string text, string[] expectedTokens, (int Index, int Length)[] expectedOffsets, int[] expectedIds) { - Tokenizer tokenizer = GPT4; + TestTokenizerEncodingForTokenizer(GPT4, text, expectedTokens, expectedOffsets, expectedIds); + TestTokenizerEncodingForTokenizer(Phi4, text, expectedTokens, expectedOffsets, expectedIds); + } + private void TestTokenizerEncodingForTokenizer(Tokenizer tokenizer, string text, string[] expectedTokens, (int Index, int Length)[] expectedOffsets, int[] expectedIds) + { IReadOnlyList encoding = tokenizer.EncodeToTokens(text, out _); IReadOnlyList encoding1 = tokenizer.EncodeToTokens(text.AsSpan(), out _); @@ -734,6 +743,15 @@ public void TestPreciseTokenLimits(string text, string[] expectedTokens, (int In } } + [Fact] + public void TestPhi4SpecialCases() + { + string text = $"{IMStart}Hello{IMSep} World{IMEnd}<|dummy_85|>"; + IReadOnlyList encoded = Phi4.EncodeToIds(text); + Assert.Equal(new List() { 100264, 9906, 100266, 4435, 100265, 100349 }, encoded); + Assert.Equal(text, Phi4.Decode(encoded)); + } + // We are not exposing the Encoder, Decoder, or Vocabulary so far. For now, use reflection to test it. private static IReadOnlyDictionary, int>? GetEncoder(TiktokenTokenizer tiktoken) => typeof(TiktokenTokenizer).GetProperty("Encoder", BindingFlags.Instance | BindingFlags.NonPublic)?.GetValue(tiktoken) as IReadOnlyDictionary, int>;