dotnet · tarekgh · Feb 23, 2025 · Feb 23, 2025
diff --git a/src/Microsoft.ML.Tokenizers.Data.Gpt2/Microsoft.ML.Tokenizers.Data.Gpt2.csproj b/src/Microsoft.ML.Tokenizers.Data.Gpt2/Microsoft.ML.Tokenizers.Data.Gpt2.csproj
@@ -16,10 +16,10 @@
       The file under MIT copyright license https://github.com/openai/tiktoken/blob/main/LICENSE
 
       Gpt2 vocab data is exact as the r50k_base vocab data, but with a different name.
-      In the CompressFile task above we modify the file's content to elimenate the ranks, thus reducing the file size,
+      In the CompressFile task above we modify the file's content to eliminate the ranks, thus reducing the file size,
       since the rank corresponds to the line number in the file. For the file p50k_base.tiktoken,
       we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.
-      After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
+      After we eliminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
     -->
     <TokenizerDataEmbeddedResource Include="Data\gpt2.tiktoken" />
   </ItemGroup>

diff --git a/src/Microsoft.ML.Tokenizers.Data.O200kBase/Microsoft.ML.Tokenizers.Data.O200kBase.csproj b/src/Microsoft.ML.Tokenizers.Data.O200kBase/Microsoft.ML.Tokenizers.Data.O200kBase.csproj
@@ -15,10 +15,10 @@
 
       The file under MIT copyright license https://github.com/openai/tiktoken/blob/main/LICENSE
 
-      In the CompressFile task above we modify the file's content to elimenate the ranks, thus reducing the file size,
+      In the CompressFile task above we modify the file's content to eliminate the ranks, thus reducing the file size,
       since the rank corresponds to the line number in the file. For the file p50k_base.tiktoken,
       we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.
-      After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
+      After we eliminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
     -->
     <TokenizerDataEmbeddedResource Include="Data\o200k_base.tiktoken" />
   </ItemGroup>

diff --git a/src/Microsoft.ML.Tokenizers.Data.P50kBase/Microsoft.ML.Tokenizers.Data.P50kBase.csproj b/src/Microsoft.ML.Tokenizers.Data.P50kBase/Microsoft.ML.Tokenizers.Data.P50kBase.csproj
@@ -15,10 +15,10 @@
 
       The file under MIT copyright license https://github.com/openai/tiktoken/blob/main/LICENSE
 
-      In the CompressFile task above we modify the file's content to elimenate the ranks, thus reducing the file size,
+      In the CompressFile task above we modify the file's content to eliminate the ranks, thus reducing the file size,
       since the rank corresponds to the line number in the file. For the file p50k_base.tiktoken,
       we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.
-      After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
+      After we eliminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
     -->
     <TokenizerDataEmbeddedResource Include="Data\p50k_base.tiktoken" />
   </ItemGroup>

diff --git a/src/Microsoft.ML.Tokenizers.Data.R50kBase/Microsoft.ML.Tokenizers.Data.R50kBase.csproj b/src/Microsoft.ML.Tokenizers.Data.R50kBase/Microsoft.ML.Tokenizers.Data.R50kBase.csproj
@@ -15,10 +15,10 @@
 
       The file under MIT copyright license https://github.com/openai/tiktoken/blob/main/LICENSE
 
-      In the CompressFile task above we modify the file's content to elimenate the ranks, thus reducing the file size,
+      In the CompressFile task above we modify the file's content to eliminate the ranks, thus reducing the file size,
       since the rank corresponds to the line number in the file. For the file p50k_base.tiktoken,
       we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.
-      After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
+      After we eliminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
     -->
     <TokenizerDataEmbeddedResource Include="Data\r50k_base.tiktoken" />
   </ItemGroup>

diff --git a/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs b/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs
@@ -1010,6 +1010,9 @@ public override OperationStatus Decode(IEnumerable<int> ids, Span<char> destinat
         private const string FimMiddle = "<|fim_middle|>";
         private const string FimSuffix = "<|fim_suffix|>";
         private const string EndOfPrompt = "<|endofprompt|>";
+        private const string IMStart = "<|im_start|>";
+        private const string IMEnd = "<|im_end|>";
+        private const string IMSep = "<|im_sep|>";
 
         private enum ModelEncoding
         {
@@ -1022,6 +1025,8 @@ private enum ModelEncoding
             O200kBase
         }
 
+        private const string Phi4ModelName = "phi-4";
+
         private static readonly (string Prefix, ModelEncoding Encoding)[] _modelPrefixToEncoding =
                                                             [
                                                                 // chat
@@ -1090,7 +1095,10 @@ private static readonly (string Prefix, ModelEncoding Encoding)[] _modelPrefixTo
                                                                 { "code-search-ada-code-001", ModelEncoding.R50kBase },
 
                                                                 // open source
-                                                                { "gpt2", ModelEncoding.GPT2 }
+                                                                { "gpt2", ModelEncoding.GPT2 },
+
+                                                                // phi-4
+                                                                { Phi4ModelName, ModelEncoding.Cl100kBase },
                                                             };
 
         private static ModelEncoding GetModelEncoding(string modelName)
@@ -1122,8 +1130,13 @@ private static (Dictionary<string, int> SpecialTokens, Regex Regex, string Vocab
             switch (modelEncoding)
             {
                 case ModelEncoding.Cl100kBase:
-                    return (new Dictionary<string, int>
-                        { { EndOfText, 100257}, { FimPrefix, 100258}, { FimMiddle, 100259}, { FimSuffix, 100260}, { EndOfPrompt, 100276} }, Cl100kBaseRegex(), Cl100kBaseVocabFile, Type.GetType(Cl100kBaseTypeName), Cl100kBasePackageName);
+                    return (
+                        Phi4ModelName.Equals(modelName, StringComparison.OrdinalIgnoreCase) ?
+                        new Dictionary<string, int> { { EndOfText, 100257 }, { FimPrefix, 100258 }, { FimMiddle, 100259 }, { FimSuffix, 100260 }, { EndOfPrompt, 100276 }, { IMStart, 100264 },
+                                                      { IMEnd, 100265 }, { IMSep, 100266 }, { "<|dummy_85|>", 100349}, // <|dummy_85|> is used for padding according to the phi-4 special token mapping.
+                        } :
+                        new Dictionary<string, int> { { EndOfText, 100257 }, { FimPrefix, 100258 }, { FimMiddle, 100259 }, { FimSuffix, 100260 }, { EndOfPrompt, 100276 } },
+                        Cl100kBaseRegex(), Cl100kBaseVocabFile, Type.GetType(Cl100kBaseTypeName), Cl100kBasePackageName);
 
                 case ModelEncoding.GPT2:
                     return (new Dictionary<string, int> { { EndOfText, 50256 }, }, P50kBaseRegex(), GPT2File, Type.GetType(Gpt2TypeName), Gpt2PackageName);

diff --git a/test/Microsoft.ML.Tokenizers.Tests/TiktokenTests.cs b/test/Microsoft.ML.Tokenizers.Tests/TiktokenTests.cs
@@ -21,6 +21,7 @@ public class TiktokenTests
     {
         const string IMStart = "<|im_start|>";
         const string IMEnd = "<|im_end|>";
+        const string IMSep = "<|im_sep|>";
 
         private static readonly Dictionary<string, int> _specialTokens = new Dictionary<string, int>
                                                 {
@@ -34,11 +35,13 @@ public class TiktokenTests
         public static Tokenizer R50kBase { get; } = TiktokenTokenizer.CreateForModel("ada");
         public static Tokenizer P50kEdit { get; } = TiktokenTokenizer.CreateForModel("text-davinci-edit-001");
         public static Tokenizer GPT4o { get; } = TiktokenTokenizer.CreateForModel("gpt-4o");
+        public static Tokenizer Phi4 { get; } = TiktokenTokenizer.CreateForModel("phi-4");
 
         [Fact]
         public async Task TestTokenizerCreation()
         {
             TestGPT4TokenizationEncoding(GPT4);
+            TestGPT4TokenizationEncoding(Phi4);
 
             Assert.True(GPT4 is TiktokenTokenizer);
             IReadOnlyDictionary<string, int>? specialTokens = (GPT4 as TiktokenTokenizer)!.SpecialTokens;
@@ -443,6 +446,7 @@ public void TestEncodeR50kBase()
         [InlineData("code-search-babbage-code-001")]
         [InlineData("code-search-ada-code-001")]
         [InlineData("gpt2")]
+        [InlineData("phi-4")]
         public void TestAllSupportedModelNames(string modelName)
         {
             Tokenizer tokenizer = TiktokenTokenizer.CreateForModel(modelName);
@@ -504,6 +508,7 @@ public void TestEncodingNamesNegativeCases()
         [InlineData("text-davinci-003")]
         [InlineData("text-curie-001")]
         [InlineData("text-davinci-edit-001")]
+        [InlineData("phi-4")]
         [ConditionalTheory(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))]
         public void TestCreationUsingModel(string modelName)
         {
@@ -568,8 +573,12 @@ public static IEnumerable<object?[]> TokenizerTestData
         [MemberData(nameof(TokenizerTestData))]
         public void TestTokenizerEncoding(string text, string[] expectedTokens, (int Index, int Length)[] expectedOffsets, int[] expectedIds)
         {
-            Tokenizer tokenizer = GPT4;
+            TestTokenizerEncodingForTokenizer(GPT4, text, expectedTokens, expectedOffsets, expectedIds);
+            TestTokenizerEncodingForTokenizer(Phi4, text, expectedTokens, expectedOffsets, expectedIds);
+        }
 
+        private void TestTokenizerEncodingForTokenizer(Tokenizer tokenizer, string text, string[] expectedTokens, (int Index, int Length)[] expectedOffsets, int[] expectedIds)
+        {
             IReadOnlyList<EncodedToken> encoding = tokenizer.EncodeToTokens(text, out _);
             IReadOnlyList<EncodedToken> encoding1 = tokenizer.EncodeToTokens(text.AsSpan(), out _);
 
@@ -734,6 +743,15 @@ public void TestPreciseTokenLimits(string text, string[] expectedTokens, (int In
             }
         }
 
+        [Fact]
+        public void TestPhi4SpecialCases()
+        {
+            string text = $"{IMStart}Hello{IMSep} World{IMEnd}<|dummy_85|>";
+            IReadOnlyList<int> encoded = Phi4.EncodeToIds(text);
+            Assert.Equal(new List<int>() { 100264, 9906, 100266, 4435, 100265, 100349 }, encoded);
+            Assert.Equal(text, Phi4.Decode(encoded));
+        }
+
         // We are not exposing the Encoder, Decoder, or Vocabulary so far. For now, use reflection to test it.
         private static IReadOnlyDictionary<ReadOnlyMemory<byte>, int>? GetEncoder(TiktokenTokenizer tiktoken)
             => typeof(TiktokenTokenizer).GetProperty("Encoder", BindingFlags.Instance | BindingFlags.NonPublic)?.GetValue(tiktoken) as IReadOnlyDictionary<ReadOnlyMemory<byte>, int>;