Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@
The file under MIT copyright license https://github.com/openai/tiktoken/blob/main/LICENSE

Gpt2 vocab data is exact as the r50k_base vocab data, but with a different name.
In the CompressFile task above we modify the file's content to elimenate the ranks, thus reducing the file size,
In the CompressFile task above we modify the file's content to eliminate the ranks, thus reducing the file size,
since the rank corresponds to the line number in the file. For the file p50k_base.tiktoken,
we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.
After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
After we eliminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
-->
<TokenizerDataEmbeddedResource Include="Data\gpt2.tiktoken" />
</ItemGroup>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@

The file under MIT copyright license https://github.com/openai/tiktoken/blob/main/LICENSE

In the CompressFile task above we modify the file's content to elimenate the ranks, thus reducing the file size,
In the CompressFile task above we modify the file's content to eliminate the ranks, thus reducing the file size,
since the rank corresponds to the line number in the file. For the file p50k_base.tiktoken,
we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.
After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
After we eliminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
-->
<TokenizerDataEmbeddedResource Include="Data\o200k_base.tiktoken" />
</ItemGroup>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@

The file under MIT copyright license https://github.com/openai/tiktoken/blob/main/LICENSE

In the CompressFile task above we modify the file's content to elimenate the ranks, thus reducing the file size,
In the CompressFile task above we modify the file's content to eliminate the ranks, thus reducing the file size,
since the rank corresponds to the line number in the file. For the file p50k_base.tiktoken,
we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.
After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
After we eliminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
-->
<TokenizerDataEmbeddedResource Include="Data\p50k_base.tiktoken" />
</ItemGroup>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@

The file under MIT copyright license https://github.com/openai/tiktoken/blob/main/LICENSE

In the CompressFile task above we modify the file's content to elimenate the ranks, thus reducing the file size,
In the CompressFile task above we modify the file's content to eliminate the ranks, thus reducing the file size,
since the rank corresponds to the line number in the file. For the file p50k_base.tiktoken,
we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.
After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
After we eliminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
-->
<TokenizerDataEmbeddedResource Include="Data\r50k_base.tiktoken" />
</ItemGroup>
Expand Down
19 changes: 16 additions & 3 deletions src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1010,6 +1010,9 @@ public override OperationStatus Decode(IEnumerable<int> ids, Span<char> destinat
private const string FimMiddle = "<|fim_middle|>";
private const string FimSuffix = "<|fim_suffix|>";
private const string EndOfPrompt = "<|endofprompt|>";
private const string IMStart = "<|im_start|>";
private const string IMEnd = "<|im_end|>";
private const string IMSep = "<|im_sep|>";

private enum ModelEncoding
{
Expand All @@ -1022,6 +1025,8 @@ private enum ModelEncoding
O200kBase
}

private const string Phi4ModelName = "phi-4";

private static readonly (string Prefix, ModelEncoding Encoding)[] _modelPrefixToEncoding =
[
// chat
Expand Down Expand Up @@ -1090,7 +1095,10 @@ private static readonly (string Prefix, ModelEncoding Encoding)[] _modelPrefixTo
{ "code-search-ada-code-001", ModelEncoding.R50kBase },

// open source
{ "gpt2", ModelEncoding.GPT2 }
{ "gpt2", ModelEncoding.GPT2 },

// phi-4
{ Phi4ModelName, ModelEncoding.Cl100kBase },
};

private static ModelEncoding GetModelEncoding(string modelName)
Expand Down Expand Up @@ -1122,8 +1130,13 @@ private static (Dictionary<string, int> SpecialTokens, Regex Regex, string Vocab
switch (modelEncoding)
{
case ModelEncoding.Cl100kBase:
return (new Dictionary<string, int>
{ { EndOfText, 100257}, { FimPrefix, 100258}, { FimMiddle, 100259}, { FimSuffix, 100260}, { EndOfPrompt, 100276} }, Cl100kBaseRegex(), Cl100kBaseVocabFile, Type.GetType(Cl100kBaseTypeName), Cl100kBasePackageName);
return (
Phi4ModelName.Equals(modelName, StringComparison.OrdinalIgnoreCase) ?
new Dictionary<string, int> { { EndOfText, 100257 }, { FimPrefix, 100258 }, { FimMiddle, 100259 }, { FimSuffix, 100260 }, { EndOfPrompt, 100276 }, { IMStart, 100264 },
{ IMEnd, 100265 }, { IMSep, 100266 }, { "<|dummy_85|>", 100349}, // <|dummy_85|> is used for padding according to the phi-4 special token mapping.
} :
new Dictionary<string, int> { { EndOfText, 100257 }, { FimPrefix, 100258 }, { FimMiddle, 100259 }, { FimSuffix, 100260 }, { EndOfPrompt, 100276 } },
Cl100kBaseRegex(), Cl100kBaseVocabFile, Type.GetType(Cl100kBaseTypeName), Cl100kBasePackageName);

case ModelEncoding.GPT2:
return (new Dictionary<string, int> { { EndOfText, 50256 }, }, P50kBaseRegex(), GPT2File, Type.GetType(Gpt2TypeName), Gpt2PackageName);
Expand Down
20 changes: 19 additions & 1 deletion test/Microsoft.ML.Tokenizers.Tests/TiktokenTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ public class TiktokenTests
{
const string IMStart = "<|im_start|>";
const string IMEnd = "<|im_end|>";
const string IMSep = "<|im_sep|>";

private static readonly Dictionary<string, int> _specialTokens = new Dictionary<string, int>
{
Expand All @@ -34,11 +35,13 @@ public class TiktokenTests
public static Tokenizer R50kBase { get; } = TiktokenTokenizer.CreateForModel("ada");
public static Tokenizer P50kEdit { get; } = TiktokenTokenizer.CreateForModel("text-davinci-edit-001");
public static Tokenizer GPT4o { get; } = TiktokenTokenizer.CreateForModel("gpt-4o");
public static Tokenizer Phi4 { get; } = TiktokenTokenizer.CreateForModel("phi-4");

[Fact]
public async Task TestTokenizerCreation()
{
TestGPT4TokenizationEncoding(GPT4);
TestGPT4TokenizationEncoding(Phi4);

Assert.True(GPT4 is TiktokenTokenizer);
IReadOnlyDictionary<string, int>? specialTokens = (GPT4 as TiktokenTokenizer)!.SpecialTokens;
Expand Down Expand Up @@ -443,6 +446,7 @@ public void TestEncodeR50kBase()
[InlineData("code-search-babbage-code-001")]
[InlineData("code-search-ada-code-001")]
[InlineData("gpt2")]
[InlineData("phi-4")]
public void TestAllSupportedModelNames(string modelName)
{
Tokenizer tokenizer = TiktokenTokenizer.CreateForModel(modelName);
Expand Down Expand Up @@ -504,6 +508,7 @@ public void TestEncodingNamesNegativeCases()
[InlineData("text-davinci-003")]
[InlineData("text-curie-001")]
[InlineData("text-davinci-edit-001")]
[InlineData("phi-4")]
[ConditionalTheory(typeof(RemoteExecutor), nameof(RemoteExecutor.IsSupported))]
public void TestCreationUsingModel(string modelName)
{
Expand Down Expand Up @@ -568,8 +573,12 @@ public static IEnumerable<object?[]> TokenizerTestData
[MemberData(nameof(TokenizerTestData))]
public void TestTokenizerEncoding(string text, string[] expectedTokens, (int Index, int Length)[] expectedOffsets, int[] expectedIds)
{
Tokenizer tokenizer = GPT4;
TestTokenizerEncodingForTokenizer(GPT4, text, expectedTokens, expectedOffsets, expectedIds);
TestTokenizerEncodingForTokenizer(Phi4, text, expectedTokens, expectedOffsets, expectedIds);
}

private void TestTokenizerEncodingForTokenizer(Tokenizer tokenizer, string text, string[] expectedTokens, (int Index, int Length)[] expectedOffsets, int[] expectedIds)
{
IReadOnlyList<EncodedToken> encoding = tokenizer.EncodeToTokens(text, out _);
IReadOnlyList<EncodedToken> encoding1 = tokenizer.EncodeToTokens(text.AsSpan(), out _);

Expand Down Expand Up @@ -734,6 +743,15 @@ public void TestPreciseTokenLimits(string text, string[] expectedTokens, (int In
}
}

[Fact]
public void TestPhi4SpecialCases()
{
string text = $"{IMStart}Hello{IMSep} World{IMEnd}<|dummy_85|>";
IReadOnlyList<int> encoded = Phi4.EncodeToIds(text);
Assert.Equal(new List<int>() { 100264, 9906, 100266, 4435, 100265, 100349 }, encoded);
Assert.Equal(text, Phi4.Decode(encoded));
}

// We are not exposing the Encoder, Decoder, or Vocabulary so far. For now, use reflection to test it.
private static IReadOnlyDictionary<ReadOnlyMemory<byte>, int>? GetEncoder(TiktokenTokenizer tiktoken)
=> typeof(TiktokenTokenizer).GetProperty("Encoder", BindingFlags.Instance | BindingFlags.NonPublic)?.GetValue(tiktoken) as IReadOnlyDictionary<ReadOnlyMemory<byte>, int>;
Expand Down