-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Create SentencePieceTokenizer from options object #7403
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
tarekgh
merged 2 commits into
dotnet:main
from
tarekgh:CreateSentencePieceTokenizerFromOptionsObject
Feb 27, 2025
Merged
Changes from all commits
Commits
Show all changes
2 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
118 changes: 118 additions & 0 deletions
118
src/Microsoft.ML.Tokenizers/Model/SentencePieceOptions.cs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,118 @@ | ||
| // Licensed to the .NET Foundation under one or more agreements. | ||
| // The .NET Foundation licenses this file to you under the MIT license. | ||
| // See the LICENSE file in the project root for more information. | ||
|
|
||
| using System.Collections.Generic; | ||
|
|
||
| namespace Microsoft.ML.Tokenizers | ||
| { | ||
| #pragma warning disable MSML_NoInstanceInitializers | ||
| /// <summary> | ||
| /// The type of the SentencePiece model. | ||
| /// </summary> | ||
| public enum SentencePieceModelType | ||
| { | ||
| /// <summary> | ||
| /// The model type is not defined. | ||
| /// </summary> | ||
| Undefined = 0, | ||
|
|
||
| /// <summary> | ||
| /// The model type is Byte Pair Encoding (Bpe) model. | ||
| /// </summary> | ||
| Bpe = 1, | ||
|
|
||
| /// <summary> | ||
| /// The model type is Unigram model. | ||
| /// </summary> | ||
| Unigram = 2, | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// Options for the SentencePiece tokenizer. | ||
tarekgh marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| /// </summary> | ||
| /// <remarks> | ||
| /// The options are used to configure the SentencePiece tokenizer. Serialization is not guaranteed for this type. | ||
| /// </remarks> | ||
| public sealed class SentencePieceOptions | ||
| { | ||
| /// <summary> | ||
| /// The type of the SentencePiece model. | ||
| /// </summary> | ||
| public SentencePieceModelType ModelType { get; set; } | ||
|
|
||
| /// <summary> | ||
| /// Determines whether the model uses a byte fallback strategy to encode unknown tokens as byte sequences. | ||
| /// </summary> | ||
| /// <remarks> | ||
| /// The vocabulary must include a special token for each byte value (0-255) in the format <0xNN>, | ||
| /// where NN represents the byte's hexadecimal value (e.g., <0x41> for byte value 65). | ||
| /// </remarks> | ||
| public bool ByteFallback { get; set; } | ||
|
|
||
| /// <summary> | ||
| /// Indicate emitting the prefix character e.g. U+2581 at the beginning of sentence token during the normalization and encoding. | ||
| /// </summary> | ||
| public bool AddDummyPrefix { get; set; } | ||
|
|
||
| /// <summary> | ||
| /// Indicate if the spaces should be replaced with character U+2581 during the normalization and encoding. Default value is `true`. | ||
| /// </summary> | ||
| public bool EscapeWhiteSpaces { get; set; } = true; | ||
|
|
||
| /// <summary> | ||
| /// Indicate emitting the character U+2581 at the end of the last sentence token instead beginning of sentence token during the normalization and encoding. | ||
| /// </summary> | ||
| public bool TreatWhitespaceAsSuffix { get; set; } | ||
|
|
||
| /// <summary> | ||
| /// Indicate removing extra white spaces from the original string during the normalization. | ||
| /// </summary> | ||
| public bool RemoveExtraWhiteSpaces { get; set; } | ||
|
|
||
| /// <summary> | ||
| /// Indicate emitting the beginning of sentence token during the encoding. Default value is `true`. | ||
| /// </summary> | ||
| public bool AddBeginningOfSentence { get; set; } = true; | ||
|
|
||
| /// <summary> | ||
| /// Indicate emitting the end of sentence token during the encoding. | ||
| /// </summary> | ||
| public bool AddEndOfSentence { get; set; } | ||
|
|
||
| /// <summary> | ||
| /// The beginning of sentence token. Default value is `<s>`. | ||
| /// </summary> | ||
| public string BeginningOfSentenceToken { get; set; } = "<s>"; | ||
|
|
||
| /// <summary> | ||
| /// The end of sentence token. Default value is `</s>`. | ||
| /// </summary> | ||
| public string EndOfSentenceToken { get; set; } = "</s>"; | ||
|
|
||
| /// <summary> | ||
| /// The unknown token. Default value is `<unk>`. | ||
| /// </summary> | ||
| public string UnknownToken { get; set; } = "<unk>"; | ||
|
|
||
| /// <summary> | ||
| /// The data used for string normalization. | ||
| /// </summary> | ||
| public byte[]? PrecompiledNormalizationData { get; set; } | ||
|
|
||
| /// <summary> | ||
| /// Represent the vocabulary. | ||
| /// The list should be sorted by token ID, with entries passed in the order that corresponds to their IDs. In other words, | ||
| /// the first entry in the list will be mapped to ID 0, the second entry to ID 1, the third to ID 2, and so on. | ||
| /// Each entry represents a token and its corresponding score. | ||
| /// </summary> | ||
| public IEnumerable<(string Token, float Score)>? Vocabulary { get; set; } | ||
|
|
||
| /// <summary> | ||
| /// The special tokens. | ||
| /// Special tokens remain intact during encoding and are not split into sub-tokens. | ||
| /// </summary> | ||
| public IReadOnlyDictionary<string, int>? SpecialTokens { get; set; } | ||
| } | ||
| #pragma warning restore MSML_NoInstanceInitializers | ||
| } | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.