diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 13085a81c..c6551cdfb 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -20,7 +20,7 @@ from .bert_tokenization import FullTokenizer as FullBertTokenizer from .gpt2_tokenization import GPT2Tokenizer - +from transformers import AutoTokenizer def build_tokenizer(args): """Initialize tokenizer.""" @@ -29,7 +29,7 @@ def build_tokenizer(args): flush=True) # Select and instantiate the tokenizer. - assert args.vocab_file is not None + assert args.vocab_file is not None or args.tokenizer_type == "PretrainedFromHF" if args.tokenizer_type == 'BertWordPieceLowerCase': tokenizer = _BertWordPieceTokenizer(vocab_file=args.vocab_file, lower_case=True, @@ -41,6 +41,13 @@ def build_tokenizer(args): elif args.tokenizer_type == 'GPT2BPETokenizer': assert args.merge_file is not None tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) + elif args.tokenizer_type == "PretrainedFromHF": + assert args.tokenizer_name_or_path is not None + print( + " vocab file is un-used. loading tokenizer from pre-trained model", + flush=True, + ) + tokenizer = _AutoTokenizer(args.tokenizer_name_or_path) else: raise NotImplementedError('{} tokenizer is not ' 'implemented.'.format(args.tokenizer_type)) @@ -289,3 +296,36 @@ def detokenize(self, token_ids): @property def eod(self): return self.eod_id + + +class _AutoTokenizer(AbstractTokenizer): + """AutoTokenizer for Hf Pretrained model loading.""" + + def __init__(self, tokenizer_name_or_path): + name = tokenizer_name_or_path + super().__init__(name) + self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path) + self.encoder = self.tokenizer.get_vocab() + self.decoder = {v: k for k, v in self.encoder.items()} + + @property + def vocab_size(self): + return self.tokenizer.vocab_size + + @property + def vocab(self): + return self.tokenizer.encoder + + @property + def inv_vocab(self): + return self.tokenizer.decoder + + def tokenize(self, text): + return self.tokenizer.encode(text) + + def detokenize(self, token_ids): + return self.tokenizer.decode(token_ids) + + @property + def eod(self): + return self.tokenizer.eos_token_id diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py index c9ecb6af2..da05b4e4d 100644 --- a/tools/preprocess_data.py +++ b/tools/preprocess_data.py @@ -105,7 +105,7 @@ def get_args(): group = parser.add_argument_group(title='tokenizer') group.add_argument('--tokenizer-type', type=str, required=True, choices=['BertWordPieceLowerCase','BertWordPieceCase', - 'GPT2BPETokenizer'], + 'GPT2BPETokenizer', 'PretrainedFromHF'], help='What type of tokenizer to use.') group.add_argument('--vocab-file', type=str, default=None, help='Path to the vocab file') @@ -113,7 +113,8 @@ def get_args(): help='Path to the BPE merge file (if necessary).') group.add_argument('--append-eod', action='store_true', help='Append an token to the end of a document.') - + group.add_argument("--tokenizer-name-or-path", type=str, default=None, + help="Name or path of the huggingface tokenizer.") group = parser.add_argument_group(title='output data') group.add_argument('--output-prefix', type=str, required=True,