From 55a153408687d2d8fce64d1d70389cc4eb03846a Mon Sep 17 00:00:00 2001 From: M Saiful Bari Date: Mon, 12 Jul 2021 20:48:24 +0000 Subject: [PATCH 1/4] hf tokenizer pipeline --- megatron/tokenizer/tokenizer.py | 44 +++++++++++++++++++++++++++++++-- tools/preprocess_data.py | 7 ++++-- 2 files changed, 47 insertions(+), 4 deletions(-) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 13085a81c..28a484256 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -20,7 +20,7 @@ from .bert_tokenization import FullTokenizer as FullBertTokenizer from .gpt2_tokenization import GPT2Tokenizer - +from transformers import AutoTokenizer def build_tokenizer(args): """Initialize tokenizer.""" @@ -29,7 +29,7 @@ def build_tokenizer(args): flush=True) # Select and instantiate the tokenizer. - assert args.vocab_file is not None + assert args.vocab_file is not None or args.tokenizer_type == "PretrainedFromHf" if args.tokenizer_type == 'BertWordPieceLowerCase': tokenizer = _BertWordPieceTokenizer(vocab_file=args.vocab_file, lower_case=True, @@ -41,6 +41,13 @@ def build_tokenizer(args): elif args.tokenizer_type == 'GPT2BPETokenizer': assert args.merge_file is not None tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) + elif args.tokenizer_type == "PretrainedFromHf": + assert args.tokenizer_name_or_path is not None + print( + " vocab file is un-used. loading tokenizer from pre-trained model", + flush=True, + ) + tokenizer = _AutoTokenizer(args.tokenizer_name_or_path) else: raise NotImplementedError('{} tokenizer is not ' 'implemented.'.format(args.tokenizer_type)) @@ -289,3 +296,36 @@ def detokenize(self, token_ids): @property def eod(self): return self.eod_id + + +class _AutoTokenizer(AbstractTokenizer): + """AutoTokenizer for Hf Pretrained model loading.""" + + def __init__(self, tokenizer_name_or_path): + name = tokenizer_name_or_path + super().__init__(name) + self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path) + self.encoder = self.tokenizer.get_vocab() + self.decoder = {v: k for k, v in self.encoder.items()} + + @property + def vocab_size(self): + return self.tokenizer.vocab_size + + @property + def vocab(self): + return self.tokenizer.encoder + + @property + def inv_vocab(self): + return self.tokenizer.decoder + + def tokenize(self, text): + return self.tokenizer.encode(text) + + def detokenize(self, token_ids): + return self.tokenizer.decode(token_ids) + + @property + def eod(self): + return self.tokenizer.eos_token_id \ No newline at end of file diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py index c9ecb6af2..7ad743654 100644 --- a/tools/preprocess_data.py +++ b/tools/preprocess_data.py @@ -105,7 +105,7 @@ def get_args(): group = parser.add_argument_group(title='tokenizer') group.add_argument('--tokenizer-type', type=str, required=True, choices=['BertWordPieceLowerCase','BertWordPieceCase', - 'GPT2BPETokenizer'], + 'GPT2BPETokenizer', 'PretrainedFromHf'], help='What type of tokenizer to use.') group.add_argument('--vocab-file', type=str, default=None, help='Path to the vocab file') @@ -113,7 +113,10 @@ def get_args(): help='Path to the BPE merge file (if necessary).') group.add_argument('--append-eod', action='store_true', help='Append an token to the end of a document.') - + group.add_argument("--tokenizer-name-or-path", type=str, default=None, + help="Name or path of the huggingface tokenizer.") + group.add_argument("--tokenizer-extra-id", type=int, default=None, + help="Extra ids for T5 Tokenizer.") group = parser.add_argument_group(title='output data') group.add_argument('--output-prefix', type=str, required=True, From 0421d65d272c5a7d40a25937bfd4f3374be3bb31 Mon Sep 17 00:00:00 2001 From: M Saiful Bari Date: Mon, 12 Jul 2021 20:56:09 +0000 Subject: [PATCH 2/4] unused arg removed --- tools/preprocess_data.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py index 7ad743654..a61a98988 100644 --- a/tools/preprocess_data.py +++ b/tools/preprocess_data.py @@ -115,8 +115,6 @@ def get_args(): help='Append an token to the end of a document.') group.add_argument("--tokenizer-name-or-path", type=str, default=None, help="Name or path of the huggingface tokenizer.") - group.add_argument("--tokenizer-extra-id", type=int, default=None, - help="Extra ids for T5 Tokenizer.") group = parser.add_argument_group(title='output data') group.add_argument('--output-prefix', type=str, required=True, From 605a7e6333a31c3fad9bb7a01a32c32175ca4807 Mon Sep 17 00:00:00 2001 From: Teven Date: Fri, 16 Jul 2021 14:38:24 +0200 Subject: [PATCH 3/4] Changing Hf to HF --- megatron/tokenizer/tokenizer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 28a484256..c6551cdfb 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -29,7 +29,7 @@ def build_tokenizer(args): flush=True) # Select and instantiate the tokenizer. - assert args.vocab_file is not None or args.tokenizer_type == "PretrainedFromHf" + assert args.vocab_file is not None or args.tokenizer_type == "PretrainedFromHF" if args.tokenizer_type == 'BertWordPieceLowerCase': tokenizer = _BertWordPieceTokenizer(vocab_file=args.vocab_file, lower_case=True, @@ -41,7 +41,7 @@ def build_tokenizer(args): elif args.tokenizer_type == 'GPT2BPETokenizer': assert args.merge_file is not None tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) - elif args.tokenizer_type == "PretrainedFromHf": + elif args.tokenizer_type == "PretrainedFromHF": assert args.tokenizer_name_or_path is not None print( " vocab file is un-used. loading tokenizer from pre-trained model", @@ -328,4 +328,4 @@ def detokenize(self, token_ids): @property def eod(self): - return self.tokenizer.eos_token_id \ No newline at end of file + return self.tokenizer.eos_token_id From d887d203fbf1f4909c660f733d9c293ed887a8ca Mon Sep 17 00:00:00 2001 From: Teven Date: Fri, 16 Jul 2021 14:39:07 +0200 Subject: [PATCH 4/4] Changing Hf to HF --- tools/preprocess_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py index a61a98988..da05b4e4d 100644 --- a/tools/preprocess_data.py +++ b/tools/preprocess_data.py @@ -105,7 +105,7 @@ def get_args(): group = parser.add_argument_group(title='tokenizer') group.add_argument('--tokenizer-type', type=str, required=True, choices=['BertWordPieceLowerCase','BertWordPieceCase', - 'GPT2BPETokenizer', 'PretrainedFromHf'], + 'GPT2BPETokenizer', 'PretrainedFromHF'], help='What type of tokenizer to use.') group.add_argument('--vocab-file', type=str, default=None, help='Path to the vocab file')