From a41e0ebbfda5361c2fa7637434dc383b26fb686c Mon Sep 17 00:00:00 2001 From: TevenLeScao Date: Mon, 19 Jul 2021 17:39:04 +0200 Subject: [PATCH] HF tokenizers (on the training side) --- megatron/arguments.py | 5 ++++- megatron/global_vars.py | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/megatron/arguments.py b/megatron/arguments.py index 0b21ffbdb..0c2811117 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -665,8 +665,11 @@ def _add_data_args(parser): default=None, choices=['BertWordPieceLowerCase', 'BertWordPieceCase', - 'GPT2BPETokenizer'], + 'GPT2BPETokenizer', + 'PretrainedFromHF'], help='What type of tokenizer to use.') + group.add_argument("--tokenizer-name-or-path", type=str, default=None, + help="Name or path of the huggingface tokenizer.") group.add_argument('--data-impl', type=str, default='infer', choices=['lazy', 'cached', 'mmap', 'infer'], help='Implementation of indexed datasets.') diff --git a/megatron/global_vars.py b/megatron/global_vars.py index c486f0d6b..de0c12794 100644 --- a/megatron/global_vars.py +++ b/megatron/global_vars.py @@ -83,7 +83,7 @@ def set_global_variables(extra_args_provider=None, args_defaults={}, defaults=args_defaults, ignore_unknown_args=ignore_unknown_args) _build_num_microbatches_calculator(args) - if args.vocab_file: + if args.vocab_file or args.tokenizer_name_or_path: _ = _build_tokenizer(args) _set_tensorboard_writer(args) _set_adlr_autoresume(args)