From aa2901bbde1769941b9b904c19fca0c261b65ee8 Mon Sep 17 00:00:00 2001 From: Yi Dong Date: Thu, 10 Nov 2022 20:57:58 +0000 Subject: [PATCH 1/2] Fix Signed-off-by: MaximumEntropy --- .../language_modeling/megatron_gpt_eval.py | 16 +++++++-- .../modules/common/text_generation_utils.py | 33 +++++++++++++++---- 2 files changed, 39 insertions(+), 10 deletions(-) diff --git a/examples/nlp/language_modeling/megatron_gpt_eval.py b/examples/nlp/language_modeling/megatron_gpt_eval.py index 780def36f5d2..fb30622f7219 100644 --- a/examples/nlp/language_modeling/megatron_gpt_eval.py +++ b/examples/nlp/language_modeling/megatron_gpt_eval.py @@ -24,7 +24,7 @@ from nemo.collections.nlp.modules.common.text_generation_server import MegatronServer from nemo.collections.nlp.modules.common.text_generation_utils import generate from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam, SamplingParam -from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy +from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector from nemo.core.config import hydra_runner from nemo.utils.app_state import AppState from nemo.utils.model_utils import inject_model_parallel_rank @@ -160,8 +160,15 @@ def main(cfg) -> None: ), "devices * num_nodes should equal tensor_model_parallel_size * pipeline_model_parallel_size" if cfg.gpt_model_file: + save_restore_connector = NLPSaveRestoreConnector() + if os.path.isdir(cfg.gpt_model_file): + save_restore_connector.model_extracted_dir = cfg.gpt_model_file + pretrained_cfg = MegatronGPTModel.restore_from( - restore_path=cfg.gpt_model_file, trainer=trainer, return_config=True + restore_path=cfg.gpt_model_file, + trainer=trainer, + return_config=True, + save_restore_connector=save_restore_connector, ) OmegaConf.set_struct(pretrained_cfg, True) with open_dict(pretrained_cfg): @@ -169,7 +176,10 @@ def main(cfg) -> None: pretrained_cfg.activations_checkpoint_granularity = None pretrained_cfg.activations_checkpoint_method = None model = MegatronGPTModel.restore_from( - restore_path=cfg.gpt_model_file, trainer=trainer, override_config_path=pretrained_cfg + restore_path=cfg.gpt_model_file, + trainer=trainer, + override_config_path=pretrained_cfg, + save_restore_connector=save_restore_connector, ) elif cfg.checkpoint_dir: app_state = AppState() diff --git a/nemo/collections/nlp/modules/common/text_generation_utils.py b/nemo/collections/nlp/modules/common/text_generation_utils.py index dbacc90eeda4..889564e38e33 100644 --- a/nemo/collections/nlp/modules/common/text_generation_utils.py +++ b/nemo/collections/nlp/modules/common/text_generation_utils.py @@ -14,6 +14,8 @@ """Utilities for generating text.""" +from collections.abc import Iterable + import torch import torch.nn.functional as F @@ -454,6 +456,21 @@ def generate( repetition_penalty=repetition_penalty, min_tokens_to_generate=min_tokens_to_generate, ) + special_tokens = set() + if hasattr(tokenizer, 'pad_token') and tokenizer.pad_token is not None: + special_tokens.add(tokenizer.pad_token) + if hasattr(tokenizer, 'eos_token') and tokenizer.eos_token is not None: + special_tokens.add(tokenizer.eos_token) + if hasattr(tokenizer, 'bos_token') and tokenizer.bos_token is not None: + special_tokens.add(tokenizer.bos_token) + if hasattr(tokenizer, 'cls_token') and tokenizer.cls_token is not None: + special_tokens.add(tokenizer.cls_token) + if hasattr(tokenizer, 'unk_token') and tokenizer.unk_token is not None: + special_tokens.add(tokenizer.unk_token) + if hasattr(tokenizer, 'sep_token') and tokenizer.sep_token is not None: + special_tokens.add(tokenizer.sep_token) + if hasattr(tokenizer, 'mask_token') and tokenizer.mask_token is not None: + special_tokens.add(tokenizer.mask_token) if output is not None: decode_tokens, output_logits, full_logits = output resp_sentences = [] @@ -466,13 +483,15 @@ def generate( if not isinstance(tokenizer, TabularTokenizer): words = [] for token in decode_token: - # Skip any soft prompt pseudo tokens - if token not in tokenizer.tokenizer.decoder: - continue - word = tokenizer.tokenizer.decoder[token] - word = bytearray([tokenizer.tokenizer.byte_decoder[c] for c in word]).decode( - 'utf-8', errors='replace' - ) + if not isinstance(token, Iterable): + token = [token] + word = tokenizer.ids_to_tokens(token) + if isinstance(word, Iterable): + word = word[0] + if hasattr(tokenizer.tokenizer, 'byte_decoder'): + word = bytearray([tokenizer.tokenizer.byte_decoder[c] for c in word]).decode( + 'utf-8', errors='replace' + ) words.append(word) resp_sentences_seg.append(words) else: From 804f3abc96b5e0cd2254b56cf5a848a7cbee0cc1 Mon Sep 17 00:00:00 2001 From: MaximumEntropy Date: Tue, 15 Nov 2022 10:01:24 -0800 Subject: [PATCH 2/2] Fix Signed-off-by: MaximumEntropy --- .../collections/nlp/modules/common/text_generation_utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/nemo/collections/nlp/modules/common/text_generation_utils.py b/nemo/collections/nlp/modules/common/text_generation_utils.py index 889564e38e33..aeeedd24148f 100644 --- a/nemo/collections/nlp/modules/common/text_generation_utils.py +++ b/nemo/collections/nlp/modules/common/text_generation_utils.py @@ -497,13 +497,17 @@ def generate( else: words = tokenizer.text_to_tokens(sentence) resp_sentences_seg.append(words) + # offsets calculation all_offsets = [] for item in resp_sentences_seg: offsets = [0] for index, token in enumerate(item): if index != len(item) - 1: - offsets.append(len(token) + offsets[-1]) + if token in special_tokens: + offsets.append(offsets[-1]) + else: + offsets.append(len(token) + offsets[-1]) all_offsets.append(offsets) output = {}