Hello,
I have been training models using deepspeed to train bert models.
The code I use is adapted from bing bert example.
I use --deepspeed_transformer_kernel=True when training and after training load the same model and perform prediction.
I am facing 4 issues when performing prediction using trained model:
Some minor changes made to the model and forward function:
def forward(self, input_ids, valid_length=None, masked_lm_labels=None, embedding_mode=True):
-
When I load a deepspeed model without kernel into a model with transformer kernel (I verified that the weights loaded are exact by double transformation) the embedding generated by model with transformer kernel and without transformer kernels are very different.
-
When using model with transformer kernel the embeddings generated by 2 inputs, input_ids1.shape[1] = 16 and input_ids2.shape[1] = 128, and valid_length = [10], and input_ids1[:, :10] == input_ids2[:, :10] the embeddings generated are not the same they may vary at the first decimal itself.
-
For some awkward number for input_ids.shape[1] like 9: the embedding generated are all NANs.
-
When input_ids.shape[1] < 5 : the program crashes with cuda error (does not display the error most of the time).
NOTE:
These issues do not exist if I use a model without deepspeed kernel.
Please let me know if this is expected and if so why and if not how can it be potentially fixed. Thanks
Below is code snippet (not complete) and the output:
def main():
start = time.time()
args = construct_arguments()
#INITIALIZE MODEL WITH DEEPSPEED TRANFORMER KERNEL
model_with_ds_tr_kernel, _ = prepare_model_optimizer(args)
args.saved_embedding_path = os.path.join(args.output_dir, "embeddings",
args.embedding_type)
os.makedirs(args.saved_embedding_path, exist_ok=True)
args.logger.info(f"Created directory to save embeddings: {args.saved_embedding_path}")
if None in [args.load_training_checkpoint, args.load_checkpoint_id]:
args.logger.info("load_training_checkpoint and load_checkpoint_id is missing.")
sys.exit()
#LOAD A SAVED MODEL
global_step, global_data_samples, last_global_step_from_restore = load_checkpoint(args, model_with_ds_tr_kernel)
#LOAD SAME MODEL BUT WITHOUT TRANSFORMER KERNEL
from m5_transformers.models.bert import BertForPreTrainingPreLN
model_no_ds_tr_kernel = BertForPreTrainingPreLN.from_pretrained("../conversion/dsmNoTK_500M_1024_spm")
model_no_ds_tr_kernel.to("cuda")
model_no_ds_tr_kernel.eval()
model_with_ds_tr_kernel.eval()
print("-----------------------------------ISSUE 1-------------------------------------------------")
print("EMBEDDINGS DIFFERENT FOR MODEL WITH AND WITHOUT TRANSFORMER KERNEL")
seq_len = 7
ipid1 = torch.arange(512).view([1,512])
ipid1 += 1000
ipid1[:,seq_len:] = 3
ipid1 = ipid1.to("cuda")
valid_len = torch.tensor([seq_len], dtype=torch.long)
valid_len = valid_len.to("cuda")
with torch.no_grad():
s1,p1 = model_with_ds_tr_kernel.network(ipid1, valid_len, embedding_mode=True)
s2,p2 = model_no_ds_tr_kernel(ipid1, valid_len, embedding_mode=True)
s1 = s1.cpu().numpy()[:,:seq_len]
s2 = s2.cpu().numpy()[:,:seq_len]
print("EMBEDDINGS GENERATED SAME: {}".format(np.allclose(s1,s2, atol=1e-4)))
print("EMBEDDINGS ABSOLUTE MEAN ERROR: {}".format(np.abs(s1-s2).mean()))
print("-----------------------------------ISSUE 2-------------------------------------------------")
print("EMBEDDINGS FOR MODEL WITH TRANSFORMER KERNEL: 2 INPUTS DIFFERENT SIZE BATCH BUT SAME DATA")
ipid2 = torch.arange(16).view([1,16])
ipid2 += 1000
ipid2[:,seq_len:] = 3
ipid2 = ipid2.to("cuda")
valid_len = torch.tensor([seq_len], dtype=torch.long)
valid_len = valid_len.to("cuda")
with torch.no_grad():
s1,p1 = model_with_ds_tr_kernel.network(ipid1, valid_len, embedding_mode=True)
s2,p2 = model_with_ds_tr_kernel.network(ipid2, valid_len, embedding_mode=True)
s1 = s1.cpu().numpy()[:,:seq_len]
s2 = s2.cpu().numpy()[:,:seq_len]
print("EMBEDDINGS GENERATED SAME: {}".format(np.allclose(s1,s2, atol=1e-4)))
print("EMBEDDINGS ABSOLUTE MEAN ERROR: {}".format(np.abs(s1-s2).mean()))
print("-----------------------------------ISSUE 3-------------------------------------------------")
print("EMBEDDINGS FOR MODEL WITH TRANSFORMER KERNEL: INPUTS WITH ODD BATCH LENGTH")
ipid1 = torch.arange(9).view([1,9])
ipid1 += 1000
ipid1[:,seq_len:] = 3
ipid1 = ipid1.to("cuda")
valid_len = torch.tensor([seq_len], dtype=torch.long)
valid_len = valid_len.to("cuda")
with torch.no_grad():
s1,p1 = model_with_ds_tr_kernel.network(ipid1, valid_len, embedding_mode=True)
s1 = s1.cpu().numpy()[:,:seq_len]
print("EMBEDDINGS GENERATED : {}".format(s1))
print("-----------------------------------ISSUE 4-------------------------------------------------")
print("EMBEDDINGS FOR MODEL WITH TRANSFORMER KERNEL: INPUTS WITH VERY SMALL BATCH LENGTH")
seq_len = 2
ipid1 = torch.arange(3).view([1,3])
ipid1 += 1000
ipid1[:,seq_len:] = 3
ipid1 = ipid1.to("cuda")
valid_len = torch.tensor([seq_len], dtype=torch.long)
valid_len = valid_len.to("cuda")
print("SYSTEM HAS CRASHED IF NOTHING IS PRINTED AFTER THIS...")
with torch.no_grad():
s1,p1 = model_with_ds_tr_kernel.network(ipid1, valid_len, embedding_mode=True)
s1 = s1.cpu().numpy()[:,:seq_len]
print("EMBEDDINGS GENERATED : {}".format(s1))
elapsed = time.time() - start
logger = args.logger
logger.info(f"Elapsed time: {elapsed} seconds")
if name == "main":
main()
OUTPUT:
-----------------------------------ISSUE 1-------------------------------------------------
EMBEDDINGS DIFFERENT FOR MODEL WITH AND WITHOUT TRANSFORMER KERNEL
EMBEDDINGS GENERATED SAME: False
EMBEDDINGS ABSOLUTE MEAN ERROR: 0.9891793131828308
-----------------------------------ISSUE 2-------------------------------------------------
EMBEDDINGS FOR MODEL WITH TRANSFORMER KERNEL: 2 INPUTS DIFFERENT SIZE BATCH BUT SAME DATA
EMBEDDINGS GENERATED SAME: False
EMBEDDINGS ABSOLUTE MEAN ERROR: 0.010894775390625
-----------------------------------ISSUE 3-------------------------------------------------
EMBEDDINGS FOR MODEL WITH TRANSFORMER KERNEL: INPUTS WITH ODD BATCH LENGTH
EMBEDDINGS GENERATED : [[[nan nan nan ... nan nan nan]
[nan nan nan ... nan nan nan]
[nan nan nan ... nan nan nan]
...
[nan nan nan ... nan nan nan]
[nan nan nan ... nan nan nan]
[nan nan nan ... nan nan nan]]]
-----------------------------------ISSUE 4-------------------------------------------------
EMBEDDINGS FOR MODEL WITH TRANSFORMER KERNEL: INPUTS WITH VERY SMALL BATCH LENGTH
SYSTEM HAS CRASHED IF NOTHING IS PRINTED AFTER THIS...
Hello,
I have been training models using deepspeed to train bert models.
The code I use is adapted from bing bert example.
I use --deepspeed_transformer_kernel=True when training and after training load the same model and perform prediction.
I am facing 4 issues when performing prediction using trained model:
Some minor changes made to the model and forward function:
def forward(self, input_ids, valid_length=None, masked_lm_labels=None, embedding_mode=True):
When I load a deepspeed model without kernel into a model with transformer kernel (I verified that the weights loaded are exact by double transformation) the embedding generated by model with transformer kernel and without transformer kernels are very different.
When using model with transformer kernel the embeddings generated by 2 inputs, input_ids1.shape[1] = 16 and input_ids2.shape[1] = 128, and valid_length = [10], and input_ids1[:, :10] == input_ids2[:, :10] the embeddings generated are not the same they may vary at the first decimal itself.
For some awkward number for input_ids.shape[1] like 9: the embedding generated are all NANs.
When input_ids.shape[1] < 5 : the program crashes with cuda error (does not display the error most of the time).
NOTE:
These issues do not exist if I use a model without deepspeed kernel.
Please let me know if this is expected and if so why and if not how can it be potentially fixed. Thanks
Below is code snippet (not complete) and the output:
def main():
if name == "main":
main()
OUTPUT:
-----------------------------------ISSUE 1-------------------------------------------------
EMBEDDINGS DIFFERENT FOR MODEL WITH AND WITHOUT TRANSFORMER KERNEL
EMBEDDINGS GENERATED SAME: False
EMBEDDINGS ABSOLUTE MEAN ERROR: 0.9891793131828308
-----------------------------------ISSUE 2-------------------------------------------------
EMBEDDINGS FOR MODEL WITH TRANSFORMER KERNEL: 2 INPUTS DIFFERENT SIZE BATCH BUT SAME DATA
EMBEDDINGS GENERATED SAME: False
EMBEDDINGS ABSOLUTE MEAN ERROR: 0.010894775390625
-----------------------------------ISSUE 3-------------------------------------------------
EMBEDDINGS FOR MODEL WITH TRANSFORMER KERNEL: INPUTS WITH ODD BATCH LENGTH
EMBEDDINGS GENERATED : [[[nan nan nan ... nan nan nan]
[nan nan nan ... nan nan nan]
[nan nan nan ... nan nan nan]
...
[nan nan nan ... nan nan nan]
[nan nan nan ... nan nan nan]
[nan nan nan ... nan nan nan]]]
-----------------------------------ISSUE 4-------------------------------------------------
EMBEDDINGS FOR MODEL WITH TRANSFORMER KERNEL: INPUTS WITH VERY SMALL BATCH LENGTH
SYSTEM HAS CRASHED IF NOTHING IS PRINTED AFTER THIS...