Can you share your experience as well? Below is the code that you can copy paste and run in as it is. I simply changed the HIDDEN_DIM to play with the number of parameters and then either
{
"train_batch_size": 128,
"steps_per_print": 2000,
"optimizer": {
"type": "Adam",
"params": {
"lr": 0.001,
"betas": [
0.8,
0.999
],
"eps": 1e-8,
"weight_decay": 3e-7
}
},
"scheduler": {
"type": "WarmupLR",
"params": {
"warmup_min_lr": 0,
"warmup_max_lr": 0.001,
"warmup_num_steps": 1000
}
},
"wall_clock_breakdown": false
}
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data.dataset import Dataset
import deepspeed as ds
import argparse
import datetime
import torch.optim as optim
def add_argument():
parser = argparse.ArgumentParser(description='MyCustom')
parser.add_argument('--with_cuda', default=False, action='store_true',
help='use CPU in case there\'s no GPU support')
parser.add_argument('--use_ema', default=False, action='store_true',
help='whether use exponential moving average')
parser.add_argument('-b', '--batch_size', default=32, type=int,
help='mini-batch size (default: 32)')
parser.add_argument('-e', '--epochs', default=30, type=int,
help='number of total epochs (default: 30)')
parser.add_argument('--local_rank', type=int, default=-1,
help='local rank passed from distributed launcher')
parser = ds.add_config_arguments(parser)
args = parser.parse_args()
return args
def count_parameters(model):
return sum(p.numel() for p in model.parameters() if p.requires_grad)
class RNN(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers,
bidirectional, dropout, pad_idx):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
self.rnn = nn.LSTM(embedding_dim,
hidden_dim,
num_layers=n_layers,
bidirectional=bidirectional,
dropout=dropout)
self.fc = nn.Linear(hidden_dim * 2, output_dim)
self.dropout = nn.Dropout(dropout)
def forward(self, text, text_lengths):
embedded = self.dropout(self.embedding(text))
packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)
packed_output, (hidden, cell) = self.rnn(packed_embedded)
hidden = self.dropout(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1))
return self.fc(hidden)
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
def generate_data():
d, l = [], []
for r in range(1000):
v = np.random.choice(25000, 512)
d.append(v)
l.append(float(np.random.randint(0, 2)))
return np.array(d), np.array(l)
class MyCustomDataset(Dataset):
def __init__(self):
self.data, self.label = generate_data()
def __getitem__(self, index):
return (self.data[index], self.label[index])
def __len__(self):
return self.data.shape[0]
BATCH_SIZE = 64
INPUT_DIM = 25000
EMBEDDING_DIM = 100
HIDDEN_DIM = 1350
OUTPUT_DIM = 1
N_LAYERS = 3
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = 1
train_data = MyCustomDataset()
train_loader = torch.utils.data.DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
model = RNN(INPUT_DIM,
EMBEDDING_DIM,
HIDDEN_DIM,
OUTPUT_DIM,
N_LAYERS,
BIDIRECTIONAL,
DROPOUT,
PAD_IDX)
print(f'The model has \x1b[6;30;42m {count_parameters(model):,} \x1b[0m trainable parameters')
criterion = nn.BCEWithLogitsLoss()
args = add_argument()
if args.deepspeed:
parameters = filter(lambda p: p.requires_grad, model.parameters())
model_engine, optimizer, train_loader, _ = ds.initialize(args=args, model=model, model_parameters=parameters, training_data=train_data)
for epoch in range(1):
running_loss = 0.0
start = datetime.datetime.now()
for i, data in enumerate(train_loader):
inputs, labels = data[0].to(model_engine.local_rank).T, data[1].to(model_engine.local_rank)
text_lengths = torch.ones(inputs.shape[1])
text_lengths = text_lengths.fill_(512)
text_lengths = text_lengths.to(model_engine.local_rank)
outputs = model_engine(inputs, text_lengths).squeeze(1)
loss = criterion(outputs, labels)
model_engine.backward(loss)
model_engine.step()
running_loss += loss.item()
end = datetime.datetime.now() - start
print(f'Single epoch time {end}')
else:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_engine = model.to(device)
optimizer = optim.Adam(model_engine.parameters())
for epoch in range(1):
running_loss = 0.0
start = datetime.datetime.now()
for i, data in enumerate(train_loader):
optimizer.zero_grad()
inputs, labels = data[0].to(device).T, data[1].to(device)
text_lengths = torch.ones(inputs.shape[1])
text_lengths = text_lengths.fill_(512)
text_lengths = text_lengths.to(device)
outputs = model_engine(inputs, text_lengths).squeeze(1)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
end = datetime.datetime.now() - start
print(f'Single epoch time {end}')
I have been doing some tests on GTX 2080Ti with a simple LSTM network
so, roughly 60M parameters are fixed on each device and the positive delta is 20M...
Can you share your experience as well? Below is the code that you can copy paste and run in as it is. I simply changed the HIDDEN_DIM to play with the number of parameters and then either
python experiment.pyor
deepspeed experiment.py --deepspeed --deepspeed_config ds_config.jsonFile - ds_config.json
File - experiment.py