Skip to content

GPU experiments - e.g. GTX 2080Ti #155

@tomekrut

Description

@tomekrut

I have been doing some tests on GTX 2080Ti with a simple LSTM network

  • 1x GPU - 84M parameters is the max I was able to put in without DeepSpeed
  • 2x GPUs including DeepSpeed... 105M parameters... 20M more parameters
  • 4x GPUs including DeepSpeed... 135M parameters
    so, roughly 60M parameters are fixed on each device and the positive delta is 20M...

Can you share your experience as well? Below is the code that you can copy paste and run in as it is. I simply changed the HIDDEN_DIM to play with the number of parameters and then either

python experiment.py
or

deepspeed experiment.py --deepspeed --deepspeed_config ds_config.json

File - ds_config.json

{
  "train_batch_size": 128,
  "steps_per_print": 2000,
  "optimizer": {
    "type": "Adam",
    "params": {
      "lr": 0.001,
      "betas": [
        0.8,
        0.999
      ],
      "eps": 1e-8,
      "weight_decay": 3e-7
    }
  },
  "scheduler": {
    "type": "WarmupLR",
    "params": {
      "warmup_min_lr": 0,
      "warmup_max_lr": 0.001,
      "warmup_num_steps": 1000
    }
  },
  "wall_clock_breakdown": false
}

File - experiment.py


import numpy as np
import torch
import torch.nn as nn
from torch.utils.data.dataset import Dataset
import deepspeed as ds
import argparse
import datetime
import torch.optim as optim

def add_argument():
    parser = argparse.ArgumentParser(description='MyCustom')
    parser.add_argument('--with_cuda', default=False, action='store_true',
                        help='use CPU in case there\'s no GPU support')
    parser.add_argument('--use_ema', default=False, action='store_true',
                        help='whether use exponential moving average')
    parser.add_argument('-b', '--batch_size', default=32, type=int,
                        help='mini-batch size (default: 32)')
    parser.add_argument('-e', '--epochs', default=30, type=int,
                        help='number of total epochs (default: 30)')
    parser.add_argument('--local_rank', type=int, default=-1,
                        help='local rank passed from distributed launcher')
    parser = ds.add_config_arguments(parser)
    args = parser.parse_args()
    return args

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers,
                 bidirectional, dropout, pad_idx):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.rnn = nn.LSTM(embedding_dim,
                           hidden_dim,
                           num_layers=n_layers,
                           bidirectional=bidirectional,
                           dropout=dropout)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text, text_lengths):
        embedded = self.dropout(self.embedding(text))
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        hidden = self.dropout(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1))
        return self.fc(hidden)


SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True


def generate_data():
    d, l = [], []
    for r in range(1000):
        v = np.random.choice(25000, 512)
        d.append(v)
        l.append(float(np.random.randint(0, 2)))
    return np.array(d), np.array(l)


class MyCustomDataset(Dataset):
    def __init__(self):
        self.data, self.label = generate_data()

    def __getitem__(self, index):
        return (self.data[index], self.label[index])

    def __len__(self):
        return self.data.shape[0]


BATCH_SIZE = 64
INPUT_DIM = 25000
EMBEDDING_DIM = 100
HIDDEN_DIM = 1350
OUTPUT_DIM = 1
N_LAYERS = 3
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = 1

train_data = MyCustomDataset()
train_loader = torch.utils.data.DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
model = RNN(INPUT_DIM,
            EMBEDDING_DIM,
            HIDDEN_DIM,
            OUTPUT_DIM,
            N_LAYERS,
            BIDIRECTIONAL,
            DROPOUT,
            PAD_IDX)

print(f'The model has \x1b[6;30;42m {count_parameters(model):,} \x1b[0m trainable parameters')
criterion = nn.BCEWithLogitsLoss()
args = add_argument()

if args.deepspeed:
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    model_engine, optimizer, train_loader, _ = ds.initialize(args=args, model=model, model_parameters=parameters, training_data=train_data)

    for epoch in range(1):
        running_loss = 0.0
        start = datetime.datetime.now()
        for i, data in enumerate(train_loader):
            inputs, labels = data[0].to(model_engine.local_rank).T, data[1].to(model_engine.local_rank)

            text_lengths = torch.ones(inputs.shape[1])
            text_lengths = text_lengths.fill_(512)
            text_lengths = text_lengths.to(model_engine.local_rank)

            outputs = model_engine(inputs, text_lengths).squeeze(1)
            loss = criterion(outputs, labels)

            model_engine.backward(loss)
            model_engine.step()

            running_loss += loss.item()

        end = datetime.datetime.now() - start
        print(f'Single epoch time {end}')

else:
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model_engine = model.to(device)
    optimizer = optim.Adam(model_engine.parameters())
    for epoch in range(1):
        running_loss = 0.0
        start = datetime.datetime.now()
        for i, data in enumerate(train_loader):
            optimizer.zero_grad()

            inputs, labels = data[0].to(device).T, data[1].to(device)

            text_lengths = torch.ones(inputs.shape[1])
            text_lengths = text_lengths.fill_(512)
            text_lengths = text_lengths.to(device)

            outputs = model_engine(inputs, text_lengths).squeeze(1)
            loss = criterion(outputs, labels)

            loss.backward()

            optimizer.step()

        end = datetime.datetime.now() - start
        print(f'Single epoch time {end}')




Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions