GPU experiments - e.g. GTX 2080Ti

I have been doing some tests on GTX 2080Ti with a simple LSTM network

- 1x GPU - 84M parameters is the max I was able to put in without DeepSpeed
- 2x GPUs including DeepSpeed...  105M parameters... 20M more parameters
- 4x GPUs including DeepSpeed... 135M parameters
so, roughly 60M parameters are fixed on each device and the positive delta is 20M... 

Can you share your experience as well? Below is the code that you can copy paste and run in as it is. I simply changed the HIDDEN_DIM to play with the number of parameters and then either

`python experiment.py`
or

`deepspeed experiment.py --deepspeed --deepspeed_config ds_config.json`




File - ds_config.json
```
{
  "train_batch_size": 128,
  "steps_per_print": 2000,
  "optimizer": {
    "type": "Adam",
    "params": {
      "lr": 0.001,
      "betas": [
        0.8,
        0.999
      ],
      "eps": 1e-8,
      "weight_decay": 3e-7
    }
  },
  "scheduler": {
    "type": "WarmupLR",
    "params": {
      "warmup_min_lr": 0,
      "warmup_max_lr": 0.001,
      "warmup_num_steps": 1000
    }
  },
  "wall_clock_breakdown": false
}

```
File - experiment.py

```

import numpy as np
import torch
import torch.nn as nn
from torch.utils.data.dataset import Dataset
import deepspeed as ds
import argparse
import datetime
import torch.optim as optim

def add_argument():
    parser = argparse.ArgumentParser(description='MyCustom')
    parser.add_argument('--with_cuda', default=False, action='store_true',
                        help='use CPU in case there\'s no GPU support')
    parser.add_argument('--use_ema', default=False, action='store_true',
                        help='whether use exponential moving average')
    parser.add_argument('-b', '--batch_size', default=32, type=int,
                        help='mini-batch size (default: 32)')
    parser.add_argument('-e', '--epochs', default=30, type=int,
                        help='number of total epochs (default: 30)')
    parser.add_argument('--local_rank', type=int, default=-1,
                        help='local rank passed from distributed launcher')
    parser = ds.add_config_arguments(parser)
    args = parser.parse_args()
    return args

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers,
                 bidirectional, dropout, pad_idx):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.rnn = nn.LSTM(embedding_dim,
                           hidden_dim,
                           num_layers=n_layers,
                           bidirectional=bidirectional,
                           dropout=dropout)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text, text_lengths):
        embedded = self.dropout(self.embedding(text))
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        hidden = self.dropout(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1))
        return self.fc(hidden)


SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True


def generate_data():
    d, l = [], []
    for r in range(1000):
        v = np.random.choice(25000, 512)
        d.append(v)
        l.append(float(np.random.randint(0, 2)))
    return np.array(d), np.array(l)


class MyCustomDataset(Dataset):
    def __init__(self):
        self.data, self.label = generate_data()

    def __getitem__(self, index):
        return (self.data[index], self.label[index])

    def __len__(self):
        return self.data.shape[0]


BATCH_SIZE = 64
INPUT_DIM = 25000
EMBEDDING_DIM = 100
HIDDEN_DIM = 1350
OUTPUT_DIM = 1
N_LAYERS = 3
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = 1

train_data = MyCustomDataset()
train_loader = torch.utils.data.DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
model = RNN(INPUT_DIM,
            EMBEDDING_DIM,
            HIDDEN_DIM,
            OUTPUT_DIM,
            N_LAYERS,
            BIDIRECTIONAL,
            DROPOUT,
            PAD_IDX)

print(f'The model has \x1b[6;30;42m {count_parameters(model):,} \x1b[0m trainable parameters')
criterion = nn.BCEWithLogitsLoss()
args = add_argument()

if args.deepspeed:
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    model_engine, optimizer, train_loader, _ = ds.initialize(args=args, model=model, model_parameters=parameters, training_data=train_data)

    for epoch in range(1):
        running_loss = 0.0
        start = datetime.datetime.now()
        for i, data in enumerate(train_loader):
            inputs, labels = data[0].to(model_engine.local_rank).T, data[1].to(model_engine.local_rank)

            text_lengths = torch.ones(inputs.shape[1])
            text_lengths = text_lengths.fill_(512)
            text_lengths = text_lengths.to(model_engine.local_rank)

            outputs = model_engine(inputs, text_lengths).squeeze(1)
            loss = criterion(outputs, labels)

            model_engine.backward(loss)
            model_engine.step()

            running_loss += loss.item()

        end = datetime.datetime.now() - start
        print(f'Single epoch time {end}')

else:
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model_engine = model.to(device)
    optimizer = optim.Adam(model_engine.parameters())
    for epoch in range(1):
        running_loss = 0.0
        start = datetime.datetime.now()
        for i, data in enumerate(train_loader):
            optimizer.zero_grad()

            inputs, labels = data[0].to(device).T, data[1].to(device)

            text_lengths = torch.ones(inputs.shape[1])
            text_lengths = text_lengths.fill_(512)
            text_lengths = text_lengths.to(device)

            outputs = model_engine(inputs, text_lengths).squeeze(1)
            loss = criterion(outputs, labels)

            loss.backward()

            optimizer.step()

        end = datetime.datetime.now() - start
        print(f'Single epoch time {end}')




```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

GPU experiments - e.g. GTX 2080Ti #155

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

GPU experiments - e.g. GTX 2080Ti #155

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions