Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion src/transformers/hf_argparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ def __init__(self, dataclass_types: Union[DataClassType, Iterable[DataClassType]

def _add_dataclass_arguments(self, dtype: DataClassType):
for field in dataclasses.fields(dtype):
if not field.init:
continue
field_name = f"--{field.name}"
kwargs = field.metadata.copy()
# field.metadata is not used at all by Data Classes,
Expand Down Expand Up @@ -142,7 +144,7 @@ def parse_args_into_dataclasses(
namespace, remaining_args = self.parse_known_args(args=args)
outputs = []
for dtype in self.dataclass_types:
keys = {f.name for f in dataclasses.fields(dtype)}
keys = {f.name for f in dataclasses.fields(dtype) if f.init}
inputs = {k: v for k, v in vars(namespace).items() if k in keys}
for k in keys:
delattr(namespace, k)
Expand Down
37 changes: 23 additions & 14 deletions src/transformers/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,15 +219,16 @@ class Trainer:
:class:`~transformers.AdamW` on your model and a scheduler given by
:func:`~transformers.get_linear_schedule_with_warmup` controlled by :obj:`args`.

Important accessors:

``self.model`` - always points to the core model. If using a transformers model, it will be a
:class:`PreTrainedModel` subclass.

``self.model_wrapped`` - always points to the most external model in case one or more other modules wrap the
original model. This is the model that should be used for the forward pass. For example, under ``DeepSpeed``,
the inner model is wrapped in ``DeepSpeed`` and then again in ``DistributedDataParallel``. If the inner model
hasn't been wrapped, then ``self.model_wrapped`` is the same as ``self.model``.
Important attributes:

- **model** -- Always points to the core model. If using a transformers model, it will be a
:class:`~transformers.PreTrainedModel` subclass.
- **model_wrapped** -- Always points to the most external model in case one or more other modules wrap the
original model. This is the model that should be used for the forward pass. For example, under ``DeepSpeed``,
the inner model is wrapped in ``DeepSpeed`` and then again in ``torch.nn.DistributedDataParallel``. If the
inner model hasn't been wrapped, then ``self.model_wrapped`` is the same as ``self.model``.
- **is_model_parallel** -- Whether or not a model has been switched to a model parallel mode (different from
data parallelism, this means some of the model layers are split on different GPUs).
"""

def __init__(
Expand Down Expand Up @@ -267,15 +268,23 @@ def __init__(
)
self.model_init = model_init

if hasattr(model, "is_parallelizable") and model.is_parallelizable and model.model_parallel:
self.is_model_parallel = True
else:
self.is_model_parallel = False
Comment on lines +271 to +274
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cool!


default_collator = default_data_collator if tokenizer is None else DataCollatorWithPadding(tokenizer)
self.data_collator = data_collator if data_collator is not None else default_collator
self.train_dataset = train_dataset
self.eval_dataset = eval_dataset
self.tokenizer = tokenizer

# Model parallel
if not self.args.model_parallel:
if not self.is_model_parallel:
model = model.to(args.device)
else:
# Force n_gpu to 1 to avoid DataParallel.
self.args._n_gpu = 1

# later use `self.model is self.model_wrapped` to check if it's wrapped or not
self.model_wrapped = model
Expand Down Expand Up @@ -669,7 +678,7 @@ def train(self, model_path: Optional[str] = None, trial: Union["optuna.Trial", D
set_seed(self.args.seed)

model = self.call_model_init(trial)
if not self.args.model_parallel:
if not self.is_model_parallel:
model = model.to(self.args.device)

self.model = model
Expand Down Expand Up @@ -719,7 +728,7 @@ def train(self, model_path: Optional[str] = None, trial: Union["optuna.Trial", D
model, self.optimizer = amp.initialize(model, self.optimizer, opt_level=self.args.fp16_opt_level)

# Multi-gpu training (should be after apex fp16 initialization)
if self.args.n_gpu > 1 and not self.args.model_parallel:
if self.args.n_gpu > 1:
model = torch.nn.DataParallel(model)

# Distributed training (should be after apex fp16 initialization)
Expand Down Expand Up @@ -930,7 +939,7 @@ def train(self, model_path: Optional[str] = None, trial: Union["optuna.Trial", D
)
if isinstance(self.model, PreTrainedModel):
self.model = self.model.from_pretrained(self.state.best_model_checkpoint)
if not self.args.model_parallel:
if not self.is_model_parallel:
self.model = self.model.to(self.args.device)
else:
state_dict = torch.load(os.path.join(self.state.best_model_checkpoint, WEIGHTS_NAME))
Expand Down Expand Up @@ -1481,7 +1490,7 @@ def prediction_loop(

model = self.model
# multi-gpu eval
if self.args.n_gpu > 1 and not self.args.model_parallel:
if self.args.n_gpu > 1:
model = torch.nn.DataParallel(model)
# Note: in torch.distributed mode, there's no point in wrapping the model
# inside a DistributedDataParallel as we'll be under `no_grad` anyways.
Expand Down
26 changes: 5 additions & 21 deletions src/transformers/training_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,9 +210,6 @@ class TrainingArguments:
- :obj:`True` if :obj:`metric_for_best_model` is set to a value that isn't :obj:`"loss"` or
:obj:`"eval_loss"`.
- :obj:`False` if :obj:`metric_for_best_model` is not set, or set to :obj:`"loss"` or :obj:`"eval_loss"`.
model_parallel (:obj:`bool`, `optional`, defaults to :obj:`False`):
If the model supports model parallelism and there is more than one device, whether to use model parallelism
to distribute the model's modules across devices or not.
ignore_skip_data (:obj:`bool`, `optional`, defaults to :obj:`False`):
When resuming training, whether or not to skip the epochs and batches to get the data loading at the same
stage as in the previous training. If set to :obj:`True`, the training will begin faster (as that skipping
Expand Down Expand Up @@ -245,15 +242,6 @@ class TrainingArguments:
do_train: bool = field(default=False, metadata={"help": "Whether to run training."})
do_eval: bool = field(default=None, metadata={"help": "Whether to run eval on the dev set."})
do_predict: bool = field(default=False, metadata={"help": "Whether to run predictions on the test set."})
model_parallel: bool = field(
default=False,
metadata={
"help": (
"If there are more than one devices, whether to use model parallelism to distribute the "
"model's modules across devices."
)
},
)
evaluation_strategy: EvaluationStrategy = field(
default="no",
metadata={"help": "The evaluation strategy to use."},
Expand Down Expand Up @@ -410,6 +398,7 @@ class TrainingArguments:
default=0.0, metadata={"help": "The label smoothing epsilon to apply (zero means no label smoothing)."}
)
adafactor: bool = field(default=False, metadata={"help": "Whether or not to replace Adam by Adafactor."})
_n_gpu: int = field(init=False, repr=False, default=0)

def __post_init__(self):
if self.disable_tqdm is None:
Expand All @@ -430,6 +419,7 @@ def __post_init__(self):

if is_torch_available() and self.device.type != "cuda" and self.fp16:
raise ValueError("Mixed precision training with AMP or APEX (`--fp16`) can only be used on CUDA devices.")
self._n_gpu = torch.cuda.device_count()

def __repr__(self):
# We override the default repr to remove deprecated arguments from the repr. This method should be removed once
Expand All @@ -451,10 +441,7 @@ def train_batch_size(self) -> int:
"version. Using `--per_device_train_batch_size` is preferred."
)
per_device_batch_size = self.per_gpu_train_batch_size or self.per_device_train_batch_size
if not self.model_parallel:
train_batch_size = per_device_batch_size * max(1, self.n_gpu)
else:
train_batch_size = per_device_batch_size
train_batch_size = per_device_batch_size * max(1, self.n_gpu)
return train_batch_size

@property
Expand All @@ -468,10 +455,7 @@ def eval_batch_size(self) -> int:
"version. Using `--per_device_eval_batch_size` is preferred."
)
per_device_batch_size = self.per_gpu_eval_batch_size or self.per_device_eval_batch_size
if not self.model_parallel:
eval_batch_size = per_device_batch_size * max(1, self.n_gpu)
else:
eval_batch_size = per_device_batch_size
eval_batch_size = per_device_batch_size * max(1, self.n_gpu)
return eval_batch_size

@cached_property
Expand All @@ -492,7 +476,7 @@ def _setup_devices(self) -> Tuple["torch.device", int]:
# GPUs available in the environment, so `CUDA_VISIBLE_DEVICES=1,2` with `cuda:0`
# will use the first GPU in that env, i.e. GPU#1
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
n_gpu = self._n_gpu
else:
# Here, we'll use torch.distributed.
# Initializes the distributed backend which will take care of synchronizing nodes/GPUs
Expand Down