From f6c80060bdee836f4f6643ee38a6b7e5f535cc8a Mon Sep 17 00:00:00 2001
From: Tuan Nguyen <tuan@neuralmagic.com>
Date: Wed, 25 May 2022 13:38:01 -0400
Subject: [PATCH 1/7] Customed save checkpoints for transformers trainer
 classes; refactoring

---
 .../transformers/question_answering.py        |  2 +-
 .../transformers/sparsification/__init__.py   |  1 +
 .../sparsification/question_answering.py      |  6 +--
 .../transformers/sparsification/trainer.py    | 38 ++++++++++++++++++-
 4 files changed, 41 insertions(+), 6 deletions(-)

diff --git a/src/sparseml/transformers/question_answering.py b/src/sparseml/transformers/question_answering.py
index 0dc9ccd70f6..9b6d9e4ebfe 100644
--- a/src/sparseml/transformers/question_answering.py
+++ b/src/sparseml/transformers/question_answering.py
@@ -40,7 +40,6 @@
     EvalPrediction,
     HfArgumentParser,
     PreTrainedTokenizerFast,
-    TrainingArguments,
     default_data_collator,
     set_seed,
 )
@@ -51,6 +50,7 @@
 from sparseml.transformers.sparsification import (
     QuestionAnsweringTrainer,
     postprocess_qa_predictions,
+    TrainingArguments,
 )
 from sparseml.transformers.utils import SparseAutoModel, get_shared_tokenizer_src
 
diff --git a/src/sparseml/transformers/sparsification/__init__.py b/src/sparseml/transformers/sparsification/__init__.py
index 61a91e00a04..b8735ea4c3d 100644
--- a/src/sparseml/transformers/sparsification/__init__.py
+++ b/src/sparseml/transformers/sparsification/__init__.py
@@ -21,3 +21,4 @@
 
 from .question_answering import *
 from .trainer import *
+from .training_args import *
diff --git a/src/sparseml/transformers/sparsification/question_answering.py b/src/sparseml/transformers/sparsification/question_answering.py
index c908074ba71..ce40b587637 100644
--- a/src/sparseml/transformers/sparsification/question_answering.py
+++ b/src/sparseml/transformers/sparsification/question_answering.py
@@ -31,10 +31,10 @@
 import numpy as np
 from torch.nn import Module
 from tqdm.auto import tqdm
-from transformers import Trainer, is_torch_tpu_available
+from transformers import is_torch_tpu_available
 from transformers.trainer_utils import PredictionOutput
 
-from sparseml.transformers.sparsification.trainer import TrainerInterface
+from sparseml.transformers.sparsification.trainer import TrainerInterface, TransformersTrainer
 
 
 if is_torch_tpu_available():
@@ -51,7 +51,7 @@
 _LOGGER = logging.getLogger(__name__)
 
 
-class _QuestionAnsweringTrainer(Trainer):
+class _QuestionAnsweringTrainer(TransformersTrainer):
     """
     Trainer implementation for Question-Answering processing
     """
diff --git a/src/sparseml/transformers/sparsification/trainer.py b/src/sparseml/transformers/sparsification/trainer.py
index 2d0dcd1716e..5230150b8cf 100644
--- a/src/sparseml/transformers/sparsification/trainer.py
+++ b/src/sparseml/transformers/sparsification/trainer.py
@@ -28,7 +28,8 @@
 import torch
 from torch import distributed as dist
 from torch.nn import Module
-from transformers import Trainer as TransformersTrainer
+from torch.utils.data import RandomSampler
+from transformers import Trainer as _TransformersTrainer
 from transformers import TrainerCallback, TrainerControl, TrainingArguments
 from transformers.file_utils import WEIGHTS_NAME
 from transformers.integrations import TensorBoardCallback
@@ -51,6 +52,7 @@
     "TrainerInterface",
     "Trainer",
     "DisableHalfPrecisionCallback",
+    "TransformersTrainer"
 ]
 
 
@@ -855,6 +857,38 @@ def _generate_apply_manager_params(self, kwargs) -> Tuple[Optional[str], float]:
         return checkpoint, epoch
 
 
+class TransformersTrainer(_TransformersTrainer):
+    """
+    A transformers trainer class with customed behaviors that can be shared
+    by all trainers inside SparseML
+    """
+    def _save_checkpoint(self, model, trial, metrics=None):
+        super()._save_checkpoint(model, trial, metrics=metrics)
+        if (
+            self.args.metric_for_best_model is None
+            or self.args.best_model_after_epoch is None
+        ):
+            return
+
+        if (self.state.epoch > self.args.best_model_after_epoch):
+            metric_to_check = self.args.metric_for_best_model
+            if not metric_to_check.startswith("eval_"):
+                metric_to_check = f"eval_{metric_to_check}"
+            metric_value = metrics[metric_to_check]
+
+            operator = np.greater if self.args.greater_is_better else np.less
+            if (
+                self.state.best_metric is None
+                or self.state.best_model_checkpoint is None
+                or operator(metric_value, self.state.best_metric)
+            ):
+                self.state.best_metric = metric_value
+                self.state.best_model_checkpoint = output_dir
+        else:
+            self.state.best_metric = None
+            self.state.best_model_checkpoint = None
+    
+
 class Trainer(TrainerInterface, TransformersTrainer):
     """
     Training implementation for running sparsification recipes with transformers flows.
@@ -924,7 +958,7 @@ def _remove_unused_columns(
             self._signature_columns += ["label", "label_ids"]
 
         return super()._remove_unused_columns(dataset, description)
-
+    
 
 class DisableHalfPrecisionCallback(TrainerCallback):
     """

From 725edc572fd7ae756d67cc2d6eb13373f198d780 Mon Sep 17 00:00:00 2001
From: Tuan Nguyen <tuan@neuralmagic.com>
Date: Wed, 25 May 2022 13:54:12 -0400
Subject: [PATCH 2/7] Add transformers training args for SparseML

---
 .../sparsification/training_args.py            | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
 create mode 100644 src/sparseml/transformers/sparsification/training_args.py

diff --git a/src/sparseml/transformers/sparsification/training_args.py b/src/sparseml/transformers/sparsification/training_args.py
new file mode 100644
index 00000000000..19205020565
--- /dev/null
+++ b/src/sparseml/transformers/sparsification/training_args.py
@@ -0,0 +1,18 @@
+from transformers import TrainingArguments as HFTrainingArgs
+
+__all__ = ["TrainingArguments"]
+
+@dataclass
+class TrainingArguments(HFTrainingArgs):
+    """
+    Training arguments specific to SparseML Transformers workflow
+
+    :param best_model_after_epoch (`int`, *optional*, defaults to None):
+        The epoch after which best model will be saved; used in conjunction with `load_best_model_at_end` and
+        `metric_for_best_model` training arguments
+    """
+    best_model_after_epoch: int = field(
+        default=None,
+        metadata={"help": "Epoch after which best model will be saved."},
+    )
+

From 91546968c7e5ce2e5d8ddde3a355c094953e45da Mon Sep 17 00:00:00 2001
From: Tuan Nguyen <tuan@neuralmagic.com>
Date: Fri, 3 Jun 2022 00:36:50 -0400
Subject: [PATCH 3/7] Reuse removed unused columns

---
 .../sparsification/question_answering.py      | 29 +------
 .../transformers/sparsification/trainer.py    | 81 ++++++++++---------
 2 files changed, 45 insertions(+), 65 deletions(-)

diff --git a/src/sparseml/transformers/sparsification/question_answering.py b/src/sparseml/transformers/sparsification/question_answering.py
index ce40b587637..1950607c9e0 100644
--- a/src/sparseml/transformers/sparsification/question_answering.py
+++ b/src/sparseml/transformers/sparsification/question_answering.py
@@ -34,7 +34,10 @@
 from transformers import is_torch_tpu_available
 from transformers.trainer_utils import PredictionOutput
 
-from sparseml.transformers.sparsification.trainer import TrainerInterface, TransformersTrainer
+from sparseml.transformers.sparsification.trainer import (
+    TrainerInterface,
+    TransformersTrainer,
+)
 
 
 if is_torch_tpu_available():
@@ -224,30 +227,6 @@ def __init__(
             **kwargs,
         )
 
-    def _remove_unused_columns(
-        self, dataset: "datasets.Dataset", description: Optional[str] = None
-    ):
-        if (
-            self._signature_columns is None
-            and self.teacher is not None
-            and self.teacher not in ("disable", "self")
-        ):
-            model_signature = inspect.signature(self.model.forward)
-            model_signature_columns = set(model_signature.parameters.keys())
-
-            teacher_signature = inspect.signature(self.teacher.forward)
-            teacher_signature_columns = set(teacher_signature.parameters.keys())
-
-            self._signature_columns = list(
-                model_signature_columns | teacher_signature_columns
-            )
-
-            # Labels may be named label or label_ids, the default data
-            # collator handles that.
-            self._signature_columns += ["label", "label_ids"]
-
-        return super()._remove_unused_columns(dataset, description)
-
 
 def postprocess_qa_predictions(
     examples,
diff --git a/src/sparseml/transformers/sparsification/trainer.py b/src/sparseml/transformers/sparsification/trainer.py
index 5230150b8cf..f39dd84bbe0 100644
--- a/src/sparseml/transformers/sparsification/trainer.py
+++ b/src/sparseml/transformers/sparsification/trainer.py
@@ -29,7 +29,7 @@
 from torch import distributed as dist
 from torch.nn import Module
 from torch.utils.data import RandomSampler
-from transformers import Trainer as _TransformersTrainer
+from transformers import Trainer as HFTransformersTrainer
 from transformers import TrainerCallback, TrainerControl, TrainingArguments
 from transformers.file_utils import WEIGHTS_NAME
 from transformers.integrations import TensorBoardCallback
@@ -52,7 +52,7 @@
     "TrainerInterface",
     "Trainer",
     "DisableHalfPrecisionCallback",
-    "TransformersTrainer"
+    "TransformersTrainer",
 ]
 
 
@@ -857,11 +857,12 @@ def _generate_apply_manager_params(self, kwargs) -> Tuple[Optional[str], float]:
         return checkpoint, epoch
 
 
-class TransformersTrainer(_TransformersTrainer):
+class TransformersTrainer(HFTransformersTrainer):
     """
     A transformers trainer class with customed behaviors that can be shared
     by all trainers inside SparseML
     """
+
     def _save_checkpoint(self, model, trial, metrics=None):
         super()._save_checkpoint(model, trial, metrics=metrics)
         if (
@@ -870,7 +871,7 @@ def _save_checkpoint(self, model, trial, metrics=None):
         ):
             return
 
-        if (self.state.epoch > self.args.best_model_after_epoch):
+        if self.state.epoch > self.args.best_model_after_epoch:
             metric_to_check = self.args.metric_for_best_model
             if not metric_to_check.startswith("eval_"):
                 metric_to_check = f"eval_{metric_to_check}"
@@ -887,41 +888,6 @@ def _save_checkpoint(self, model, trial, metrics=None):
         else:
             self.state.best_metric = None
             self.state.best_model_checkpoint = None
-    
-
-class Trainer(TrainerInterface, TransformersTrainer):
-    """
-    Training implementation for running sparsification recipes with transformers flows.
-    :param model: the model to use with the trainer and apply sparsification to
-    :param model_state_path: the state path to the model,
-        used to load config and tokenizer settings
-    :param recipe: the recipe, if any, to apply to the model and training
-        process
-    :param recipe_args: A json string, csv key=value string, or dictionary containing
-        arguments to override the root arguments within the recipe such as
-        learning rate or num epochs
-    :param teacher: teacher model for distillation. Set to 'self' to distill
-        from the loaded model or 'disable' to turn off distillation
-    :param kwargs: key word arguments passed to the parent class
-    """
-
-    def __init__(
-        self,
-        model: Module,
-        model_state_path: str,
-        recipe: Optional[str],
-        recipe_args: Optional[Union[Dict[str, Any], str]] = None,
-        teacher: Optional[Union[Module, str]] = None,
-        **kwargs,
-    ):
-        super().__init__(
-            model=model,
-            model_state_path=model_state_path,
-            recipe=recipe,
-            recipe_args=recipe_args,
-            teacher=teacher,
-            **kwargs,
-        )
 
     def _remove_unused_columns(
         self, dataset: "datasets.Dataset", description: Optional[str] = None
@@ -958,7 +924,42 @@ def _remove_unused_columns(
             self._signature_columns += ["label", "label_ids"]
 
         return super()._remove_unused_columns(dataset, description)
-    
+
+
+class Trainer(TrainerInterface, TransformersTrainer):
+    """
+    Training implementation for running sparsification recipes with transformers flows.
+    :param model: the model to use with the trainer and apply sparsification to
+    :param model_state_path: the state path to the model,
+        used to load config and tokenizer settings
+    :param recipe: the recipe, if any, to apply to the modle and training
+        process
+    :param recipe_args: A json string, csv key=value string, or dictionary containing
+        arguments to override the root arguments within the recipe such as
+        learning rate or num epochs
+    :param teacher: teacher model for distillation. Set to 'self' to distill
+        from the loaded model or 'disable' to turn of distillation
+    :param kwargs: key word arguments passed to the parent class
+    """
+
+    def __init__(
+        self,
+        model: Module,
+        model_state_path: str,
+        recipe: Optional[str],
+        recipe_args: Optional[Union[Dict[str, Any], str]] = None,
+        teacher: Optional[Union[Module, str]] = None,
+        **kwargs,
+    ):
+        super().__init__(
+            model=model,
+            model_state_path=model_state_path,
+            recipe=recipe,
+            recipe_args=recipe_args,
+            teacher=teacher,
+            **kwargs,
+        )
+
 
 class DisableHalfPrecisionCallback(TrainerCallback):
     """

From 578ac0215bf325b326def7896505466dd4e55f9e Mon Sep 17 00:00:00 2001
From: Tuan Nguyen <tuan@neuralmagic.com>
Date: Fri, 3 Jun 2022 10:11:12 -0400
Subject: [PATCH 4/7] Format training args

---
 .../sparsification/training_args.py             | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/src/sparseml/transformers/sparsification/training_args.py b/src/sparseml/transformers/sparsification/training_args.py
index 19205020565..e9226e3d678 100644
--- a/src/sparseml/transformers/sparsification/training_args.py
+++ b/src/sparseml/transformers/sparsification/training_args.py
@@ -1,7 +1,22 @@
+# Copyright (c) 2022 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from transformers import TrainingArguments as HFTrainingArgs
 
 __all__ = ["TrainingArguments"]
 
+
 @dataclass
 class TrainingArguments(HFTrainingArgs):
     """
@@ -11,8 +26,8 @@ class TrainingArguments(HFTrainingArgs):
         The epoch after which best model will be saved; used in conjunction with `load_best_model_at_end` and
         `metric_for_best_model` training arguments
     """
+
     best_model_after_epoch: int = field(
         default=None,
         metadata={"help": "Epoch after which best model will be saved."},
     )
-

From e8d426ddca0b68f5f37a7ab3aa50a426e9393f9c Mon Sep 17 00:00:00 2001
From: Tuan Nguyen <tuan@neuralmagic.com>
Date: Fri, 3 Jun 2022 22:23:09 -0400
Subject: [PATCH 5/7] Move distill teacher, recipe args

---
 .../transformers/question_answering.py        | 27 +++--------------
 .../sparsification/question_answering.py      |  2 --
 .../transformers/sparsification/trainer.py    | 18 ++---------
 .../sparsification/training_args.py           | 30 +++++++++++++++++--
 .../transformers/text_classification.py       | 28 +++--------------
 5 files changed, 38 insertions(+), 67 deletions(-)

diff --git a/src/sparseml/transformers/question_answering.py b/src/sparseml/transformers/question_answering.py
index 9b6d9e4ebfe..3d6fa1705d0 100644
--- a/src/sparseml/transformers/question_answering.py
+++ b/src/sparseml/transformers/question_answering.py
@@ -49,8 +49,8 @@
 
 from sparseml.transformers.sparsification import (
     QuestionAnsweringTrainer,
-    postprocess_qa_predictions,
     TrainingArguments,
+    postprocess_qa_predictions,
 )
 from sparseml.transformers.utils import SparseAutoModel, get_shared_tokenizer_src
 
@@ -90,10 +90,6 @@ class ModelArguments:
             )
         }
     )
-    distill_teacher: Optional[str] = field(
-        default=None,
-        metadata={"help": "Teacher model which needs to be a trained QA model"},
-    )
     config_name: Optional[str] = field(
         default=None,
         metadata={
@@ -141,21 +137,6 @@ class DataTrainingArguments:
     Arguments pertaining to what data to input to our model for training and eval
     """
 
-    recipe: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": (
-                "Path to a SparseML sparsification recipe, see "
-                "https://github.com/neuralmagic/sparseml for more information"
-            )
-        },
-    )
-    recipe_args: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "Recipe arguments to be overwritten",
-        },
-    )
     dataset_name: Optional[str] = field(
         default=None,
         metadata={
@@ -445,7 +426,7 @@ def main(**kwargs):
             "revision": model_args.model_revision,
             "use_auth_token": True if model_args.use_auth_token else None,
         },
-        teacher_name_or_path=model_args.distill_teacher,
+        teacher_name_or_path=training_args.distill_teacher,
         teacher_kwargs={
             "cache_dir": model_args.cache_dir,
             "use_auth_token": True if model_args.use_auth_token else None,
@@ -772,8 +753,8 @@ def compute_metrics(p: EvalPrediction):
     trainer = QuestionAnsweringTrainer(
         model=model,
         model_state_path=model_args.model_name_or_path,
-        recipe=data_args.recipe,
-        recipe_args=data_args.recipe_args,
+        recipe=training_args.recipe,
+        recipe_args=training_args.recipe_args,
         metadata_args=metadata_args,
         teacher=teacher,
         args=training_args,
diff --git a/src/sparseml/transformers/sparsification/question_answering.py b/src/sparseml/transformers/sparsification/question_answering.py
index 1950607c9e0..d2cd4778d2d 100644
--- a/src/sparseml/transformers/sparsification/question_answering.py
+++ b/src/sparseml/transformers/sparsification/question_answering.py
@@ -21,13 +21,11 @@
 """
 
 import collections
-import inspect
 import json
 import logging
 import os
 from typing import Any, Dict, List, Optional, Tuple, Union
 
-import datasets
 import numpy as np
 from torch.nn import Module
 from tqdm.auto import tqdm
diff --git a/src/sparseml/transformers/sparsification/trainer.py b/src/sparseml/transformers/sparsification/trainer.py
index f39dd84bbe0..752a43a4c40 100644
--- a/src/sparseml/transformers/sparsification/trainer.py
+++ b/src/sparseml/transformers/sparsification/trainer.py
@@ -864,6 +864,8 @@ class TransformersTrainer(HFTransformersTrainer):
     """
 
     def _save_checkpoint(self, model, trial, metrics=None):
+        # Call into the save checkpoint by HF Transformers, which saves the
+        # best metric if required
         super()._save_checkpoint(model, trial, metrics=metrics)
         if (
             self.args.metric_for_best_model is None
@@ -871,21 +873,7 @@ def _save_checkpoint(self, model, trial, metrics=None):
         ):
             return
 
-        if self.state.epoch > self.args.best_model_after_epoch:
-            metric_to_check = self.args.metric_for_best_model
-            if not metric_to_check.startswith("eval_"):
-                metric_to_check = f"eval_{metric_to_check}"
-            metric_value = metrics[metric_to_check]
-
-            operator = np.greater if self.args.greater_is_better else np.less
-            if (
-                self.state.best_metric is None
-                or self.state.best_model_checkpoint is None
-                or operator(metric_value, self.state.best_metric)
-            ):
-                self.state.best_metric = metric_value
-                self.state.best_model_checkpoint = output_dir
-        else:
+        if self.state.epoch <= self.args.best_model_after_epoch:
             self.state.best_metric = None
             self.state.best_model_checkpoint = None
 
diff --git a/src/sparseml/transformers/sparsification/training_args.py b/src/sparseml/transformers/sparsification/training_args.py
index e9226e3d678..a1aa639ad87 100644
--- a/src/sparseml/transformers/sparsification/training_args.py
+++ b/src/sparseml/transformers/sparsification/training_args.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022 - present / Neuralmagic, Inc. All Rights Reserved.
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,8 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from dataclasses import dataclass, field
+from typing import Optional
+
 from transformers import TrainingArguments as HFTrainingArgs
 
+
 __all__ = ["TrainingArguments"]
 
 
@@ -23,11 +27,31 @@ class TrainingArguments(HFTrainingArgs):
     Training arguments specific to SparseML Transformers workflow
 
     :param best_model_after_epoch (`int`, *optional*, defaults to None):
-        The epoch after which best model will be saved; used in conjunction with `load_best_model_at_end` and
-        `metric_for_best_model` training arguments
+        The epoch after which best model will be saved; used in conjunction
+        with `load_best_model_at_end` and `metric_for_best_model` training
+        arguments
     """
 
+    distill_teacher: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Teacher model (a trained text classification model)",
+        },
+    )
     best_model_after_epoch: int = field(
         default=None,
         metadata={"help": "Epoch after which best model will be saved."},
     )
+    recipe: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "Path to a SparseML sparsification recipe, see "
+                "https://github.com/neuralmagic/sparseml for more information"
+            ),
+        },
+    )
+    recipe_args: Optional[str] = field(
+        default=None,
+        metadata={"help": "Recipe arguments to be overwritten"},
+    )
diff --git a/src/sparseml/transformers/text_classification.py b/src/sparseml/transformers/text_classification.py
index 17ddc8aac3f..a27cc97322d 100644
--- a/src/sparseml/transformers/text_classification.py
+++ b/src/sparseml/transformers/text_classification.py
@@ -42,7 +42,6 @@
     EvalPrediction,
     HfArgumentParser,
     PretrainedConfig,
-    TrainingArguments,
     default_data_collator,
     set_seed,
 )
@@ -50,7 +49,7 @@
 from transformers.utils import check_min_version
 from transformers.utils.versions import require_version
 
-from sparseml.transformers.sparsification import Trainer
+from sparseml.transformers.sparsification import Trainer, TrainingArguments
 from sparseml.transformers.utils import SparseAutoModel, get_shared_tokenizer_src
 
 
@@ -94,19 +93,6 @@ class DataTrainingArguments:
     arguments to be able to specify them on the command line
     """
 
-    recipe: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": (
-                "Path to a SparseML sparsification recipe, see "
-                "https://github.com/neuralmagic/sparseml for more information"
-            ),
-        },
-    )
-    recipe_args: Optional[str] = field(
-        default=None,
-        metadata={"help": "Recipe arguments to be overwritten"},
-    )
     task_name: Optional[str] = field(
         default=None,
         metadata={
@@ -254,12 +240,6 @@ class ModelArguments:
             )
         }
     )
-    distill_teacher: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "Teacher model which must be a trained text classification model"
-        },
-    )
     config_name: Optional[str] = field(
         default=None,
         metadata={
@@ -481,7 +461,7 @@ def main(**kwargs):
             "revision": model_args.model_revision,
             "use_auth_token": True if model_args.use_auth_token else None,
         },
-        teacher_name_or_path=model_args.distill_teacher,
+        teacher_name_or_path=training_args.distill_teacher,
         teacher_kwargs={
             "cache_dir": model_args.cache_dir,
             "use_auth_token": True if model_args.use_auth_token else None,
@@ -720,9 +700,9 @@ def compute_metrics(p: EvalPrediction):
     trainer = Trainer(
         model=model,
         model_state_path=model_args.model_name_or_path,
-        recipe=data_args.recipe,
+        recipe=training_args.recipe,
         metadata_args=metadata_args,
-        recipe_args=data_args.recipe_args,
+        recipe_args=training_args.recipe_args,
         teacher=teacher,
         args=training_args,
         data_args=data_args,

From 46259198c0d00a4df22782382f98582d76d4af0f Mon Sep 17 00:00:00 2001
From: Tuan Nguyen <tuan@neuralmagic.com>
Date: Fri, 10 Jun 2022 14:09:23 -0400
Subject: [PATCH 6/7] Simplify MLM, token classification code

---
 .../transformers/masked_language_modeling.py  | 22 ++-----------------
 .../transformers/token_classification.py      | 22 ++-----------------
 2 files changed, 4 insertions(+), 40 deletions(-)

diff --git a/src/sparseml/transformers/masked_language_modeling.py b/src/sparseml/transformers/masked_language_modeling.py
index 326fc03402c..3b4a7733672 100644
--- a/src/sparseml/transformers/masked_language_modeling.py
+++ b/src/sparseml/transformers/masked_language_modeling.py
@@ -47,14 +47,13 @@
     AutoTokenizer,
     DataCollatorForLanguageModeling,
     HfArgumentParser,
-    TrainingArguments,
     set_seed,
 )
 from transformers.trainer_utils import get_last_checkpoint
 from transformers.utils import check_min_version
 from transformers.utils.versions import require_version
 
-from sparseml.transformers.sparsification import Trainer
+from sparseml.transformers.sparsification import Trainer, TrainingArguments
 from sparseml.transformers.utils import SparseAutoModel, get_shared_tokenizer_src
 
 
@@ -108,10 +107,6 @@ class ModelArguments:
             "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index"
         },
     )
-    distill_teacher: Optional[str] = field(
-        default=None,
-        metadata={"help": "Teacher model which needs to be a trained QA model"},
-    )
     config_name: Optional[str] = field(
         default=None,
         metadata={
@@ -164,19 +159,6 @@ class DataTrainingArguments:
     training and eval
     """
 
-    recipe: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": (
-                "Path to a SparseML sparsification recipe, see "
-                "https://github.com/neuralmagic/sparseml for more information"
-            ),
-        },
-    )
-    recipe_args: Optional[str] = field(
-        default=None,
-        metadata={"help": "Recipe arguments to be overwritten"},
-    )
     dataset_name: Optional[str] = field(
         default=None,
         metadata={"help": "The name of the dataset to use (via the datasets library)"},
@@ -490,7 +472,7 @@ def main(**kwargs):
             "revision": model_args.model_revision,
             "use_auth_token": True if model_args.use_auth_token else None,
         },
-        teacher_name_or_path=model_args.distill_teacher,
+        teacher_name_or_path=training_args.distill_teacher,
         teacher_kwargs={
             "cache_dir": model_args.cache_dir,
             "use_auth_token": True if model_args.use_auth_token else None,
diff --git a/src/sparseml/transformers/token_classification.py b/src/sparseml/transformers/token_classification.py
index c846ac50e1c..9692c4f8e3d 100644
--- a/src/sparseml/transformers/token_classification.py
+++ b/src/sparseml/transformers/token_classification.py
@@ -40,14 +40,13 @@
     HfArgumentParser,
     PretrainedConfig,
     PreTrainedTokenizerFast,
-    TrainingArguments,
     set_seed,
 )
 from transformers.trainer_utils import get_last_checkpoint
 from transformers.utils import check_min_version
 from transformers.utils.versions import require_version
 
-from sparseml.transformers.sparsification import Trainer
+from sparseml.transformers.sparsification import Trainer, TrainingArguments
 from sparseml.transformers.utils import SparseAutoModel, get_shared_tokenizer_src
 
 
@@ -84,10 +83,6 @@ class ModelArguments:
             )
         }
     )
-    distill_teacher: Optional[str] = field(
-        default=None,
-        metadata={"help": "Teacher model which needs to be a trained NER model"},
-    )
     config_name: Optional[str] = field(
         default=None,
         metadata={
@@ -127,19 +122,6 @@ class DataTrainingArguments:
     training and eval
     """
 
-    recipe: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": (
-                "Path to a SparseML sparsification recipe, see "
-                "https://github.com/neuralmagic/sparseml for more information"
-            ),
-        },
-    )
-    recipe_args: Optional[str] = field(
-        default=None,
-        metadata={"help": "Recipe arguments to be overwritten"},
-    )
     task_name: Optional[str] = field(
         default="ner", metadata={"help": "The name of the task (ner, pos...)."}
     )
@@ -441,7 +423,7 @@ def get_label_list(labels):
             "revision": model_args.model_revision,
             "use_auth_token": True if model_args.use_auth_token else None,
         },
-        teacher_name_or_path=model_args.distill_teacher,
+        teacher_name_or_path=training_args.distill_teacher,
         teacher_kwargs={
             "cache_dir": model_args.cache_dir,
             "use_auth_token": True if model_args.use_auth_token else None,

From 15e8b3bc5aad62ffad73f8588c3376513129486c Mon Sep 17 00:00:00 2001
From: Tuan Nguyen <tuan@neuralmagic.com>
Date: Mon, 13 Jun 2022 23:13:41 -0400
Subject: [PATCH 7/7] Fix recipe calls in mlm and token cls

---
 src/sparseml/transformers/masked_language_modeling.py | 4 ++--
 src/sparseml/transformers/sparsification/trainer.py   | 3 +--
 src/sparseml/transformers/token_classification.py     | 4 ++--
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/sparseml/transformers/masked_language_modeling.py b/src/sparseml/transformers/masked_language_modeling.py
index 3b4a7733672..37c6ad3516e 100644
--- a/src/sparseml/transformers/masked_language_modeling.py
+++ b/src/sparseml/transformers/masked_language_modeling.py
@@ -664,9 +664,9 @@ def compute_metrics(eval_preds):
     trainer = Trainer(
         model=model,
         model_state_path=model_args.model_name_or_path,
-        recipe=data_args.recipe,
+        recipe=training_args.recipe,
         metadata_args=metadata_args,
-        recipe_args=data_args.recipe_args,
+        recipe_args=training_args.recipe_args,
         teacher=teacher,
         args=training_args,
         data_args=data_args,
diff --git a/src/sparseml/transformers/sparsification/trainer.py b/src/sparseml/transformers/sparsification/trainer.py
index 752a43a4c40..847c8b5b8b7 100644
--- a/src/sparseml/transformers/sparsification/trainer.py
+++ b/src/sparseml/transformers/sparsification/trainer.py
@@ -28,7 +28,6 @@
 import torch
 from torch import distributed as dist
 from torch.nn import Module
-from torch.utils.data import RandomSampler
 from transformers import Trainer as HFTransformersTrainer
 from transformers import TrainerCallback, TrainerControl, TrainingArguments
 from transformers.file_utils import WEIGHTS_NAME
@@ -859,7 +858,7 @@ def _generate_apply_manager_params(self, kwargs) -> Tuple[Optional[str], float]:
 
 class TransformersTrainer(HFTransformersTrainer):
     """
-    A transformers trainer class with customed behaviors that can be shared
+    A transformers trainer class with custom behavior that can be shared
     by all trainers inside SparseML
     """
 
diff --git a/src/sparseml/transformers/token_classification.py b/src/sparseml/transformers/token_classification.py
index 9692c4f8e3d..6a2764cbf89 100644
--- a/src/sparseml/transformers/token_classification.py
+++ b/src/sparseml/transformers/token_classification.py
@@ -625,9 +625,9 @@ def compute_metrics(p):
     trainer = Trainer(
         model=model,
         model_state_path=model_args.model_name_or_path,
-        recipe=data_args.recipe,
+        recipe=training_args.recipe,
         metadata_args=metadata_args,
-        recipe_args=data_args.recipe_args,
+        recipe_args=training_args.recipe_args,
         teacher=teacher,
         args=training_args,
         data_args=data_args,