From e7bbeb956924fa82149079c8fc7903c4c4a489c9 Mon Sep 17 00:00:00 2001
From: Eugene <etesting007@gmail.com>
Date: Thu, 20 Jul 2023 10:07:18 -0700
Subject: [PATCH 01/15] length and time based rewards

---
 openvalidators/config.py          | 12 ++++
 openvalidators/neuron.py          |  8 +++
 openvalidators/reward/__init__.py |  2 +
 openvalidators/reward/length.py   | 88 ++++++++++++++++++++++++++++
 openvalidators/reward/timing.py   | 95 +++++++++++++++++++++++++++++++
 5 files changed, 205 insertions(+)
 create mode 100644 openvalidators/reward/length.py
 create mode 100644 openvalidators/reward/timing.py

diff --git a/openvalidators/config.py b/openvalidators/config.py
index 6565cef..e635466 100644
--- a/openvalidators/config.py
+++ b/openvalidators/config.py
@@ -230,6 +230,18 @@ def add_args(cls, parser):
         help="Dont apply the nsfw reward model",
         default=False,
     )
+    parser.add_argument(
+        "--neuron.length_off",
+        action="store_true",
+        help="Dont apply the length reward model",
+        default=False,
+    )
+    parser.add_argument(
+        "--neuron.time_off",
+        action="store_true",
+        help="Dont apply the time reward model",
+        default=False,
+    )
     parser.add_argument(
         "--neuron.relevance_off",
         action="store_true",
diff --git a/openvalidators/neuron.py b/openvalidators/neuron.py
index f1a2405..9c269f6 100644
--- a/openvalidators/neuron.py
+++ b/openvalidators/neuron.py
@@ -43,6 +43,8 @@
     DiversityRewardModel,
     PromptRewardModel,
     RewardModelType,
+    LengthAwareRewardModel,
+    TimeAwareRewardModel,
 )
 
 
@@ -196,6 +198,12 @@ def __init__(self):
                 NSFWRewardModel(device=self.device)
                 if not self.config.neuron.nsfw_off
                 else MockRewardModel(RewardModelType.nsfw.value),
+                LengthAwareRewardModel(device=self.device)
+                if not self.config.neuron.length_off
+                else MockRewardModel(RewardModelType.length.value),
+                TimeAwareRewardModel(device=self.device)
+                if not self.config.neuron.time_off
+                else MockRewardModel(RewardModelType.time.value),
             ]
             bt.logging.debug(str(self.reward_functions))
 
diff --git a/openvalidators/reward/__init__.py b/openvalidators/reward/__init__.py
index 6a94469..5ae8d5b 100644
--- a/openvalidators/reward/__init__.py
+++ b/openvalidators/reward/__init__.py
@@ -9,3 +9,5 @@
 from .diversity import DiversityRewardModel
 from .prompt import PromptRewardModel
 from .config import RewardModelType, DefaultRewardFrameworkConfig
+from .length import LengthAwareRewardModel
+from .timing import TimeAwareRewardModel
\ No newline at end of file
diff --git a/openvalidators/reward/length.py b/openvalidators/reward/length.py
new file mode 100644
index 0000000..8f5a60f
--- /dev/null
+++ b/openvalidators/reward/length.py
@@ -0,0 +1,88 @@
+
+# The MIT License (MIT)
+# Copyright © 2021 Yuma Rao
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+# documentation files (the “Software”), to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
+# and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of
+# the Software.
+
+# THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+# THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+import torch
+from typing import List
+from .config import RewardModelType
+from .reward import BaseRewardModel
+from transformers import  AutoTokenizer, AutoModel
+
+class LengthAwareRewardModel(BaseRewardModel):
+    """
+    A model that calculates rewards based on the input prompt, completion, and the length of the completion.
+
+    Attributes:
+        reward_model_path (str): The path to the pretrained reward model.
+        revision (str): The revision version of the pretrained model.
+    """
+    
+    @property
+    def name(self) -> str: 
+        """The name of the reward model."""
+        return RewardModelType.length_aware.value
+
+    def __init__(self, device: str):
+        """
+        The constructor for LengthAwareRewardModel class.
+
+        Parameters:
+           device (str): The device to which the model will be sent.
+        """
+        super().__init__()
+        self.device = device
+        # Load the tokenizer from the pretrained model
+        self.tokenizer = AutoTokenizer.from_pretrained(LengthAwareRewardModel.reward_model_path, 
+                                                       revision = LengthAwareRewardModel.revision)
+
+    def reward(self, prompt: str, completion: str, name: str) -> float:
+        """
+        Calculate the reward for a single prompt and completion. The length of the completion is used as an additional factor.
+
+        Parameters:
+           prompt (str): The prompt.
+           completion (str): The completion.
+           name (str): The name.
+
+        Returns:
+           float: The calculated reward.
+        """
+        with torch.no_grad():
+            # Tokenize the message
+            inputs = self.tokenizer(completion,
+                                    return_tensors="pt",
+                                    truncation=True,
+                                    ).to(self.device)
+            # Multiply the reward by the length of the completion
+            return len(inputs)
+
+    def get_rewards(self, prompt: str, completions: List[str], name: str) -> torch.FloatTensor:
+        """
+        Calculate the rewards for multiple completions. The length of each completion is used as an additional factor.
+
+        Parameters:
+           prompt (str): The prompt.
+           completions (List[str]): The list of completions.
+           name (str): The name.
+
+        Returns:
+           torch.FloatTensor: The calculated rewards.
+        """
+        # Return the calculated rewards for all completions
+        return torch.tensor([self.reward(prompt, completion, name) for completion in completions], dtype=torch.float32).to(self.device)
+
+    
\ No newline at end of file
diff --git a/openvalidators/reward/timing.py b/openvalidators/reward/timing.py
new file mode 100644
index 0000000..fc525a9
--- /dev/null
+++ b/openvalidators/reward/timing.py
@@ -0,0 +1,95 @@
+
+# The MIT License (MIT)
+# Copyright © 2021 Yuma Rao
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+# documentation files (the “Software”), to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
+# and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of
+# the Software.
+
+# THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+# THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+import torch
+from typing import List
+from .config import RewardModelType
+from .reward import BaseRewardModel
+from transformers import  AutoTokenizer, AutoModel
+
+class TimeAwareRewardModel(BaseRewardModel):
+    """
+    A model that calculates rewards based on the input prompt, completion, and time.
+    """
+    @property
+    def name(self) -> str: 
+        """The name of the reward model."""
+        return RewardModelType.time_aware.value
+
+    def __init__(self, device: str):
+        """
+        The constructor for TimeAwareRewardModel class.
+
+        Parameters:
+           device (str): The device to which the model will be sent.
+        """
+        super().__init__()
+        self.device = device
+
+    def get_rewards(self, prompt: str, completions: List[str], name: str, timing: List[int]) -> torch.FloatTensor:
+        """
+        Calculate the rewards for multiple completions.
+
+        Parameters:
+           prompt (str): The prompt.
+           completions (List[str]): The list of completions.
+           name (str): The name.
+           time (str): The time.
+
+        Returns:
+           torch.FloatTensor: The calculated rewards.
+        """
+        # Return the calculated rewards for all completions
+        return torch.tensor([-time for time in timing], dtype=torch.float32).to(self.device)
+    
+        
+    def apply(self, prompt: str, responses: List[bt.DendriteCall], name: str, test=False) -> torch.FloatTensor:
+        """
+        Applies the reward model across each call. Unsuccessful responses are assigned a reward of zero.
+
+        Parameters:
+            prompt (str): The prompt.
+            responses (List[bt.DendriteCall]): The list of dendrite calls.
+            name (str): The name.
+            test (bool): A boolean indicating whether or not this is a test run. Default is False.
+
+        Returns:
+            torch.FloatTensor: The calculated rewards.
+        """
+        # Get indices of successful responses
+        successful_completions_indices: List[int] = [idx for idx, resp in enumerate(responses) if resp.is_success]
+
+        # Extract the completion text from successful responses
+        successful_completions: List[str] = [responses[idx].completion.strip() for idx in successful_completions_indices]
+        successful_timings: List[str] = [responses[idx].elapsed_time for idx in successful_completions_indices]
+
+        # Calculate rewards for each successful completion
+        successful_rewards = self.get_rewards(prompt, successful_completions, name, successful_timings)
+
+        # Apply softmax normalization to the rewards
+        successful_rewards = self.normalize_rewards(successful_rewards, test)
+
+        # Initialize a tensor of zeros to hold the rewards for all responses
+        filled_rewards = torch.zeros(len(responses), dtype=torch.float32)
+
+        # Fill in the rewards for successful responses
+        for idx, reward in zip(successful_completions_indices, successful_rewards):
+            filled_rewards[idx] = reward
+
+        # Return the tensor of rewards
+        return filled_rewards

From ddd1a6749ab5225230b9fb1a7123994523a5a6c1 Mon Sep 17 00:00:00 2001
From: Eugene <etesting007@gmail.com>
Date: Thu, 20 Jul 2023 10:14:09 -0700
Subject: [PATCH 02/15] bt

---
 openvalidators/reward/timing.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/openvalidators/reward/timing.py b/openvalidators/reward/timing.py
index fc525a9..b3f96e9 100644
--- a/openvalidators/reward/timing.py
+++ b/openvalidators/reward/timing.py
@@ -21,6 +21,7 @@
 from .config import RewardModelType
 from .reward import BaseRewardModel
 from transformers import  AutoTokenizer, AutoModel
+import bittensor as bt
 
 class TimeAwareRewardModel(BaseRewardModel):
     """

From 886e804e0abf7ad04b97c0fc7bf4651bde5f9f17 Mon Sep 17 00:00:00 2001
From: Eugene <etesting007@gmail.com>
Date: Thu, 20 Jul 2023 10:18:43 -0700
Subject: [PATCH 03/15] gpt j tokenizer

---
 openvalidators/reward/length.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/openvalidators/reward/length.py b/openvalidators/reward/length.py
index 8f5a60f..0fc7e81 100644
--- a/openvalidators/reward/length.py
+++ b/openvalidators/reward/length.py
@@ -30,7 +30,8 @@ class LengthAwareRewardModel(BaseRewardModel):
         reward_model_path (str): The path to the pretrained reward model.
         revision (str): The revision version of the pretrained model.
     """
-    
+    reward_model_path: str = "EleutherAI/gpt-j-6b"
+
     @property
     def name(self) -> str: 
         """The name of the reward model."""
@@ -46,8 +47,7 @@ def __init__(self, device: str):
         super().__init__()
         self.device = device
         # Load the tokenizer from the pretrained model
-        self.tokenizer = AutoTokenizer.from_pretrained(LengthAwareRewardModel.reward_model_path, 
-                                                       revision = LengthAwareRewardModel.revision)
+        self.tokenizer = AutoTokenizer.from_pretrained(LengthAwareRewardModel.reward_model_path)
 
     def reward(self, prompt: str, completion: str, name: str) -> float:
         """

From 82433365f4e3a236019a5d29aaa67675a99c7810 Mon Sep 17 00:00:00 2001
From: Eugene <etesting007@gmail.com>
Date: Thu, 20 Jul 2023 10:30:10 -0700
Subject: [PATCH 04/15] length and time aware

---
 openvalidators/reward/config.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/openvalidators/reward/config.py b/openvalidators/reward/config.py
index 2f4b63b..3fe2152 100644
--- a/openvalidators/reward/config.py
+++ b/openvalidators/reward/config.py
@@ -28,7 +28,8 @@ class RewardModelType(Enum):
     blacklist = 'blacklist_filter'
     nsfw = 'nsfw_filter'
     relevance = 'relevance_filter'
-
+    time_aware = 'time_aware'
+    length_aware = 'length_aware'
 
 @dataclass(frozen=True)
 class DefaultRewardFrameworkConfig:

From ae1740e9c794bda3dc9688df2cb29af6dee00cee Mon Sep 17 00:00:00 2001
From: Eugene <etesting007@gmail.com>
Date: Thu, 20 Jul 2023 10:39:30 -0700
Subject: [PATCH 05/15] remove test

---
 openvalidators/reward/timing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openvalidators/reward/timing.py b/openvalidators/reward/timing.py
index b3f96e9..3e0e8cd 100644
--- a/openvalidators/reward/timing.py
+++ b/openvalidators/reward/timing.py
@@ -83,7 +83,7 @@ def apply(self, prompt: str, responses: List[bt.DendriteCall], name: str, test=F
         successful_rewards = self.get_rewards(prompt, successful_completions, name, successful_timings)
 
         # Apply softmax normalization to the rewards
-        successful_rewards = self.normalize_rewards(successful_rewards, test)
+        successful_rewards = self.normalize_rewards(successful_rewards)
 
         # Initialize a tensor of zeros to hold the rewards for all responses
         filled_rewards = torch.zeros(len(responses), dtype=torch.float32)

From 1e7c1903b5db378997a7252e64c26a5deebbb028 Mon Sep 17 00:00:00 2001
From: Eugene <etesting007@gmail.com>
Date: Thu, 20 Jul 2023 12:22:20 -0700
Subject: [PATCH 06/15] event update

---
 openvalidators/event.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/openvalidators/event.py b/openvalidators/event.py
index d790318..abdbaf1 100644
--- a/openvalidators/event.py
+++ b/openvalidators/event.py
@@ -44,6 +44,8 @@ class EventSchema:
     rlhf_reward_model: Optional[List[float]]  # Output vector of the rlhf reward model
     prompt_reward_model: Optional[List[float]]  # Output vector of the prompt reward model
     relevance_filter: Optional[List[float]]  # Output vector of the relevance scoring reward model
+    time_aware_reward_model: Optional[List[float]]  # Output vector of the time scoring reward model
+    length_aware_reward_model: Optional[List[float]]  # Output vector of the length scoring reward model
 
     # Weights data
     set_weights: Optional[List[List[float]]]
@@ -60,6 +62,8 @@ def from_dict(event_dict: dict, disable_log_rewards: bool) -> 'EventSchema':
             'diversity_reward_model': event_dict.get(RewardModelType.diversity.value),
             'rlhf_reward_model': event_dict.get(RewardModelType.rlhf.value),
             'prompt_reward_model': event_dict.get(RewardModelType.prompt.value),
+            'time_aware_reward_model' : event_dict.get(RewardModelType.time_aware.value),
+            'length_aware_reward_model' : event_dict.get(RewardModelType.length_aware.value),
         }
 
         # Logs warning that expected data was not set properly

From c45d19971ce6d03288d47b51cb226b10b3ed06c3 Mon Sep 17 00:00:00 2001
From: Eugene <etesting007@gmail.com>
Date: Thu, 20 Jul 2023 15:17:31 -0700
Subject: [PATCH 07/15] remove truncation

---
 openvalidators/reward/length.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/openvalidators/reward/length.py b/openvalidators/reward/length.py
index 0fc7e81..e58cfe5 100644
--- a/openvalidators/reward/length.py
+++ b/openvalidators/reward/length.py
@@ -65,7 +65,6 @@ def reward(self, prompt: str, completion: str, name: str) -> float:
             # Tokenize the message
             inputs = self.tokenizer(completion,
                                     return_tensors="pt",
-                                    truncation=True,
                                     ).to(self.device)
             # Multiply the reward by the length of the completion
             return len(inputs)

From 69b92e8f68050e73a69b34ae5d2d30303deaf870 Mon Sep 17 00:00:00 2001
From: Eugene <etesting007@gmail.com>
Date: Thu, 20 Jul 2023 15:23:49 -0700
Subject: [PATCH 08/15] inputs size

---
 openvalidators/reward/length.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/openvalidators/reward/length.py b/openvalidators/reward/length.py
index e58cfe5..239476b 100644
--- a/openvalidators/reward/length.py
+++ b/openvalidators/reward/length.py
@@ -67,7 +67,8 @@ def reward(self, prompt: str, completion: str, name: str) -> float:
                                     return_tensors="pt",
                                     ).to(self.device)
             # Multiply the reward by the length of the completion
-            return len(inputs)
+            print(inputs.size())
+            return inputs.size()[1]
 
     def get_rewards(self, prompt: str, completions: List[str], name: str) -> torch.FloatTensor:
         """

From 1d32cf435b235fbf8d0c887cbf641089ba304e18 Mon Sep 17 00:00:00 2001
From: Eugene <etesting007@gmail.com>
Date: Thu, 20 Jul 2023 15:30:56 -0700
Subject: [PATCH 09/15] inputs size

---
 openvalidators/reward/length.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/openvalidators/reward/length.py b/openvalidators/reward/length.py
index 239476b..e99313b 100644
--- a/openvalidators/reward/length.py
+++ b/openvalidators/reward/length.py
@@ -67,8 +67,7 @@ def reward(self, prompt: str, completion: str, name: str) -> float:
                                     return_tensors="pt",
                                     ).to(self.device)
             # Multiply the reward by the length of the completion
-            print(inputs.size())
-            return inputs.size()[1]
+            return inputs['input_ids'].size()[1]
 
     def get_rewards(self, prompt: str, completions: List[str], name: str) -> torch.FloatTensor:
         """

From adfc3869be99b12b8aeb4c002134599fc3bc0014 Mon Sep 17 00:00:00 2001
From: Eugene <etesting007@gmail.com>
Date: Mon, 24 Jul 2023 12:04:55 -0700
Subject: [PATCH 10/15] rewards for additional length + timing

---
 openvalidators/reward/length.py | 104 +++++++++++++++++++++++++++++++-
 openvalidators/reward/timing.py |  86 ++++++++++++++++++++++++--
 2 files changed, 185 insertions(+), 5 deletions(-)

diff --git a/openvalidators/reward/length.py b/openvalidators/reward/length.py
index e99313b..ae62eb9 100644
--- a/openvalidators/reward/length.py
+++ b/openvalidators/reward/length.py
@@ -48,6 +48,7 @@ def __init__(self, device: str):
         self.device = device
         # Load the tokenizer from the pretrained model
         self.tokenizer = AutoTokenizer.from_pretrained(LengthAwareRewardModel.reward_model_path)
+        self.cutoff = 300
 
     def reward(self, prompt: str, completion: str, name: str) -> float:
         """
@@ -67,7 +68,12 @@ def reward(self, prompt: str, completion: str, name: str) -> float:
                                     return_tensors="pt",
                                     ).to(self.device)
             # Multiply the reward by the length of the completion
-            return inputs['input_ids'].size()[1]
+            token_len = inputs['input_ids'].size()[1]
+
+            if token_len > self.cutoff:
+                return torch.tensor([self.cutoff]).to(self.device)
+            else:
+                return token_len
 
     def get_rewards(self, prompt: str, completions: List[str], name: str) -> torch.FloatTensor:
         """
@@ -84,4 +90,100 @@ def get_rewards(self, prompt: str, completions: List[str], name: str) -> torch.F
         # Return the calculated rewards for all completions
         return torch.tensor([self.reward(prompt, completion, name) for completion in completions], dtype=torch.float32).to(self.device)
 
+    def apply(self, prompt: str, responses: List[bt.DendriteCall], name: str) -> torch.FloatTensor:
+        """
+        Applies the reward model across each call. Unsuccessful responses are assigned a reward of zero.
+
+        Parameters:
+            prompt (str): The prompt.
+            responses (List[bt.DendriteCall]): The list of dendrite calls.
+            name (str): The name.
+
+        Returns:
+            torch.FloatTensor: The calculated rewards.
+        """
+        # Get indices of successful responses
+        successful_completions_indices: List[int] = [idx for idx, resp in enumerate(responses) if resp.is_success]
+
+        # Extract the completion text from successful responses
+        successful_completions: List[str] = [responses[idx].completion.strip() for idx in successful_completions_indices]
+
+        # Calculate rewards for each successful completion
+        successful_rewards = self.get_rewards(prompt, successful_completions, name)
+
+        # Apply softmax normalization to the rewards
+        successful_rewards = self.normalize_rewards(successful_rewards, name)
+
+        # Initialize a tensor of zeros to hold the rewards for all responses
+        filled_rewards = torch.zeros(len(responses), dtype=torch.float32)
+
+        # Fill in the rewards for successful responses
+        for idx, reward in zip(successful_completions_indices, successful_rewards):
+            filled_rewards[idx] = reward
+
+        # Return the tensor of rewards
+        return filled_rewards
+
+    def normalize_rewards(self, rewards: torch.FloatTensor, name: str) -> torch.FloatTensor:
+        """
+        This method normalizes the given rewards by updating the moving mean and
+        variance statistics. The rewards are first standardized, and then scaled to
+        the 0-1 range using a cumulative distribution function (CDF) to ensure they're
+        in a comparable range across different environments.
+
+        Args:
+            rewards (torch.FloatTensor): The reward values to be normalized.
+            name (str): The name to be used as a key for indexing the mean, variance, and count.
+
+        Returns:
+            torch.FloatTensor: The normalized reward values.
+
+        Note:
+            - This function uses Welford's online algorithm to update the mean and variance.
+            - It standardizes the reward values using the updated mean and variance.
+            - It then scales the standardized values to the 0-1 range using the error
+            function (erf) as a CDF.
+        """        
+        # Initialize the mean, variance, and count for a new name
+        if name not in self.mean:
+            self.mean[name] = 0.0
+            self.var[name] = 0.0
+            self.count[name] = 0
+
+        # Get the number of rewards (successful responses).
+        new_count = rewards.numel()
+
+        # Update stats only if there are new rewards.
+        if 0 < new_count and 0 < self.count[name] + new_count:
+            # Calculate the mean and standard deviation of the new rewards.
+            new_mean = rewards.mean()
+            new_var = rewards.var(dim=0)
+
+            # Compute the weights for the new and old rewards.
+            new_weight = new_count / (self.count[name] + new_count)
+            old_weight = self.count[name] / (self.count[name] + new_count)
+
+            # Save the difference in means before updating the old mean.
+            diff = new_mean - self.mean[name]
+
+            # Update the old mean with the new mean and weights.
+            self.mean[name] = new_weight * new_mean + old_weight * self.mean[name]
+            # Update the old variance with the new variance and weights, and adjusting for the difference in means.
+            self.var[name] = (
+                new_weight * new_var
+                + old_weight * self.var[name]
+                + new_weight * old_weight * diff * diff
+            )
+            # Update the old count with the new count, but don't exceed the limit.
+            self.count[name] = min(self.count_limit, self.count[name] + new_count)
+
+        # Standardize the rewards using the updated mean and variance.
+        rewards = rewards - self.mean[name]
+        if self.var[name] > 0:
+            rewards /= torch.sqrt(self.var[name])
+        # Scale the standardized rewards to the range [0, 1] using the error function as a cumulative distribution function (CDF).
+        rewards = 0.5 * (1 + torch.erf(rewards / torch.sqrt(torch.tensor([2.0])).to(rewards.device)))
+
+        return rewards
+
     
\ No newline at end of file
diff --git a/openvalidators/reward/timing.py b/openvalidators/reward/timing.py
index 3e0e8cd..07b23ca 100644
--- a/openvalidators/reward/timing.py
+++ b/openvalidators/reward/timing.py
@@ -41,6 +41,7 @@ def __init__(self, device: str):
         """
         super().__init__()
         self.device = device
+        self.cutoff = 1
 
     def get_rewards(self, prompt: str, completions: List[str], name: str, timing: List[int]) -> torch.FloatTensor:
         """
@@ -56,10 +57,26 @@ def get_rewards(self, prompt: str, completions: List[str], name: str, timing: Li
            torch.FloatTensor: The calculated rewards.
         """
         # Return the calculated rewards for all completions
-        return torch.tensor([-time for time in timing], dtype=torch.float32).to(self.device)
+        return torch.tensor([self.reward(time) for time in timing], dtype=torch.float32).to(self.device)
     
+    def reward(self, time:float) -> float:
+        """
+        Calculate the reward for a single completion, checks the time and 
+
+        Parameters:
+           time (float): The prompt.
+           completion (str): The completion.
+           name (str): The name.
+
+        Returns:
+           float: The calculated reward.
+        """
+        if time < self.cutoff:
+            return -20
+        else:
+            return -time
         
-    def apply(self, prompt: str, responses: List[bt.DendriteCall], name: str, test=False) -> torch.FloatTensor:
+    def apply(self, prompt: str, responses: List[bt.DendriteCall], name: str) -> torch.FloatTensor:
         """
         Applies the reward model across each call. Unsuccessful responses are assigned a reward of zero.
 
@@ -67,7 +84,6 @@ def apply(self, prompt: str, responses: List[bt.DendriteCall], name: str, test=F
             prompt (str): The prompt.
             responses (List[bt.DendriteCall]): The list of dendrite calls.
             name (str): The name.
-            test (bool): A boolean indicating whether or not this is a test run. Default is False.
 
         Returns:
             torch.FloatTensor: The calculated rewards.
@@ -83,7 +99,7 @@ def apply(self, prompt: str, responses: List[bt.DendriteCall], name: str, test=F
         successful_rewards = self.get_rewards(prompt, successful_completions, name, successful_timings)
 
         # Apply softmax normalization to the rewards
-        successful_rewards = self.normalize_rewards(successful_rewards)
+        successful_rewards = self.normalize_rewards(successful_rewards, name)
 
         # Initialize a tensor of zeros to hold the rewards for all responses
         filled_rewards = torch.zeros(len(responses), dtype=torch.float32)
@@ -94,3 +110,65 @@ def apply(self, prompt: str, responses: List[bt.DendriteCall], name: str, test=F
 
         # Return the tensor of rewards
         return filled_rewards
+
+    def normalize_rewards(self, rewards: torch.FloatTensor, name: str) -> torch.FloatTensor:
+        """
+        This method normalizes the given rewards by updating the moving mean and
+        variance statistics. The rewards are first standardized, and then scaled to
+        the 0-1 range using a cumulative distribution function (CDF) to ensure they're
+        in a comparable range across different environments.
+
+        Args:
+            rewards (torch.FloatTensor): The reward values to be normalized.
+            name (str): The name to be used as a key for indexing the mean, variance, and count.
+
+        Returns:
+            torch.FloatTensor: The normalized reward values.
+
+        Note:
+            - This function uses Welford's online algorithm to update the mean and variance.
+            - It standardizes the reward values using the updated mean and variance.
+            - It then scales the standardized values to the 0-1 range using the error
+            function (erf) as a CDF.
+        """        
+        # Initialize the mean, variance, and count for a new name
+        if name not in self.mean:
+            self.mean[name] = 0.0
+            self.var[name] = 0.0
+            self.count[name] = 0
+
+        # Get the number of rewards (successful responses).
+        new_count = rewards.numel()
+
+        # Update stats only if there are new rewards.
+        if 0 < new_count and 0 < self.count[name] + new_count:
+            # Calculate the mean and standard deviation of the new rewards.
+            new_mean = rewards.mean()
+            new_var = rewards.var(dim=0)
+
+            # Compute the weights for the new and old rewards.
+            new_weight = new_count / (self.count[name] + new_count)
+            old_weight = self.count[name] / (self.count[name] + new_count)
+
+            # Save the difference in means before updating the old mean.
+            diff = new_mean - self.mean[name]
+
+            # Update the old mean with the new mean and weights.
+            self.mean[name] = new_weight * new_mean + old_weight * self.mean[name]
+            # Update the old variance with the new variance and weights, and adjusting for the difference in means.
+            self.var[name] = (
+                new_weight * new_var
+                + old_weight * self.var[name]
+                + new_weight * old_weight * diff * diff
+            )
+            # Update the old count with the new count, but don't exceed the limit.
+            self.count[name] = min(self.count_limit, self.count[name] + new_count)
+
+        # Standardize the rewards using the updated mean and variance.
+        rewards = rewards - self.mean[name]
+        if self.var[name] > 0:
+            rewards /= torch.sqrt(self.var[name])
+        # Scale the standardized rewards to the range [0, 1] using the error function as a cumulative distribution function (CDF).
+        rewards = 0.5 * (1 + torch.erf(rewards / torch.sqrt(torch.tensor([2.0])).to(rewards.device)))
+
+        return rewards

From 2602e177899d951510faab7de99a5a204ac76414 Mon Sep 17 00:00:00 2001
From: Eugene <etesting007@gmail.com>
Date: Mon, 24 Jul 2023 12:55:51 -0700
Subject: [PATCH 11/15] dictionaries

---
 openvalidators/reward/length.py | 5 ++++-
 openvalidators/reward/timing.py | 3 +++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/openvalidators/reward/length.py b/openvalidators/reward/length.py
index ae62eb9..8ae4643 100644
--- a/openvalidators/reward/length.py
+++ b/openvalidators/reward/length.py
@@ -49,7 +49,10 @@ def __init__(self, device: str):
         # Load the tokenizer from the pretrained model
         self.tokenizer = AutoTokenizer.from_pretrained(LengthAwareRewardModel.reward_model_path)
         self.cutoff = 300
-
+        self.count = {}
+        self.mean = {}
+        self.var = {}
+        
     def reward(self, prompt: str, completion: str, name: str) -> float:
         """
         Calculate the reward for a single prompt and completion. The length of the completion is used as an additional factor.
diff --git a/openvalidators/reward/timing.py b/openvalidators/reward/timing.py
index 07b23ca..6e573a9 100644
--- a/openvalidators/reward/timing.py
+++ b/openvalidators/reward/timing.py
@@ -42,6 +42,9 @@ def __init__(self, device: str):
         super().__init__()
         self.device = device
         self.cutoff = 1
+        self.count = {}
+        self.mean = {}
+        self.var = {}
 
     def get_rewards(self, prompt: str, completions: List[str], name: str, timing: List[int]) -> torch.FloatTensor:
         """

From 5abc4e1a0a3aae588b6dac0914355a09897e2b19 Mon Sep 17 00:00:00 2001
From: Eugene <etesting007@gmail.com>
Date: Mon, 24 Jul 2023 12:56:35 -0700
Subject: [PATCH 12/15] import error

---
 openvalidators/reward/length.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/openvalidators/reward/length.py b/openvalidators/reward/length.py
index 8ae4643..bd837ef 100644
--- a/openvalidators/reward/length.py
+++ b/openvalidators/reward/length.py
@@ -21,6 +21,7 @@
 from .config import RewardModelType
 from .reward import BaseRewardModel
 from transformers import  AutoTokenizer, AutoModel
+import bittensor as bt
 
 class LengthAwareRewardModel(BaseRewardModel):
     """
@@ -52,7 +53,7 @@ def __init__(self, device: str):
         self.count = {}
         self.mean = {}
         self.var = {}
-        
+
     def reward(self, prompt: str, completion: str, name: str) -> float:
         """
         Calculate the reward for a single prompt and completion. The length of the completion is used as an additional factor.

From 2a3ba0816081b6e73dc8ca2211461792bfc7c660 Mon Sep 17 00:00:00 2001
From: Eugene <etesting007@gmail.com>
Date: Mon, 24 Jul 2023 15:22:46 -0700
Subject: [PATCH 13/15] test fixes

---
 openvalidators/reward/config.py | 4 ++--
 tests/test_event.py             | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/openvalidators/reward/config.py b/openvalidators/reward/config.py
index 3fe2152..addea83 100644
--- a/openvalidators/reward/config.py
+++ b/openvalidators/reward/config.py
@@ -28,8 +28,8 @@ class RewardModelType(Enum):
     blacklist = 'blacklist_filter'
     nsfw = 'nsfw_filter'
     relevance = 'relevance_filter'
-    time_aware = 'time_aware'
-    length_aware = 'length_aware'
+    time_aware = 'time_aware_reward_model'
+    length_aware = 'length_aware_reward_model'
 
 @dataclass(frozen=True)
 class DefaultRewardFrameworkConfig:
diff --git a/tests/test_event.py b/tests/test_event.py
index 3ad566f..8c1310d 100644
--- a/tests/test_event.py
+++ b/tests/test_event.py
@@ -45,6 +45,8 @@ def test_event_from_dict_all_forward_columns_match(self):
             RewardModelType.rlhf.value: [1.0],
             RewardModelType.prompt.value: [1.0],
             RewardModelType.relevance.value: [1.0],
+            RewardModelType.time_aware.value: [1.0],
+            RewardModelType.length_aware.value: [1.0],
         }
 
         # Act

From 06698bbb8bf4a3995358e81443686b1a476f3951 Mon Sep 17 00:00:00 2001
From: Eugene <etesting007@gmail.com>
Date: Tue, 25 Jul 2023 11:52:35 -0700
Subject: [PATCH 14/15] moving cutoff

---
 openvalidators/reward/length.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/openvalidators/reward/length.py b/openvalidators/reward/length.py
index bd837ef..86911d6 100644
--- a/openvalidators/reward/length.py
+++ b/openvalidators/reward/length.py
@@ -49,7 +49,6 @@ def __init__(self, device: str):
         self.device = device
         # Load the tokenizer from the pretrained model
         self.tokenizer = AutoTokenizer.from_pretrained(LengthAwareRewardModel.reward_model_path)
-        self.cutoff = 300
         self.count = {}
         self.mean = {}
         self.var = {}
@@ -73,9 +72,10 @@ def reward(self, prompt: str, completion: str, name: str) -> float:
                                     ).to(self.device)
             # Multiply the reward by the length of the completion
             token_len = inputs['input_ids'].size()[1]
+            cutoff = self.mean[name] + 2*torch.sqrt(self.var[name])
 
-            if token_len > self.cutoff:
-                return torch.tensor([self.cutoff]).to(self.device)
+            if token_len > cutoff:
+                return torch.tensor([cutoff]).to(self.device)
             else:
                 return token_len
 
@@ -115,7 +115,7 @@ def apply(self, prompt: str, responses: List[bt.DendriteCall], name: str) -> tor
         # Calculate rewards for each successful completion
         successful_rewards = self.get_rewards(prompt, successful_completions, name)
 
-        # Apply softmax normalization to the rewards
+        # Apply normalization to the rewards
         successful_rewards = self.normalize_rewards(successful_rewards, name)
 
         # Initialize a tensor of zeros to hold the rewards for all responses

From 655bc13accb9db8b6484b4caf4d55451fe77b59a Mon Sep 17 00:00:00 2001
From: Eugene <etesting007@gmail.com>
Date: Tue, 25 Jul 2023 12:00:33 -0700
Subject: [PATCH 15/15] adds check

---
 openvalidators/reward/length.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/openvalidators/reward/length.py b/openvalidators/reward/length.py
index 86911d6..a2ed636 100644
--- a/openvalidators/reward/length.py
+++ b/openvalidators/reward/length.py
@@ -72,7 +72,10 @@ def reward(self, prompt: str, completion: str, name: str) -> float:
                                     ).to(self.device)
             # Multiply the reward by the length of the completion
             token_len = inputs['input_ids'].size()[1]
-            cutoff = self.mean[name] + 2*torch.sqrt(self.var[name])
+            if name not in self.mean:
+                cutoff = 300
+            else:
+                cutoff = self.mean[name] + 2*torch.sqrt(self.var[name])
 
             if token_len > cutoff:
                 return torch.tensor([cutoff]).to(self.device)