From e7bbeb956924fa82149079c8fc7903c4c4a489c9 Mon Sep 17 00:00:00 2001 From: Eugene Date: Thu, 20 Jul 2023 10:07:18 -0700 Subject: [PATCH 01/15] length and time based rewards --- openvalidators/config.py | 12 ++++ openvalidators/neuron.py | 8 +++ openvalidators/reward/__init__.py | 2 + openvalidators/reward/length.py | 88 ++++++++++++++++++++++++++++ openvalidators/reward/timing.py | 95 +++++++++++++++++++++++++++++++ 5 files changed, 205 insertions(+) create mode 100644 openvalidators/reward/length.py create mode 100644 openvalidators/reward/timing.py diff --git a/openvalidators/config.py b/openvalidators/config.py index 6565cef..e635466 100644 --- a/openvalidators/config.py +++ b/openvalidators/config.py @@ -230,6 +230,18 @@ def add_args(cls, parser): help="Dont apply the nsfw reward model", default=False, ) + parser.add_argument( + "--neuron.length_off", + action="store_true", + help="Dont apply the length reward model", + default=False, + ) + parser.add_argument( + "--neuron.time_off", + action="store_true", + help="Dont apply the time reward model", + default=False, + ) parser.add_argument( "--neuron.relevance_off", action="store_true", diff --git a/openvalidators/neuron.py b/openvalidators/neuron.py index f1a2405..9c269f6 100644 --- a/openvalidators/neuron.py +++ b/openvalidators/neuron.py @@ -43,6 +43,8 @@ DiversityRewardModel, PromptRewardModel, RewardModelType, + LengthAwareRewardModel, + TimeAwareRewardModel, ) @@ -196,6 +198,12 @@ def __init__(self): NSFWRewardModel(device=self.device) if not self.config.neuron.nsfw_off else MockRewardModel(RewardModelType.nsfw.value), + LengthAwareRewardModel(device=self.device) + if not self.config.neuron.length_off + else MockRewardModel(RewardModelType.length.value), + TimeAwareRewardModel(device=self.device) + if not self.config.neuron.time_off + else MockRewardModel(RewardModelType.time.value), ] bt.logging.debug(str(self.reward_functions)) diff --git a/openvalidators/reward/__init__.py b/openvalidators/reward/__init__.py index 6a94469..5ae8d5b 100644 --- a/openvalidators/reward/__init__.py +++ b/openvalidators/reward/__init__.py @@ -9,3 +9,5 @@ from .diversity import DiversityRewardModel from .prompt import PromptRewardModel from .config import RewardModelType, DefaultRewardFrameworkConfig +from .length import LengthAwareRewardModel +from .timing import TimeAwareRewardModel \ No newline at end of file diff --git a/openvalidators/reward/length.py b/openvalidators/reward/length.py new file mode 100644 index 0000000..8f5a60f --- /dev/null +++ b/openvalidators/reward/length.py @@ -0,0 +1,88 @@ + +# The MIT License (MIT) +# Copyright © 2021 Yuma Rao + +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the “Software”), to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, +# and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of +# the Software. + +# THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +# THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +import torch +from typing import List +from .config import RewardModelType +from .reward import BaseRewardModel +from transformers import AutoTokenizer, AutoModel + +class LengthAwareRewardModel(BaseRewardModel): + """ + A model that calculates rewards based on the input prompt, completion, and the length of the completion. + + Attributes: + reward_model_path (str): The path to the pretrained reward model. + revision (str): The revision version of the pretrained model. + """ + + @property + def name(self) -> str: + """The name of the reward model.""" + return RewardModelType.length_aware.value + + def __init__(self, device: str): + """ + The constructor for LengthAwareRewardModel class. + + Parameters: + device (str): The device to which the model will be sent. + """ + super().__init__() + self.device = device + # Load the tokenizer from the pretrained model + self.tokenizer = AutoTokenizer.from_pretrained(LengthAwareRewardModel.reward_model_path, + revision = LengthAwareRewardModel.revision) + + def reward(self, prompt: str, completion: str, name: str) -> float: + """ + Calculate the reward for a single prompt and completion. The length of the completion is used as an additional factor. + + Parameters: + prompt (str): The prompt. + completion (str): The completion. + name (str): The name. + + Returns: + float: The calculated reward. + """ + with torch.no_grad(): + # Tokenize the message + inputs = self.tokenizer(completion, + return_tensors="pt", + truncation=True, + ).to(self.device) + # Multiply the reward by the length of the completion + return len(inputs) + + def get_rewards(self, prompt: str, completions: List[str], name: str) -> torch.FloatTensor: + """ + Calculate the rewards for multiple completions. The length of each completion is used as an additional factor. + + Parameters: + prompt (str): The prompt. + completions (List[str]): The list of completions. + name (str): The name. + + Returns: + torch.FloatTensor: The calculated rewards. + """ + # Return the calculated rewards for all completions + return torch.tensor([self.reward(prompt, completion, name) for completion in completions], dtype=torch.float32).to(self.device) + + \ No newline at end of file diff --git a/openvalidators/reward/timing.py b/openvalidators/reward/timing.py new file mode 100644 index 0000000..fc525a9 --- /dev/null +++ b/openvalidators/reward/timing.py @@ -0,0 +1,95 @@ + +# The MIT License (MIT) +# Copyright © 2021 Yuma Rao + +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the “Software”), to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, +# and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of +# the Software. + +# THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +# THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +import torch +from typing import List +from .config import RewardModelType +from .reward import BaseRewardModel +from transformers import AutoTokenizer, AutoModel + +class TimeAwareRewardModel(BaseRewardModel): + """ + A model that calculates rewards based on the input prompt, completion, and time. + """ + @property + def name(self) -> str: + """The name of the reward model.""" + return RewardModelType.time_aware.value + + def __init__(self, device: str): + """ + The constructor for TimeAwareRewardModel class. + + Parameters: + device (str): The device to which the model will be sent. + """ + super().__init__() + self.device = device + + def get_rewards(self, prompt: str, completions: List[str], name: str, timing: List[int]) -> torch.FloatTensor: + """ + Calculate the rewards for multiple completions. + + Parameters: + prompt (str): The prompt. + completions (List[str]): The list of completions. + name (str): The name. + time (str): The time. + + Returns: + torch.FloatTensor: The calculated rewards. + """ + # Return the calculated rewards for all completions + return torch.tensor([-time for time in timing], dtype=torch.float32).to(self.device) + + + def apply(self, prompt: str, responses: List[bt.DendriteCall], name: str, test=False) -> torch.FloatTensor: + """ + Applies the reward model across each call. Unsuccessful responses are assigned a reward of zero. + + Parameters: + prompt (str): The prompt. + responses (List[bt.DendriteCall]): The list of dendrite calls. + name (str): The name. + test (bool): A boolean indicating whether or not this is a test run. Default is False. + + Returns: + torch.FloatTensor: The calculated rewards. + """ + # Get indices of successful responses + successful_completions_indices: List[int] = [idx for idx, resp in enumerate(responses) if resp.is_success] + + # Extract the completion text from successful responses + successful_completions: List[str] = [responses[idx].completion.strip() for idx in successful_completions_indices] + successful_timings: List[str] = [responses[idx].elapsed_time for idx in successful_completions_indices] + + # Calculate rewards for each successful completion + successful_rewards = self.get_rewards(prompt, successful_completions, name, successful_timings) + + # Apply softmax normalization to the rewards + successful_rewards = self.normalize_rewards(successful_rewards, test) + + # Initialize a tensor of zeros to hold the rewards for all responses + filled_rewards = torch.zeros(len(responses), dtype=torch.float32) + + # Fill in the rewards for successful responses + for idx, reward in zip(successful_completions_indices, successful_rewards): + filled_rewards[idx] = reward + + # Return the tensor of rewards + return filled_rewards From ddd1a6749ab5225230b9fb1a7123994523a5a6c1 Mon Sep 17 00:00:00 2001 From: Eugene Date: Thu, 20 Jul 2023 10:14:09 -0700 Subject: [PATCH 02/15] bt --- openvalidators/reward/timing.py | 1 + 1 file changed, 1 insertion(+) diff --git a/openvalidators/reward/timing.py b/openvalidators/reward/timing.py index fc525a9..b3f96e9 100644 --- a/openvalidators/reward/timing.py +++ b/openvalidators/reward/timing.py @@ -21,6 +21,7 @@ from .config import RewardModelType from .reward import BaseRewardModel from transformers import AutoTokenizer, AutoModel +import bittensor as bt class TimeAwareRewardModel(BaseRewardModel): """ From 886e804e0abf7ad04b97c0fc7bf4651bde5f9f17 Mon Sep 17 00:00:00 2001 From: Eugene Date: Thu, 20 Jul 2023 10:18:43 -0700 Subject: [PATCH 03/15] gpt j tokenizer --- openvalidators/reward/length.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/openvalidators/reward/length.py b/openvalidators/reward/length.py index 8f5a60f..0fc7e81 100644 --- a/openvalidators/reward/length.py +++ b/openvalidators/reward/length.py @@ -30,7 +30,8 @@ class LengthAwareRewardModel(BaseRewardModel): reward_model_path (str): The path to the pretrained reward model. revision (str): The revision version of the pretrained model. """ - + reward_model_path: str = "EleutherAI/gpt-j-6b" + @property def name(self) -> str: """The name of the reward model.""" @@ -46,8 +47,7 @@ def __init__(self, device: str): super().__init__() self.device = device # Load the tokenizer from the pretrained model - self.tokenizer = AutoTokenizer.from_pretrained(LengthAwareRewardModel.reward_model_path, - revision = LengthAwareRewardModel.revision) + self.tokenizer = AutoTokenizer.from_pretrained(LengthAwareRewardModel.reward_model_path) def reward(self, prompt: str, completion: str, name: str) -> float: """ From 82433365f4e3a236019a5d29aaa67675a99c7810 Mon Sep 17 00:00:00 2001 From: Eugene Date: Thu, 20 Jul 2023 10:30:10 -0700 Subject: [PATCH 04/15] length and time aware --- openvalidators/reward/config.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/openvalidators/reward/config.py b/openvalidators/reward/config.py index 2f4b63b..3fe2152 100644 --- a/openvalidators/reward/config.py +++ b/openvalidators/reward/config.py @@ -28,7 +28,8 @@ class RewardModelType(Enum): blacklist = 'blacklist_filter' nsfw = 'nsfw_filter' relevance = 'relevance_filter' - + time_aware = 'time_aware' + length_aware = 'length_aware' @dataclass(frozen=True) class DefaultRewardFrameworkConfig: From ae1740e9c794bda3dc9688df2cb29af6dee00cee Mon Sep 17 00:00:00 2001 From: Eugene Date: Thu, 20 Jul 2023 10:39:30 -0700 Subject: [PATCH 05/15] remove test --- openvalidators/reward/timing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openvalidators/reward/timing.py b/openvalidators/reward/timing.py index b3f96e9..3e0e8cd 100644 --- a/openvalidators/reward/timing.py +++ b/openvalidators/reward/timing.py @@ -83,7 +83,7 @@ def apply(self, prompt: str, responses: List[bt.DendriteCall], name: str, test=F successful_rewards = self.get_rewards(prompt, successful_completions, name, successful_timings) # Apply softmax normalization to the rewards - successful_rewards = self.normalize_rewards(successful_rewards, test) + successful_rewards = self.normalize_rewards(successful_rewards) # Initialize a tensor of zeros to hold the rewards for all responses filled_rewards = torch.zeros(len(responses), dtype=torch.float32) From 1e7c1903b5db378997a7252e64c26a5deebbb028 Mon Sep 17 00:00:00 2001 From: Eugene Date: Thu, 20 Jul 2023 12:22:20 -0700 Subject: [PATCH 06/15] event update --- openvalidators/event.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/openvalidators/event.py b/openvalidators/event.py index d790318..abdbaf1 100644 --- a/openvalidators/event.py +++ b/openvalidators/event.py @@ -44,6 +44,8 @@ class EventSchema: rlhf_reward_model: Optional[List[float]] # Output vector of the rlhf reward model prompt_reward_model: Optional[List[float]] # Output vector of the prompt reward model relevance_filter: Optional[List[float]] # Output vector of the relevance scoring reward model + time_aware_reward_model: Optional[List[float]] # Output vector of the time scoring reward model + length_aware_reward_model: Optional[List[float]] # Output vector of the length scoring reward model # Weights data set_weights: Optional[List[List[float]]] @@ -60,6 +62,8 @@ def from_dict(event_dict: dict, disable_log_rewards: bool) -> 'EventSchema': 'diversity_reward_model': event_dict.get(RewardModelType.diversity.value), 'rlhf_reward_model': event_dict.get(RewardModelType.rlhf.value), 'prompt_reward_model': event_dict.get(RewardModelType.prompt.value), + 'time_aware_reward_model' : event_dict.get(RewardModelType.time_aware.value), + 'length_aware_reward_model' : event_dict.get(RewardModelType.length_aware.value), } # Logs warning that expected data was not set properly From c45d19971ce6d03288d47b51cb226b10b3ed06c3 Mon Sep 17 00:00:00 2001 From: Eugene Date: Thu, 20 Jul 2023 15:17:31 -0700 Subject: [PATCH 07/15] remove truncation --- openvalidators/reward/length.py | 1 - 1 file changed, 1 deletion(-) diff --git a/openvalidators/reward/length.py b/openvalidators/reward/length.py index 0fc7e81..e58cfe5 100644 --- a/openvalidators/reward/length.py +++ b/openvalidators/reward/length.py @@ -65,7 +65,6 @@ def reward(self, prompt: str, completion: str, name: str) -> float: # Tokenize the message inputs = self.tokenizer(completion, return_tensors="pt", - truncation=True, ).to(self.device) # Multiply the reward by the length of the completion return len(inputs) From 69b92e8f68050e73a69b34ae5d2d30303deaf870 Mon Sep 17 00:00:00 2001 From: Eugene Date: Thu, 20 Jul 2023 15:23:49 -0700 Subject: [PATCH 08/15] inputs size --- openvalidators/reward/length.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/openvalidators/reward/length.py b/openvalidators/reward/length.py index e58cfe5..239476b 100644 --- a/openvalidators/reward/length.py +++ b/openvalidators/reward/length.py @@ -67,7 +67,8 @@ def reward(self, prompt: str, completion: str, name: str) -> float: return_tensors="pt", ).to(self.device) # Multiply the reward by the length of the completion - return len(inputs) + print(inputs.size()) + return inputs.size()[1] def get_rewards(self, prompt: str, completions: List[str], name: str) -> torch.FloatTensor: """ From 1d32cf435b235fbf8d0c887cbf641089ba304e18 Mon Sep 17 00:00:00 2001 From: Eugene Date: Thu, 20 Jul 2023 15:30:56 -0700 Subject: [PATCH 09/15] inputs size --- openvalidators/reward/length.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/openvalidators/reward/length.py b/openvalidators/reward/length.py index 239476b..e99313b 100644 --- a/openvalidators/reward/length.py +++ b/openvalidators/reward/length.py @@ -67,8 +67,7 @@ def reward(self, prompt: str, completion: str, name: str) -> float: return_tensors="pt", ).to(self.device) # Multiply the reward by the length of the completion - print(inputs.size()) - return inputs.size()[1] + return inputs['input_ids'].size()[1] def get_rewards(self, prompt: str, completions: List[str], name: str) -> torch.FloatTensor: """ From adfc3869be99b12b8aeb4c002134599fc3bc0014 Mon Sep 17 00:00:00 2001 From: Eugene Date: Mon, 24 Jul 2023 12:04:55 -0700 Subject: [PATCH 10/15] rewards for additional length + timing --- openvalidators/reward/length.py | 104 +++++++++++++++++++++++++++++++- openvalidators/reward/timing.py | 86 ++++++++++++++++++++++++-- 2 files changed, 185 insertions(+), 5 deletions(-) diff --git a/openvalidators/reward/length.py b/openvalidators/reward/length.py index e99313b..ae62eb9 100644 --- a/openvalidators/reward/length.py +++ b/openvalidators/reward/length.py @@ -48,6 +48,7 @@ def __init__(self, device: str): self.device = device # Load the tokenizer from the pretrained model self.tokenizer = AutoTokenizer.from_pretrained(LengthAwareRewardModel.reward_model_path) + self.cutoff = 300 def reward(self, prompt: str, completion: str, name: str) -> float: """ @@ -67,7 +68,12 @@ def reward(self, prompt: str, completion: str, name: str) -> float: return_tensors="pt", ).to(self.device) # Multiply the reward by the length of the completion - return inputs['input_ids'].size()[1] + token_len = inputs['input_ids'].size()[1] + + if token_len > self.cutoff: + return torch.tensor([self.cutoff]).to(self.device) + else: + return token_len def get_rewards(self, prompt: str, completions: List[str], name: str) -> torch.FloatTensor: """ @@ -84,4 +90,100 @@ def get_rewards(self, prompt: str, completions: List[str], name: str) -> torch.F # Return the calculated rewards for all completions return torch.tensor([self.reward(prompt, completion, name) for completion in completions], dtype=torch.float32).to(self.device) + def apply(self, prompt: str, responses: List[bt.DendriteCall], name: str) -> torch.FloatTensor: + """ + Applies the reward model across each call. Unsuccessful responses are assigned a reward of zero. + + Parameters: + prompt (str): The prompt. + responses (List[bt.DendriteCall]): The list of dendrite calls. + name (str): The name. + + Returns: + torch.FloatTensor: The calculated rewards. + """ + # Get indices of successful responses + successful_completions_indices: List[int] = [idx for idx, resp in enumerate(responses) if resp.is_success] + + # Extract the completion text from successful responses + successful_completions: List[str] = [responses[idx].completion.strip() for idx in successful_completions_indices] + + # Calculate rewards for each successful completion + successful_rewards = self.get_rewards(prompt, successful_completions, name) + + # Apply softmax normalization to the rewards + successful_rewards = self.normalize_rewards(successful_rewards, name) + + # Initialize a tensor of zeros to hold the rewards for all responses + filled_rewards = torch.zeros(len(responses), dtype=torch.float32) + + # Fill in the rewards for successful responses + for idx, reward in zip(successful_completions_indices, successful_rewards): + filled_rewards[idx] = reward + + # Return the tensor of rewards + return filled_rewards + + def normalize_rewards(self, rewards: torch.FloatTensor, name: str) -> torch.FloatTensor: + """ + This method normalizes the given rewards by updating the moving mean and + variance statistics. The rewards are first standardized, and then scaled to + the 0-1 range using a cumulative distribution function (CDF) to ensure they're + in a comparable range across different environments. + + Args: + rewards (torch.FloatTensor): The reward values to be normalized. + name (str): The name to be used as a key for indexing the mean, variance, and count. + + Returns: + torch.FloatTensor: The normalized reward values. + + Note: + - This function uses Welford's online algorithm to update the mean and variance. + - It standardizes the reward values using the updated mean and variance. + - It then scales the standardized values to the 0-1 range using the error + function (erf) as a CDF. + """ + # Initialize the mean, variance, and count for a new name + if name not in self.mean: + self.mean[name] = 0.0 + self.var[name] = 0.0 + self.count[name] = 0 + + # Get the number of rewards (successful responses). + new_count = rewards.numel() + + # Update stats only if there are new rewards. + if 0 < new_count and 0 < self.count[name] + new_count: + # Calculate the mean and standard deviation of the new rewards. + new_mean = rewards.mean() + new_var = rewards.var(dim=0) + + # Compute the weights for the new and old rewards. + new_weight = new_count / (self.count[name] + new_count) + old_weight = self.count[name] / (self.count[name] + new_count) + + # Save the difference in means before updating the old mean. + diff = new_mean - self.mean[name] + + # Update the old mean with the new mean and weights. + self.mean[name] = new_weight * new_mean + old_weight * self.mean[name] + # Update the old variance with the new variance and weights, and adjusting for the difference in means. + self.var[name] = ( + new_weight * new_var + + old_weight * self.var[name] + + new_weight * old_weight * diff * diff + ) + # Update the old count with the new count, but don't exceed the limit. + self.count[name] = min(self.count_limit, self.count[name] + new_count) + + # Standardize the rewards using the updated mean and variance. + rewards = rewards - self.mean[name] + if self.var[name] > 0: + rewards /= torch.sqrt(self.var[name]) + # Scale the standardized rewards to the range [0, 1] using the error function as a cumulative distribution function (CDF). + rewards = 0.5 * (1 + torch.erf(rewards / torch.sqrt(torch.tensor([2.0])).to(rewards.device))) + + return rewards + \ No newline at end of file diff --git a/openvalidators/reward/timing.py b/openvalidators/reward/timing.py index 3e0e8cd..07b23ca 100644 --- a/openvalidators/reward/timing.py +++ b/openvalidators/reward/timing.py @@ -41,6 +41,7 @@ def __init__(self, device: str): """ super().__init__() self.device = device + self.cutoff = 1 def get_rewards(self, prompt: str, completions: List[str], name: str, timing: List[int]) -> torch.FloatTensor: """ @@ -56,10 +57,26 @@ def get_rewards(self, prompt: str, completions: List[str], name: str, timing: Li torch.FloatTensor: The calculated rewards. """ # Return the calculated rewards for all completions - return torch.tensor([-time for time in timing], dtype=torch.float32).to(self.device) + return torch.tensor([self.reward(time) for time in timing], dtype=torch.float32).to(self.device) + def reward(self, time:float) -> float: + """ + Calculate the reward for a single completion, checks the time and + + Parameters: + time (float): The prompt. + completion (str): The completion. + name (str): The name. + + Returns: + float: The calculated reward. + """ + if time < self.cutoff: + return -20 + else: + return -time - def apply(self, prompt: str, responses: List[bt.DendriteCall], name: str, test=False) -> torch.FloatTensor: + def apply(self, prompt: str, responses: List[bt.DendriteCall], name: str) -> torch.FloatTensor: """ Applies the reward model across each call. Unsuccessful responses are assigned a reward of zero. @@ -67,7 +84,6 @@ def apply(self, prompt: str, responses: List[bt.DendriteCall], name: str, test=F prompt (str): The prompt. responses (List[bt.DendriteCall]): The list of dendrite calls. name (str): The name. - test (bool): A boolean indicating whether or not this is a test run. Default is False. Returns: torch.FloatTensor: The calculated rewards. @@ -83,7 +99,7 @@ def apply(self, prompt: str, responses: List[bt.DendriteCall], name: str, test=F successful_rewards = self.get_rewards(prompt, successful_completions, name, successful_timings) # Apply softmax normalization to the rewards - successful_rewards = self.normalize_rewards(successful_rewards) + successful_rewards = self.normalize_rewards(successful_rewards, name) # Initialize a tensor of zeros to hold the rewards for all responses filled_rewards = torch.zeros(len(responses), dtype=torch.float32) @@ -94,3 +110,65 @@ def apply(self, prompt: str, responses: List[bt.DendriteCall], name: str, test=F # Return the tensor of rewards return filled_rewards + + def normalize_rewards(self, rewards: torch.FloatTensor, name: str) -> torch.FloatTensor: + """ + This method normalizes the given rewards by updating the moving mean and + variance statistics. The rewards are first standardized, and then scaled to + the 0-1 range using a cumulative distribution function (CDF) to ensure they're + in a comparable range across different environments. + + Args: + rewards (torch.FloatTensor): The reward values to be normalized. + name (str): The name to be used as a key for indexing the mean, variance, and count. + + Returns: + torch.FloatTensor: The normalized reward values. + + Note: + - This function uses Welford's online algorithm to update the mean and variance. + - It standardizes the reward values using the updated mean and variance. + - It then scales the standardized values to the 0-1 range using the error + function (erf) as a CDF. + """ + # Initialize the mean, variance, and count for a new name + if name not in self.mean: + self.mean[name] = 0.0 + self.var[name] = 0.0 + self.count[name] = 0 + + # Get the number of rewards (successful responses). + new_count = rewards.numel() + + # Update stats only if there are new rewards. + if 0 < new_count and 0 < self.count[name] + new_count: + # Calculate the mean and standard deviation of the new rewards. + new_mean = rewards.mean() + new_var = rewards.var(dim=0) + + # Compute the weights for the new and old rewards. + new_weight = new_count / (self.count[name] + new_count) + old_weight = self.count[name] / (self.count[name] + new_count) + + # Save the difference in means before updating the old mean. + diff = new_mean - self.mean[name] + + # Update the old mean with the new mean and weights. + self.mean[name] = new_weight * new_mean + old_weight * self.mean[name] + # Update the old variance with the new variance and weights, and adjusting for the difference in means. + self.var[name] = ( + new_weight * new_var + + old_weight * self.var[name] + + new_weight * old_weight * diff * diff + ) + # Update the old count with the new count, but don't exceed the limit. + self.count[name] = min(self.count_limit, self.count[name] + new_count) + + # Standardize the rewards using the updated mean and variance. + rewards = rewards - self.mean[name] + if self.var[name] > 0: + rewards /= torch.sqrt(self.var[name]) + # Scale the standardized rewards to the range [0, 1] using the error function as a cumulative distribution function (CDF). + rewards = 0.5 * (1 + torch.erf(rewards / torch.sqrt(torch.tensor([2.0])).to(rewards.device))) + + return rewards From 2602e177899d951510faab7de99a5a204ac76414 Mon Sep 17 00:00:00 2001 From: Eugene Date: Mon, 24 Jul 2023 12:55:51 -0700 Subject: [PATCH 11/15] dictionaries --- openvalidators/reward/length.py | 5 ++++- openvalidators/reward/timing.py | 3 +++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/openvalidators/reward/length.py b/openvalidators/reward/length.py index ae62eb9..8ae4643 100644 --- a/openvalidators/reward/length.py +++ b/openvalidators/reward/length.py @@ -49,7 +49,10 @@ def __init__(self, device: str): # Load the tokenizer from the pretrained model self.tokenizer = AutoTokenizer.from_pretrained(LengthAwareRewardModel.reward_model_path) self.cutoff = 300 - + self.count = {} + self.mean = {} + self.var = {} + def reward(self, prompt: str, completion: str, name: str) -> float: """ Calculate the reward for a single prompt and completion. The length of the completion is used as an additional factor. diff --git a/openvalidators/reward/timing.py b/openvalidators/reward/timing.py index 07b23ca..6e573a9 100644 --- a/openvalidators/reward/timing.py +++ b/openvalidators/reward/timing.py @@ -42,6 +42,9 @@ def __init__(self, device: str): super().__init__() self.device = device self.cutoff = 1 + self.count = {} + self.mean = {} + self.var = {} def get_rewards(self, prompt: str, completions: List[str], name: str, timing: List[int]) -> torch.FloatTensor: """ From 5abc4e1a0a3aae588b6dac0914355a09897e2b19 Mon Sep 17 00:00:00 2001 From: Eugene Date: Mon, 24 Jul 2023 12:56:35 -0700 Subject: [PATCH 12/15] import error --- openvalidators/reward/length.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/openvalidators/reward/length.py b/openvalidators/reward/length.py index 8ae4643..bd837ef 100644 --- a/openvalidators/reward/length.py +++ b/openvalidators/reward/length.py @@ -21,6 +21,7 @@ from .config import RewardModelType from .reward import BaseRewardModel from transformers import AutoTokenizer, AutoModel +import bittensor as bt class LengthAwareRewardModel(BaseRewardModel): """ @@ -52,7 +53,7 @@ def __init__(self, device: str): self.count = {} self.mean = {} self.var = {} - + def reward(self, prompt: str, completion: str, name: str) -> float: """ Calculate the reward for a single prompt and completion. The length of the completion is used as an additional factor. From 2a3ba0816081b6e73dc8ca2211461792bfc7c660 Mon Sep 17 00:00:00 2001 From: Eugene Date: Mon, 24 Jul 2023 15:22:46 -0700 Subject: [PATCH 13/15] test fixes --- openvalidators/reward/config.py | 4 ++-- tests/test_event.py | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/openvalidators/reward/config.py b/openvalidators/reward/config.py index 3fe2152..addea83 100644 --- a/openvalidators/reward/config.py +++ b/openvalidators/reward/config.py @@ -28,8 +28,8 @@ class RewardModelType(Enum): blacklist = 'blacklist_filter' nsfw = 'nsfw_filter' relevance = 'relevance_filter' - time_aware = 'time_aware' - length_aware = 'length_aware' + time_aware = 'time_aware_reward_model' + length_aware = 'length_aware_reward_model' @dataclass(frozen=True) class DefaultRewardFrameworkConfig: diff --git a/tests/test_event.py b/tests/test_event.py index 3ad566f..8c1310d 100644 --- a/tests/test_event.py +++ b/tests/test_event.py @@ -45,6 +45,8 @@ def test_event_from_dict_all_forward_columns_match(self): RewardModelType.rlhf.value: [1.0], RewardModelType.prompt.value: [1.0], RewardModelType.relevance.value: [1.0], + RewardModelType.time_aware.value: [1.0], + RewardModelType.length_aware.value: [1.0], } # Act From 06698bbb8bf4a3995358e81443686b1a476f3951 Mon Sep 17 00:00:00 2001 From: Eugene Date: Tue, 25 Jul 2023 11:52:35 -0700 Subject: [PATCH 14/15] moving cutoff --- openvalidators/reward/length.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/openvalidators/reward/length.py b/openvalidators/reward/length.py index bd837ef..86911d6 100644 --- a/openvalidators/reward/length.py +++ b/openvalidators/reward/length.py @@ -49,7 +49,6 @@ def __init__(self, device: str): self.device = device # Load the tokenizer from the pretrained model self.tokenizer = AutoTokenizer.from_pretrained(LengthAwareRewardModel.reward_model_path) - self.cutoff = 300 self.count = {} self.mean = {} self.var = {} @@ -73,9 +72,10 @@ def reward(self, prompt: str, completion: str, name: str) -> float: ).to(self.device) # Multiply the reward by the length of the completion token_len = inputs['input_ids'].size()[1] + cutoff = self.mean[name] + 2*torch.sqrt(self.var[name]) - if token_len > self.cutoff: - return torch.tensor([self.cutoff]).to(self.device) + if token_len > cutoff: + return torch.tensor([cutoff]).to(self.device) else: return token_len @@ -115,7 +115,7 @@ def apply(self, prompt: str, responses: List[bt.DendriteCall], name: str) -> tor # Calculate rewards for each successful completion successful_rewards = self.get_rewards(prompt, successful_completions, name) - # Apply softmax normalization to the rewards + # Apply normalization to the rewards successful_rewards = self.normalize_rewards(successful_rewards, name) # Initialize a tensor of zeros to hold the rewards for all responses From 655bc13accb9db8b6484b4caf4d55451fe77b59a Mon Sep 17 00:00:00 2001 From: Eugene Date: Tue, 25 Jul 2023 12:00:33 -0700 Subject: [PATCH 15/15] adds check --- openvalidators/reward/length.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/openvalidators/reward/length.py b/openvalidators/reward/length.py index 86911d6..a2ed636 100644 --- a/openvalidators/reward/length.py +++ b/openvalidators/reward/length.py @@ -72,7 +72,10 @@ def reward(self, prompt: str, completion: str, name: str) -> float: ).to(self.device) # Multiply the reward by the length of the completion token_len = inputs['input_ids'].size()[1] - cutoff = self.mean[name] + 2*torch.sqrt(self.var[name]) + if name not in self.mean: + cutoff = 300 + else: + cutoff = self.mean[name] + 2*torch.sqrt(self.var[name]) if token_len > cutoff: return torch.tensor([cutoff]).to(self.device)