diff --git a/openvalidators/config.py b/openvalidators/config.py index 6565cef..e635466 100644 --- a/openvalidators/config.py +++ b/openvalidators/config.py @@ -230,6 +230,18 @@ def add_args(cls, parser): help="Dont apply the nsfw reward model", default=False, ) + parser.add_argument( + "--neuron.length_off", + action="store_true", + help="Dont apply the length reward model", + default=False, + ) + parser.add_argument( + "--neuron.time_off", + action="store_true", + help="Dont apply the time reward model", + default=False, + ) parser.add_argument( "--neuron.relevance_off", action="store_true", diff --git a/openvalidators/event.py b/openvalidators/event.py index d790318..abdbaf1 100644 --- a/openvalidators/event.py +++ b/openvalidators/event.py @@ -44,6 +44,8 @@ class EventSchema: rlhf_reward_model: Optional[List[float]] # Output vector of the rlhf reward model prompt_reward_model: Optional[List[float]] # Output vector of the prompt reward model relevance_filter: Optional[List[float]] # Output vector of the relevance scoring reward model + time_aware_reward_model: Optional[List[float]] # Output vector of the time scoring reward model + length_aware_reward_model: Optional[List[float]] # Output vector of the length scoring reward model # Weights data set_weights: Optional[List[List[float]]] @@ -60,6 +62,8 @@ def from_dict(event_dict: dict, disable_log_rewards: bool) -> 'EventSchema': 'diversity_reward_model': event_dict.get(RewardModelType.diversity.value), 'rlhf_reward_model': event_dict.get(RewardModelType.rlhf.value), 'prompt_reward_model': event_dict.get(RewardModelType.prompt.value), + 'time_aware_reward_model' : event_dict.get(RewardModelType.time_aware.value), + 'length_aware_reward_model' : event_dict.get(RewardModelType.length_aware.value), } # Logs warning that expected data was not set properly diff --git a/openvalidators/neuron.py b/openvalidators/neuron.py index f1a2405..9c269f6 100644 --- a/openvalidators/neuron.py +++ b/openvalidators/neuron.py @@ -43,6 +43,8 @@ DiversityRewardModel, PromptRewardModel, RewardModelType, + LengthAwareRewardModel, + TimeAwareRewardModel, ) @@ -196,6 +198,12 @@ def __init__(self): NSFWRewardModel(device=self.device) if not self.config.neuron.nsfw_off else MockRewardModel(RewardModelType.nsfw.value), + LengthAwareRewardModel(device=self.device) + if not self.config.neuron.length_off + else MockRewardModel(RewardModelType.length.value), + TimeAwareRewardModel(device=self.device) + if not self.config.neuron.time_off + else MockRewardModel(RewardModelType.time.value), ] bt.logging.debug(str(self.reward_functions)) diff --git a/openvalidators/reward/__init__.py b/openvalidators/reward/__init__.py index 6a94469..5ae8d5b 100644 --- a/openvalidators/reward/__init__.py +++ b/openvalidators/reward/__init__.py @@ -9,3 +9,5 @@ from .diversity import DiversityRewardModel from .prompt import PromptRewardModel from .config import RewardModelType, DefaultRewardFrameworkConfig +from .length import LengthAwareRewardModel +from .timing import TimeAwareRewardModel \ No newline at end of file diff --git a/openvalidators/reward/config.py b/openvalidators/reward/config.py index 2f4b63b..addea83 100644 --- a/openvalidators/reward/config.py +++ b/openvalidators/reward/config.py @@ -28,7 +28,8 @@ class RewardModelType(Enum): blacklist = 'blacklist_filter' nsfw = 'nsfw_filter' relevance = 'relevance_filter' - + time_aware = 'time_aware_reward_model' + length_aware = 'length_aware_reward_model' @dataclass(frozen=True) class DefaultRewardFrameworkConfig: diff --git a/openvalidators/reward/length.py b/openvalidators/reward/length.py new file mode 100644 index 0000000..a2ed636 --- /dev/null +++ b/openvalidators/reward/length.py @@ -0,0 +1,196 @@ + +# The MIT License (MIT) +# Copyright © 2021 Yuma Rao + +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the “Software”), to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, +# and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of +# the Software. + +# THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +# THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +import torch +from typing import List +from .config import RewardModelType +from .reward import BaseRewardModel +from transformers import AutoTokenizer, AutoModel +import bittensor as bt + +class LengthAwareRewardModel(BaseRewardModel): + """ + A model that calculates rewards based on the input prompt, completion, and the length of the completion. + + Attributes: + reward_model_path (str): The path to the pretrained reward model. + revision (str): The revision version of the pretrained model. + """ + reward_model_path: str = "EleutherAI/gpt-j-6b" + + @property + def name(self) -> str: + """The name of the reward model.""" + return RewardModelType.length_aware.value + + def __init__(self, device: str): + """ + The constructor for LengthAwareRewardModel class. + + Parameters: + device (str): The device to which the model will be sent. + """ + super().__init__() + self.device = device + # Load the tokenizer from the pretrained model + self.tokenizer = AutoTokenizer.from_pretrained(LengthAwareRewardModel.reward_model_path) + self.count = {} + self.mean = {} + self.var = {} + + def reward(self, prompt: str, completion: str, name: str) -> float: + """ + Calculate the reward for a single prompt and completion. The length of the completion is used as an additional factor. + + Parameters: + prompt (str): The prompt. + completion (str): The completion. + name (str): The name. + + Returns: + float: The calculated reward. + """ + with torch.no_grad(): + # Tokenize the message + inputs = self.tokenizer(completion, + return_tensors="pt", + ).to(self.device) + # Multiply the reward by the length of the completion + token_len = inputs['input_ids'].size()[1] + if name not in self.mean: + cutoff = 300 + else: + cutoff = self.mean[name] + 2*torch.sqrt(self.var[name]) + + if token_len > cutoff: + return torch.tensor([cutoff]).to(self.device) + else: + return token_len + + def get_rewards(self, prompt: str, completions: List[str], name: str) -> torch.FloatTensor: + """ + Calculate the rewards for multiple completions. The length of each completion is used as an additional factor. + + Parameters: + prompt (str): The prompt. + completions (List[str]): The list of completions. + name (str): The name. + + Returns: + torch.FloatTensor: The calculated rewards. + """ + # Return the calculated rewards for all completions + return torch.tensor([self.reward(prompt, completion, name) for completion in completions], dtype=torch.float32).to(self.device) + + def apply(self, prompt: str, responses: List[bt.DendriteCall], name: str) -> torch.FloatTensor: + """ + Applies the reward model across each call. Unsuccessful responses are assigned a reward of zero. + + Parameters: + prompt (str): The prompt. + responses (List[bt.DendriteCall]): The list of dendrite calls. + name (str): The name. + + Returns: + torch.FloatTensor: The calculated rewards. + """ + # Get indices of successful responses + successful_completions_indices: List[int] = [idx for idx, resp in enumerate(responses) if resp.is_success] + + # Extract the completion text from successful responses + successful_completions: List[str] = [responses[idx].completion.strip() for idx in successful_completions_indices] + + # Calculate rewards for each successful completion + successful_rewards = self.get_rewards(prompt, successful_completions, name) + + # Apply normalization to the rewards + successful_rewards = self.normalize_rewards(successful_rewards, name) + + # Initialize a tensor of zeros to hold the rewards for all responses + filled_rewards = torch.zeros(len(responses), dtype=torch.float32) + + # Fill in the rewards for successful responses + for idx, reward in zip(successful_completions_indices, successful_rewards): + filled_rewards[idx] = reward + + # Return the tensor of rewards + return filled_rewards + + def normalize_rewards(self, rewards: torch.FloatTensor, name: str) -> torch.FloatTensor: + """ + This method normalizes the given rewards by updating the moving mean and + variance statistics. The rewards are first standardized, and then scaled to + the 0-1 range using a cumulative distribution function (CDF) to ensure they're + in a comparable range across different environments. + + Args: + rewards (torch.FloatTensor): The reward values to be normalized. + name (str): The name to be used as a key for indexing the mean, variance, and count. + + Returns: + torch.FloatTensor: The normalized reward values. + + Note: + - This function uses Welford's online algorithm to update the mean and variance. + - It standardizes the reward values using the updated mean and variance. + - It then scales the standardized values to the 0-1 range using the error + function (erf) as a CDF. + """ + # Initialize the mean, variance, and count for a new name + if name not in self.mean: + self.mean[name] = 0.0 + self.var[name] = 0.0 + self.count[name] = 0 + + # Get the number of rewards (successful responses). + new_count = rewards.numel() + + # Update stats only if there are new rewards. + if 0 < new_count and 0 < self.count[name] + new_count: + # Calculate the mean and standard deviation of the new rewards. + new_mean = rewards.mean() + new_var = rewards.var(dim=0) + + # Compute the weights for the new and old rewards. + new_weight = new_count / (self.count[name] + new_count) + old_weight = self.count[name] / (self.count[name] + new_count) + + # Save the difference in means before updating the old mean. + diff = new_mean - self.mean[name] + + # Update the old mean with the new mean and weights. + self.mean[name] = new_weight * new_mean + old_weight * self.mean[name] + # Update the old variance with the new variance and weights, and adjusting for the difference in means. + self.var[name] = ( + new_weight * new_var + + old_weight * self.var[name] + + new_weight * old_weight * diff * diff + ) + # Update the old count with the new count, but don't exceed the limit. + self.count[name] = min(self.count_limit, self.count[name] + new_count) + + # Standardize the rewards using the updated mean and variance. + rewards = rewards - self.mean[name] + if self.var[name] > 0: + rewards /= torch.sqrt(self.var[name]) + # Scale the standardized rewards to the range [0, 1] using the error function as a cumulative distribution function (CDF). + rewards = 0.5 * (1 + torch.erf(rewards / torch.sqrt(torch.tensor([2.0])).to(rewards.device))) + + return rewards + + \ No newline at end of file diff --git a/openvalidators/reward/timing.py b/openvalidators/reward/timing.py new file mode 100644 index 0000000..6e573a9 --- /dev/null +++ b/openvalidators/reward/timing.py @@ -0,0 +1,177 @@ + +# The MIT License (MIT) +# Copyright © 2021 Yuma Rao + +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the “Software”), to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, +# and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of +# the Software. + +# THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO +# THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +import torch +from typing import List +from .config import RewardModelType +from .reward import BaseRewardModel +from transformers import AutoTokenizer, AutoModel +import bittensor as bt + +class TimeAwareRewardModel(BaseRewardModel): + """ + A model that calculates rewards based on the input prompt, completion, and time. + """ + @property + def name(self) -> str: + """The name of the reward model.""" + return RewardModelType.time_aware.value + + def __init__(self, device: str): + """ + The constructor for TimeAwareRewardModel class. + + Parameters: + device (str): The device to which the model will be sent. + """ + super().__init__() + self.device = device + self.cutoff = 1 + self.count = {} + self.mean = {} + self.var = {} + + def get_rewards(self, prompt: str, completions: List[str], name: str, timing: List[int]) -> torch.FloatTensor: + """ + Calculate the rewards for multiple completions. + + Parameters: + prompt (str): The prompt. + completions (List[str]): The list of completions. + name (str): The name. + time (str): The time. + + Returns: + torch.FloatTensor: The calculated rewards. + """ + # Return the calculated rewards for all completions + return torch.tensor([self.reward(time) for time in timing], dtype=torch.float32).to(self.device) + + def reward(self, time:float) -> float: + """ + Calculate the reward for a single completion, checks the time and + + Parameters: + time (float): The prompt. + completion (str): The completion. + name (str): The name. + + Returns: + float: The calculated reward. + """ + if time < self.cutoff: + return -20 + else: + return -time + + def apply(self, prompt: str, responses: List[bt.DendriteCall], name: str) -> torch.FloatTensor: + """ + Applies the reward model across each call. Unsuccessful responses are assigned a reward of zero. + + Parameters: + prompt (str): The prompt. + responses (List[bt.DendriteCall]): The list of dendrite calls. + name (str): The name. + + Returns: + torch.FloatTensor: The calculated rewards. + """ + # Get indices of successful responses + successful_completions_indices: List[int] = [idx for idx, resp in enumerate(responses) if resp.is_success] + + # Extract the completion text from successful responses + successful_completions: List[str] = [responses[idx].completion.strip() for idx in successful_completions_indices] + successful_timings: List[str] = [responses[idx].elapsed_time for idx in successful_completions_indices] + + # Calculate rewards for each successful completion + successful_rewards = self.get_rewards(prompt, successful_completions, name, successful_timings) + + # Apply softmax normalization to the rewards + successful_rewards = self.normalize_rewards(successful_rewards, name) + + # Initialize a tensor of zeros to hold the rewards for all responses + filled_rewards = torch.zeros(len(responses), dtype=torch.float32) + + # Fill in the rewards for successful responses + for idx, reward in zip(successful_completions_indices, successful_rewards): + filled_rewards[idx] = reward + + # Return the tensor of rewards + return filled_rewards + + def normalize_rewards(self, rewards: torch.FloatTensor, name: str) -> torch.FloatTensor: + """ + This method normalizes the given rewards by updating the moving mean and + variance statistics. The rewards are first standardized, and then scaled to + the 0-1 range using a cumulative distribution function (CDF) to ensure they're + in a comparable range across different environments. + + Args: + rewards (torch.FloatTensor): The reward values to be normalized. + name (str): The name to be used as a key for indexing the mean, variance, and count. + + Returns: + torch.FloatTensor: The normalized reward values. + + Note: + - This function uses Welford's online algorithm to update the mean and variance. + - It standardizes the reward values using the updated mean and variance. + - It then scales the standardized values to the 0-1 range using the error + function (erf) as a CDF. + """ + # Initialize the mean, variance, and count for a new name + if name not in self.mean: + self.mean[name] = 0.0 + self.var[name] = 0.0 + self.count[name] = 0 + + # Get the number of rewards (successful responses). + new_count = rewards.numel() + + # Update stats only if there are new rewards. + if 0 < new_count and 0 < self.count[name] + new_count: + # Calculate the mean and standard deviation of the new rewards. + new_mean = rewards.mean() + new_var = rewards.var(dim=0) + + # Compute the weights for the new and old rewards. + new_weight = new_count / (self.count[name] + new_count) + old_weight = self.count[name] / (self.count[name] + new_count) + + # Save the difference in means before updating the old mean. + diff = new_mean - self.mean[name] + + # Update the old mean with the new mean and weights. + self.mean[name] = new_weight * new_mean + old_weight * self.mean[name] + # Update the old variance with the new variance and weights, and adjusting for the difference in means. + self.var[name] = ( + new_weight * new_var + + old_weight * self.var[name] + + new_weight * old_weight * diff * diff + ) + # Update the old count with the new count, but don't exceed the limit. + self.count[name] = min(self.count_limit, self.count[name] + new_count) + + # Standardize the rewards using the updated mean and variance. + rewards = rewards - self.mean[name] + if self.var[name] > 0: + rewards /= torch.sqrt(self.var[name]) + # Scale the standardized rewards to the range [0, 1] using the error function as a cumulative distribution function (CDF). + rewards = 0.5 * (1 + torch.erf(rewards / torch.sqrt(torch.tensor([2.0])).to(rewards.device))) + + return rewards diff --git a/tests/test_event.py b/tests/test_event.py index 3ad566f..8c1310d 100644 --- a/tests/test_event.py +++ b/tests/test_event.py @@ -45,6 +45,8 @@ def test_event_from_dict_all_forward_columns_match(self): RewardModelType.rlhf.value: [1.0], RewardModelType.prompt.value: [1.0], RewardModelType.relevance.value: [1.0], + RewardModelType.time_aware.value: [1.0], + RewardModelType.length_aware.value: [1.0], } # Act