Skip to content
12 changes: 12 additions & 0 deletions openvalidators/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,18 @@ def add_args(cls, parser):
help="Dont apply the nsfw reward model",
default=False,
)
parser.add_argument(
"--neuron.length_off",
action="store_true",
help="Dont apply the length reward model",
default=False,
)
parser.add_argument(
"--neuron.time_off",
action="store_true",
help="Dont apply the time reward model",
default=False,
)
parser.add_argument(
"--neuron.relevance_off",
action="store_true",
Expand Down
4 changes: 4 additions & 0 deletions openvalidators/event.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ class EventSchema:
rlhf_reward_model: Optional[List[float]] # Output vector of the rlhf reward model
prompt_reward_model: Optional[List[float]] # Output vector of the prompt reward model
relevance_filter: Optional[List[float]] # Output vector of the relevance scoring reward model
time_aware_reward_model: Optional[List[float]] # Output vector of the time scoring reward model
length_aware_reward_model: Optional[List[float]] # Output vector of the length scoring reward model

# Weights data
set_weights: Optional[List[List[float]]]
Expand All @@ -60,6 +62,8 @@ def from_dict(event_dict: dict, disable_log_rewards: bool) -> 'EventSchema':
'diversity_reward_model': event_dict.get(RewardModelType.diversity.value),
'rlhf_reward_model': event_dict.get(RewardModelType.rlhf.value),
'prompt_reward_model': event_dict.get(RewardModelType.prompt.value),
'time_aware_reward_model' : event_dict.get(RewardModelType.time_aware.value),
'length_aware_reward_model' : event_dict.get(RewardModelType.length_aware.value),
}

# Logs warning that expected data was not set properly
Expand Down
8 changes: 8 additions & 0 deletions openvalidators/neuron.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@
DiversityRewardModel,
PromptRewardModel,
RewardModelType,
LengthAwareRewardModel,
TimeAwareRewardModel,
)


Expand Down Expand Up @@ -196,6 +198,12 @@ def __init__(self):
NSFWRewardModel(device=self.device)
if not self.config.neuron.nsfw_off
else MockRewardModel(RewardModelType.nsfw.value),
LengthAwareRewardModel(device=self.device)
if not self.config.neuron.length_off
else MockRewardModel(RewardModelType.length.value),
TimeAwareRewardModel(device=self.device)
if not self.config.neuron.time_off
else MockRewardModel(RewardModelType.time.value),
]
bt.logging.debug(str(self.reward_functions))

Expand Down
2 changes: 2 additions & 0 deletions openvalidators/reward/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,5 @@
from .diversity import DiversityRewardModel
from .prompt import PromptRewardModel
from .config import RewardModelType, DefaultRewardFrameworkConfig
from .length import LengthAwareRewardModel
from .timing import TimeAwareRewardModel
3 changes: 2 additions & 1 deletion openvalidators/reward/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ class RewardModelType(Enum):
blacklist = 'blacklist_filter'
nsfw = 'nsfw_filter'
relevance = 'relevance_filter'

time_aware = 'time_aware_reward_model'
length_aware = 'length_aware_reward_model'

@dataclass(frozen=True)
class DefaultRewardFrameworkConfig:
Expand Down
196 changes: 196 additions & 0 deletions openvalidators/reward/length.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@

# The MIT License (MIT)
# Copyright © 2021 Yuma Rao

# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
# documentation files (the “Software”), to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
# and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all copies or substantial portions of
# the Software.

# THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
# THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.

import torch
from typing import List
from .config import RewardModelType
from .reward import BaseRewardModel
from transformers import AutoTokenizer, AutoModel
import bittensor as bt

class LengthAwareRewardModel(BaseRewardModel):
"""
A model that calculates rewards based on the input prompt, completion, and the length of the completion.

Attributes:
reward_model_path (str): The path to the pretrained reward model.
revision (str): The revision version of the pretrained model.
"""
reward_model_path: str = "EleutherAI/gpt-j-6b"

@property
def name(self) -> str:
"""The name of the reward model."""
return RewardModelType.length_aware.value

def __init__(self, device: str):
"""
The constructor for LengthAwareRewardModel class.

Parameters:
device (str): The device to which the model will be sent.
"""
super().__init__()
self.device = device
# Load the tokenizer from the pretrained model
self.tokenizer = AutoTokenizer.from_pretrained(LengthAwareRewardModel.reward_model_path)
self.count = {}
self.mean = {}
self.var = {}

def reward(self, prompt: str, completion: str, name: str) -> float:
"""
Calculate the reward for a single prompt and completion. The length of the completion is used as an additional factor.

Parameters:
prompt (str): The prompt.
completion (str): The completion.
name (str): The name.

Returns:
float: The calculated reward.
"""
with torch.no_grad():
# Tokenize the message
inputs = self.tokenizer(completion,
return_tensors="pt",
).to(self.device)
# Multiply the reward by the length of the completion
token_len = inputs['input_ids'].size()[1]
if name not in self.mean:
cutoff = 300
else:
cutoff = self.mean[name] + 2*torch.sqrt(self.var[name])

if token_len > cutoff:
return torch.tensor([cutoff]).to(self.device)
else:
return token_len

def get_rewards(self, prompt: str, completions: List[str], name: str) -> torch.FloatTensor:
"""
Calculate the rewards for multiple completions. The length of each completion is used as an additional factor.

Parameters:
prompt (str): The prompt.
completions (List[str]): The list of completions.
name (str): The name.

Returns:
torch.FloatTensor: The calculated rewards.
"""
# Return the calculated rewards for all completions
return torch.tensor([self.reward(prompt, completion, name) for completion in completions], dtype=torch.float32).to(self.device)

def apply(self, prompt: str, responses: List[bt.DendriteCall], name: str) -> torch.FloatTensor:
"""
Applies the reward model across each call. Unsuccessful responses are assigned a reward of zero.

Parameters:
prompt (str): The prompt.
responses (List[bt.DendriteCall]): The list of dendrite calls.
name (str): The name.

Returns:
torch.FloatTensor: The calculated rewards.
"""
# Get indices of successful responses
successful_completions_indices: List[int] = [idx for idx, resp in enumerate(responses) if resp.is_success]

# Extract the completion text from successful responses
successful_completions: List[str] = [responses[idx].completion.strip() for idx in successful_completions_indices]

# Calculate rewards for each successful completion
successful_rewards = self.get_rewards(prompt, successful_completions, name)

# Apply normalization to the rewards
successful_rewards = self.normalize_rewards(successful_rewards, name)

# Initialize a tensor of zeros to hold the rewards for all responses
filled_rewards = torch.zeros(len(responses), dtype=torch.float32)

# Fill in the rewards for successful responses
for idx, reward in zip(successful_completions_indices, successful_rewards):
filled_rewards[idx] = reward

# Return the tensor of rewards
return filled_rewards

def normalize_rewards(self, rewards: torch.FloatTensor, name: str) -> torch.FloatTensor:
"""
This method normalizes the given rewards by updating the moving mean and
variance statistics. The rewards are first standardized, and then scaled to
the 0-1 range using a cumulative distribution function (CDF) to ensure they're
in a comparable range across different environments.

Args:
rewards (torch.FloatTensor): The reward values to be normalized.
name (str): The name to be used as a key for indexing the mean, variance, and count.

Returns:
torch.FloatTensor: The normalized reward values.

Note:
- This function uses Welford's online algorithm to update the mean and variance.
- It standardizes the reward values using the updated mean and variance.
- It then scales the standardized values to the 0-1 range using the error
function (erf) as a CDF.
"""
# Initialize the mean, variance, and count for a new name
if name not in self.mean:
self.mean[name] = 0.0
self.var[name] = 0.0
self.count[name] = 0

# Get the number of rewards (successful responses).
new_count = rewards.numel()

# Update stats only if there are new rewards.
if 0 < new_count and 0 < self.count[name] + new_count:
# Calculate the mean and standard deviation of the new rewards.
new_mean = rewards.mean()
new_var = rewards.var(dim=0)

# Compute the weights for the new and old rewards.
new_weight = new_count / (self.count[name] + new_count)
old_weight = self.count[name] / (self.count[name] + new_count)

# Save the difference in means before updating the old mean.
diff = new_mean - self.mean[name]

# Update the old mean with the new mean and weights.
self.mean[name] = new_weight * new_mean + old_weight * self.mean[name]
# Update the old variance with the new variance and weights, and adjusting for the difference in means.
self.var[name] = (
new_weight * new_var
+ old_weight * self.var[name]
+ new_weight * old_weight * diff * diff
)
# Update the old count with the new count, but don't exceed the limit.
self.count[name] = min(self.count_limit, self.count[name] + new_count)

# Standardize the rewards using the updated mean and variance.
rewards = rewards - self.mean[name]
if self.var[name] > 0:
rewards /= torch.sqrt(self.var[name])
# Scale the standardized rewards to the range [0, 1] using the error function as a cumulative distribution function (CDF).
rewards = 0.5 * (1 + torch.erf(rewards / torch.sqrt(torch.tensor([2.0])).to(rewards.device)))

return rewards


Loading