opentensor · Eugene-hu · Aug 28, 2023 · Jul 20, 2023 · Jul 20, 2023 · Jul 20, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,13 +1,24 @@
 # Changelog
 
-## 1.1.8 / 2023-08-12
+## 1.2.0 / 2023-08-28
+### What's changed
+- Adds Direct Optimization (DPO) style rewards by @opentaco on #99
+- Changes print format on exception catch by @camfairchild on #135
+- Brings back netuid and wandb to logged config by @p-ferreira on #137
+- Adds DPO penalty update by @Eugene-hu  on #138
+- Adds original reward output to wandb logs by @isabella618033  on #139
+- Reweights reward models by @Eugene-hu  on #140
+- Update stale documentation by @steffencruz  on #129
 
-## What's Changed
-- Make sure to serve axon first by @camfairchild in 14921d35c
 
 
-**Full Changelog**: https://github.com/opentensor/validators/compare/v1.1.7...v1.1.8
+**Full Changelog**: https://github.com/opentensor/validators/compare/v1.1.7...v1.2.0
 
+## 1.1.8 / 2023-08-12
+### What's Changed
+- Make sure to serve axon first by @camfairchild in 14921d35c
+- Adds scripts for releases on github by @camfairchild  in #128
+- Wandb config log changes @isabella618033  in #132  
 
 ## 1.1.7 / 2023-08-11
 ### What’s Changed

diff --git a/README.md b/README.md
@@ -118,13 +118,6 @@ Check the [README of the data collector](./scripts/README.md) for more informati
 
 ----
 ## Experimental Features
-### Prompt-Based Scoring
-The reward mechanism for miner completions plays a crucial role in the overall quality of the network. As such, we are constantly developing and testing new methods that make the reward process **open** and **robust**. This benefits everyone. Presently, miners weights are set based on evaluations of their completions that are carried out by a reward model. This presents two major challenges:
-
-1. Reward model evaluations are a bottleneck, owing to the large model size
-2. Reward models are vulnerable to attacks, which reduces the network quality for everyone
-
-Consequently, validators also perform *shadow scoring*, which outsources the reward mechanism to the network. This feature is currently under development, and so the prompt-based scores are only used for research purposes.
 
 ## Sentence Embedding Gating Model
 Another cornerstone of the validator functionality is the use of a mixture of experts (MoE) model, which we call the gating model, to enable queries to be efficiently routed to the best-suited miners. **This incentivizes miners to become specialists, which in turn improves response quality**. It also reduces latency and addresses bandwidth issues in the network.

diff --git a/openvalidators/__init__.py b/openvalidators/__init__.py
@@ -28,6 +28,6 @@
 from . import weights
 from . import event
 
-__version__ = "1.1.8"
+__version__ = "1.2.0"
 version_split = __version__.split(".")
 __spec_version__ = (1000 * int(version_split[0])) + (10 * int(version_split[1])) + (1 * int(version_split[2]))
diff --git a/openvalidators/config.py b/openvalidators/config.py
@@ -265,6 +265,12 @@ def add_args(cls, parser):
         help="Weight for the reciprocate reward model",
         default=DefaultRewardFrameworkConfig.reciprocate_model_weight,
     )
+    parser.add_argument(
+        "--reward.dpo_weight",
+        type=float,
+        help="Weight for the dpo reward model",
+        default=DefaultRewardFrameworkConfig.dpo_model_weight,
+    )
     parser.add_argument(
         "--reward.rlhf_weight",
         type=float,

diff --git a/openvalidators/event.py b/openvalidators/event.py
@@ -41,27 +41,49 @@ class EventSchema:
     nsfw_filter: Optional[List[float]]  # Output vector of the nsfw filter
     reciprocate_reward_model: Optional[List[float]]  # Output vector of the reciprocate reward model
     diversity_reward_model: Optional[List[float]]  # Output vector of the diversity reward model
+    dpo_reward_model: Optional[List[float]]  # Output vector of the dpo reward model
     rlhf_reward_model: Optional[List[float]]  # Output vector of the rlhf reward model
     prompt_reward_model: Optional[List[float]]  # Output vector of the prompt reward model
     relevance_filter: Optional[List[float]]  # Output vector of the relevance scoring reward model
     task_validator_filter: Optional[List[float]]
 
+    dahoas_reward_model_normalized: Optional[List[float]] # Output vector of the dahoas reward model
+    nsfw_filter_normalized: Optional[List[float]]  # Output vector of the nsfw filter
+    reciprocate_reward_model_normalized: Optional[List[float]]  # Output vector of the reciprocate reward model
+    diversity_reward_model_normalized: Optional[List[float]]  # Output vector of the diversity reward model
+    dpo_reward_model_normalized: Optional[List[float]]  # Output vector of the dpo reward model
+    rlhf_reward_model_normalized: Optional[List[float]]  # Output vector of the rlhf reward model
+    prompt_reward_model_normalized: Optional[List[float]]  # Output vector of the prompt reward model
+    relevance_filter_normalized: Optional[List[float]]  # Output vector of the relevance scoring reward model
+    task_validator_filter_normalized: Optional[List[float]]
+
     # Weights data
     set_weights: Optional[List[List[float]]]
 
     @staticmethod
     def from_dict(event_dict: dict, disable_log_rewards: bool) -> 'EventSchema':
         """Converts a dictionary to an EventSchema object."""
         rewards = {
-            'dahoas_reward_model': event_dict.get(RewardModelType.dahoas.value),
             'blacklist_filter': event_dict.get(RewardModelType.blacklist.value),
+            'dahoas_reward_model': event_dict.get(RewardModelType.dahoas.value),
             'task_validator_filter': event_dict.get(RewardModelType.task_validator.value),
             'nsfw_filter': event_dict.get(RewardModelType.nsfw.value),
             'relevance_filter': event_dict.get(RewardModelType.relevance.value),
             'reciprocate_reward_model': event_dict.get(RewardModelType.reciprocate.value),
             'diversity_reward_model': event_dict.get(RewardModelType.diversity.value),
+            'dpo_reward_model': event_dict.get(RewardModelType.dpo.value),
             'rlhf_reward_model': event_dict.get(RewardModelType.rlhf.value),
             'prompt_reward_model': event_dict.get(RewardModelType.prompt.value),
+
+            'dahoas_reward_model_normalized': event_dict.get(RewardModelType.dahoas.value + '_normalized'),
+            'task_validator_filter_normalized': event_dict.get(RewardModelType.task_validator.value + '_normalized'),
+            'nsfw_filter_normalized': event_dict.get(RewardModelType.nsfw.value + '_normalized'),
+            'relevance_filter_normalized': event_dict.get(RewardModelType.relevance.value + '_normalized'),
+            'reciprocate_reward_model_normalized': event_dict.get(RewardModelType.reciprocate.value + '_normalized'),
+            'diversity_reward_model_normalized': event_dict.get(RewardModelType.diversity.value + '_normalized'),
+            'dpo_reward_model_normalized': event_dict.get(RewardModelType.dpo.value + '_normalized'),
+            'rlhf_reward_model_normalized': event_dict.get(RewardModelType.rlhf.value + '_normalized'),
+            'prompt_reward_model_normalized': event_dict.get(RewardModelType.prompt.value + '_normalized'),
         }
 
         # Logs warning that expected data was not set properly

diff --git a/openvalidators/forward.py b/openvalidators/forward.py
@@ -87,18 +87,20 @@ async def run_step(self, prompt: str, k: int, timeout: float, name: str, exclude
     # Compute the rewards for the responses given the prompt.
     rewards: torch.FloatTensor = torch.zeros(len(responses), dtype=torch.float32).to(self.device)
     for weight_i, reward_fn_i in zip(self.reward_weights, self.reward_functions):
-        reward_i = reward_fn_i.apply(prompt, responses, name).to(self.device)
-        rewards += weight_i * reward_i
+        reward_i, reward_i_normalized = reward_fn_i.apply(prompt, responses, name)
+        rewards += weight_i * reward_i_normalized.to(self.device)
         if not self.config.neuron.disable_log_rewards:
             event[reward_fn_i.name] = reward_i.tolist()
-        bt.logging.trace(str(reward_fn_i.name), reward_i.tolist())
+            event[reward_fn_i.name + '_normalized'] = reward_i_normalized.tolist()
+        bt.logging.trace(str(reward_fn_i.name), reward_i_normalized.tolist())
 
     for masking_fn_i in self.masking_functions:
-        mask_i = masking_fn_i.apply(base_prompt, responses, name).to(self.device)
-        rewards *= mask_i  # includes diversity
+        mask_i, mask_i_normalized = masking_fn_i.apply(base_prompt, responses, name)
+        rewards *= mask_i_normalized.to(self.device)  # includes diversity
         if not self.config.neuron.disable_log_rewards:
             event[masking_fn_i.name] = mask_i.tolist()
-        bt.logging.trace(str(masking_fn_i.name), mask_i.tolist())
+            event[masking_fn_i.name + '_normalized'] = mask_i_normalized.tolist()
+        bt.logging.trace(str(masking_fn_i.name), mask_i_normalized.tolist())
 
     # Train the gating model based on the predicted scores and the actual rewards.
     gating_scores: torch.FloatTensor = self.gating_model(prompt).to(self.device)

diff --git a/openvalidators/neuron.py b/openvalidators/neuron.py
@@ -36,6 +36,7 @@
     Blacklist,
     TaskValidator,
     NSFWRewardModel,
+    DirectPreferenceRewardModel,
     OpenAssistantRewardModel,
     ReciprocateRewardModel,
     RelevanceRewardModel,
@@ -174,6 +175,7 @@ def __init__(self):
         else:
             self.reward_weights = torch.tensor(
                 [
+                    self.config.reward.dpo_weight,
                     self.config.reward.rlhf_weight,
                     self.config.reward.reciprocate_weight,
                     self.config.reward.dahoas_weight,
@@ -192,6 +194,9 @@ def __init__(self):
                 raise Exception(message)
 
             self.reward_functions = [
+                DirectPreferenceRewardModel(device=self.device)
+                if self.config.reward.dpo_weight > 0
+                else MockRewardModel(RewardModelType.dpo.value),
                 OpenAssistantRewardModel(device=self.device)
                 if self.config.reward.rlhf_weight > 0
                 else MockRewardModel(RewardModelType.rlhf.value),

diff --git a/openvalidators/reward/__init__.py b/openvalidators/reward/__init__.py
@@ -1,6 +1,7 @@
 from .blacklist import Blacklist
 from .task_validator import TaskValidator
 from .nsfw import NSFWRewardModel
+from .dpo import DirectPreferenceRewardModel
 from .open_assistant import OpenAssistantRewardModel
 from .reciprocate import ReciprocateRewardModel
 from .relevance import RelevanceRewardModel

diff --git a/openvalidators/reward/config.py b/openvalidators/reward/config.py
@@ -18,6 +18,7 @@
 
 
 class RewardModelType(Enum):
+    dpo = 'dpo_reward_model'
     rlhf = 'rlhf_reward_model'
     reciprocate = 'reciprocate_reward_model'
     dahoas = 'dahoas_reward_model'
@@ -34,7 +35,8 @@ class DefaultRewardFrameworkConfig:
     """Reward framework default configuration.
     Note: All the weights should add up to 1.0.
     """
-    rlhf_model_weight: float = 0.6
-    reciprocate_model_weight: float = 0.4
+    dpo_model_weight: float = 0.3
+    rlhf_model_weight: float = 0.4
+    reciprocate_model_weight: float = 0.3
     dahoas_model_weight: float = 0
     prompt_model_weight: float = 0
diff --git a/openvalidators/reward/dpo.py b/openvalidators/reward/dpo.py
@@ -0,0 +1,117 @@
+# The MIT License (MIT)
+# Copyright © 2021 Yuma Rao
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+# documentation files (the “Software”), to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
+# and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of
+# the Software.
+
+# THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+# THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+import torch
+import bittensor as bt
+from typing import List
+from .config import RewardModelType
+from .reward import BaseRewardModel
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+
+class DirectPreferenceRewardModel(BaseRewardModel):
+
+    reward_model_name: str = "cerebras/btlm-3b-8k-base"
+
+    @property
+    def name(self) -> str: return RewardModelType.dpo.value
+
+    def __init__(self, device: str):
+        super().__init__()
+        self.device = device
+        self.penalty = 1.2 # Same penalty as the original [paper](https://arxiv.org/pdf/1909.05858.pdf).
+        self.tokenizer = AutoTokenizer.from_pretrained(DirectPreferenceRewardModel.reward_model_name)
+        self.model = AutoModelForCausalLM.from_pretrained(DirectPreferenceRewardModel.reward_model_name,
+                                                          trust_remote_code=True,
+                                                          torch_dtype=torch.float16).to(self.device)
+
+    def reward_single(self, prompt: str, completion: str, name: str ,with_penalty=True) -> float:
+        r""" Calculates a direct preference optimization (DPO) style reward for a completion,
+        which is a reference model's average log-probability for completion tokens given a prompt.
+        Uses guidance from https://github.com/eric-mitchell/direct-preference-optimization/blob/main/trainers.py.
+        """
+        with torch.no_grad():
+
+            # Check if completion is 
+            if completion.strip() == '' or len(completion) <= 5:
+                return -11 # exp(-11)=1.67e-5 < 2e-5=1/50257 (typical vocab size)
+
+            # Tokenize the combined prompt + completion.
+            combined = self.tokenizer(prompt + completion, return_tensors="pt").input_ids[0].to(self.device)  # [seq_len]
+            # Tokenize only the prompt, to help determine prompt token length.
+            prompt_part = self.tokenizer(prompt, return_tensors="pt").input_ids[0].to(self.device)  # [prompt_len]
+
+            # Completion doesn't fit into model sequence, so return lowest reward.
+            if self.tokenizer.model_max_length <= len(prompt_part):
+                return -11.  # exp(-11)=1.67e-5 < 2e-5=1/50257 (typical vocab size)
+
+            # Truncate combined to fit into model max sequence length.
+            if self.tokenizer.model_max_length < len(combined):
+                combined = combined[:self.tokenizer.model_max_length]
+
+            labels = combined.clone()  # [seq_len]
+            # Ignore prompt part for calculating reward.
+            labels[:len(prompt_part)] = -100
+            # Label only each next token prediction ground-truth.
+            labels = labels[1:]  # [seq_len-1]
+            loss_mask = (labels != -100)  # [seq_len-1]
+            # Dummy token to allow for indexing, but loss will be ignored.
+            labels[labels == -100] = 0
+            # Reshape for gather operation.
+            labels = labels.unsqueeze(0).unsqueeze(2)  # [batch_size=1, seq_len-1, :]
+
+            # Forward pass to calculate logit predictions for each sequence position.
+            logits = self.model(combined.unsqueeze(0)).logits  # [batch_size=1, seq_len, vocab_len]
+            # Predict only where labels are available.
+            logits = logits[:, :-1, :]  # [batch_size=1, seq_len-1, vocab_len]
+
+            if with_penalty:
+                # Apply penalty for repeated generation
+                for i in range(len(prompt_part)+1, len(combined)-1):
+                    logit = logits[:,i,:].clone()
+                    inputs = combined[len(prompt_part):i].clone()
+                    logits[:,i,:] =  self.logit_penalty(input_ids=inputs, logit=logit)
+
+            # Rescale via log(softmax(logits)).
+            logits = logits.log_softmax(-1)
+            # Calculate the model's log-probability for each actual completion token.
+            per_token_logps = torch.gather(logits, dim=2, index=labels).squeeze(2)  # [batch_size=1, seq_len-1]
+            # Average log-probability over completion sequence.
+            reward = (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)  # [batch_size=1]
+            reward = reward[0].cpu().detach()
+
+            # NaNs can possibly arise through log(0)=-inf, replace with suitably small logits.
+            if torch.isnan(reward) or torch.isinf(reward):
+                return -11.  # exp(-11)=1.67e-5 < 2e-5=1/50257 (typical vocab size)
+            return reward.item()
+
+    def get_rewards(self, prompt: str, completions: List[str], name: str) -> torch.FloatTensor:
+        rewards = torch.tensor([self.reward_single(prompt, completion, name) for completion in completions],
+                               dtype=torch.float32).to(self.device)
+        bt.logging.trace(f"DirectPreferenceRewardModel | rewards: {rewards.tolist()}")
+        return rewards
+
+    def logit_penalty(self, input_ids: torch.LongTensor, logit: torch.FloatTensor) -> torch.FloatTensor:
+        # Counts the unique tokens within each generation
+        uniques, counts = input_ids.unique(return_counts=True)
+        score = torch.gather(logit, 1, uniques.unsqueeze(0))
+
+        # if score < 0 then repetition penalty has to be multiplied to reduce the previous token probability
+        score = torch.where(score < 0, score * (self.penalty**counts), score / (self.penalty**counts))
+
+        logit.scatter_(1, uniques.unsqueeze(0), score.to(logit.dtype))
+        return logit
diff --git a/openvalidators/reward/reward.py b/openvalidators/reward/reward.py
@@ -98,17 +98,19 @@ def apply( self, prompt: str, responses: List[ bt.DendriteCall ], name: str) ->
         successful_rewards = self.get_rewards( prompt, successful_completions, name )
 
         # Softmax rewards across samples.
-        successful_rewards = self.normalize_rewards( successful_rewards )
+        successful_rewards_normalized = self.normalize_rewards( successful_rewards )
 
         # Init zero rewards for all calls.
-        filled_rewards = torch.zeros( len( responses ), dtype=torch.float32)
+        filled_rewards = torch.ones( len( responses ), dtype=torch.float32) * torch.nan
+        filled_rewards_normalized = torch.zeros( len( responses ), dtype=torch.float32)
 
         # Fill reward tensor.
-        for idx, reward in zip(successful_completions_indices, successful_rewards):
+        for idx, reward, reward_normalized in zip(successful_completions_indices, successful_rewards, successful_rewards_normalized):
             filled_rewards[idx] = reward
+            filled_rewards_normalized[idx] = reward_normalized
 
         # Return the filled rewards.
-        return filled_rewards 
+        return filled_rewards, filled_rewards_normalized
 
 
 class MockRewardModel( BaseRewardModel ):
@@ -121,7 +123,7 @@ def __init__(self, mock_name: str = 'MockReward'):
         self.mock_name = mock_name
 
     def apply( self, prompt: str, completion: List[str], name: str ) -> torch.FloatTensor: 
-        return torch.tensor( [0 for _ in completion], dtype=torch.float32 )
-
+        mock_reward = torch.tensor( [0 for _ in completion], dtype=torch.float32 )
+        return mock_reward, mock_reward
 
 
diff --git a/openvalidators/run.py b/openvalidators/run.py
@@ -17,7 +17,7 @@
 
 import asyncio
 import bittensor as bt
-from traceback import print_exc
+from traceback import print_exception
 
 from openvalidators.forward import forward
 from openvalidators.utils import should_checkpoint, checkpoint, should_reinit_wandb, reinit_wandb, load_state, save_state
@@ -58,4 +58,4 @@ async def run_forward():
 
     except Exception as e:
         bt.logging.error("Error in training loop", str(e))
-        bt.logging.debug(print_exc(e))
+        bt.logging.debug(print_exception(value=e))
diff --git a/openvalidators/utils.py b/openvalidators/utils.py
@@ -49,12 +49,15 @@ def init_wandb(self, reinit=False):
     if self.config.neuron.disable_log_rewards:
         tags.append("disable_log_rewards")
 
+    wandb_config = {key: copy.deepcopy(self.config.get(key, None)) for key in ('neuron', 'reward', 'netuid', 'wandb')}
+    wandb_config['neuron'].pop('full_path', None)
+
     self.wandb = wandb.init(
         anonymous="allow",
         reinit=reinit,
         project=self.config.wandb.project_name,
         entity=self.config.wandb.entity,
-        config={key: self.config.get(key, None) for key in ('neuron', 'reward')},
+        config=wandb_config,
         mode="offline" if self.config.wandb.offline else "online",
         dir=self.config.neuron.full_path,
         tags=tags,