From ac623e9aa65be4dd693acff20ad4d935833a2c18 Mon Sep 17 00:00:00 2001 From: opentaco Date: Mon, 7 Aug 2023 18:17:12 +0200 Subject: [PATCH 1/8] Sync diversity model to disk --- openvalidators/utils.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/openvalidators/utils.py b/openvalidators/utils.py index 340a020..b1ae8ea 100644 --- a/openvalidators/utils.py +++ b/openvalidators/utils.py @@ -201,6 +201,15 @@ def save_state(self): gating_model_file_path = f"{self.config.neuron.full_path}/{gating_model_name}_gating_linear_layer.pth" torch.save(gating_model_linear_layer_dict, gating_model_file_path) + # Save diversity model. + diversity_model_dict = {"historic_embeddings": self.diversity_model.historic_embeddings} + diversity_model_file_path = f"{self.config.neuron.full_path}/diversity_model.pth" + torch.save(diversity_model_dict, diversity_model_file_path) + bt.logging.success( + prefix="Saved diversity model", + sufix=f"{diversity_model_file_path} [{self.diversity_model.historic_embeddings.shape}]", + ) + if not self.config.wandb.off: wandb.log({ "step": self.step, @@ -234,5 +243,15 @@ def load_state(self): prefix="Reloaded model", sufix=f"{ self.config.neuron.full_path }/model.torch", ) + + # Load diversity model. + diversity_model_file_path = f"{self.config.neuron.full_path}/diversity_model.pth" + diversity_model_dict = torch.load(diversity_model_file_path) + self.diversity_model.historic_embeddings = diversity_model_dict["historic_embeddings"] + bt.logging.success( + prefix="Reloaded diversity model", + sufix=f"{diversity_model_file_path} [{self.diversity_model.historic_embeddings.shape}]", + ) + except Exception as e: bt.logging.warning(f"Failed to load model with error: {e}") From 74749519bba43e35f0c026e4e8578e359f1ddf35 Mon Sep 17 00:00:00 2001 From: opentaco Date: Tue, 8 Aug 2023 17:14:20 +0200 Subject: [PATCH 2/8] Self diversity model --- openvalidators/neuron.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/openvalidators/neuron.py b/openvalidators/neuron.py index 6ba439b..29e0a1c 100644 --- a/openvalidators/neuron.py +++ b/openvalidators/neuron.py @@ -201,7 +201,7 @@ def __init__(self): RelevanceRewardModel(device=self.device) if not self.config.neuron.relevance_off else MockRewardModel(RewardModelType.relevance.value) ) - diversity_model = ( + self.diversity_model = ( DiversityRewardModel(device=self.device) if not self.config.neuron.diversity_off else MockRewardModel(RewardModelType.diversity.value) ) @@ -210,7 +210,7 @@ def __init__(self): else MockRewardModel(RewardModelType.nsfw.value) ) - self.masking_functions = [self.blacklist, task_validator, relevance_model, diversity_model, nsfw_model] + self.masking_functions = [self.blacklist, task_validator, relevance_model, self.diversity_model, nsfw_model] bt.logging.debug(str(self.reward_functions)) bt.logging.debug(str(self.masking_functions)) From 9fc75d9036e8a650e4c72b67f8e3df94539e7b44 Mon Sep 17 00:00:00 2001 From: opentaco Date: Tue, 8 Aug 2023 17:19:57 +0200 Subject: [PATCH 3/8] Separate exceptions --- openvalidators/utils.py | 39 +++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/openvalidators/utils.py b/openvalidators/utils.py index b1ae8ea..398984b 100644 --- a/openvalidators/utils.py +++ b/openvalidators/utils.py @@ -194,27 +194,21 @@ def save_state(self): prefix="Saved model", sufix=f"{ self.config.neuron.full_path }/model.torch", ) + except Exception as e: + bt.logging.warning(f"Failed to save model with error: {e}") + try: # Save the gating model. gating_model_linear_layer_dict = self.gating_model.linear.state_dict() gating_model_name = self.config.gating.model_name.replace("/", "_") gating_model_file_path = f"{self.config.neuron.full_path}/{gating_model_name}_gating_linear_layer.pth" torch.save(gating_model_linear_layer_dict, gating_model_file_path) - # Save diversity model. - diversity_model_dict = {"historic_embeddings": self.diversity_model.historic_embeddings} - diversity_model_file_path = f"{self.config.neuron.full_path}/diversity_model.pth" - torch.save(diversity_model_dict, diversity_model_file_path) - bt.logging.success( - prefix="Saved diversity model", - sufix=f"{diversity_model_file_path} [{self.diversity_model.historic_embeddings.shape}]", - ) - if not self.config.wandb.off: wandb.log({ "step": self.step, "block": ttl_get_block(self), - **neuron_state_dict + **neuron_state_dict }) if not self.config.wandb.off and self.config.wandb.track_gating_model: model_artifact = wandb.Artifact(f"{gating_model_name}_gating_linear_layer", type="model") @@ -222,12 +216,23 @@ def save_state(self): self.wandb.log_artifact(model_artifact) bt.logging.success(prefix="Saved gating model", sufix=f"{gating_model_file_path}") + except Exception as e: + bt.logging.warning(f"Failed to save gating model with error: {e}") - #empty cache - torch.cuda.empty_cache() - + try: + # Save diversity model. + diversity_model_dict = {"historic_embeddings": self.diversity_model.historic_embeddings} + diversity_model_file_path = f"{self.config.neuron.full_path}/diversity_model.pth" + torch.save(diversity_model_dict, diversity_model_file_path) + bt.logging.success( + prefix="Saved diversity model", + sufix=f"{diversity_model_file_path} [{self.diversity_model.historic_embeddings.shape}]", + ) except Exception as e: - bt.logging.warning(f"Failed to save model with error: {e}") + bt.logging.warning(f"Failed to save diversity model with error: {e}") + + # empty cache + torch.cuda.empty_cache() def load_state(self): @@ -243,7 +248,10 @@ def load_state(self): prefix="Reloaded model", sufix=f"{ self.config.neuron.full_path }/model.torch", ) + except Exception as e: + bt.logging.warning(f"Failed to load model with error: {e}") + try: # Load diversity model. diversity_model_file_path = f"{self.config.neuron.full_path}/diversity_model.pth" diversity_model_dict = torch.load(diversity_model_file_path) @@ -252,6 +260,5 @@ def load_state(self): prefix="Reloaded diversity model", sufix=f"{diversity_model_file_path} [{self.diversity_model.historic_embeddings.shape}]", ) - except Exception as e: - bt.logging.warning(f"Failed to load model with error: {e}") + bt.logging.warning(f"Failed to load diversity model with error: {e}") From aa2e2b55b466641bb7d7bcf3b10f20bc11b5aacc Mon Sep 17 00:00:00 2001 From: opentaco Date: Tue, 8 Aug 2023 17:29:09 +0200 Subject: [PATCH 4/8] Print historic_embeddings shape --- openvalidators/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/openvalidators/utils.py b/openvalidators/utils.py index 398984b..8bcdf1a 100644 --- a/openvalidators/utils.py +++ b/openvalidators/utils.py @@ -226,7 +226,7 @@ def save_state(self): torch.save(diversity_model_dict, diversity_model_file_path) bt.logging.success( prefix="Saved diversity model", - sufix=f"{diversity_model_file_path} [{self.diversity_model.historic_embeddings.shape}]", + sufix=f"{diversity_model_file_path} [{list(self.diversity_model.historic_embeddings.shape)}]", ) except Exception as e: bt.logging.warning(f"Failed to save diversity model with error: {e}") @@ -258,7 +258,7 @@ def load_state(self): self.diversity_model.historic_embeddings = diversity_model_dict["historic_embeddings"] bt.logging.success( prefix="Reloaded diversity model", - sufix=f"{diversity_model_file_path} [{self.diversity_model.historic_embeddings.shape}]", + sufix=f"{diversity_model_file_path} [{list(self.diversity_model.historic_embeddings.shape)}]", ) except Exception as e: bt.logging.warning(f"Failed to load diversity model with error: {e}") From 2ceb5d0097bb8182560d8952f3d8b9b57a7785ea Mon Sep 17 00:00:00 2001 From: opentaco Date: Tue, 8 Aug 2023 17:55:02 +0200 Subject: [PATCH 5/8] Print historic_embeddings shape --- openvalidators/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/openvalidators/utils.py b/openvalidators/utils.py index 8bcdf1a..29cc89a 100644 --- a/openvalidators/utils.py +++ b/openvalidators/utils.py @@ -226,7 +226,7 @@ def save_state(self): torch.save(diversity_model_dict, diversity_model_file_path) bt.logging.success( prefix="Saved diversity model", - sufix=f"{diversity_model_file_path} [{list(self.diversity_model.historic_embeddings.shape)}]", + sufix=f"{diversity_model_file_path} {list(self.diversity_model.historic_embeddings.shape)}", ) except Exception as e: bt.logging.warning(f"Failed to save diversity model with error: {e}") @@ -258,7 +258,7 @@ def load_state(self): self.diversity_model.historic_embeddings = diversity_model_dict["historic_embeddings"] bt.logging.success( prefix="Reloaded diversity model", - sufix=f"{diversity_model_file_path} [{list(self.diversity_model.historic_embeddings.shape)}]", + sufix=f"{diversity_model_file_path} {list(self.diversity_model.historic_embeddings.shape)}", ) except Exception as e: bt.logging.warning(f"Failed to load diversity model with error: {e}") From 69aca46d4d68e4fdb67f521a002f300b07e23c4f Mon Sep 17 00:00:00 2001 From: opentaco Date: Tue, 8 Aug 2023 18:08:42 +0200 Subject: [PATCH 6/8] Tensorize neuron_weights --- openvalidators/utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/openvalidators/utils.py b/openvalidators/utils.py index 29cc89a..14b8398 100644 --- a/openvalidators/utils.py +++ b/openvalidators/utils.py @@ -241,8 +241,9 @@ def load_state(self): try: state_dict = torch.load(f"{self.config.neuron.full_path}/model.torch") # Check for nans in saved state dict - if not torch.isnan(state_dict["neuron_weights"]).any(): - self.moving_averaged_scores = state_dict["neuron_weights"].clone().detach() + neuron_weights = torch.tensor(state_dict["neuron_weights"]) + if not torch.isnan(neuron_weights).any(): + self.moving_averaged_scores = neuron_weights self.hotkeys = state_dict["neuron_hotkeys"] bt.logging.success( prefix="Reloaded model", From 2ad3604a6ed4e1adbed557522c9bcf7edc3738f3 Mon Sep 17 00:00:00 2001 From: opentaco Date: Tue, 8 Aug 2023 18:20:59 +0200 Subject: [PATCH 7/8] Device neuron_weights --- openvalidators/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openvalidators/utils.py b/openvalidators/utils.py index 14b8398..7e72972 100644 --- a/openvalidators/utils.py +++ b/openvalidators/utils.py @@ -243,7 +243,7 @@ def load_state(self): # Check for nans in saved state dict neuron_weights = torch.tensor(state_dict["neuron_weights"]) if not torch.isnan(neuron_weights).any(): - self.moving_averaged_scores = neuron_weights + self.moving_averaged_scores = neuron_weights.to(self.device) self.hotkeys = state_dict["neuron_hotkeys"] bt.logging.success( prefix="Reloaded model", From 7432eb8b7ae9621c6d8d6850a5b204f5863f6d37 Mon Sep 17 00:00:00 2001 From: opentaco Date: Tue, 8 Aug 2023 18:47:24 +0200 Subject: [PATCH 8/8] Device agnostic save --- openvalidators/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/openvalidators/utils.py b/openvalidators/utils.py index 7e72972..d008f55 100644 --- a/openvalidators/utils.py +++ b/openvalidators/utils.py @@ -221,7 +221,7 @@ def save_state(self): try: # Save diversity model. - diversity_model_dict = {"historic_embeddings": self.diversity_model.historic_embeddings} + diversity_model_dict = {"historic_embeddings": self.diversity_model.historic_embeddings.to('cpu')} diversity_model_file_path = f"{self.config.neuron.full_path}/diversity_model.pth" torch.save(diversity_model_dict, diversity_model_file_path) bt.logging.success( @@ -256,7 +256,7 @@ def load_state(self): # Load diversity model. diversity_model_file_path = f"{self.config.neuron.full_path}/diversity_model.pth" diversity_model_dict = torch.load(diversity_model_file_path) - self.diversity_model.historic_embeddings = diversity_model_dict["historic_embeddings"] + self.diversity_model.historic_embeddings = diversity_model_dict["historic_embeddings"].to(self.device) bt.logging.success( prefix="Reloaded diversity model", sufix=f"{diversity_model_file_path} {list(self.diversity_model.historic_embeddings.shape)}",