From 2386528a71faf9655b58d931a3ddc75964cb224c Mon Sep 17 00:00:00 2001
From: Cameron Fairchild <cameron@opentensor.ai>
Date: Tue, 30 Aug 2022 14:30:07 -0400
Subject: [PATCH 01/11] change diff display out

---
 bittensor/utils/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bittensor/utils/__init__.py b/bittensor/utils/__init__.py
index f2aa24bd8f..b703b46c6b 100644
--- a/bittensor/utils/__init__.py
+++ b/bittensor/utils/__init__.py
@@ -454,12 +454,12 @@ def get_human_readable(num, suffix="H"):
     return f"{num:.1f}Y{suffix}"
 
 def millify(n: int):
-    millnames = ['',' K',' M',' B',' T']
+    millnames = ['',' K',' M',' B',' T', 'q', 'Q']
     n = float(n)
     millidx = max(0,min(len(millnames)-1,
                         int(math.floor(0 if n == 0 else math.log10(abs(n))/3))))
 
-    return '{:.0f}{}'.format(n / 10**(3 * millidx), millnames[millidx])
+    return '{:.4f}{}'.format(n / 10**(3 * millidx), millnames[millidx])
 
 @backoff.on_exception(backoff.constant,
                             Exception,

From 54e4a97d03fbd637cc85f1d2ae1129074b374af8 Mon Sep 17 00:00:00 2001
From: Cameron Fairchild <cameron@opentensor.ai>
Date: Tue, 30 Aug 2022 14:34:00 -0400
Subject: [PATCH 02/11] remove logging

---
 bittensor/utils/register_cuda.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/bittensor/utils/register_cuda.py b/bittensor/utils/register_cuda.py
index 033f6d7612..086f1f3637 100644
--- a/bittensor/utils/register_cuda.py
+++ b/bittensor/utils/register_cuda.py
@@ -69,7 +69,6 @@ def create_seal_hash( block_bytes:bytes, nonce:int ) -> bytes:
     solution = cubit.solve_cuda(TPB, nonce_start, update_interval, upper_bytes, block_bytes, dev_id) # 0 is first GPU
     seal = None
     if solution != -1:
-        print(f"Checking solution: {solution} for bn: {bn}")
         seal = create_seal_hash(block_bytes, solution)
         if seal_meets_difficulty(seal, difficulty):
             return solution, seal

From cc485b0be35bc0ca0048ad4b0ee7d357599c6ab8 Mon Sep 17 00:00:00 2001
From: Cameron Fairchild <cameron@opentensor.ai>
Date: Tue, 30 Aug 2022 16:19:58 -0400
Subject: [PATCH 03/11] check cubit support in the check config

---
 bittensor/_subtensor/__init__.py | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/bittensor/_subtensor/__init__.py b/bittensor/_subtensor/__init__.py
index a994c1e79a..9656c28f5d 100644
--- a/bittensor/_subtensor/__init__.py
+++ b/bittensor/_subtensor/__init__.py
@@ -15,22 +15,16 @@
 # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
 # DEALINGS IN THE SOFTWARE.
 import argparse
+import copy
 import os
 
-import random
-import time
-import psutil
-import subprocess
-from sys import platform   
-
 import bittensor
-import copy
+from loguru import logger
 from substrateinterface import SubstrateInterface
+from torch.cuda import is_available as is_cuda_available
 
-from . import subtensor_impl
-from . import subtensor_mock
+from . import subtensor_impl, subtensor_mock
 
-from loguru import logger
 logger = logger.opt(colors=True)
 
 __type_registery__ = {
@@ -218,6 +212,17 @@ def add_defaults(cls, defaults ):
     def check_config( config: 'bittensor.Config' ):
         assert config.subtensor
         #assert config.subtensor.network != None
+        assert all((isinstance(x, int) or isinstance(x, str) and x.isnumeric() ) for x in config.subtensor.register.cuda.get('dev_id', []))
+
+        if config.subtensor.register.cuda.get('use_cuda', False):
+            try:
+                import cubit
+            except ImportError:
+                raise ImportError('CUDA registration is enabled but cubit is not installed. Please install cubit.')
+
+            if not is_cuda_available():
+                raise RuntimeError('CUDA registration is enabled but no CUDA devices are detected.')
+
 
     @staticmethod
     def determine_chain_endpoint(network: str):

From e39e82d2c443337f2c0db6b8254e3b0a5ff49f27 Mon Sep 17 00:00:00 2001
From: Cameron Fairchild <cameron@opentensor.ai>
Date: Tue, 30 Aug 2022 16:26:30 -0400
Subject: [PATCH 04/11] allow 1 or more devices in flag

---
 bittensor/_cli/__init__.py             | 17 ++++++++++++++---
 bittensor/_subtensor/__init__.py       |  4 ++--
 bittensor/_subtensor/subtensor_impl.py | 10 +++++-----
 bittensor/utils/__init__.py            |  8 +++++---
 4 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/bittensor/_cli/__init__.py b/bittensor/_cli/__init__.py
index c9e5ecedfe..2814253169 100644
--- a/bittensor/_cli/__init__.py
+++ b/bittensor/_cli/__init__.py
@@ -25,7 +25,7 @@
 
 import bittensor
 import torch
-from rich.prompt import Confirm, Prompt
+from rich.prompt import Confirm, Prompt, PromptBase
 from substrateinterface.utils.ss58 import ss58_decode, ss58_encode
 
 from . import cli_impl
@@ -843,9 +843,11 @@ def check_register_config( config: 'bittensor.Config' ):
                     for i, device in enumerate(devices):
                         choices_str += ("  {}: {}\n".format(device, device_names[i]))
                     console.print(choices_str)
-                    dev_id = Prompt.ask("Which GPU would you like to use?", choices=devices, default=str(bittensor.defaults.subtensor.register.cuda.dev_id))
+                    dev_id = IntListPrompt.ask("Which GPU(s) would you like to use?", choices=devices, default=str(bittensor.defaults.subtensor.register.cuda.dev_id))
                     try:
-                        dev_id = int(dev_id)
+                        # replace the commas with spaces then split over whitespace.,
+                        # then strip the whitespace and convert to ints.
+                        dev_id = [int(dev_id.strip()) for dev_id in dev_id.replace(',', ' ').split()]
                     except ValueError:
                         console.error(":cross_mark:[red]Invalid GPU device[/red] [bold white]{}[/bold white]\nAvailable CUDA devices:{}".format(dev_id, choices_str))
                         sys.exit(1)
@@ -935,3 +937,12 @@ def check_update_config( config: 'bittensor.Config'):
         if not config.no_prompt:
             answer = Prompt.ask('This will update the local bittensor package', choices = ['Y','N'], default = 'Y')
             config.answer = answer
+
+class IntListPrompt(PromptBase):
+    """ Prompt for a list of integers. """
+    
+    def check_choice( self, value: str ) -> bool:
+        assert self.choices is not None
+        # check if value is a valid choice or all the values in a list of ints are valid choices
+        return value in self.choices or \
+            all( val.strip() in self.choices for val in value.replace(',', ' ').split( ))
diff --git a/bittensor/_subtensor/__init__.py b/bittensor/_subtensor/__init__.py
index 9656c28f5d..dacf932617 100644
--- a/bittensor/_subtensor/__init__.py
+++ b/bittensor/_subtensor/__init__.py
@@ -183,7 +183,7 @@ def add_args(cls, parser: argparse.ArgumentParser, prefix: str = None ):
             parser.add_argument('--' + prefix_str + 'subtensor.register.update_interval', '--' + prefix_str + 'subtensor.register.cuda.update_interval', '--' + prefix_str + 'cuda.update_interval', '-u', help="The number of nonces to process before checking for next block during registration", type=int, default=bittensor.defaults.subtensor.register.update_interval)
              # registration args. Used for register and re-register and anything that calls register.
             parser.add_argument( '--' + prefix_str + 'subtensor.register.cuda.use_cuda', '--' + prefix_str + 'cuda', '--' + prefix_str + 'cuda.use_cuda', default=bittensor.defaults.subtensor.register.cuda.use_cuda, help='''Set true to use CUDA.''', action='store_true', required=False )
-            parser.add_argument( '--' + prefix_str + 'subtensor.register.cuda.dev_id', '--' + prefix_str + 'cuda.dev_id',  type=int, default=argparse.SUPPRESS, help='''Set the CUDA device id. Goes by the order of speed. (i.e. 0 is the fastest).''', required=False )
+            parser.add_argument( '--' + prefix_str + 'subtensor.register.cuda.dev_id', '--' + prefix_str + 'cuda.dev_id',  type=int, nargs='+', default=argparse.SUPPRESS, help='''Set the CUDA device id(s). Goes by the order of speed. (i.e. 0 is the fastest).''', required=False )
             parser.add_argument( '--' + prefix_str + 'subtensor.register.cuda.TPB', '--' + prefix_str + 'cuda.TPB', type=int, default=bittensor.defaults.subtensor.register.cuda.TPB, help='''Set the number of Threads Per Block for CUDA.''', required=False )
 
         except argparse.ArgumentError:
@@ -204,7 +204,7 @@ def add_defaults(cls, defaults ):
         defaults.subtensor.register.update_interval = os.getenv('BT_SUBTENSOR_REGISTER_UPDATE_INTERVAL') if os.getenv('BT_SUBTENSOR_REGISTER_UPDATE_INTERVAL') != None else 50_000
 
         defaults.subtensor.register.cuda = bittensor.Config()
-        defaults.subtensor.register.cuda.dev_id = 0
+        defaults.subtensor.register.cuda.dev_id = [0]
         defaults.subtensor.register.cuda.use_cuda = False
         defaults.subtensor.register.cuda.TPB = 256
 
diff --git a/bittensor/_subtensor/subtensor_impl.py b/bittensor/_subtensor/subtensor_impl.py
index c05f3a68da..dc78a1cd37 100644
--- a/bittensor/_subtensor/subtensor_impl.py
+++ b/bittensor/_subtensor/subtensor_impl.py
@@ -442,7 +442,7 @@ def register (
         prompt: bool = False,
         max_allowed_attempts: int = 3,
         cuda: bool = False,
-        dev_id: int = 0,
+        dev_id: Union[List[int], int] = 0,
         TPB: int = 256,
         num_processes: Optional[int] = None,
         update_interval: Optional[int] = None,
@@ -462,11 +462,11 @@ def register (
             max_allowed_attempts (int):
                 Maximum number of attempts to register the wallet.
             cuda (bool):
-                If true, the wallet should be registered on the cuda device.
-            dev_id (int):
-                The cuda device id.
+                If true, the wallet should be registered using CUDA device(s).
+            dev_id (Union[List[int], int]):
+                The CUDA device id to use, or a list of device ids.
             TPB (int):
-                The number of threads per block (cuda).
+                The number of threads per block (CUDA).
             num_processes (int):
                 The number of processes to use to register.
             update_interval (int):
diff --git a/bittensor/utils/__init__.py b/bittensor/utils/__init__.py
index b703b46c6b..9d8445f36a 100644
--- a/bittensor/utils/__init__.py
+++ b/bittensor/utils/__init__.py
@@ -9,7 +9,7 @@
 import time
 from dataclasses import dataclass
 from queue import Empty
-from typing import Any, Dict, Optional, Tuple, Union
+from typing import Any, Dict, Optional, Tuple, Union, List
 
 import backoff
 import bittensor
@@ -473,7 +473,7 @@ def get_block_with_retry(subtensor: 'bittensor.Subtensor') -> Tuple[int, int, by
         raise Exception("Network error. Could not connect to substrate to get block hash")
     return block_number, difficulty, block_hash
 
-def solve_for_difficulty_fast_cuda( subtensor: 'bittensor.Subtensor', wallet: 'bittensor.Wallet', update_interval: int = 50_000, TPB: int = 512, dev_id: int = 0 ) -> Optional[POWSolution]:
+def solve_for_difficulty_fast_cuda( subtensor: 'bittensor.Subtensor', wallet: 'bittensor.Wallet', update_interval: int = 50_000, TPB: int = 512, dev_id: List[int] = [0] ) -> Optional[POWSolution]:
     """
     Solves the registration fast using CUDA
     Args:
@@ -493,6 +493,8 @@ def solve_for_difficulty_fast_cuda( subtensor: 'bittensor.Subtensor', wallet: 'b
 
     if update_interval is None:
         update_interval = 50_000
+
+    dev_id = dev_id[0]
     
     block_number, difficulty, block_hash = get_block_with_retry(subtensor)
     block_bytes = block_hash.encode('utf-8')[2:]
@@ -570,7 +572,7 @@ def solve_for_difficulty_fast_cuda( subtensor: 'bittensor.Subtensor', wallet: 'b
         status.stop()
         return None
 
-def create_pow( subtensor, wallet, cuda: bool = False, dev_id: int = 0, tpb: int = 256, num_processes: int = None, update_interval: int = None ) -> Optional[Dict[str, Any]]:
+def create_pow( subtensor, wallet, cuda: bool = False, dev_id: Union[List[int], int] = 0, tpb: int = 256, num_processes: int = None, update_interval: int = None) -> Optional[Dict[str, Any]]:
     if cuda:
         solution: POWSolution = solve_for_difficulty_fast_cuda( subtensor, wallet, dev_id=dev_id, TPB=tpb, update_interval=update_interval )
     else:

From eebf6a94fe4374759e1e7e0613a5ced370ed0ccc Mon Sep 17 00:00:00 2001
From: Cameron Fairchild <cameron@opentensor.ai>
Date: Tue, 30 Aug 2022 16:27:04 -0400
Subject: [PATCH 05/11] cuda flag should be suppress

---
 bittensor/_cli/__init__.py       | 14 ++++++++------
 bittensor/_subtensor/__init__.py |  2 +-
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/bittensor/_cli/__init__.py b/bittensor/_cli/__init__.py
index 2814253169..51922bfbdb 100644
--- a/bittensor/_cli/__init__.py
+++ b/bittensor/_cli/__init__.py
@@ -828,14 +828,16 @@ def check_register_config( config: 'bittensor.Config' ):
         if config.wallet.get('hotkey') == bittensor.defaults.wallet.hotkey and not config.no_prompt:
             hotkey = Prompt.ask("Enter hotkey name", default = bittensor.defaults.wallet.hotkey)
             config.wallet.hotkey = str(hotkey)
-
-        if not config.no_prompt and config.subtensor.register.cuda.use_cuda == bittensor.defaults.subtensor.register.cuda.use_cuda:
-            # Ask about cuda registration only if a CUDA device is available.
+        
+        if not config.no_prompt:
             if torch.cuda.is_available():
-                cuda = Confirm.ask("Detected CUDA device, use CUDA for registration?\n")
-                config.subtensor.register.cuda.use_cuda = cuda
+                if config.subtensor.register.cuda.get('use_cuda') is None:
+                    # Ask about cuda registration only if a CUDA device is available.
+                    cuda = Confirm.ask("Detected CUDA device, use CUDA for registration?\n")
+                    config.subtensor.register.cuda.use_cuda = cuda
+                    
                 # Only ask about which CUDA device if the user has more than one CUDA device.
-                if cuda and config.subtensor.register.cuda.get('dev_id') is None and torch.cuda.device_count() > 0:
+                if config.subtensor.register.cuda.use_cuda and config.subtensor.register.cuda.get('dev_id') is None and torch.cuda.device_count() > 0:
                     devices: List[str] = [str(x) for x in range(torch.cuda.device_count())]
                     device_names: List[str] = [torch.cuda.get_device_name(x) for x in range(torch.cuda.device_count())]
                     console.print("Available CUDA devices:")
diff --git a/bittensor/_subtensor/__init__.py b/bittensor/_subtensor/__init__.py
index dacf932617..513e33c0df 100644
--- a/bittensor/_subtensor/__init__.py
+++ b/bittensor/_subtensor/__init__.py
@@ -182,7 +182,7 @@ def add_args(cls, parser: argparse.ArgumentParser, prefix: str = None ):
             parser.add_argument('--' + prefix_str + 'subtensor.register.num_processes', '-n', dest='subtensor.register.num_processes', help="Number of processors to use for registration", type=int, default=bittensor.defaults.subtensor.register.num_processes)
             parser.add_argument('--' + prefix_str + 'subtensor.register.update_interval', '--' + prefix_str + 'subtensor.register.cuda.update_interval', '--' + prefix_str + 'cuda.update_interval', '-u', help="The number of nonces to process before checking for next block during registration", type=int, default=bittensor.defaults.subtensor.register.update_interval)
              # registration args. Used for register and re-register and anything that calls register.
-            parser.add_argument( '--' + prefix_str + 'subtensor.register.cuda.use_cuda', '--' + prefix_str + 'cuda', '--' + prefix_str + 'cuda.use_cuda', default=bittensor.defaults.subtensor.register.cuda.use_cuda, help='''Set true to use CUDA.''', action='store_true', required=False )
+            parser.add_argument( '--' + prefix_str + 'subtensor.register.cuda.use_cuda', '--' + prefix_str + 'cuda', '--' + prefix_str + 'cuda.use_cuda', default=argparse.SUPPRESS, help='''Set true to use CUDA.''', action='store_true', required=False )
             parser.add_argument( '--' + prefix_str + 'subtensor.register.cuda.dev_id', '--' + prefix_str + 'cuda.dev_id',  type=int, nargs='+', default=argparse.SUPPRESS, help='''Set the CUDA device id(s). Goes by the order of speed. (i.e. 0 is the fastest).''', required=False )
             parser.add_argument( '--' + prefix_str + 'subtensor.register.cuda.TPB', '--' + prefix_str + 'cuda.TPB', type=int, default=bittensor.defaults.subtensor.register.cuda.TPB, help='''Set the number of Threads Per Block for CUDA.''', required=False )
 

From 19a2ff1e52540c45e86377eca55d024d6eaab609 Mon Sep 17 00:00:00 2001
From: Cameron Fairchild <cameron@opentensor.ai>
Date: Tue, 30 Aug 2022 16:28:15 -0400
Subject: [PATCH 06/11] modify how cpu count is found

---
 bittensor/utils/__init__.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/bittensor/utils/__init__.py b/bittensor/utils/__init__.py
index 9d8445f36a..2b1f1b5093 100644
--- a/bittensor/utils/__init__.py
+++ b/bittensor/utils/__init__.py
@@ -302,6 +302,8 @@ def update_curr_block(curr_diff: multiprocessing.Array, curr_block: multiprocess
             curr_block[i] = block_bytes[i]
         registration_diff_pack(diff, curr_diff)
 
+def get_cpu_count():
+    return len(os.sched_getaffinity(0))
 
 def solve_for_difficulty_fast( subtensor, wallet, num_processes: Optional[int] = None, update_interval: Optional[int] = None ) -> Optional[POWSolution]:
     """
@@ -322,7 +324,7 @@ def solve_for_difficulty_fast( subtensor, wallet, num_processes: Optional[int] =
     """
     if num_processes == None:
         # get the number of allowed processes for this process
-        num_processes = len(os.sched_getaffinity(0))
+        num_processes = min(1, get_cpu_count())
 
     if update_interval is None:
         update_interval = 50_000

From bb9d19af7a67a9fa99a409c4e10c95ca89889810 Mon Sep 17 00:00:00 2001
From: Cameron Fairchild <cameron@opentensor.ai>
Date: Tue, 30 Aug 2022 16:28:44 -0400
Subject: [PATCH 07/11] make a solver base class

---
 bittensor/utils/__init__.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/bittensor/utils/__init__.py b/bittensor/utils/__init__.py
index 2b1f1b5093..b9c798f2c2 100644
--- a/bittensor/utils/__init__.py
+++ b/bittensor/utils/__init__.py
@@ -145,7 +145,7 @@ class POWSolution:
     difficulty: int
     seal: bytes
 
-class Solver(multiprocessing.Process):
+class SolverBase(multiprocessing.Process):
     """
     A process that solves the registration PoW problem.
 
@@ -193,7 +193,7 @@ class Solver(multiprocessing.Process):
     proc_num: int
     num_proc: int
     update_interval: int
-    best_queue: multiprocessing.Queue
+    best_queue: Optional[multiprocessing.Queue]
     time_queue: multiprocessing.Queue
     solution_queue: multiprocessing.Queue
     newBlockEvent: multiprocessing.Event
@@ -221,6 +221,10 @@ def __init__(self, proc_num, num_proc, update_interval, best_queue, time_queue,
         self.stopEvent = stopEvent
         self.limit = limit
 
+    def run(self):
+        raise NotImplementedError("SolverBase is an abstract class")
+
+class Solver(SolverBase):
     def run(self):
         block_number: int
         block_bytes: bytes

From 5b1fcda5fe20c3e70d879e47369f33966de43d9e Mon Sep 17 00:00:00 2001
From: Cameron Fairchild <cameron@opentensor.ai>
Date: Tue, 30 Aug 2022 16:28:58 -0400
Subject: [PATCH 08/11] add a solverbase for CUDA

---
 bittensor/utils/__init__.py | 66 +++++++++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)

diff --git a/bittensor/utils/__init__.py b/bittensor/utils/__init__.py
index b9c798f2c2..302845ae2f 100644
--- a/bittensor/utils/__init__.py
+++ b/bittensor/utils/__init__.py
@@ -259,6 +259,72 @@ def run(self):
             nonce_start += self.update_interval * self.num_proc
             nonce_end += self.update_interval * self.num_proc
 
+class CUDASolver(SolverBase):
+    dev_id: int
+    TPB: int
+
+    def __init__(self, proc_num, num_proc, update_interval, time_queue, solution_queue, stopEvent, curr_block, curr_block_num, curr_diff, check_block, limit, dev_id: int, TPB: int):
+        super().__init__(proc_num, num_proc, update_interval, None, time_queue, solution_queue, stopEvent, curr_block, curr_block_num, curr_diff, check_block, limit)
+        self.dev_id = dev_id
+        self.TPB = TPB
+
+    def run(self):
+        block_number: int
+        block_bytes: bytes
+        block_difficulty: int
+        nonce_limit = int(math.pow(2,64)) - 1
+
+        # Start at random nonce
+        nonce_start = self.TPB * self.update_interval * self.proc_num + random.randint( 0, nonce_limit )
+        nonce_end = nonce_start + self.update_interval * self.TPB
+        while not self.stopEvent.is_set():
+            if self.newBlockEvent.is_set():
+                with self.check_block:
+                    block_number = self.curr_block_num.value
+                    block_bytes = bytes(self.curr_block)
+                    block_difficulty = registration_diff_unpack(self.curr_diff)
+
+                self.newBlockEvent.clear()
+                # reset nonces to start from random point
+                nonce_start = self.update_interval * self.proc_num + random.randint( 0, nonce_limit )
+                nonce_end = nonce_start + self.update_interval
+                
+            # Do a block of nonces
+            solution, time = solve_for_nonce_block_cuda(self, nonce_start, self.update_interval, block_bytes, block_difficulty, self.limit, block_number, self.dev_id, self.TPB)
+            if solution is not None:
+                self.solution_queue.put(solution)
+
+            # Send time
+            self.time_queue.put_nowait(time)
+                
+            nonce_start += self.update_interval * self.num_proc
+            nonce_start = nonce_start % nonce_limit
+            nonce_end += self.update_interval * self.num_proc
+
+
+def solve_for_nonce_block_cuda(solver: CUDASolver, nonce_start: int, update_interval: int, block_bytes: bytes, difficulty: int, limit: int, block_number: int, dev_id: int, TPB: int) -> Tuple[Optional[POWSolution], int]:
+    start = time.time()
+
+    solution, seal = solve_cuda(nonce_start,
+                    update_interval,
+                    TPB,
+                    block_bytes, 
+                    block_number,
+                    difficulty, 
+                    limit,
+                    dev_id)
+
+    if (solution != -1):
+        # Check if solution is valid
+        # Attempt to reset CUDA device
+        #reset_cuda()           
+    
+        #print(f"{solver.proc_num} on cuda:{solver.dev_id} found a solution: {solution}, {block_number}, {str(block_bytes)}, {str(seal)}, {difficulty}")
+        # Found a solution, save it.
+        return POWSolution(solution, block_number, difficulty, seal), time.time() - start
+
+    return None, time.time() - start
+
 
 def solve_for_nonce_block(solver: Solver, nonce_start: int, nonce_end: int, block_bytes: bytes, difficulty: int, limit: int, block_number: int) -> Tuple[Optional[POWSolution], int]:
     best_local = float('inf')

From 864bb0fa214130e35ead4778bb2326d61fbb3d90 Mon Sep 17 00:00:00 2001
From: Cameron Fairchild <cameron@opentensor.ai>
Date: Tue, 30 Aug 2022 16:30:55 -0400
Subject: [PATCH 09/11] use mutli process kernel launching, one per GPU

---
 bittensor/utils/__init__.py | 179 +++++++++++++++++++++++-------------
 1 file changed, 113 insertions(+), 66 deletions(-)

diff --git a/bittensor/utils/__init__.py b/bittensor/utils/__init__.py
index 302845ae2f..3b753e618e 100644
--- a/bittensor/utils/__init__.py
+++ b/bittensor/utils/__init__.py
@@ -545,7 +545,7 @@ def get_block_with_retry(subtensor: 'bittensor.Subtensor') -> Tuple[int, int, by
         raise Exception("Network error. Could not connect to substrate to get block hash")
     return block_number, difficulty, block_hash
 
-def solve_for_difficulty_fast_cuda( subtensor: 'bittensor.Subtensor', wallet: 'bittensor.Wallet', update_interval: int = 50_000, TPB: int = 512, dev_id: List[int] = [0] ) -> Optional[POWSolution]:
+def solve_for_difficulty_fast_cuda( subtensor: 'bittensor.Subtensor', wallet: 'bittensor.Wallet', update_interval: int = 50_000, TPB: int = 512, dev_id: Union[List[int], int] = 0, use_kernel_launch_optimization: bool = False ) -> Optional[POWSolution]:
     """
     Solves the registration fast using CUDA
     Args:
@@ -557,92 +557,139 @@ def solve_for_difficulty_fast_cuda( subtensor: 'bittensor.Subtensor', wallet: 'b
             The number of nonces to try before checking for more blocks
         TPB: int
             The number of threads per block. CUDA param that should match the GPU capability
-        dev_id: int
-            The CUDA device ID to execute the registration on
+        dev_id: Union[List[int], int]
+            The CUDA device IDs to execute the registration on, either a single device or a list of devices
     """
-    if not torch.cuda.is_available():
-        raise Exception("CUDA not available")
+    if isinstance(dev_id, int):
+        dev_id = [dev_id]
+    elif dev_id is None:
+        dev_id = [0]
 
     if update_interval is None:
         update_interval = 50_000
 
-    dev_id = dev_id[0]
-    
-    block_number, difficulty, block_hash = get_block_with_retry(subtensor)
-    block_bytes = block_hash.encode('utf-8')[2:]
-    
-    nonce = 0
+    if not torch.cuda.is_available():
+        raise Exception("CUDA not available")
+        
     limit = int(math.pow(2,256)) - 1
-    start_time = time.time()
 
     console = bittensor.__console__
     status = console.status("Solving")
+
+    # Set mp start to use spawn so CUDA doesn't complain
+    multiprocessing.set_start_method('spawn')
+
+    curr_block = multiprocessing.Array('h', 64, lock=True) # byte array
+    curr_block_num = multiprocessing.Value('i', 0, lock=True) # int
+    curr_diff = multiprocessing.Array('Q', [0, 0], lock=True) # [high, low]
+
+    def update_curr_block(block_number: int, block_bytes: bytes, diff: int, lock: multiprocessing.Lock):
+        with lock:
+            curr_block_num.value = block_number
+            for i in range(64):
+                curr_block[i] = block_bytes[i]
+            registration_diff_pack(diff, curr_diff)
+
+    status.start()
+
+    # Establish communication queues
+    stopEvent = multiprocessing.Event()
+    stopEvent.clear()
+    solution_queue = multiprocessing.Queue()
+    time_queue = multiprocessing.Queue()
+    check_block = multiprocessing.Lock()
+    
+    # Start consumers
+    num_processes = len(dev_id)
+    ## Create one consumer per GPU
+    solvers = [ CUDASolver(i, num_processes, update_interval, time_queue, solution_queue, stopEvent, curr_block, curr_block_num, curr_diff, check_block, limit, dev_id[i], TPB)
+                for i in range(num_processes) ]
+
+    # Get first block
+    block_number = subtensor.get_current_block()
+    difficulty = subtensor.difficulty
+    block_hash = subtensor.substrate.get_block_hash( block_number )
+    while block_hash == None:
+        block_hash = subtensor.substrate.get_block_hash( block_number )
+    block_bytes = block_hash.encode('utf-8')[2:]
+    old_block_number = block_number
+    # Set to current block
+    update_curr_block(block_number, block_bytes, difficulty, check_block)
+
+    # Set new block events for each solver to start
+    for w in solvers:
+        w.newBlockEvent.set()
+    
+    for w in solvers:
+        w.start() # start the solver processes
     
-    solution = -1
     start_time = time.time()
-    interval_time = start_time
+    time_since = 0.0
+    solution = None
+    itrs_per_sec = 0
+    while not wallet.is_registered(subtensor):
+        # Wait until a solver finds a solution
+        try:
+            solution = solution_queue.get(block=True, timeout=0.15)
+            if solution is not None:
+                break
+        except Empty:
+            # No solution found, try again
+            pass
 
-    status.start()
-    while solution == -1 and not wallet.is_registered(subtensor):
-        solution, seal = solve_cuda(nonce,
-                        update_interval,
-                        TPB,
-                        block_bytes, 
-                        block_number,
-                        difficulty, 
-                        limit,
-                        dev_id)
-
-        if (solution != -1):
-            # Attempt to reset CUDA device
-            # Check for any errors
-            err = log_cuda_errors()
-            if err: print(err)
-            reset_cuda()
-            status.stop()
-            new_bn = subtensor.get_current_block()
-            print(f"Found solution for bn: {block_number}; Newest: {new_bn}")            
-            return POWSolution(solution, block_number, difficulty, seal)
-
-        nonce += (TPB * update_interval)
-        if (nonce >= int(math.pow(2,63))):
-            nonce = 0
-        itrs_per_sec = (TPB * update_interval) / (time.time() - interval_time)
-        interval_time = time.time()
-
-        block_number, difficulty, block_hash = get_block_with_retry(subtensor)
-        block_bytes = block_hash.encode('utf-8')[2:]
+        # check for new block
+        block_number = subtensor.get_current_block()
+        if block_number != old_block_number:
+            old_block_number = block_number
+            # update block information
+            block_hash = subtensor.substrate.get_block_hash( block_number)
+            while block_hash == None:
+                block_hash = subtensor.substrate.get_block_hash( block_number)
+            block_bytes = block_hash.encode('utf-8')[2:]
+            difficulty = subtensor.difficulty
+
+            update_curr_block(block_number, block_bytes, difficulty, check_block)
+            # Set new block events for each solver
+            for w in solvers:
+                w.newBlockEvent.set()
+                
+        # Get times for each solver
+        time_total = 0
+        num_time = 0
+        for _ in range(time_queue.qsize()):
+            try:
+                time_ = time_queue.get_nowait()
+                time_total += time_
+                num_time += 1
 
+            except Empty:
+                break
+        
+        if num_time > 0:
+            time_avg = time_total / num_time
+            itrs_per_sec = TPB*update_interval*num_processes / time_avg
+            time_since = time.time() - start_time
+            
+        #times = [ time_queue.get() for _ in solvers ]
+        #time_avg = average(times)
+        
         message = f"""Solving 
-            time spent: {datetime.timedelta(seconds=time.time() - start_time)}
-            Nonce: [bold white]{nonce}[/bold white]
+            time spent: {time_since}
             Difficulty: [bold white]{millify(difficulty)}[/bold white]
-            Iters: [bold white]{get_human_readable(int(itrs_per_sec), "H")}/s[/bold white]
+            Iters: [bold white]{get_human_readable(int(itrs_per_sec), 'H')}/s[/bold white]
             Block: [bold white]{block_number}[/bold white]
             Block_hash: [bold white]{block_hash.encode('utf-8')}[/bold white]"""
         status.update(message.replace(" ", ""))
-        
+    
     # exited while, found_solution contains the nonce or wallet is registered
-    if solution == -1: # didn't find solution
-        # Check for any errors
-        err = log_cuda_errors()
-        if err:
-            raise CUDAException(err)
-        
-        reset_cuda()
+    if solution is not None:
+        stopEvent.set() # stop all other processes
         status.stop()
-        return None
-    
-    else:
-        # Check for any errors
-        err = log_cuda_errors()
-        if err:
-            raise CUDAException(err)
 
-        reset_cuda()
-        # Shouldn't get here
-        status.stop()
-        return None
+        return solution
+
+    status.stop()
+    return None
 
 def create_pow( subtensor, wallet, cuda: bool = False, dev_id: Union[List[int], int] = 0, tpb: int = 256, num_processes: int = None, update_interval: int = None) -> Optional[Dict[str, Any]]:
     if cuda:

From 7bc18a46ab0acbb4379be359e67e29fbd202d15b Mon Sep 17 00:00:00 2001
From: Cameron Fairchild <cameron@opentensor.ai>
Date: Tue, 30 Aug 2022 16:46:53 -0400
Subject: [PATCH 10/11] move check under dot get accessor

---
 bittensor/_subtensor/__init__.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/bittensor/_subtensor/__init__.py b/bittensor/_subtensor/__init__.py
index 513e33c0df..80f07bdf39 100644
--- a/bittensor/_subtensor/__init__.py
+++ b/bittensor/_subtensor/__init__.py
@@ -212,16 +212,17 @@ def add_defaults(cls, defaults ):
     def check_config( config: 'bittensor.Config' ):
         assert config.subtensor
         #assert config.subtensor.network != None
-        assert all((isinstance(x, int) or isinstance(x, str) and x.isnumeric() ) for x in config.subtensor.register.cuda.get('dev_id', []))
+        if config.subtensor.get('register') and config.subtensor.register.get('cuda'):
+            assert all((isinstance(x, int) or isinstance(x, str) and x.isnumeric() ) for x in config.subtensor.register.cuda.get('dev_id', []))
 
-        if config.subtensor.register.cuda.get('use_cuda', False):
-            try:
-                import cubit
-            except ImportError:
-                raise ImportError('CUDA registration is enabled but cubit is not installed. Please install cubit.')
+            if config.subtensor.register.cuda.get('use_cuda', False):
+                try:
+                    import cubit
+                except ImportError:
+                    raise ImportError('CUDA registration is enabled but cubit is not installed. Please install cubit.')
 
-            if not is_cuda_available():
-                raise RuntimeError('CUDA registration is enabled but no CUDA devices are detected.')
+                if not is_cuda_available():
+                    raise RuntimeError('CUDA registration is enabled but no CUDA devices are detected.')
 
 
     @staticmethod

From cbe804b6fd094bf757c34ef968cae19640465b10 Mon Sep 17 00:00:00 2001
From: Cameron Fairchild <cameron@opentensor.ai>
Date: Tue, 30 Aug 2022 21:33:05 -0400
Subject: [PATCH 11/11] add All gpus specification

---
 bittensor/_cli/__init__.py | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/bittensor/_cli/__init__.py b/bittensor/_cli/__init__.py
index d8c64be787..870b442e6a 100644
--- a/bittensor/_cli/__init__.py
+++ b/bittensor/_cli/__init__.py
@@ -834,14 +834,17 @@ def _check_for_cuda_reg_config( config: 'bittensor.Config' ) -> None:
                 for i, device in enumerate(devices):
                     choices_str += ("  {}: {}\n".format(device, device_names[i]))
                 console.print(choices_str)
-                dev_id = IntListPrompt.ask("Which GPU(s) would you like to use?", choices=devices, default=str(bittensor.defaults.subtensor.register.cuda.dev_id))
-                try:
-                    # replace the commas with spaces then split over whitespace.,
-                    # then strip the whitespace and convert to ints.
-                    dev_id = [int(dev_id.strip()) for dev_id in dev_id.replace(',', ' ').split()]
-                except ValueError:
-                    console.error(":cross_mark:[red]Invalid GPU device[/red] [bold white]{}[/bold white]\nAvailable CUDA devices:{}".format(dev_id, choices_str))
-                    sys.exit(1)
+                dev_id = IntListPrompt.ask("Which GPU(s) would you like to use? Please list one, or comma-separated", choices=devices, default='All')
+                if dev_id == 'All':
+                    dev_id = list(range(torch.cuda.device_count()))
+                else:
+                    try:
+                        # replace the commas with spaces then split over whitespace.,
+                        # then strip the whitespace and convert to ints.
+                        dev_id = [int(dev_id.strip()) for dev_id in dev_id.replace(',', ' ').split()]
+                    except ValueError:
+                        console.error(":cross_mark:[red]Invalid GPU device[/red] [bold white]{}[/bold white]\nAvailable CUDA devices:{}".format(dev_id, choices_str))
+                        sys.exit(1)
                 config.subtensor.register.cuda.dev_id = dev_id
 
     def check_register_config( config: 'bittensor.Config' ):
@@ -954,5 +957,6 @@ class IntListPrompt(PromptBase):
     def check_choice( self, value: str ) -> bool:
         assert self.choices is not None
         # check if value is a valid choice or all the values in a list of ints are valid choices
-        return value in self.choices or \
+        return value == "All" or \
+            value in self.choices or \
             all( val.strip() in self.choices for val in value.replace(',', ' ').split( ))