From 2386528a71faf9655b58d931a3ddc75964cb224c Mon Sep 17 00:00:00 2001 From: Cameron Fairchild Date: Tue, 30 Aug 2022 14:30:07 -0400 Subject: [PATCH 01/11] change diff display out --- bittensor/utils/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bittensor/utils/__init__.py b/bittensor/utils/__init__.py index f2aa24bd8f..b703b46c6b 100644 --- a/bittensor/utils/__init__.py +++ b/bittensor/utils/__init__.py @@ -454,12 +454,12 @@ def get_human_readable(num, suffix="H"): return f"{num:.1f}Y{suffix}" def millify(n: int): - millnames = ['',' K',' M',' B',' T'] + millnames = ['',' K',' M',' B',' T', 'q', 'Q'] n = float(n) millidx = max(0,min(len(millnames)-1, int(math.floor(0 if n == 0 else math.log10(abs(n))/3)))) - return '{:.0f}{}'.format(n / 10**(3 * millidx), millnames[millidx]) + return '{:.4f}{}'.format(n / 10**(3 * millidx), millnames[millidx]) @backoff.on_exception(backoff.constant, Exception, From 54e4a97d03fbd637cc85f1d2ae1129074b374af8 Mon Sep 17 00:00:00 2001 From: Cameron Fairchild Date: Tue, 30 Aug 2022 14:34:00 -0400 Subject: [PATCH 02/11] remove logging --- bittensor/utils/register_cuda.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bittensor/utils/register_cuda.py b/bittensor/utils/register_cuda.py index 033f6d7612..086f1f3637 100644 --- a/bittensor/utils/register_cuda.py +++ b/bittensor/utils/register_cuda.py @@ -69,7 +69,6 @@ def create_seal_hash( block_bytes:bytes, nonce:int ) -> bytes: solution = cubit.solve_cuda(TPB, nonce_start, update_interval, upper_bytes, block_bytes, dev_id) # 0 is first GPU seal = None if solution != -1: - print(f"Checking solution: {solution} for bn: {bn}") seal = create_seal_hash(block_bytes, solution) if seal_meets_difficulty(seal, difficulty): return solution, seal From cc485b0be35bc0ca0048ad4b0ee7d357599c6ab8 Mon Sep 17 00:00:00 2001 From: Cameron Fairchild Date: Tue, 30 Aug 2022 16:19:58 -0400 Subject: [PATCH 03/11] check cubit support in the check config --- bittensor/_subtensor/__init__.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/bittensor/_subtensor/__init__.py b/bittensor/_subtensor/__init__.py index a994c1e79a..9656c28f5d 100644 --- a/bittensor/_subtensor/__init__.py +++ b/bittensor/_subtensor/__init__.py @@ -15,22 +15,16 @@ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. import argparse +import copy import os -import random -import time -import psutil -import subprocess -from sys import platform - import bittensor -import copy +from loguru import logger from substrateinterface import SubstrateInterface +from torch.cuda import is_available as is_cuda_available -from . import subtensor_impl -from . import subtensor_mock +from . import subtensor_impl, subtensor_mock -from loguru import logger logger = logger.opt(colors=True) __type_registery__ = { @@ -218,6 +212,17 @@ def add_defaults(cls, defaults ): def check_config( config: 'bittensor.Config' ): assert config.subtensor #assert config.subtensor.network != None + assert all((isinstance(x, int) or isinstance(x, str) and x.isnumeric() ) for x in config.subtensor.register.cuda.get('dev_id', [])) + + if config.subtensor.register.cuda.get('use_cuda', False): + try: + import cubit + except ImportError: + raise ImportError('CUDA registration is enabled but cubit is not installed. Please install cubit.') + + if not is_cuda_available(): + raise RuntimeError('CUDA registration is enabled but no CUDA devices are detected.') + @staticmethod def determine_chain_endpoint(network: str): From e39e82d2c443337f2c0db6b8254e3b0a5ff49f27 Mon Sep 17 00:00:00 2001 From: Cameron Fairchild Date: Tue, 30 Aug 2022 16:26:30 -0400 Subject: [PATCH 04/11] allow 1 or more devices in flag --- bittensor/_cli/__init__.py | 17 ++++++++++++++--- bittensor/_subtensor/__init__.py | 4 ++-- bittensor/_subtensor/subtensor_impl.py | 10 +++++----- bittensor/utils/__init__.py | 8 +++++--- 4 files changed, 26 insertions(+), 13 deletions(-) diff --git a/bittensor/_cli/__init__.py b/bittensor/_cli/__init__.py index c9e5ecedfe..2814253169 100644 --- a/bittensor/_cli/__init__.py +++ b/bittensor/_cli/__init__.py @@ -25,7 +25,7 @@ import bittensor import torch -from rich.prompt import Confirm, Prompt +from rich.prompt import Confirm, Prompt, PromptBase from substrateinterface.utils.ss58 import ss58_decode, ss58_encode from . import cli_impl @@ -843,9 +843,11 @@ def check_register_config( config: 'bittensor.Config' ): for i, device in enumerate(devices): choices_str += (" {}: {}\n".format(device, device_names[i])) console.print(choices_str) - dev_id = Prompt.ask("Which GPU would you like to use?", choices=devices, default=str(bittensor.defaults.subtensor.register.cuda.dev_id)) + dev_id = IntListPrompt.ask("Which GPU(s) would you like to use?", choices=devices, default=str(bittensor.defaults.subtensor.register.cuda.dev_id)) try: - dev_id = int(dev_id) + # replace the commas with spaces then split over whitespace., + # then strip the whitespace and convert to ints. + dev_id = [int(dev_id.strip()) for dev_id in dev_id.replace(',', ' ').split()] except ValueError: console.error(":cross_mark:[red]Invalid GPU device[/red] [bold white]{}[/bold white]\nAvailable CUDA devices:{}".format(dev_id, choices_str)) sys.exit(1) @@ -935,3 +937,12 @@ def check_update_config( config: 'bittensor.Config'): if not config.no_prompt: answer = Prompt.ask('This will update the local bittensor package', choices = ['Y','N'], default = 'Y') config.answer = answer + +class IntListPrompt(PromptBase): + """ Prompt for a list of integers. """ + + def check_choice( self, value: str ) -> bool: + assert self.choices is not None + # check if value is a valid choice or all the values in a list of ints are valid choices + return value in self.choices or \ + all( val.strip() in self.choices for val in value.replace(',', ' ').split( )) diff --git a/bittensor/_subtensor/__init__.py b/bittensor/_subtensor/__init__.py index 9656c28f5d..dacf932617 100644 --- a/bittensor/_subtensor/__init__.py +++ b/bittensor/_subtensor/__init__.py @@ -183,7 +183,7 @@ def add_args(cls, parser: argparse.ArgumentParser, prefix: str = None ): parser.add_argument('--' + prefix_str + 'subtensor.register.update_interval', '--' + prefix_str + 'subtensor.register.cuda.update_interval', '--' + prefix_str + 'cuda.update_interval', '-u', help="The number of nonces to process before checking for next block during registration", type=int, default=bittensor.defaults.subtensor.register.update_interval) # registration args. Used for register and re-register and anything that calls register. parser.add_argument( '--' + prefix_str + 'subtensor.register.cuda.use_cuda', '--' + prefix_str + 'cuda', '--' + prefix_str + 'cuda.use_cuda', default=bittensor.defaults.subtensor.register.cuda.use_cuda, help='''Set true to use CUDA.''', action='store_true', required=False ) - parser.add_argument( '--' + prefix_str + 'subtensor.register.cuda.dev_id', '--' + prefix_str + 'cuda.dev_id', type=int, default=argparse.SUPPRESS, help='''Set the CUDA device id. Goes by the order of speed. (i.e. 0 is the fastest).''', required=False ) + parser.add_argument( '--' + prefix_str + 'subtensor.register.cuda.dev_id', '--' + prefix_str + 'cuda.dev_id', type=int, nargs='+', default=argparse.SUPPRESS, help='''Set the CUDA device id(s). Goes by the order of speed. (i.e. 0 is the fastest).''', required=False ) parser.add_argument( '--' + prefix_str + 'subtensor.register.cuda.TPB', '--' + prefix_str + 'cuda.TPB', type=int, default=bittensor.defaults.subtensor.register.cuda.TPB, help='''Set the number of Threads Per Block for CUDA.''', required=False ) except argparse.ArgumentError: @@ -204,7 +204,7 @@ def add_defaults(cls, defaults ): defaults.subtensor.register.update_interval = os.getenv('BT_SUBTENSOR_REGISTER_UPDATE_INTERVAL') if os.getenv('BT_SUBTENSOR_REGISTER_UPDATE_INTERVAL') != None else 50_000 defaults.subtensor.register.cuda = bittensor.Config() - defaults.subtensor.register.cuda.dev_id = 0 + defaults.subtensor.register.cuda.dev_id = [0] defaults.subtensor.register.cuda.use_cuda = False defaults.subtensor.register.cuda.TPB = 256 diff --git a/bittensor/_subtensor/subtensor_impl.py b/bittensor/_subtensor/subtensor_impl.py index c05f3a68da..dc78a1cd37 100644 --- a/bittensor/_subtensor/subtensor_impl.py +++ b/bittensor/_subtensor/subtensor_impl.py @@ -442,7 +442,7 @@ def register ( prompt: bool = False, max_allowed_attempts: int = 3, cuda: bool = False, - dev_id: int = 0, + dev_id: Union[List[int], int] = 0, TPB: int = 256, num_processes: Optional[int] = None, update_interval: Optional[int] = None, @@ -462,11 +462,11 @@ def register ( max_allowed_attempts (int): Maximum number of attempts to register the wallet. cuda (bool): - If true, the wallet should be registered on the cuda device. - dev_id (int): - The cuda device id. + If true, the wallet should be registered using CUDA device(s). + dev_id (Union[List[int], int]): + The CUDA device id to use, or a list of device ids. TPB (int): - The number of threads per block (cuda). + The number of threads per block (CUDA). num_processes (int): The number of processes to use to register. update_interval (int): diff --git a/bittensor/utils/__init__.py b/bittensor/utils/__init__.py index b703b46c6b..9d8445f36a 100644 --- a/bittensor/utils/__init__.py +++ b/bittensor/utils/__init__.py @@ -9,7 +9,7 @@ import time from dataclasses import dataclass from queue import Empty -from typing import Any, Dict, Optional, Tuple, Union +from typing import Any, Dict, Optional, Tuple, Union, List import backoff import bittensor @@ -473,7 +473,7 @@ def get_block_with_retry(subtensor: 'bittensor.Subtensor') -> Tuple[int, int, by raise Exception("Network error. Could not connect to substrate to get block hash") return block_number, difficulty, block_hash -def solve_for_difficulty_fast_cuda( subtensor: 'bittensor.Subtensor', wallet: 'bittensor.Wallet', update_interval: int = 50_000, TPB: int = 512, dev_id: int = 0 ) -> Optional[POWSolution]: +def solve_for_difficulty_fast_cuda( subtensor: 'bittensor.Subtensor', wallet: 'bittensor.Wallet', update_interval: int = 50_000, TPB: int = 512, dev_id: List[int] = [0] ) -> Optional[POWSolution]: """ Solves the registration fast using CUDA Args: @@ -493,6 +493,8 @@ def solve_for_difficulty_fast_cuda( subtensor: 'bittensor.Subtensor', wallet: 'b if update_interval is None: update_interval = 50_000 + + dev_id = dev_id[0] block_number, difficulty, block_hash = get_block_with_retry(subtensor) block_bytes = block_hash.encode('utf-8')[2:] @@ -570,7 +572,7 @@ def solve_for_difficulty_fast_cuda( subtensor: 'bittensor.Subtensor', wallet: 'b status.stop() return None -def create_pow( subtensor, wallet, cuda: bool = False, dev_id: int = 0, tpb: int = 256, num_processes: int = None, update_interval: int = None ) -> Optional[Dict[str, Any]]: +def create_pow( subtensor, wallet, cuda: bool = False, dev_id: Union[List[int], int] = 0, tpb: int = 256, num_processes: int = None, update_interval: int = None) -> Optional[Dict[str, Any]]: if cuda: solution: POWSolution = solve_for_difficulty_fast_cuda( subtensor, wallet, dev_id=dev_id, TPB=tpb, update_interval=update_interval ) else: From eebf6a94fe4374759e1e7e0613a5ced370ed0ccc Mon Sep 17 00:00:00 2001 From: Cameron Fairchild Date: Tue, 30 Aug 2022 16:27:04 -0400 Subject: [PATCH 05/11] cuda flag should be suppress --- bittensor/_cli/__init__.py | 14 ++++++++------ bittensor/_subtensor/__init__.py | 2 +- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/bittensor/_cli/__init__.py b/bittensor/_cli/__init__.py index 2814253169..51922bfbdb 100644 --- a/bittensor/_cli/__init__.py +++ b/bittensor/_cli/__init__.py @@ -828,14 +828,16 @@ def check_register_config( config: 'bittensor.Config' ): if config.wallet.get('hotkey') == bittensor.defaults.wallet.hotkey and not config.no_prompt: hotkey = Prompt.ask("Enter hotkey name", default = bittensor.defaults.wallet.hotkey) config.wallet.hotkey = str(hotkey) - - if not config.no_prompt and config.subtensor.register.cuda.use_cuda == bittensor.defaults.subtensor.register.cuda.use_cuda: - # Ask about cuda registration only if a CUDA device is available. + + if not config.no_prompt: if torch.cuda.is_available(): - cuda = Confirm.ask("Detected CUDA device, use CUDA for registration?\n") - config.subtensor.register.cuda.use_cuda = cuda + if config.subtensor.register.cuda.get('use_cuda') is None: + # Ask about cuda registration only if a CUDA device is available. + cuda = Confirm.ask("Detected CUDA device, use CUDA for registration?\n") + config.subtensor.register.cuda.use_cuda = cuda + # Only ask about which CUDA device if the user has more than one CUDA device. - if cuda and config.subtensor.register.cuda.get('dev_id') is None and torch.cuda.device_count() > 0: + if config.subtensor.register.cuda.use_cuda and config.subtensor.register.cuda.get('dev_id') is None and torch.cuda.device_count() > 0: devices: List[str] = [str(x) for x in range(torch.cuda.device_count())] device_names: List[str] = [torch.cuda.get_device_name(x) for x in range(torch.cuda.device_count())] console.print("Available CUDA devices:") diff --git a/bittensor/_subtensor/__init__.py b/bittensor/_subtensor/__init__.py index dacf932617..513e33c0df 100644 --- a/bittensor/_subtensor/__init__.py +++ b/bittensor/_subtensor/__init__.py @@ -182,7 +182,7 @@ def add_args(cls, parser: argparse.ArgumentParser, prefix: str = None ): parser.add_argument('--' + prefix_str + 'subtensor.register.num_processes', '-n', dest='subtensor.register.num_processes', help="Number of processors to use for registration", type=int, default=bittensor.defaults.subtensor.register.num_processes) parser.add_argument('--' + prefix_str + 'subtensor.register.update_interval', '--' + prefix_str + 'subtensor.register.cuda.update_interval', '--' + prefix_str + 'cuda.update_interval', '-u', help="The number of nonces to process before checking for next block during registration", type=int, default=bittensor.defaults.subtensor.register.update_interval) # registration args. Used for register and re-register and anything that calls register. - parser.add_argument( '--' + prefix_str + 'subtensor.register.cuda.use_cuda', '--' + prefix_str + 'cuda', '--' + prefix_str + 'cuda.use_cuda', default=bittensor.defaults.subtensor.register.cuda.use_cuda, help='''Set true to use CUDA.''', action='store_true', required=False ) + parser.add_argument( '--' + prefix_str + 'subtensor.register.cuda.use_cuda', '--' + prefix_str + 'cuda', '--' + prefix_str + 'cuda.use_cuda', default=argparse.SUPPRESS, help='''Set true to use CUDA.''', action='store_true', required=False ) parser.add_argument( '--' + prefix_str + 'subtensor.register.cuda.dev_id', '--' + prefix_str + 'cuda.dev_id', type=int, nargs='+', default=argparse.SUPPRESS, help='''Set the CUDA device id(s). Goes by the order of speed. (i.e. 0 is the fastest).''', required=False ) parser.add_argument( '--' + prefix_str + 'subtensor.register.cuda.TPB', '--' + prefix_str + 'cuda.TPB', type=int, default=bittensor.defaults.subtensor.register.cuda.TPB, help='''Set the number of Threads Per Block for CUDA.''', required=False ) From 19a2ff1e52540c45e86377eca55d024d6eaab609 Mon Sep 17 00:00:00 2001 From: Cameron Fairchild Date: Tue, 30 Aug 2022 16:28:15 -0400 Subject: [PATCH 06/11] modify how cpu count is found --- bittensor/utils/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bittensor/utils/__init__.py b/bittensor/utils/__init__.py index 9d8445f36a..2b1f1b5093 100644 --- a/bittensor/utils/__init__.py +++ b/bittensor/utils/__init__.py @@ -302,6 +302,8 @@ def update_curr_block(curr_diff: multiprocessing.Array, curr_block: multiprocess curr_block[i] = block_bytes[i] registration_diff_pack(diff, curr_diff) +def get_cpu_count(): + return len(os.sched_getaffinity(0)) def solve_for_difficulty_fast( subtensor, wallet, num_processes: Optional[int] = None, update_interval: Optional[int] = None ) -> Optional[POWSolution]: """ @@ -322,7 +324,7 @@ def solve_for_difficulty_fast( subtensor, wallet, num_processes: Optional[int] = """ if num_processes == None: # get the number of allowed processes for this process - num_processes = len(os.sched_getaffinity(0)) + num_processes = min(1, get_cpu_count()) if update_interval is None: update_interval = 50_000 From bb9d19af7a67a9fa99a409c4e10c95ca89889810 Mon Sep 17 00:00:00 2001 From: Cameron Fairchild Date: Tue, 30 Aug 2022 16:28:44 -0400 Subject: [PATCH 07/11] make a solver base class --- bittensor/utils/__init__.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/bittensor/utils/__init__.py b/bittensor/utils/__init__.py index 2b1f1b5093..b9c798f2c2 100644 --- a/bittensor/utils/__init__.py +++ b/bittensor/utils/__init__.py @@ -145,7 +145,7 @@ class POWSolution: difficulty: int seal: bytes -class Solver(multiprocessing.Process): +class SolverBase(multiprocessing.Process): """ A process that solves the registration PoW problem. @@ -193,7 +193,7 @@ class Solver(multiprocessing.Process): proc_num: int num_proc: int update_interval: int - best_queue: multiprocessing.Queue + best_queue: Optional[multiprocessing.Queue] time_queue: multiprocessing.Queue solution_queue: multiprocessing.Queue newBlockEvent: multiprocessing.Event @@ -221,6 +221,10 @@ def __init__(self, proc_num, num_proc, update_interval, best_queue, time_queue, self.stopEvent = stopEvent self.limit = limit + def run(self): + raise NotImplementedError("SolverBase is an abstract class") + +class Solver(SolverBase): def run(self): block_number: int block_bytes: bytes From 5b1fcda5fe20c3e70d879e47369f33966de43d9e Mon Sep 17 00:00:00 2001 From: Cameron Fairchild Date: Tue, 30 Aug 2022 16:28:58 -0400 Subject: [PATCH 08/11] add a solverbase for CUDA --- bittensor/utils/__init__.py | 66 +++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/bittensor/utils/__init__.py b/bittensor/utils/__init__.py index b9c798f2c2..302845ae2f 100644 --- a/bittensor/utils/__init__.py +++ b/bittensor/utils/__init__.py @@ -259,6 +259,72 @@ def run(self): nonce_start += self.update_interval * self.num_proc nonce_end += self.update_interval * self.num_proc +class CUDASolver(SolverBase): + dev_id: int + TPB: int + + def __init__(self, proc_num, num_proc, update_interval, time_queue, solution_queue, stopEvent, curr_block, curr_block_num, curr_diff, check_block, limit, dev_id: int, TPB: int): + super().__init__(proc_num, num_proc, update_interval, None, time_queue, solution_queue, stopEvent, curr_block, curr_block_num, curr_diff, check_block, limit) + self.dev_id = dev_id + self.TPB = TPB + + def run(self): + block_number: int + block_bytes: bytes + block_difficulty: int + nonce_limit = int(math.pow(2,64)) - 1 + + # Start at random nonce + nonce_start = self.TPB * self.update_interval * self.proc_num + random.randint( 0, nonce_limit ) + nonce_end = nonce_start + self.update_interval * self.TPB + while not self.stopEvent.is_set(): + if self.newBlockEvent.is_set(): + with self.check_block: + block_number = self.curr_block_num.value + block_bytes = bytes(self.curr_block) + block_difficulty = registration_diff_unpack(self.curr_diff) + + self.newBlockEvent.clear() + # reset nonces to start from random point + nonce_start = self.update_interval * self.proc_num + random.randint( 0, nonce_limit ) + nonce_end = nonce_start + self.update_interval + + # Do a block of nonces + solution, time = solve_for_nonce_block_cuda(self, nonce_start, self.update_interval, block_bytes, block_difficulty, self.limit, block_number, self.dev_id, self.TPB) + if solution is not None: + self.solution_queue.put(solution) + + # Send time + self.time_queue.put_nowait(time) + + nonce_start += self.update_interval * self.num_proc + nonce_start = nonce_start % nonce_limit + nonce_end += self.update_interval * self.num_proc + + +def solve_for_nonce_block_cuda(solver: CUDASolver, nonce_start: int, update_interval: int, block_bytes: bytes, difficulty: int, limit: int, block_number: int, dev_id: int, TPB: int) -> Tuple[Optional[POWSolution], int]: + start = time.time() + + solution, seal = solve_cuda(nonce_start, + update_interval, + TPB, + block_bytes, + block_number, + difficulty, + limit, + dev_id) + + if (solution != -1): + # Check if solution is valid + # Attempt to reset CUDA device + #reset_cuda() + + #print(f"{solver.proc_num} on cuda:{solver.dev_id} found a solution: {solution}, {block_number}, {str(block_bytes)}, {str(seal)}, {difficulty}") + # Found a solution, save it. + return POWSolution(solution, block_number, difficulty, seal), time.time() - start + + return None, time.time() - start + def solve_for_nonce_block(solver: Solver, nonce_start: int, nonce_end: int, block_bytes: bytes, difficulty: int, limit: int, block_number: int) -> Tuple[Optional[POWSolution], int]: best_local = float('inf') From 864bb0fa214130e35ead4778bb2326d61fbb3d90 Mon Sep 17 00:00:00 2001 From: Cameron Fairchild Date: Tue, 30 Aug 2022 16:30:55 -0400 Subject: [PATCH 09/11] use mutli process kernel launching, one per GPU --- bittensor/utils/__init__.py | 179 +++++++++++++++++++++++------------- 1 file changed, 113 insertions(+), 66 deletions(-) diff --git a/bittensor/utils/__init__.py b/bittensor/utils/__init__.py index 302845ae2f..3b753e618e 100644 --- a/bittensor/utils/__init__.py +++ b/bittensor/utils/__init__.py @@ -545,7 +545,7 @@ def get_block_with_retry(subtensor: 'bittensor.Subtensor') -> Tuple[int, int, by raise Exception("Network error. Could not connect to substrate to get block hash") return block_number, difficulty, block_hash -def solve_for_difficulty_fast_cuda( subtensor: 'bittensor.Subtensor', wallet: 'bittensor.Wallet', update_interval: int = 50_000, TPB: int = 512, dev_id: List[int] = [0] ) -> Optional[POWSolution]: +def solve_for_difficulty_fast_cuda( subtensor: 'bittensor.Subtensor', wallet: 'bittensor.Wallet', update_interval: int = 50_000, TPB: int = 512, dev_id: Union[List[int], int] = 0, use_kernel_launch_optimization: bool = False ) -> Optional[POWSolution]: """ Solves the registration fast using CUDA Args: @@ -557,92 +557,139 @@ def solve_for_difficulty_fast_cuda( subtensor: 'bittensor.Subtensor', wallet: 'b The number of nonces to try before checking for more blocks TPB: int The number of threads per block. CUDA param that should match the GPU capability - dev_id: int - The CUDA device ID to execute the registration on + dev_id: Union[List[int], int] + The CUDA device IDs to execute the registration on, either a single device or a list of devices """ - if not torch.cuda.is_available(): - raise Exception("CUDA not available") + if isinstance(dev_id, int): + dev_id = [dev_id] + elif dev_id is None: + dev_id = [0] if update_interval is None: update_interval = 50_000 - dev_id = dev_id[0] - - block_number, difficulty, block_hash = get_block_with_retry(subtensor) - block_bytes = block_hash.encode('utf-8')[2:] - - nonce = 0 + if not torch.cuda.is_available(): + raise Exception("CUDA not available") + limit = int(math.pow(2,256)) - 1 - start_time = time.time() console = bittensor.__console__ status = console.status("Solving") + + # Set mp start to use spawn so CUDA doesn't complain + multiprocessing.set_start_method('spawn') + + curr_block = multiprocessing.Array('h', 64, lock=True) # byte array + curr_block_num = multiprocessing.Value('i', 0, lock=True) # int + curr_diff = multiprocessing.Array('Q', [0, 0], lock=True) # [high, low] + + def update_curr_block(block_number: int, block_bytes: bytes, diff: int, lock: multiprocessing.Lock): + with lock: + curr_block_num.value = block_number + for i in range(64): + curr_block[i] = block_bytes[i] + registration_diff_pack(diff, curr_diff) + + status.start() + + # Establish communication queues + stopEvent = multiprocessing.Event() + stopEvent.clear() + solution_queue = multiprocessing.Queue() + time_queue = multiprocessing.Queue() + check_block = multiprocessing.Lock() + + # Start consumers + num_processes = len(dev_id) + ## Create one consumer per GPU + solvers = [ CUDASolver(i, num_processes, update_interval, time_queue, solution_queue, stopEvent, curr_block, curr_block_num, curr_diff, check_block, limit, dev_id[i], TPB) + for i in range(num_processes) ] + + # Get first block + block_number = subtensor.get_current_block() + difficulty = subtensor.difficulty + block_hash = subtensor.substrate.get_block_hash( block_number ) + while block_hash == None: + block_hash = subtensor.substrate.get_block_hash( block_number ) + block_bytes = block_hash.encode('utf-8')[2:] + old_block_number = block_number + # Set to current block + update_curr_block(block_number, block_bytes, difficulty, check_block) + + # Set new block events for each solver to start + for w in solvers: + w.newBlockEvent.set() + + for w in solvers: + w.start() # start the solver processes - solution = -1 start_time = time.time() - interval_time = start_time + time_since = 0.0 + solution = None + itrs_per_sec = 0 + while not wallet.is_registered(subtensor): + # Wait until a solver finds a solution + try: + solution = solution_queue.get(block=True, timeout=0.15) + if solution is not None: + break + except Empty: + # No solution found, try again + pass - status.start() - while solution == -1 and not wallet.is_registered(subtensor): - solution, seal = solve_cuda(nonce, - update_interval, - TPB, - block_bytes, - block_number, - difficulty, - limit, - dev_id) - - if (solution != -1): - # Attempt to reset CUDA device - # Check for any errors - err = log_cuda_errors() - if err: print(err) - reset_cuda() - status.stop() - new_bn = subtensor.get_current_block() - print(f"Found solution for bn: {block_number}; Newest: {new_bn}") - return POWSolution(solution, block_number, difficulty, seal) - - nonce += (TPB * update_interval) - if (nonce >= int(math.pow(2,63))): - nonce = 0 - itrs_per_sec = (TPB * update_interval) / (time.time() - interval_time) - interval_time = time.time() - - block_number, difficulty, block_hash = get_block_with_retry(subtensor) - block_bytes = block_hash.encode('utf-8')[2:] + # check for new block + block_number = subtensor.get_current_block() + if block_number != old_block_number: + old_block_number = block_number + # update block information + block_hash = subtensor.substrate.get_block_hash( block_number) + while block_hash == None: + block_hash = subtensor.substrate.get_block_hash( block_number) + block_bytes = block_hash.encode('utf-8')[2:] + difficulty = subtensor.difficulty + + update_curr_block(block_number, block_bytes, difficulty, check_block) + # Set new block events for each solver + for w in solvers: + w.newBlockEvent.set() + + # Get times for each solver + time_total = 0 + num_time = 0 + for _ in range(time_queue.qsize()): + try: + time_ = time_queue.get_nowait() + time_total += time_ + num_time += 1 + except Empty: + break + + if num_time > 0: + time_avg = time_total / num_time + itrs_per_sec = TPB*update_interval*num_processes / time_avg + time_since = time.time() - start_time + + #times = [ time_queue.get() for _ in solvers ] + #time_avg = average(times) + message = f"""Solving - time spent: {datetime.timedelta(seconds=time.time() - start_time)} - Nonce: [bold white]{nonce}[/bold white] + time spent: {time_since} Difficulty: [bold white]{millify(difficulty)}[/bold white] - Iters: [bold white]{get_human_readable(int(itrs_per_sec), "H")}/s[/bold white] + Iters: [bold white]{get_human_readable(int(itrs_per_sec), 'H')}/s[/bold white] Block: [bold white]{block_number}[/bold white] Block_hash: [bold white]{block_hash.encode('utf-8')}[/bold white]""" status.update(message.replace(" ", "")) - + # exited while, found_solution contains the nonce or wallet is registered - if solution == -1: # didn't find solution - # Check for any errors - err = log_cuda_errors() - if err: - raise CUDAException(err) - - reset_cuda() + if solution is not None: + stopEvent.set() # stop all other processes status.stop() - return None - - else: - # Check for any errors - err = log_cuda_errors() - if err: - raise CUDAException(err) - reset_cuda() - # Shouldn't get here - status.stop() - return None + return solution + + status.stop() + return None def create_pow( subtensor, wallet, cuda: bool = False, dev_id: Union[List[int], int] = 0, tpb: int = 256, num_processes: int = None, update_interval: int = None) -> Optional[Dict[str, Any]]: if cuda: From 7bc18a46ab0acbb4379be359e67e29fbd202d15b Mon Sep 17 00:00:00 2001 From: Cameron Fairchild Date: Tue, 30 Aug 2022 16:46:53 -0400 Subject: [PATCH 10/11] move check under dot get accessor --- bittensor/_subtensor/__init__.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/bittensor/_subtensor/__init__.py b/bittensor/_subtensor/__init__.py index 513e33c0df..80f07bdf39 100644 --- a/bittensor/_subtensor/__init__.py +++ b/bittensor/_subtensor/__init__.py @@ -212,16 +212,17 @@ def add_defaults(cls, defaults ): def check_config( config: 'bittensor.Config' ): assert config.subtensor #assert config.subtensor.network != None - assert all((isinstance(x, int) or isinstance(x, str) and x.isnumeric() ) for x in config.subtensor.register.cuda.get('dev_id', [])) + if config.subtensor.get('register') and config.subtensor.register.get('cuda'): + assert all((isinstance(x, int) or isinstance(x, str) and x.isnumeric() ) for x in config.subtensor.register.cuda.get('dev_id', [])) - if config.subtensor.register.cuda.get('use_cuda', False): - try: - import cubit - except ImportError: - raise ImportError('CUDA registration is enabled but cubit is not installed. Please install cubit.') + if config.subtensor.register.cuda.get('use_cuda', False): + try: + import cubit + except ImportError: + raise ImportError('CUDA registration is enabled but cubit is not installed. Please install cubit.') - if not is_cuda_available(): - raise RuntimeError('CUDA registration is enabled but no CUDA devices are detected.') + if not is_cuda_available(): + raise RuntimeError('CUDA registration is enabled but no CUDA devices are detected.') @staticmethod From cbe804b6fd094bf757c34ef968cae19640465b10 Mon Sep 17 00:00:00 2001 From: Cameron Fairchild Date: Tue, 30 Aug 2022 21:33:05 -0400 Subject: [PATCH 11/11] add All gpus specification --- bittensor/_cli/__init__.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/bittensor/_cli/__init__.py b/bittensor/_cli/__init__.py index d8c64be787..870b442e6a 100644 --- a/bittensor/_cli/__init__.py +++ b/bittensor/_cli/__init__.py @@ -834,14 +834,17 @@ def _check_for_cuda_reg_config( config: 'bittensor.Config' ) -> None: for i, device in enumerate(devices): choices_str += (" {}: {}\n".format(device, device_names[i])) console.print(choices_str) - dev_id = IntListPrompt.ask("Which GPU(s) would you like to use?", choices=devices, default=str(bittensor.defaults.subtensor.register.cuda.dev_id)) - try: - # replace the commas with spaces then split over whitespace., - # then strip the whitespace and convert to ints. - dev_id = [int(dev_id.strip()) for dev_id in dev_id.replace(',', ' ').split()] - except ValueError: - console.error(":cross_mark:[red]Invalid GPU device[/red] [bold white]{}[/bold white]\nAvailable CUDA devices:{}".format(dev_id, choices_str)) - sys.exit(1) + dev_id = IntListPrompt.ask("Which GPU(s) would you like to use? Please list one, or comma-separated", choices=devices, default='All') + if dev_id == 'All': + dev_id = list(range(torch.cuda.device_count())) + else: + try: + # replace the commas with spaces then split over whitespace., + # then strip the whitespace and convert to ints. + dev_id = [int(dev_id.strip()) for dev_id in dev_id.replace(',', ' ').split()] + except ValueError: + console.error(":cross_mark:[red]Invalid GPU device[/red] [bold white]{}[/bold white]\nAvailable CUDA devices:{}".format(dev_id, choices_str)) + sys.exit(1) config.subtensor.register.cuda.dev_id = dev_id def check_register_config( config: 'bittensor.Config' ): @@ -954,5 +957,6 @@ class IntListPrompt(PromptBase): def check_choice( self, value: str ) -> bool: assert self.choices is not None # check if value is a valid choice or all the values in a list of ints are valid choices - return value in self.choices or \ + return value == "All" or \ + value in self.choices or \ all( val.strip() in self.choices for val in value.replace(',', ' ').split( ))