opentensor · Eugene-hu · Nov 9, 2022 · Sep 5, 2022 · Sep 6, 2022 · Sep 6, 2022
diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,5 @@
-FROM nvidia/cuda:11.2.1-base
+# syntax=docker/dockerfile:1
+FROM pytorch/pytorch:1.12.0-cuda11.3-cudnn8-devel
 
 LABEL bittensor.image.authors="bittensor.com" \
 	bittensor.image.vendor="Bittensor" \
@@ -14,22 +15,30 @@ ARG DEBIAN_FRONTEND=noninteractive
 RUN apt-key del 7fa2af80
 RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub
 RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu2004/x86_64/7fa2af80.pub
+# Update the base image
+RUN apt update && apt upgrade -y
+# Install bittensor
+## Install dependencies
+RUN apt install -y curl sudo nano git htop netcat wget unzip python3-dev python3-pip tmux apt-utils cmake build-essential
+## Upgrade pip
+RUN pip3 install --upgrade pip
 
-RUN apt-get update && apt-get install --no-install-recommends --no-install-suggests -y apt-utils curl git cmake build-essential unzip python3-pip  wget iproute2 software-properties-common
+# Install nvm and pm2
+RUN curl -o install_nvm.sh https://raw.githubusercontent.com/nvm-sh/nvm/v0.39.1/install.sh && \
+	echo 'fabc489b39a5e9c999c7cab4d281cdbbcbad10ec2f8b9a7f7144ad701b6bfdc7 install_nvm.sh' | sha256sum --check && \
+	bash install_nvm.sh
 
-RUN add-apt-repository ppa:deadsnakes/ppa
-RUN apt-get update
-RUN apt-get install python3 python3-dev -y
-RUN python3 -m pip install --upgrade pip
+RUN bash -c "source $HOME/.nvm/nvm.sh && \
+    # use node 16
+    nvm install 16 && \
+    # install pm2
+    npm install --location=global pm2"
 
-# add Bittensor code to docker image
-RUN mkdir /bittensor
-RUN mkdir /home/.bittensor
-COPY . /bittensor
+RUN mkdir -p /root/.bittensor/bittensor
+RUN cd ~/.bittensor/bittensor && \
+    python3 -m pip install bittensor
 
-WORKDIR /bittensor
-RUN pip install --upgrade numpy pandas setuptools "tqdm>=4.27,<4.50.0" wheel
-RUN pip install -r requirements.txt
-RUN pip install .
+# Increase ulimit to 1,000,000
+RUN prlimit --pid=$PPID --nofile=1000000
 
 EXPOSE 8091
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-3.4.1
+3.4.2
diff --git a/bittensor/__init__.py b/bittensor/__init__.py
@@ -16,16 +16,29 @@
 # DEALINGS IN THE SOFTWARE.
 
 from rich.console import Console
+from rich.traceback import install
 from prometheus_client import Info
 
+import nest_asyncio
+nest_asyncio.apply()
+
 # Bittensor code and protocol version.
-__version__ = '3.4.1'
+__version__ = '3.4.2'
 version_split = __version__.split(".")
 __version_as_int__ = (100 * int(version_split[0])) + (10 * int(version_split[1])) + (1 * int(version_split[2]))
 
+
+# Turn off rich console locals trace.
+from rich.traceback import install
+install(show_locals=False)
+
 # Rich console.
 __console__ = Console()
 __use_console__ = True
+
+# Remove overdue locals in debug training.
+install(show_locals=False)
+
 def turn_console_off():
     from io import StringIO
     __use_console__ = False
@@ -62,8 +75,8 @@ def turn_console_off():
 
 __nobunaga_entrypoint__ = "staging.nobunaga.opentensor.ai:9944"
 
-
-__bellagene_entrypoint__ = "parachain.opentensor.ai:443"
+# Needs to use wss://
+__bellagene_entrypoint__ = "wss://parachain.opentensor.ai:443"
 
 
 __local_entrypoint__ = "127.0.0.1:9944"

diff --git a/bittensor/_axon/axon_impl.py b/bittensor/_axon/axon_impl.py
@@ -27,18 +27,33 @@
 import grpc
 import wandb
 import pandas
+import uuid
 from loguru import logger
 import torch.nn.functional as F
 import concurrent
 
-from prometheus_client import Counter, Histogram, Enum, CollectorRegistry
 
 import bittensor
 import bittensor.utils.stats as stat_utils
 from datetime import datetime
 
 logger = logger.opt(colors=True)
 
+from prometheus_client import Counter, Histogram, Enum, CollectorRegistry
+PROM_axon_is_started = Enum('axon_is_started', 'is_started', states=['stopped', 'started'])
+PROM_total_forward = Counter('axon_total_forward', 'total_forward', ['wallet', 'identifier'])
+PROM_total_backward = Counter('axon_total_backward', 'total_backward', ['wallet', 'identifier'])
+PROM_forward_latency = Histogram('axon_forward_latency', 'forward_latency', ['wallet', 'identifier'], buckets=list(range(0,bittensor.__blocktime__,1)))
+PROM_backward_latency = Histogram('axon_backward_latency', 'backward_latency', ['wallet', 'identifier'], buckets=list(range(0,bittensor.__blocktime__,1))) 
+PROM_forward_synapses = Counter('axon_forward_synapses', 'forward_synapses', ['wallet', 'identifier', "synapse"])
+PROM_backward_synapses = Counter('axon_backward_synapses', 'backward_synapses', ['wallet', 'identifier', "synapse"])
+PROM_forward_codes = Counter('axon_forward_codes', 'forward_codes', ['wallet', 'identifier', "code"])
+PROM_backward_codes = Counter('axon_backward_codes', 'backward_codes', ['wallet', 'identifier', "code"])
+PROM_forward_hotkeys = Counter('axon_forward_hotkeys', 'forward_hotkeys', ['wallet', 'identifier', "hotkey"])
+PROM_backward_hotkeys = Counter('axon_backward_hotkeys', 'backward_hotkeys', ['wallet', 'identifier', "hotkey"])
+PROM_forward_bytes = Counter('axon_forward_bytes', 'forward_bytes', ['wallet', 'identifier', "hotkey"])
+PROM_backward_bytes = Counter('axon_backward_bytes', 'backward_bytes', ['wallet', 'identifier', "hotkey"])
+
 class Axon( bittensor.grpc.BittensorServicer ):
     r""" Services Forward and Backward requests from other neurons.
     """
@@ -103,27 +118,8 @@ def __init__(
 
         # -- Priority 
         self.priority = priority 
-        self.priority_threadpool= priority_threadpool
-
-         # == Prometheus
-        # We are running over various suffix values in the event that there are multiple axons in the same process.
-        # The first axon is created with a null suffix and subsequent values are ordered like so: axon_is_started, axon_is_started_1, axon_is_started_2 etc...
-
-        if self.prometheus_level != bittensor.prometheus.level.OFF.name:
-            registry = CollectorRegistry()
-            self.is_started = Enum('axon_is_started', 'is_started', states=['stopped', 'started'], registry=registry)
-            self.total_forward = Counter('axon_total_forward', 'total_forward', registry=registry)
-            self.total_backward = Counter('axon_total_backward', 'total_backward', registry=registry)
-            self.forward_latency = Histogram('axon_forward_latency', 'forward_latency', buckets=list(range(0,bittensor.__blocktime__,1)), registry=registry)
-            self.backward_latency = Histogram('axon_backward_latency', 'backward_latency', buckets=list(range(0,bittensor.__blocktime__,1)), registry=registry) 
-            self.forward_synapses = Counter('axon_forward_synapses', 'forward_synapses', ["synapse"], registry=registry)
-            self.backward_synapses = Counter('axon_backward_synapses', 'backward_synapses', ["synapse"], registry=registry)
-            self.forward_codes = Counter('axon_forward_codes', 'forward_codes', ["code"], registry=registry)
-            self.backward_codes = Counter('axon_backward_codes', 'backward_codes', ["code"], registry=registry)
-            self.forward_hotkeys = Counter('axon_forward_hotkeys', 'forward_hotkeys', ["hotkey"], registry=registry)
-            self.backward_hotkeys = Counter('axon_backward_hotkeys', 'backward_hotkeys', ["hotkey"], registry=registry)
-            self.forward_bytes = Counter('axon_forward_bytes', 'forward_bytes', ["hotkey"], registry=registry)
-            self.backward_bytes = Counter('axon_backward_bytes', 'backward_bytes', ["hotkey"], registry=registry)
+        self.priority_threadpool = priority_threadpool
+        self._prometheus_uuid = uuid.uuid1()
 
     def __str__(self) -> str:
         return "Axon({}, {}, {}, {})".format( self.ip, self.port, self.wallet.hotkey.ss58_address, "started" if self.started else "stopped")
@@ -239,17 +235,17 @@ def check_if_should_return() -> bool:
         def finalize_codes_stats_and_logs( message = None):
             # === Prometheus
             if self.prometheus_level != bittensor.prometheus.level.OFF.name:
-                self.total_forward.inc()
-                self.forward_latency.observe( clock.time() - start_time )
+                PROM_total_forward.labels( wallet = self.wallet.hotkey.ss58_address, identifier = self._prometheus_uuid ).inc()
+                PROM_forward_latency.labels( wallet = self.wallet.hotkey.ss58_address, identifier = self._prometheus_uuid ).observe( clock.time() - start_time )
                 if self.prometheus_level == bittensor.prometheus.level.DEBUG.name:
-                    self.forward_hotkeys.labels( request.hotkey ).inc()
-                    self.forward_bytes.labels( request.hotkey ).inc( sys.getsizeof( request ) )
+                    PROM_forward_hotkeys.labels( wallet = self.wallet.hotkey.ss58_address, identifier = self._prometheus_uuid, hotkey = request.hotkey ).inc()
+                    PROM_forward_bytes.labels( wallet = self.wallet.hotkey.ss58_address, identifier = self._prometheus_uuid, hotkey = request.hotkey ).inc( sys.getsizeof( request ) )
 
             for index, synapse in enumerate( synapses ):
                 # === Prometheus
                 if self.prometheus_level != bittensor.prometheus.level.OFF.name:
-                    self.forward_synapses.labels( str(synapse) ).inc()
-                    self.forward_codes.labels( str(synapse_codes[ index ]) ).inc()
+                    PROM_forward_synapses.labels( wallet = self.wallet.hotkey.ss58_address, identifier = self._prometheus_uuid, synapse = str(synapse) ).inc()
+                    PROM_forward_codes.labels( wallet = self.wallet.hotkey.ss58_address, identifier = self._prometheus_uuid, code = str(synapse_codes[ index ]) ).inc()
 
                 # === Logging
                 request.synapses [ index ].return_code = synapse_codes[ index ] # Set synapse wire proto codes.
@@ -261,7 +257,7 @@ def finalize_codes_stats_and_logs( message = None):
                     code = synapse_codes[ index ], 
                     call_time = synapse_call_times[ index ], 
                     pubkey = request.hotkey, 
-                    inputs = synapse_inputs [index] , 
+                    inputs = deserialized_forward_tensors [index].shape if deserialized_forward_tensors [index] != None else None , 
                     outputs = None if synapse_responses[index] == None else list( synapse_responses[index].shape ), 
                     message = synapse_messages[ index ] if message == None else message,
                     synapse = synapse.synapse_type
@@ -471,17 +467,17 @@ def check_if_should_return() -> bool:
         def finalize_codes_stats_and_logs():
             # === Prometheus
             if self.prometheus_level != bittensor.prometheus.level.OFF.name:
-                self.total_backward.inc()
-                self.backward_latency.observe( clock.time() - start_time )
+                PROM_total_backward.labels( wallet = self.wallet.hotkey.ss58_address, identifier = self._prometheus_uuid ).inc()
+                PROM_backward_latency.labels( wallet = self.wallet.hotkey.ss58_address, identifier = self._prometheus_uuid ).observe( clock.time() - start_time )
                 if self.prometheus_level == bittensor.prometheus.level.DEBUG.name:
-                    self.backward_hotkeys.labels( request.hotkey ).inc()
-                    self.backward_bytes.labels( request.hotkey ).inc( sys.getsizeof( request ) )
+                    PROM_backward_hotkeys.labels( wallet = self.wallet.hotkey.ss58_address, identifier = self._prometheus_uuid, hotkey = request.hotkey ).inc()
+                    PROM_backward_bytes.labels( wallet = self.wallet.hotkey.ss58_address, identifier = self._prometheus_uuid, hotkey = request.hotkey ).inc( sys.getsizeof( request ) )
 
             for index, synapse in enumerate( synapses ):
                 # === Prometheus
                 if self.prometheus_level != bittensor.prometheus.level.OFF.name:
-                    self.backward_synapses.labels( str(synapse) ).inc()
-                    self.backward_codes.labels( str(synapse_codes[ index ]) ).inc()
+                    PROM_backward_synapses.labels( wallet = self.wallet.hotkey.ss58_address, identifier = self._prometheus_uuid, synapse = str(synapse) ).inc()
+                    PROM_backward_codes.labels( wallet = self.wallet.hotkey.ss58_address, identifier = self._prometheus_uuid, code = str(synapse_codes[ index ]) ).inc()
 
                 # === Logging
                 request.synapses [ index ].return_code = synapse_codes[ index ] # Set synapse wire proto codes.
@@ -818,7 +814,7 @@ def start(self) -> 'Axon':
 
         # Switch prometheus ENUM.
         if self.prometheus_level != bittensor.prometheus.level.OFF.name:
-            self.is_started.state('started')
+            PROM_axon_is_started.state('started')
 
         return self
 
@@ -832,7 +828,7 @@ def stop(self) -> 'Axon':
 
         # Switch prometheus ENUM.
         if self.prometheus_level != bittensor.prometheus.level.OFF.name:
-            self.is_started.state('stopped')
+            PROM_axon_is_started.state('stopped')
 
         return self