From e9f10abc955f24b32e0f39e075f0411c37b3cccd Mon Sep 17 00:00:00 2001
From: Dylan Uys <dylan.uys@gmail.com>
Date: Tue, 21 Jan 2025 21:08:28 -0800
Subject: [PATCH 1/7] hunyuan video initial commit

---
 .../synthetic_data_generator.py               | 19 +++++++---
 bitmind/validator/config.py                   | 37 +++++++++++++++++--
 bitmind/validator/model_utils.py              | 13 ++++++-
 requirements.txt                              |  2 +-
 setup_env.sh                                  |  1 -
 5 files changed, 60 insertions(+), 12 deletions(-)

diff --git a/bitmind/synthetic_data_generation/synthetic_data_generator.py b/bitmind/synthetic_data_generation/synthetic_data_generator.py
index 8899a5f5..5087e4e1 100644
--- a/bitmind/synthetic_data_generation/synthetic_data_generator.py
+++ b/bitmind/synthetic_data_generation/synthetic_data_generator.py
@@ -10,6 +10,7 @@
 
 import bittensor as bt
 import numpy as np
+import random
 import torch
 from diffusers.utils import export_to_video
 from PIL import Image
@@ -269,10 +270,11 @@ def _run_generation(
         # Process generation arguments
         for k, v in gen_args.items():
             if isinstance(v, dict):
-                gen_args[k] = np.random.randint(
-                    gen_args[k]['min'],
-                    gen_args[k]['max']
-                )
+                if "min" in v and "max" in v:
+                    gen_args[k] = np.random.randint(v['min'], v['max'])
+                if "options" in v:
+                    gen_args[k] = random.choice(v['options'])
+
             for dim in ('height', 'width'):
                 if isinstance(gen_args.get(dim), list):
                     gen_args[dim] = np.random.choice(gen_args[dim])
@@ -281,6 +283,9 @@ def _run_generation(
             if generate_at_target_size:
                 gen_args['height'] = TARGET_IMAGE_SIZE[0]
                 gen_args['width'] = TARGET_IMAGE_SIZE[1]
+            elif 'resolution' in gen_args:
+                gen_args['height'] = gen_args['resolution'][0]
+                gen_args['width'] = gen_args['resolution'][1]
 
             truncated_prompt = truncate_prompt_if_too_long(
                 prompt,
@@ -289,6 +294,7 @@ def _run_generation(
 
             bt.logging.info(f"Generating media from prompt: {truncated_prompt}")
             bt.logging.info(f"Generation args: {gen_args}")
+
             start_time = time.time()
             if model_config.get('use_autocast', True):
                 pretrained_args = model_config.get('from_pretrained_args', {})
@@ -347,9 +353,12 @@ def load_model(self, model_name: Optional[str] = None, modality: Optional[str] =
             self.model_name = model_name
 
         bt.logging.info(f"Loading {self.model_name}")
-        
+
         pipeline_cls = MODELS[model_name]['pipeline_cls']
         pipeline_args = MODELS[model_name]['from_pretrained_args']
+        for k, v in pipeline_args.items():
+            if isinstance(v, tuple) and callable(v[0]):
+                pipeline_args[k] = v[0](**v[1])
 
         self.model = pipeline_cls.from_pretrained(
             pipeline_args.get('base', model_name),
diff --git a/bitmind/validator/config.py b/bitmind/validator/config.py
index 91b0d574..7b97aa62 100644
--- a/bitmind/validator/config.py
+++ b/bitmind/validator/config.py
@@ -9,12 +9,13 @@
     FluxPipeline,
     CogVideoXPipeline,
     MochiPipeline,
+    HunyuanVideoPipeline,
     AnimateDiffPipeline,
     EulerDiscreteScheduler,
-    AutoPipelineForInpainting
+    AutoPipelineForInpainting,
 )
 
-from .model_utils import load_annimatediff_motion_adapter
+from .model_utils import load_annimatediff_motion_adapter, load_hunyuanvideo_transformer
 
 
 TARGET_IMAGE_SIZE: tuple[int, int] = (256, 256)
@@ -171,6 +172,32 @@
 
 # Text-to-video model configurations
 T2V_MODELS: Dict[str, Dict[str, Any]] = {
+    "tencent/HunyuanVideo": {
+        "pipeline_cls": HunyuanVideoPipeline,
+        "from_pretrained_args": {
+            # custom functions supplied as tuple of (fn, args)
+            "transformer": (
+                load_hunyuanvideo_transformer,
+                { 
+                    "model_id": "tencent/HunyuanVideo",
+                    "subfolder": "transformer",
+                    "torch_dtype": torch.bfloat16,
+                    "revision": 'refs/pr/18'
+                }
+            ),
+            "revision": 'refs/pr/18',
+            "torch_dtype": torch.float16
+        },
+        "generate_args": {
+            "num_frames": {"min": 61, "max": 129},
+            "resolution": {"options": [
+                [720, 1280], [1280, 720], [1104, 832], [832,1104], [960,960],
+                [544, 960], [960, 544],	[624, 832], [832, 624],	[720, 720]
+            ]},
+            "num_inference_steps":  {"min": 30, "max": 50},
+        },
+        "vae_enable_tiling": True
+    },
     "genmo/mochi-1-preview": {
         "pipeline_cls": MochiPipeline,
         "from_pretrained_args": {
@@ -180,7 +207,6 @@
         "generate_args": {
             "num_frames": 84
         },
-        #"enable_model_cpu_offload": True,
         "vae_enable_tiling": True
     },
     'THUDM/CogVideoX-5b': {
@@ -205,7 +231,10 @@
         "from_pretrained_args": {
             "base": "emilianJR/epiCRealism",
             "torch_dtype": torch.bfloat16,
-            "motion_adapter": load_annimatediff_motion_adapter()
+            "motion_adapter": (
+                load_annimatediff_motion_adapter,
+                {"step": 4}
+            )
         },
         "generate_args": {
             "guidance_scale": 2,
diff --git a/bitmind/validator/model_utils.py b/bitmind/validator/model_utils.py
index 36b90ad0..b0c414b0 100644
--- a/bitmind/validator/model_utils.py
+++ b/bitmind/validator/model_utils.py
@@ -1,9 +1,20 @@
 import torch
-from diffusers import MotionAdapter
+from diffusers import MotionAdapter, HunyuanVideoTransformer3DModel
 from huggingface_hub import hf_hub_download
 from safetensors.torch import load_file
 
 
+def load_hunyuanvideo_transformer(
+      model_id: str = "tencent/HunyuanVideo",
+      subfolder: str = "transformer",
+      torch_dtype: torch.dtype = torch.bfloat16, 
+      revision: str = 'refs/pr/18'
+):
+    return HunyuanVideoTransformer3DModel.from_pretrained(
+        model_id, subfolder=subfolder, torch_dtype=torch_dtype, revision=revision
+    )
+
+
 def load_annimatediff_motion_adapter(
     step: int = 4
 ) -> MotionAdapter:
diff --git a/requirements.txt b/requirements.txt
index 80ca086d..7c4725f5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,7 +9,7 @@ scikit-learn==1.5.2
 
 # Deep learning tools
 transformers==4.47.0
-#git+https://github.com/huggingface/diffusers.git@6a51427b6a226591ccc40249721c486855f53e1c#egg=diffusers
+diffusers==0.32.2
 accelerate==1.2.0
 bitsandbytes==0.45.0
 sentencepiece==0.2.0
diff --git a/setup_env.sh b/setup_env.sh
index ebd3be0b..0934c966 100755
--- a/setup_env.sh
+++ b/setup_env.sh
@@ -32,7 +32,6 @@ sudo npm install -g pm2@latest
 # Python Package Installation
 ############################
 
-pip install git+https://github.com/huggingface/diffusers.git@6a51427b6a226591ccc40249721c486855f53e1c
 pip install -e .
 
 ############################

From 21aa0537ff2a61adb9910bc6dbce18ba5b174594 Mon Sep 17 00:00:00 2001
From: Dylan Uys <dylan.uys@gmail.com>
Date: Wed, 22 Jan 2025 07:50:36 +0000
Subject: [PATCH 2/7] delete resolution from from_pretrained_args after
 extracting h,w

---
 bitmind/synthetic_data_generation/synthetic_data_generator.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bitmind/synthetic_data_generation/synthetic_data_generator.py b/bitmind/synthetic_data_generation/synthetic_data_generator.py
index 5087e4e1..54f20aab 100644
--- a/bitmind/synthetic_data_generation/synthetic_data_generator.py
+++ b/bitmind/synthetic_data_generation/synthetic_data_generator.py
@@ -286,6 +286,7 @@ def _run_generation(
             elif 'resolution' in gen_args:
                 gen_args['height'] = gen_args['resolution'][0]
                 gen_args['width'] = gen_args['resolution'][1]
+                del gen_args['resolution']
 
             truncated_prompt = truncate_prompt_if_too_long(
                 prompt,

From 1ee6b04bdc58749368aeead521373711a591a0fe Mon Sep 17 00:00:00 2001
From: Dylan Uys <dylan.uys@gmail.com>
Date: Tue, 21 Jan 2025 23:58:51 -0800
Subject: [PATCH 3/7] model_id arg for from_pretrained

---
 bitmind/validator/config.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bitmind/validator/config.py b/bitmind/validator/config.py
index 7b97aa62..0a11c261 100644
--- a/bitmind/validator/config.py
+++ b/bitmind/validator/config.py
@@ -175,8 +175,8 @@
     "tencent/HunyuanVideo": {
         "pipeline_cls": HunyuanVideoPipeline,
         "from_pretrained_args": {
-            # custom functions supplied as tuple of (fn, args)
-            "transformer": (
+            "model_id": "tencent/HunyuanVideo",
+            "transformer": (  # custom functions supplied as tuple of (fn, args)
                 load_hunyuanvideo_transformer,
                 { 
                     "model_id": "tencent/HunyuanVideo",

From 44356706460595e3266054bf4f56be96e7fa3c24 Mon Sep 17 00:00:00 2001
From: Dylan Uys <dylan.uys@gmail.com>
Date: Wed, 22 Jan 2025 23:26:48 +0000
Subject: [PATCH 4/7] standardizing model_id usage

---
 .../synthetic_data_generation/synthetic_data_generator.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/bitmind/synthetic_data_generation/synthetic_data_generator.py b/bitmind/synthetic_data_generation/synthetic_data_generator.py
index 54f20aab..deaa36d1 100644
--- a/bitmind/synthetic_data_generation/synthetic_data_generator.py
+++ b/bitmind/synthetic_data_generation/synthetic_data_generator.py
@@ -361,8 +361,14 @@ def load_model(self, model_name: Optional[str] = None, modality: Optional[str] =
             if isinstance(v, tuple) and callable(v[0]):
                 pipeline_args[k] = v[0](**v[1])
 
+        if 'model_id' in pipeline_args:
+            model_id = pipeline_args['model_id']
+            del pipeline_args['model_id']
+        else:
+            model_id = model_name
+
         self.model = pipeline_cls.from_pretrained(
-            pipeline_args.get('base', model_name),
+            model_id,
             cache_dir=HUGGINGFACE_CACHE_DIR,
             **pipeline_args,
             add_watermarker=False

From e2c039ffa50ca6f336559c1c87e43c6e0d340ef7 Mon Sep 17 00:00:00 2001
From: Dylan Uys <dylan.uys@gmail.com>
Date: Wed, 22 Jan 2025 23:27:14 +0000
Subject: [PATCH 5/7] fixing autocast and torch_dtype for hunyuan

---
 bitmind/validator/config.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/bitmind/validator/config.py b/bitmind/validator/config.py
index 0a11c261..03d08918 100644
--- a/bitmind/validator/config.py
+++ b/bitmind/validator/config.py
@@ -186,7 +186,7 @@
                 }
             ),
             "revision": 'refs/pr/18',
-            "torch_dtype": torch.float16
+            "torch_dtype": torch.bfloat16
         },
         "generate_args": {
             "num_frames": {"min": 61, "max": 129},
@@ -196,6 +196,7 @@
             ]},
             "num_inference_steps":  {"min": 30, "max": 50},
         },
+        "use_autocast": False,
         "vae_enable_tiling": True
     },
     "genmo/mochi-1-preview": {
@@ -229,7 +230,7 @@
     'ByteDance/AnimateDiff-Lightning': {
         "pipeline_cls": AnimateDiffPipeline,
         "from_pretrained_args": {
-            "base": "emilianJR/epiCRealism",
+            "model_id": "emilianJR/epiCRealism",
             "torch_dtype": torch.bfloat16,
             "motion_adapter": (
                 load_annimatediff_motion_adapter,

From 6305e2c7c1be96e395514755dbd9f1faf685d4c7 Mon Sep 17 00:00:00 2001
From: Dylan Uys <dylan.uys@gmail.com>
Date: Thu, 23 Jan 2025 02:47:06 +0000
Subject: [PATCH 6/7] adding resolution options and save options for all t2v
 models

---
 .../synthetic_data_generator.py                  |  6 +-----
 bitmind/validator/config.py                      | 16 ++++++++++++++--
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/bitmind/synthetic_data_generation/synthetic_data_generator.py b/bitmind/synthetic_data_generation/synthetic_data_generator.py
index deaa36d1..7bce9d7a 100644
--- a/bitmind/synthetic_data_generation/synthetic_data_generator.py
+++ b/bitmind/synthetic_data_generation/synthetic_data_generator.py
@@ -267,7 +267,7 @@ def _run_generation(
             gen_args['mask_image'] = create_random_mask(image.size)
             gen_args['image'] = image
 
-        # Process generation arguments
+        # Prepare generation arguments
         for k, v in gen_args.items():
             if isinstance(v, dict):
                 if "min" in v and "max" in v:
@@ -275,10 +275,6 @@ def _run_generation(
                 if "options" in v:
                     gen_args[k] = random.choice(v['options'])
 
-            for dim in ('height', 'width'):
-                if isinstance(gen_args.get(dim), list):
-                    gen_args[dim] = np.random.choice(gen_args[dim])
-
         try:
             if generate_at_target_size:
                 gen_args['height'] = TARGET_IMAGE_SIZE[0]
diff --git a/bitmind/validator/config.py b/bitmind/validator/config.py
index 03d08918..b4ecf3b1 100644
--- a/bitmind/validator/config.py
+++ b/bitmind/validator/config.py
@@ -194,8 +194,9 @@
                 [720, 1280], [1280, 720], [1104, 832], [832,1104], [960,960],
                 [544, 960], [960, 544],	[624, 832], [832, 624],	[720, 720]
             ]},
-            "num_inference_steps":  {"min": 30, "max": 50},
+            "num_inference_steps": {"min": 30, "max": 50},
         },
+        "save_args": {"fps": 30},
         "use_autocast": False,
         "vae_enable_tiling": True
     },
@@ -206,8 +207,11 @@
             "torch_dtype": torch.bfloat16
         },
         "generate_args": {
-            "num_frames": 84
+            "num_frames": 84,
+            "num_inference_steps": {"min": 30, "max": 65},
+            "resolution": [480, 848]
         },
+        "save_args": {"fps": 30}
         "vae_enable_tiling": True
     },
     'THUDM/CogVideoX-5b': {
@@ -221,7 +225,9 @@
             "num_videos_per_prompt": 1,
             "num_inference_steps": {"min": 50, "max": 125},
             "num_frames": 48,
+            "resolution": [720, 480]
         },
+        "save_args": {"fps": 8},
         "enable_model_cpu_offload": True,
         #"enable_sequential_cpu_offload": True,
         "vae_enable_slicing": True,
@@ -240,7 +246,13 @@
         "generate_args": {
             "guidance_scale": 2,
             "num_inference_steps": {"min": 50, "max": 125},
+            "resolution": {"options": [
+                [512, 512], [512, 768], [512, 1024],
+                [768, 512], [768, 768], [768, 1024],
+                [1024, 512], [1024, 768], [1024, 1024]
+            ]}
         },
+        "save_args": {"fps": 15},
         "scheduler": {
             "cls": EulerDiscreteScheduler,
             "from_config_args": {

From a966d62ab625f4f2739dd8fbf88f1d9d765ef8b6 Mon Sep 17 00:00:00 2001
From: Dylan Uys <dylan.uys@gmail.com>
Date: Thu, 23 Jan 2025 03:52:01 +0000
Subject: [PATCH 7/7] missing comma in config

---
 bitmind/validator/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bitmind/validator/config.py b/bitmind/validator/config.py
index b4ecf3b1..e77deed0 100644
--- a/bitmind/validator/config.py
+++ b/bitmind/validator/config.py
@@ -211,7 +211,7 @@
             "num_inference_steps": {"min": 30, "max": 65},
             "resolution": [480, 848]
         },
-        "save_args": {"fps": 30}
+        "save_args": {"fps": 30},
         "vae_enable_tiling": True
     },
     'THUDM/CogVideoX-5b': {