BitMind-AI · dylanuys · Jan 23, 2025 · Jan 22, 2025 · Jan 22, 2025 · Jan 22, 2025
diff --git a/bitmind/synthetic_data_generation/synthetic_data_generator.py b/bitmind/synthetic_data_generation/synthetic_data_generator.py
@@ -10,6 +10,7 @@
 
 import bittensor as bt
 import numpy as np
+import random
 import torch
 from diffusers.utils import export_to_video
 from PIL import Image
@@ -266,21 +267,22 @@ def _run_generation(
             gen_args['mask_image'] = create_random_mask(image.size)
             gen_args['image'] = image
 
-        # Process generation arguments
+        # Prepare generation arguments
         for k, v in gen_args.items():
             if isinstance(v, dict):
-                gen_args[k] = np.random.randint(
-                    gen_args[k]['min'],
-                    gen_args[k]['max']
-                )
-            for dim in ('height', 'width'):
-                if isinstance(gen_args.get(dim), list):
-                    gen_args[dim] = np.random.choice(gen_args[dim])
+                if "min" in v and "max" in v:
+                    gen_args[k] = np.random.randint(v['min'], v['max'])
+                if "options" in v:
+                    gen_args[k] = random.choice(v['options'])
 
         try:
             if generate_at_target_size:
                 gen_args['height'] = TARGET_IMAGE_SIZE[0]
                 gen_args['width'] = TARGET_IMAGE_SIZE[1]
+            elif 'resolution' in gen_args:
+                gen_args['height'] = gen_args['resolution'][0]
+                gen_args['width'] = gen_args['resolution'][1]
+                del gen_args['resolution']
 
             truncated_prompt = truncate_prompt_if_too_long(
                 prompt,
@@ -289,6 +291,7 @@ def _run_generation(
 
             bt.logging.info(f"Generating media from prompt: {truncated_prompt}")
             bt.logging.info(f"Generation args: {gen_args}")
+
             start_time = time.time()
             if model_config.get('use_autocast', True):
                 pretrained_args = model_config.get('from_pretrained_args', {})
@@ -347,12 +350,21 @@ def load_model(self, model_name: Optional[str] = None, modality: Optional[str] =
             self.model_name = model_name
 
         bt.logging.info(f"Loading {self.model_name}")
-        
+
         pipeline_cls = MODELS[model_name]['pipeline_cls']
         pipeline_args = MODELS[model_name]['from_pretrained_args']
+        for k, v in pipeline_args.items():
+            if isinstance(v, tuple) and callable(v[0]):
+                pipeline_args[k] = v[0](**v[1])
+
+        if 'model_id' in pipeline_args:
+            model_id = pipeline_args['model_id']
+            del pipeline_args['model_id']
+        else:
+            model_id = model_name
 
         self.model = pipeline_cls.from_pretrained(
-            pipeline_args.get('base', model_name),
+            model_id,
             cache_dir=HUGGINGFACE_CACHE_DIR,
             **pipeline_args,
             add_watermarker=False

diff --git a/bitmind/validator/config.py b/bitmind/validator/config.py
@@ -9,12 +9,13 @@
     FluxPipeline,
     CogVideoXPipeline,
     MochiPipeline,
+    HunyuanVideoPipeline,
     AnimateDiffPipeline,
     EulerDiscreteScheduler,
-    AutoPipelineForInpainting
+    AutoPipelineForInpainting,
 )
 
-from .model_utils import load_annimatediff_motion_adapter
+from .model_utils import load_annimatediff_motion_adapter, load_hunyuanvideo_transformer
 
 
 TARGET_IMAGE_SIZE: tuple[int, int] = (256, 256)
@@ -171,16 +172,46 @@
 
 # Text-to-video model configurations
 T2V_MODELS: Dict[str, Dict[str, Any]] = {
+    "tencent/HunyuanVideo": {
+        "pipeline_cls": HunyuanVideoPipeline,
+        "from_pretrained_args": {
+            "model_id": "tencent/HunyuanVideo",
+            "transformer": (  # custom functions supplied as tuple of (fn, args)
+                load_hunyuanvideo_transformer,
+                { 
+                    "model_id": "tencent/HunyuanVideo",
+                    "subfolder": "transformer",
+                    "torch_dtype": torch.bfloat16,
+                    "revision": 'refs/pr/18'
+                }
+            ),
+            "revision": 'refs/pr/18',
+            "torch_dtype": torch.bfloat16
+        },
+        "generate_args": {
+            "num_frames": {"min": 61, "max": 129},
+            "resolution": {"options": [
+                [720, 1280], [1280, 720], [1104, 832], [832,1104], [960,960],
+                [544, 960], [960, 544],	[624, 832], [832, 624],	[720, 720]
+            ]},
+            "num_inference_steps": {"min": 30, "max": 50},
+        },
+        "save_args": {"fps": 30},
+        "use_autocast": False,
+        "vae_enable_tiling": True
+    },
     "genmo/mochi-1-preview": {
         "pipeline_cls": MochiPipeline,
         "from_pretrained_args": {
             "variant": "bf16", 
             "torch_dtype": torch.bfloat16
         },
         "generate_args": {
-            "num_frames": 84
+            "num_frames": 84,
+            "num_inference_steps": {"min": 30, "max": 65},
+            "resolution": [480, 848]
         },
-        #"enable_model_cpu_offload": True,
+        "save_args": {"fps": 30},
         "vae_enable_tiling": True
     },
     'THUDM/CogVideoX-5b': {
@@ -194,7 +225,9 @@
             "num_videos_per_prompt": 1,
             "num_inference_steps": {"min": 50, "max": 125},
             "num_frames": 48,
+            "resolution": [720, 480]
         },
+        "save_args": {"fps": 8},
         "enable_model_cpu_offload": True,
         #"enable_sequential_cpu_offload": True,
         "vae_enable_slicing": True,
@@ -203,14 +236,23 @@
     'ByteDance/AnimateDiff-Lightning': {
         "pipeline_cls": AnimateDiffPipeline,
         "from_pretrained_args": {
-            "base": "emilianJR/epiCRealism",
+            "model_id": "emilianJR/epiCRealism",
             "torch_dtype": torch.bfloat16,
-            "motion_adapter": load_annimatediff_motion_adapter()
+            "motion_adapter": (
+                load_annimatediff_motion_adapter,
+                {"step": 4}
+            )
         },
         "generate_args": {
             "guidance_scale": 2,
             "num_inference_steps": {"min": 50, "max": 125},
+            "resolution": {"options": [
+                [512, 512], [512, 768], [512, 1024],
+                [768, 512], [768, 768], [768, 1024],
+                [1024, 512], [1024, 768], [1024, 1024]
+            ]}
         },
+        "save_args": {"fps": 15},
         "scheduler": {
             "cls": EulerDiscreteScheduler,
             "from_config_args": {

diff --git a/bitmind/validator/model_utils.py b/bitmind/validator/model_utils.py
@@ -1,9 +1,20 @@
 import torch
-from diffusers import MotionAdapter
+from diffusers import MotionAdapter, HunyuanVideoTransformer3DModel
 from huggingface_hub import hf_hub_download
 from safetensors.torch import load_file
 
 
+def load_hunyuanvideo_transformer(
+      model_id: str = "tencent/HunyuanVideo",
+      subfolder: str = "transformer",
+      torch_dtype: torch.dtype = torch.bfloat16, 
+      revision: str = 'refs/pr/18'
+):
+    return HunyuanVideoTransformer3DModel.from_pretrained(
+        model_id, subfolder=subfolder, torch_dtype=torch_dtype, revision=revision
+    )
+
+
 def load_annimatediff_motion_adapter(
     step: int = 4
 ) -> MotionAdapter:

diff --git a/requirements.txt b/requirements.txt
@@ -9,7 +9,7 @@ scikit-learn==1.5.2
 
 # Deep learning tools
 transformers==4.47.0
-#git+https://github.com/huggingface/diffusers.git@6a51427b6a226591ccc40249721c486855f53e1c#egg=diffusers
+diffusers==0.32.2
 accelerate==1.2.0
 bitsandbytes==0.45.0
 sentencepiece==0.2.0

diff --git a/setup_env.sh b/setup_env.sh
@@ -32,7 +32,6 @@ sudo npm install -g pm2@latest
 # Python Package Installation
 ############################
 
-pip install git+https://github.com/huggingface/diffusers.git@6a51427b6a226591ccc40249721c486855f53e1c
 pip install -e .
 
 ############################