diff --git a/bitmind/synthetic_data_generation/synthetic_data_generator.py b/bitmind/synthetic_data_generation/synthetic_data_generator.py index 8899a5f5..7bce9d7a 100644 --- a/bitmind/synthetic_data_generation/synthetic_data_generator.py +++ b/bitmind/synthetic_data_generation/synthetic_data_generator.py @@ -10,6 +10,7 @@ import bittensor as bt import numpy as np +import random import torch from diffusers.utils import export_to_video from PIL import Image @@ -266,21 +267,22 @@ def _run_generation( gen_args['mask_image'] = create_random_mask(image.size) gen_args['image'] = image - # Process generation arguments + # Prepare generation arguments for k, v in gen_args.items(): if isinstance(v, dict): - gen_args[k] = np.random.randint( - gen_args[k]['min'], - gen_args[k]['max'] - ) - for dim in ('height', 'width'): - if isinstance(gen_args.get(dim), list): - gen_args[dim] = np.random.choice(gen_args[dim]) + if "min" in v and "max" in v: + gen_args[k] = np.random.randint(v['min'], v['max']) + if "options" in v: + gen_args[k] = random.choice(v['options']) try: if generate_at_target_size: gen_args['height'] = TARGET_IMAGE_SIZE[0] gen_args['width'] = TARGET_IMAGE_SIZE[1] + elif 'resolution' in gen_args: + gen_args['height'] = gen_args['resolution'][0] + gen_args['width'] = gen_args['resolution'][1] + del gen_args['resolution'] truncated_prompt = truncate_prompt_if_too_long( prompt, @@ -289,6 +291,7 @@ def _run_generation( bt.logging.info(f"Generating media from prompt: {truncated_prompt}") bt.logging.info(f"Generation args: {gen_args}") + start_time = time.time() if model_config.get('use_autocast', True): pretrained_args = model_config.get('from_pretrained_args', {}) @@ -347,12 +350,21 @@ def load_model(self, model_name: Optional[str] = None, modality: Optional[str] = self.model_name = model_name bt.logging.info(f"Loading {self.model_name}") - + pipeline_cls = MODELS[model_name]['pipeline_cls'] pipeline_args = MODELS[model_name]['from_pretrained_args'] + for k, v in pipeline_args.items(): + if isinstance(v, tuple) and callable(v[0]): + pipeline_args[k] = v[0](**v[1]) + + if 'model_id' in pipeline_args: + model_id = pipeline_args['model_id'] + del pipeline_args['model_id'] + else: + model_id = model_name self.model = pipeline_cls.from_pretrained( - pipeline_args.get('base', model_name), + model_id, cache_dir=HUGGINGFACE_CACHE_DIR, **pipeline_args, add_watermarker=False diff --git a/bitmind/validator/config.py b/bitmind/validator/config.py index 91b0d574..e77deed0 100644 --- a/bitmind/validator/config.py +++ b/bitmind/validator/config.py @@ -9,12 +9,13 @@ FluxPipeline, CogVideoXPipeline, MochiPipeline, + HunyuanVideoPipeline, AnimateDiffPipeline, EulerDiscreteScheduler, - AutoPipelineForInpainting + AutoPipelineForInpainting, ) -from .model_utils import load_annimatediff_motion_adapter +from .model_utils import load_annimatediff_motion_adapter, load_hunyuanvideo_transformer TARGET_IMAGE_SIZE: tuple[int, int] = (256, 256) @@ -171,6 +172,34 @@ # Text-to-video model configurations T2V_MODELS: Dict[str, Dict[str, Any]] = { + "tencent/HunyuanVideo": { + "pipeline_cls": HunyuanVideoPipeline, + "from_pretrained_args": { + "model_id": "tencent/HunyuanVideo", + "transformer": ( # custom functions supplied as tuple of (fn, args) + load_hunyuanvideo_transformer, + { + "model_id": "tencent/HunyuanVideo", + "subfolder": "transformer", + "torch_dtype": torch.bfloat16, + "revision": 'refs/pr/18' + } + ), + "revision": 'refs/pr/18', + "torch_dtype": torch.bfloat16 + }, + "generate_args": { + "num_frames": {"min": 61, "max": 129}, + "resolution": {"options": [ + [720, 1280], [1280, 720], [1104, 832], [832,1104], [960,960], + [544, 960], [960, 544], [624, 832], [832, 624], [720, 720] + ]}, + "num_inference_steps": {"min": 30, "max": 50}, + }, + "save_args": {"fps": 30}, + "use_autocast": False, + "vae_enable_tiling": True + }, "genmo/mochi-1-preview": { "pipeline_cls": MochiPipeline, "from_pretrained_args": { @@ -178,9 +207,11 @@ "torch_dtype": torch.bfloat16 }, "generate_args": { - "num_frames": 84 + "num_frames": 84, + "num_inference_steps": {"min": 30, "max": 65}, + "resolution": [480, 848] }, - #"enable_model_cpu_offload": True, + "save_args": {"fps": 30}, "vae_enable_tiling": True }, 'THUDM/CogVideoX-5b': { @@ -194,7 +225,9 @@ "num_videos_per_prompt": 1, "num_inference_steps": {"min": 50, "max": 125}, "num_frames": 48, + "resolution": [720, 480] }, + "save_args": {"fps": 8}, "enable_model_cpu_offload": True, #"enable_sequential_cpu_offload": True, "vae_enable_slicing": True, @@ -203,14 +236,23 @@ 'ByteDance/AnimateDiff-Lightning': { "pipeline_cls": AnimateDiffPipeline, "from_pretrained_args": { - "base": "emilianJR/epiCRealism", + "model_id": "emilianJR/epiCRealism", "torch_dtype": torch.bfloat16, - "motion_adapter": load_annimatediff_motion_adapter() + "motion_adapter": ( + load_annimatediff_motion_adapter, + {"step": 4} + ) }, "generate_args": { "guidance_scale": 2, "num_inference_steps": {"min": 50, "max": 125}, + "resolution": {"options": [ + [512, 512], [512, 768], [512, 1024], + [768, 512], [768, 768], [768, 1024], + [1024, 512], [1024, 768], [1024, 1024] + ]} }, + "save_args": {"fps": 15}, "scheduler": { "cls": EulerDiscreteScheduler, "from_config_args": { diff --git a/bitmind/validator/model_utils.py b/bitmind/validator/model_utils.py index 36b90ad0..b0c414b0 100644 --- a/bitmind/validator/model_utils.py +++ b/bitmind/validator/model_utils.py @@ -1,9 +1,20 @@ import torch -from diffusers import MotionAdapter +from diffusers import MotionAdapter, HunyuanVideoTransformer3DModel from huggingface_hub import hf_hub_download from safetensors.torch import load_file +def load_hunyuanvideo_transformer( + model_id: str = "tencent/HunyuanVideo", + subfolder: str = "transformer", + torch_dtype: torch.dtype = torch.bfloat16, + revision: str = 'refs/pr/18' +): + return HunyuanVideoTransformer3DModel.from_pretrained( + model_id, subfolder=subfolder, torch_dtype=torch_dtype, revision=revision + ) + + def load_annimatediff_motion_adapter( step: int = 4 ) -> MotionAdapter: diff --git a/requirements.txt b/requirements.txt index 80ca086d..7c4725f5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,7 +9,7 @@ scikit-learn==1.5.2 # Deep learning tools transformers==4.47.0 -#git+https://github.com/huggingface/diffusers.git@6a51427b6a226591ccc40249721c486855f53e1c#egg=diffusers +diffusers==0.32.2 accelerate==1.2.0 bitsandbytes==0.45.0 sentencepiece==0.2.0 diff --git a/setup_env.sh b/setup_env.sh index ebd3be0b..0934c966 100755 --- a/setup_env.sh +++ b/setup_env.sh @@ -32,7 +32,6 @@ sudo npm install -g pm2@latest # Python Package Installation ############################ -pip install git+https://github.com/huggingface/diffusers.git@6a51427b6a226591ccc40249721c486855f53e1c pip install -e . ############################