From e9f10abc955f24b32e0f39e075f0411c37b3cccd Mon Sep 17 00:00:00 2001 From: Dylan Uys Date: Tue, 21 Jan 2025 21:08:28 -0800 Subject: [PATCH 1/7] hunyuan video initial commit --- .../synthetic_data_generator.py | 19 +++++++--- bitmind/validator/config.py | 37 +++++++++++++++++-- bitmind/validator/model_utils.py | 13 ++++++- requirements.txt | 2 +- setup_env.sh | 1 - 5 files changed, 60 insertions(+), 12 deletions(-) diff --git a/bitmind/synthetic_data_generation/synthetic_data_generator.py b/bitmind/synthetic_data_generation/synthetic_data_generator.py index 8899a5f5..5087e4e1 100644 --- a/bitmind/synthetic_data_generation/synthetic_data_generator.py +++ b/bitmind/synthetic_data_generation/synthetic_data_generator.py @@ -10,6 +10,7 @@ import bittensor as bt import numpy as np +import random import torch from diffusers.utils import export_to_video from PIL import Image @@ -269,10 +270,11 @@ def _run_generation( # Process generation arguments for k, v in gen_args.items(): if isinstance(v, dict): - gen_args[k] = np.random.randint( - gen_args[k]['min'], - gen_args[k]['max'] - ) + if "min" in v and "max" in v: + gen_args[k] = np.random.randint(v['min'], v['max']) + if "options" in v: + gen_args[k] = random.choice(v['options']) + for dim in ('height', 'width'): if isinstance(gen_args.get(dim), list): gen_args[dim] = np.random.choice(gen_args[dim]) @@ -281,6 +283,9 @@ def _run_generation( if generate_at_target_size: gen_args['height'] = TARGET_IMAGE_SIZE[0] gen_args['width'] = TARGET_IMAGE_SIZE[1] + elif 'resolution' in gen_args: + gen_args['height'] = gen_args['resolution'][0] + gen_args['width'] = gen_args['resolution'][1] truncated_prompt = truncate_prompt_if_too_long( prompt, @@ -289,6 +294,7 @@ def _run_generation( bt.logging.info(f"Generating media from prompt: {truncated_prompt}") bt.logging.info(f"Generation args: {gen_args}") + start_time = time.time() if model_config.get('use_autocast', True): pretrained_args = model_config.get('from_pretrained_args', {}) @@ -347,9 +353,12 @@ def load_model(self, model_name: Optional[str] = None, modality: Optional[str] = self.model_name = model_name bt.logging.info(f"Loading {self.model_name}") - + pipeline_cls = MODELS[model_name]['pipeline_cls'] pipeline_args = MODELS[model_name]['from_pretrained_args'] + for k, v in pipeline_args.items(): + if isinstance(v, tuple) and callable(v[0]): + pipeline_args[k] = v[0](**v[1]) self.model = pipeline_cls.from_pretrained( pipeline_args.get('base', model_name), diff --git a/bitmind/validator/config.py b/bitmind/validator/config.py index 91b0d574..7b97aa62 100644 --- a/bitmind/validator/config.py +++ b/bitmind/validator/config.py @@ -9,12 +9,13 @@ FluxPipeline, CogVideoXPipeline, MochiPipeline, + HunyuanVideoPipeline, AnimateDiffPipeline, EulerDiscreteScheduler, - AutoPipelineForInpainting + AutoPipelineForInpainting, ) -from .model_utils import load_annimatediff_motion_adapter +from .model_utils import load_annimatediff_motion_adapter, load_hunyuanvideo_transformer TARGET_IMAGE_SIZE: tuple[int, int] = (256, 256) @@ -171,6 +172,32 @@ # Text-to-video model configurations T2V_MODELS: Dict[str, Dict[str, Any]] = { + "tencent/HunyuanVideo": { + "pipeline_cls": HunyuanVideoPipeline, + "from_pretrained_args": { + # custom functions supplied as tuple of (fn, args) + "transformer": ( + load_hunyuanvideo_transformer, + { + "model_id": "tencent/HunyuanVideo", + "subfolder": "transformer", + "torch_dtype": torch.bfloat16, + "revision": 'refs/pr/18' + } + ), + "revision": 'refs/pr/18', + "torch_dtype": torch.float16 + }, + "generate_args": { + "num_frames": {"min": 61, "max": 129}, + "resolution": {"options": [ + [720, 1280], [1280, 720], [1104, 832], [832,1104], [960,960], + [544, 960], [960, 544], [624, 832], [832, 624], [720, 720] + ]}, + "num_inference_steps": {"min": 30, "max": 50}, + }, + "vae_enable_tiling": True + }, "genmo/mochi-1-preview": { "pipeline_cls": MochiPipeline, "from_pretrained_args": { @@ -180,7 +207,6 @@ "generate_args": { "num_frames": 84 }, - #"enable_model_cpu_offload": True, "vae_enable_tiling": True }, 'THUDM/CogVideoX-5b': { @@ -205,7 +231,10 @@ "from_pretrained_args": { "base": "emilianJR/epiCRealism", "torch_dtype": torch.bfloat16, - "motion_adapter": load_annimatediff_motion_adapter() + "motion_adapter": ( + load_annimatediff_motion_adapter, + {"step": 4} + ) }, "generate_args": { "guidance_scale": 2, diff --git a/bitmind/validator/model_utils.py b/bitmind/validator/model_utils.py index 36b90ad0..b0c414b0 100644 --- a/bitmind/validator/model_utils.py +++ b/bitmind/validator/model_utils.py @@ -1,9 +1,20 @@ import torch -from diffusers import MotionAdapter +from diffusers import MotionAdapter, HunyuanVideoTransformer3DModel from huggingface_hub import hf_hub_download from safetensors.torch import load_file +def load_hunyuanvideo_transformer( + model_id: str = "tencent/HunyuanVideo", + subfolder: str = "transformer", + torch_dtype: torch.dtype = torch.bfloat16, + revision: str = 'refs/pr/18' +): + return HunyuanVideoTransformer3DModel.from_pretrained( + model_id, subfolder=subfolder, torch_dtype=torch_dtype, revision=revision + ) + + def load_annimatediff_motion_adapter( step: int = 4 ) -> MotionAdapter: diff --git a/requirements.txt b/requirements.txt index 80ca086d..7c4725f5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,7 +9,7 @@ scikit-learn==1.5.2 # Deep learning tools transformers==4.47.0 -#git+https://github.com/huggingface/diffusers.git@6a51427b6a226591ccc40249721c486855f53e1c#egg=diffusers +diffusers==0.32.2 accelerate==1.2.0 bitsandbytes==0.45.0 sentencepiece==0.2.0 diff --git a/setup_env.sh b/setup_env.sh index ebd3be0b..0934c966 100755 --- a/setup_env.sh +++ b/setup_env.sh @@ -32,7 +32,6 @@ sudo npm install -g pm2@latest # Python Package Installation ############################ -pip install git+https://github.com/huggingface/diffusers.git@6a51427b6a226591ccc40249721c486855f53e1c pip install -e . ############################ From 21aa0537ff2a61adb9910bc6dbce18ba5b174594 Mon Sep 17 00:00:00 2001 From: Dylan Uys Date: Wed, 22 Jan 2025 07:50:36 +0000 Subject: [PATCH 2/7] delete resolution from from_pretrained_args after extracting h,w --- bitmind/synthetic_data_generation/synthetic_data_generator.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bitmind/synthetic_data_generation/synthetic_data_generator.py b/bitmind/synthetic_data_generation/synthetic_data_generator.py index 5087e4e1..54f20aab 100644 --- a/bitmind/synthetic_data_generation/synthetic_data_generator.py +++ b/bitmind/synthetic_data_generation/synthetic_data_generator.py @@ -286,6 +286,7 @@ def _run_generation( elif 'resolution' in gen_args: gen_args['height'] = gen_args['resolution'][0] gen_args['width'] = gen_args['resolution'][1] + del gen_args['resolution'] truncated_prompt = truncate_prompt_if_too_long( prompt, From 1ee6b04bdc58749368aeead521373711a591a0fe Mon Sep 17 00:00:00 2001 From: Dylan Uys Date: Tue, 21 Jan 2025 23:58:51 -0800 Subject: [PATCH 3/7] model_id arg for from_pretrained --- bitmind/validator/config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bitmind/validator/config.py b/bitmind/validator/config.py index 7b97aa62..0a11c261 100644 --- a/bitmind/validator/config.py +++ b/bitmind/validator/config.py @@ -175,8 +175,8 @@ "tencent/HunyuanVideo": { "pipeline_cls": HunyuanVideoPipeline, "from_pretrained_args": { - # custom functions supplied as tuple of (fn, args) - "transformer": ( + "model_id": "tencent/HunyuanVideo", + "transformer": ( # custom functions supplied as tuple of (fn, args) load_hunyuanvideo_transformer, { "model_id": "tencent/HunyuanVideo", From 44356706460595e3266054bf4f56be96e7fa3c24 Mon Sep 17 00:00:00 2001 From: Dylan Uys Date: Wed, 22 Jan 2025 23:26:48 +0000 Subject: [PATCH 4/7] standardizing model_id usage --- .../synthetic_data_generation/synthetic_data_generator.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/bitmind/synthetic_data_generation/synthetic_data_generator.py b/bitmind/synthetic_data_generation/synthetic_data_generator.py index 54f20aab..deaa36d1 100644 --- a/bitmind/synthetic_data_generation/synthetic_data_generator.py +++ b/bitmind/synthetic_data_generation/synthetic_data_generator.py @@ -361,8 +361,14 @@ def load_model(self, model_name: Optional[str] = None, modality: Optional[str] = if isinstance(v, tuple) and callable(v[0]): pipeline_args[k] = v[0](**v[1]) + if 'model_id' in pipeline_args: + model_id = pipeline_args['model_id'] + del pipeline_args['model_id'] + else: + model_id = model_name + self.model = pipeline_cls.from_pretrained( - pipeline_args.get('base', model_name), + model_id, cache_dir=HUGGINGFACE_CACHE_DIR, **pipeline_args, add_watermarker=False From e2c039ffa50ca6f336559c1c87e43c6e0d340ef7 Mon Sep 17 00:00:00 2001 From: Dylan Uys Date: Wed, 22 Jan 2025 23:27:14 +0000 Subject: [PATCH 5/7] fixing autocast and torch_dtype for hunyuan --- bitmind/validator/config.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bitmind/validator/config.py b/bitmind/validator/config.py index 0a11c261..03d08918 100644 --- a/bitmind/validator/config.py +++ b/bitmind/validator/config.py @@ -186,7 +186,7 @@ } ), "revision": 'refs/pr/18', - "torch_dtype": torch.float16 + "torch_dtype": torch.bfloat16 }, "generate_args": { "num_frames": {"min": 61, "max": 129}, @@ -196,6 +196,7 @@ ]}, "num_inference_steps": {"min": 30, "max": 50}, }, + "use_autocast": False, "vae_enable_tiling": True }, "genmo/mochi-1-preview": { @@ -229,7 +230,7 @@ 'ByteDance/AnimateDiff-Lightning': { "pipeline_cls": AnimateDiffPipeline, "from_pretrained_args": { - "base": "emilianJR/epiCRealism", + "model_id": "emilianJR/epiCRealism", "torch_dtype": torch.bfloat16, "motion_adapter": ( load_annimatediff_motion_adapter, From 6305e2c7c1be96e395514755dbd9f1faf685d4c7 Mon Sep 17 00:00:00 2001 From: Dylan Uys Date: Thu, 23 Jan 2025 02:47:06 +0000 Subject: [PATCH 6/7] adding resolution options and save options for all t2v models --- .../synthetic_data_generator.py | 6 +----- bitmind/validator/config.py | 16 ++++++++++++++-- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/bitmind/synthetic_data_generation/synthetic_data_generator.py b/bitmind/synthetic_data_generation/synthetic_data_generator.py index deaa36d1..7bce9d7a 100644 --- a/bitmind/synthetic_data_generation/synthetic_data_generator.py +++ b/bitmind/synthetic_data_generation/synthetic_data_generator.py @@ -267,7 +267,7 @@ def _run_generation( gen_args['mask_image'] = create_random_mask(image.size) gen_args['image'] = image - # Process generation arguments + # Prepare generation arguments for k, v in gen_args.items(): if isinstance(v, dict): if "min" in v and "max" in v: @@ -275,10 +275,6 @@ def _run_generation( if "options" in v: gen_args[k] = random.choice(v['options']) - for dim in ('height', 'width'): - if isinstance(gen_args.get(dim), list): - gen_args[dim] = np.random.choice(gen_args[dim]) - try: if generate_at_target_size: gen_args['height'] = TARGET_IMAGE_SIZE[0] diff --git a/bitmind/validator/config.py b/bitmind/validator/config.py index 03d08918..b4ecf3b1 100644 --- a/bitmind/validator/config.py +++ b/bitmind/validator/config.py @@ -194,8 +194,9 @@ [720, 1280], [1280, 720], [1104, 832], [832,1104], [960,960], [544, 960], [960, 544], [624, 832], [832, 624], [720, 720] ]}, - "num_inference_steps": {"min": 30, "max": 50}, + "num_inference_steps": {"min": 30, "max": 50}, }, + "save_args": {"fps": 30}, "use_autocast": False, "vae_enable_tiling": True }, @@ -206,8 +207,11 @@ "torch_dtype": torch.bfloat16 }, "generate_args": { - "num_frames": 84 + "num_frames": 84, + "num_inference_steps": {"min": 30, "max": 65}, + "resolution": [480, 848] }, + "save_args": {"fps": 30} "vae_enable_tiling": True }, 'THUDM/CogVideoX-5b': { @@ -221,7 +225,9 @@ "num_videos_per_prompt": 1, "num_inference_steps": {"min": 50, "max": 125}, "num_frames": 48, + "resolution": [720, 480] }, + "save_args": {"fps": 8}, "enable_model_cpu_offload": True, #"enable_sequential_cpu_offload": True, "vae_enable_slicing": True, @@ -240,7 +246,13 @@ "generate_args": { "guidance_scale": 2, "num_inference_steps": {"min": 50, "max": 125}, + "resolution": {"options": [ + [512, 512], [512, 768], [512, 1024], + [768, 512], [768, 768], [768, 1024], + [1024, 512], [1024, 768], [1024, 1024] + ]} }, + "save_args": {"fps": 15}, "scheduler": { "cls": EulerDiscreteScheduler, "from_config_args": { From a966d62ab625f4f2739dd8fbf88f1d9d765ef8b6 Mon Sep 17 00:00:00 2001 From: Dylan Uys Date: Thu, 23 Jan 2025 03:52:01 +0000 Subject: [PATCH 7/7] missing comma in config --- bitmind/validator/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bitmind/validator/config.py b/bitmind/validator/config.py index b4ecf3b1..e77deed0 100644 --- a/bitmind/validator/config.py +++ b/bitmind/validator/config.py @@ -211,7 +211,7 @@ "num_inference_steps": {"min": 30, "max": 65}, "resolution": [480, 848] }, - "save_args": {"fps": 30} + "save_args": {"fps": 30}, "vae_enable_tiling": True }, 'THUDM/CogVideoX-5b': {