From fbd852db2d86b5a007147d5bb2e011ed7b9489ad Mon Sep 17 00:00:00 2001 From: Dylan Uys Date: Wed, 18 Dec 2024 10:07:30 -0800 Subject: [PATCH 1/3] Release 2.0.3 (#134) Bittensor 8.5.1 --- bitmind/__init__.py | 2 +- requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bitmind/__init__.py b/bitmind/__init__.py index cacab596..0f888438 100644 --- a/bitmind/__init__.py +++ b/bitmind/__init__.py @@ -18,7 +18,7 @@ # DEALINGS IN THE SOFTWARE. -__version__ = "2.0.2" +__version__ = "2.0.3" version_split = __version__.split(".") __spec_version__ = ( (1000 * int(version_split[0])) diff --git a/requirements.txt b/requirements.txt index 0720de08..80ca086d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ # Core ML frameworks -bittensor==8.4.1 +bittensor==8.5.1 torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 From 850590de10409c7d9e4ddbfd9fcde623bc4c905b Mon Sep 17 00:00:00 2001 From: Dylan Uys Date: Thu, 19 Dec 2024 07:18:05 +0000 Subject: [PATCH 2/3] enhancing prompts by adding conveyed motion with llama --- bitmind/synthetic_data_generation/__init__.py | 1 + ...ation_generator.py => prompt_generator.py} | 257 ++++++++++-------- .../synthetic_data_generator.py | 27 +- 3 files changed, 163 insertions(+), 122 deletions(-) rename bitmind/synthetic_data_generation/{image_annotation_generator.py => prompt_generator.py} (53%) diff --git a/bitmind/synthetic_data_generation/__init__.py b/bitmind/synthetic_data_generation/__init__.py index 5c7fbce0..2b692016 100644 --- a/bitmind/synthetic_data_generation/__init__.py +++ b/bitmind/synthetic_data_generation/__init__.py @@ -1 +1,2 @@ from .synthetic_data_generator import SyntheticDataGenerator +from .prompt_generator import PromptGenerator \ No newline at end of file diff --git a/bitmind/synthetic_data_generation/image_annotation_generator.py b/bitmind/synthetic_data_generation/prompt_generator.py similarity index 53% rename from bitmind/synthetic_data_generation/image_annotation_generator.py rename to bitmind/synthetic_data_generation/prompt_generator.py index d2cbd11c..a8083b87 100644 --- a/bitmind/synthetic_data_generation/image_annotation_generator.py +++ b/bitmind/synthetic_data_generation/prompt_generator.py @@ -20,7 +20,7 @@ disable_progress_bar() -class ImageAnnotationGenerator: +class PromptGenerator: """ A class for generating and moderating image annotations using transformer models. @@ -31,10 +31,9 @@ class ImageAnnotationGenerator: def __init__( self, - model_name: str, - text_moderation_model_name: str, + vlm_name: str, + llm_name: str, device: str = 'cuda', - apply_moderation: bool = True ) -> None: """ Initialize the ImageAnnotationGenerator with specific models and device settings. @@ -47,126 +46,74 @@ def __init__( apply_moderation: Flag to determine whether text moderation should be applied to captions. """ - self.model_name = model_name - self.processor = Blip2Processor.from_pretrained( - self.model_name, - cache_dir=HUGGINGFACE_CACHE_DIR - ) - - self.apply_moderation = apply_moderation - self.text_moderation_model_name = text_moderation_model_name - self.text_moderation_pipeline = None - self.model = None + self.vlm_name = vlm_name + self.llm_name = llm_name + self.vlm_processor = None + self.vlm = None + self.llm_pipeline = None self.device = device - def is_model_loaded(self) -> bool: - return self.model is not None + def are_models_loaded(self) -> bool: + return (self.vlm is not None) and (self.llm_pipeline is not None) def load_models(self) -> None: """ Load the necessary models for image annotation and text moderation onto the specified device. """ - if self.is_model_loaded(): - bt.logging.warning( - f"Image annotation model {self.model_name} is already loaded" - ) + if self.are_models_loaded(): + bt.logging.warning(f"Models already loaded") return - bt.logging.info(f"Loading image annotation model {self.model_name}") - self.model = Blip2ForConditionalGeneration.from_pretrained( - self.model_name, - torch_dtype=torch.float16, + bt.logging.info(f"Loading caption generation model {self.vlm_name}") + self.vlm_processor = Blip2Processor.from_pretrained( + self.vlm_name, cache_dir=HUGGINGFACE_CACHE_DIR ) - self.model.to(self.device) - bt.logging.info(f"Loaded image annotation model {self.model_name}") - bt.logging.info( - f"Loading annotation moderation model {self.text_moderation_model_name}..." + self.vlm = Blip2ForConditionalGeneration.from_pretrained( + self.vlm_name, + torch_dtype=torch.float16, + cache_dir=HUGGINGFACE_CACHE_DIR ) - if self.apply_moderation: - model = AutoModelForCausalLM.from_pretrained( - self.text_moderation_model_name, - torch_dtype=torch.bfloat16, - cache_dir=HUGGINGFACE_CACHE_DIR - ) + self.vlm.to(self.device) + bt.logging.info(f"Loaded image annotation model {self.vlm_name}") - tokenizer = AutoTokenizer.from_pretrained( - self.text_moderation_model_name, - cache_dir=HUGGINGFACE_CACHE_DIR - ) - model = model.to(self.device) - self.text_moderation_pipeline = pipeline( - "text-generation", - model=model, - tokenizer=tokenizer - ) - bt.logging.info( - f"Loaded annotation moderation model {self.text_moderation_model_name}." + bt.logging.info(f"Loading caption moderation model {self.llm_name}") + llm = AutoModelForCausalLM.from_pretrained( + self.llm_name, + torch_dtype=torch.bfloat16, + cache_dir=HUGGINGFACE_CACHE_DIR + ) + tokenizer = AutoTokenizer.from_pretrained( + self.llm_name, + cache_dir=HUGGINGFACE_CACHE_DIR + ) + llm = llm.to(self.device) + self.llm_pipeline = pipeline( + "text-generation", + model=llm, + tokenizer=tokenizer ) + bt.logging.info(f"Loaded caption moderation model {self.llm_name}") def clear_gpu(self) -> None: """ Clear GPU memory by moving models back to CPU and deleting them, followed by collecting garbage. """ - bt.logging.info("Clearing GPU memory after generating image annotation") - self.model.to('cpu') - del self.model - self.model = None - if self.text_moderation_pipeline: - self.text_moderation_pipeline.model.to('cpu') - del self.text_moderation_pipeline - self.text_moderation_pipeline = None - gc.collect() - torch.cuda.empty_cache() - - def moderate(self, description: str, max_new_tokens: int = 80) -> str: - """ - Use the text moderation pipeline to make the description more concise - and neutral. - - Args: - description: The text description to be moderated. - max_new_tokens: Maximum number of new tokens to generate in the - moderated text. - - Returns: - The moderated description text, or the original description if - moderation fails. - """ - messages = [ - { - "role": "system", - "content": ( - "[INST]You always concisely rephrase given descriptions, " - "eliminate redundancy, and remove all specific references to " - "individuals by name. You do not respond with anything other " - "than the revised description.[/INST]" - ) - }, - { - "role": "user", - "content": description - } - ] - try: - moderated_text = self.text_moderation_pipeline( - messages, - max_new_tokens=max_new_tokens, - pad_token_id=self.text_moderation_pipeline.tokenizer.eos_token_id, - return_full_text=False - ) + bt.logging.info("Clearing GPU memory after prompt generation") + if self.vlm: + self.vlm.to('cpu') + del self.vlm + self.vlm = None - if isinstance(moderated_text, list): - return moderated_text[0]['generated_text'] + if self.llm_pipeline: + self.llm_pipeline.model.to('cpu') + del self.llm_pipeline + self.llm_pipeline = None - bt.logging.error("Moderated text did not return a list.") - return description - - except Exception as e: - bt.logging.error(f"An error occurred during moderation: {e}", exc_info=True) - return description + gc.collect() + torch.cuda.empty_cache() def generate( self, @@ -200,17 +147,17 @@ def generate( for i, prompt in enumerate(prompts): description += prompt + ' ' - inputs = self.processor( + inputs = self.vlm_processor( image, text=description, return_tensors="pt" ).to(self.device, torch.float16) - generated_ids = self.model.generate( + generated_ids = self.vlm.generate( **inputs, max_new_tokens=max_new_tokens ) - answer = self.processor.batch_decode( + answer = self.vlm_processor.batch_decode( generated_ids, skip_special_tokens=True )[0].strip() @@ -237,8 +184,102 @@ def generate( if not description.endswith('.'): description += '.' - if self.apply_moderation: - moderated_description = self.moderate(description) - return moderated_description + moderated_description = self.moderate(description) + enhanced_description = self.enhance(description) + return enhanced_description - return description + def moderate(self, description: str, max_new_tokens: int = 80) -> str: + """ + Use the text moderation pipeline to make the description more concise + and neutral. + + Args: + description: The text description to be moderated. + max_new_tokens: Maximum number of new tokens to generate in the + moderated text. + + Returns: + The moderated description text, or the original description if + moderation fails. + """ + messages = [ + { + "role": "system", + "content": ( + "[INST]You always concisely rephrase given descriptions, " + "eliminate redundancy, and remove all specific references to " + "individuals by name. You do not respond with anything other " + "than the revised description.[/INST]" + ) + }, + { + "role": "user", + "content": description + } + ] + try: + moderated_text = self.llm_pipeline( + messages, + max_new_tokens=max_new_tokens, + pad_token_id=self.llm_pipeline.tokenizer.eos_token_id, + return_full_text=False + ) + return moderated_text[0]['generated_text'] + + except Exception as e: + bt.logging.error(f"An error occurred during moderation: {e}", exc_info=True) + return description + + def enhance(self, description: str, max_new_tokens: int = 80) -> str: + """ + Enhance a static image description to make it suitable for video generation + by adding dynamic elements and motion. + + Args: + description: The static image description to enhance. + max_new_tokens: Maximum number of new tokens to generate in the enhanced text. + + Returns: + An enhanced description suitable for video generation, or the original + description if enhancement fails. + """ + messages = [ + { + "role": "system", + "content": ( + "[INST]You are an expert at converting static image descriptions " + "into dynamic video prompts. Enhance the given description by " + "adding natural motion and temporal elements while preserving the " + "core scene. Follow these rules:\n" + "1. Maintain the essential elements of the original description\n" + "2. Add smooth, continuous motions that work well in video\n" + "3. For portraits: Add natural facial movements or expressions\n" + "4. For non-portrait images with people: Add contextually appropriate " + "actions (e.g., for a beach scene, people might be walking along " + "the shoreline or playing in the waves; for a cafe scene, people " + "might be sipping drinks or engaging in conversation)\n" + "5. For landscapes: Add environmental motion like wind or water\n" + "6. For urban scenes: Add dynamic elements like people or traffic\n" + "7. Keep the description concise but descriptive\n" + "8. Focus on gradual, natural transitions\n" + "Only respond with the enhanced description.[/INST]" + ) + }, + { + "role": "user", + "content": description + } + ] + + try: + enhanced_text = self.llm_pipeline( + messages, + max_new_tokens=max_new_tokens, + pad_token_id=self.llm_pipeline.tokenizer.eos_token_id, + return_full_text=False + ) + return enhanced_text[0]['generated_text'] + + except Exception as e: + print(f"An error occurred during motion enhancement: {e}") + return description diff --git a/bitmind/synthetic_data_generation/synthetic_data_generator.py b/bitmind/synthetic_data_generation/synthetic_data_generator.py index d140541e..ce38a5c3 100644 --- a/bitmind/synthetic_data_generation/synthetic_data_generator.py +++ b/bitmind/synthetic_data_generation/synthetic_data_generator.py @@ -26,7 +26,7 @@ get_modality ) from bitmind.synthetic_data_generation.prompt_utils import truncate_prompt_if_too_long -from bitmind.synthetic_data_generation.image_annotation_generator import ImageAnnotationGenerator +from bitmind.synthetic_data_generation.prompt_generator import PromptGenerator from bitmind.validator.cache import ImageCache @@ -59,7 +59,7 @@ class SyntheticDataGenerator: prompt_type: The type of prompt generation strategy ('random', 'annotation'). prompt_generator_name: Name of the prompt generation model. t2vis_model_name: Name of the t2v or t2i model. - image_annotation_generator: The generator object for annotating images if required. + prompt_generator: The vlm/llm pipeline for generating input prompts for t2i/t2v models output_dir: Directory to write generated data. """ @@ -106,20 +106,20 @@ def __init__( self.t2vis_model_name = None self.prompt_type = prompt_type - if self.prompt_type == 'annotation': - self.image_annotation_generator = ImageAnnotationGenerator( - model_name=IMAGE_ANNOTATION_MODEL, - text_moderation_model_name=TEXT_MODERATION_MODEL - ) - else: - raise NotImplementedError(f"Unsupported prompt type: {self.prompt_type}") + self.image_cache = image_cache + if self.prompt_type == 'annotation' and self.image_cache is None: + raise ValueError(f"image_cache cannot be None if prompt_type == 'annotation'") + + self.prompt_generator = PromptGenerator( + vlm_name=IMAGE_ANNOTATION_MODEL, + llm_name=TEXT_MODERATION_MODEL + ) self.output_dir = Path(output_dir) if output_dir else None if self.output_dir: (self.output_dir / "video").mkdir(parents=True, exist_ok=True) (self.output_dir / "image").mkdir(parents=True, exist_ok=True) - self.image_cache = image_cache def batch_generate(self, batch_size: int = 5) -> None: """ @@ -136,7 +136,6 @@ def batch_generate(self, batch_size: int = 5) -> None: prompts.append(self.generate_prompt(image=image_sample['image'], clear_gpu=i==batch_size-1)) bt.logging.info(f"Caption {i+1}/{batch_size} generated: {prompts[-1]}") - # shuffle and interleave models t2i_model_names = random.sample(T2I_MODEL_NAMES, len(T2I_MODEL_NAMES)) t2v_model_names = random.sample(T2V_MODEL_NAMES, len(T2V_MODEL_NAMES)) @@ -206,10 +205,10 @@ def generate_prompt( raise ValueError( "image can't be None if self.prompt_type is 'annotation'" ) - self.image_annotation_generator.load_models() - prompt = self.image_annotation_generator.generate(image) + self.prompt_generator.load_models() + prompt = self.prompt_generator.generate(image) if clear_gpu: - self.image_annotation_generator.clear_gpu() + self.prompt_generator.clear_gpu() else: raise NotImplementedError(f"Unsupported prompt type: {self.prompt_type}") return prompt From d0360af0f43af2aecd0c4daa8653328bf11325c9 Mon Sep 17 00:00:00 2001 From: Dylan Uys Date: Thu, 19 Dec 2024 22:02:47 +0000 Subject: [PATCH 3/3] Mining docs fix setup_miner_env.sh -> setup_env.sh --- docs/Mining.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/Mining.md b/docs/Mining.md index 652d45f9..89c8ad82 100644 --- a/docs/Mining.md +++ b/docs/Mining.md @@ -37,8 +37,8 @@ Install the remaining necessary requirements with the following chained command. ```bash conda activate bitmind export PIP_NO_CACHE_DIR=1 -chmod +x setup_miner_env.sh -./setup_miner_env.sh +chmod +x setup_env.sh +./setup_env.sh ``` ### Data