From 1c2ae4bba88bb78afa4d649c5bb50398b9aedd3a Mon Sep 17 00:00:00 2001 From: "huangjintao.hjt" Date: Wed, 25 Sep 2024 16:30:35 +0800 Subject: [PATCH 1/2] support got-ocr2 --- ...14\346\225\260\346\215\256\351\233\206.md" | 1 + docs/source/Multi-Modal/index.md | 1 + ...00\344\275\263\345\256\236\350\267\265.md" | 1 + .../Instruction/Supported-models-datasets.md | 1 + docs/source_en/Multi-Modal/index.md | 1 + .../Multi-Modal/qwen2-vl-best-practice.md | 1 + swift/llm/utils/model.py | 19 ++++++ swift/llm/utils/template.py | 58 ++++++++++++++++++- swift/utils/module_mapping.py | 7 +++ 9 files changed, 89 insertions(+), 1 deletion(-) diff --git "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" index 1e1c045f40..2c18f56782 100644 --- "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" +++ "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" @@ -509,6 +509,7 @@ |florence-2-base-ft|[AI-ModelScope/Florence-2-base-ft](https://modelscope.cn/models/AI-ModelScope/Florence-2-base-ft/summary)|^(language_model\|image_projection)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|florence|✔|✘|✘|✘||vision|[microsoft/Florence-2-base-ft](https://huggingface.co/microsoft/Florence-2-base-ft)| |florence-2-large|[AI-ModelScope/Florence-2-large](https://modelscope.cn/models/AI-ModelScope/Florence-2-large/summary)|^(language_model\|image_projection)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|florence|✔|✘|✘|✘||vision|[microsoft/Florence-2-large](https://huggingface.co/microsoft/Florence-2-large)| |florence-2-large-ft|[AI-ModelScope/Florence-2-large-ft](https://modelscope.cn/models/AI-ModelScope/Florence-2-large-ft/summary)|^(language_model\|image_projection)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|florence|✔|✘|✘|✘||vision|[microsoft/Florence-2-large-ft](https://huggingface.co/microsoft/Florence-2-large-ft)| +|got-ocr2|[stepfun-ai/GOT-OCR2_0](https://modelscope.cn/models/stepfun-ai/GOT-OCR2_0/summary)|^(model.layers\|model.mm_projector_vary)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|got_ocr2|✔|✘|✘|✘||audio|[stepfun-ai/GOT-OCR2_0](https://huggingface.co/stepfun-ai/GOT-OCR2_0)| ## 数据集 diff --git a/docs/source/Multi-Modal/index.md b/docs/source/Multi-Modal/index.md index 2c6e1691c1..6a81f92acf 100644 --- a/docs/source/Multi-Modal/index.md +++ b/docs/source/Multi-Modal/index.md @@ -19,6 +19,7 @@ 7. [Internlm2-Xcomposers最佳实践](internlm-xcomposer2最佳实践.md) 8. [Phi3-Vision最佳实践](phi3-vision最佳实践.md), [Phi3.5-Vision最佳实践](https://github.com/modelscope/ms-swift/issues/1809) 9. [mPLUG-Owl2最佳实践](mplug-owl2最佳实践.md), [mPLUG-Owl3最佳实践](https://github.com/modelscope/ms-swift/issues/1969) +10. [GOT-OCR2最佳实践](https://github.com/modelscope/ms-swift/issues/2122) 一轮对话只能包含一张图片(可能可以不含图片): diff --git "a/docs/source/Multi-Modal/qwen2-vl\346\234\200\344\275\263\345\256\236\350\267\265.md" "b/docs/source/Multi-Modal/qwen2-vl\346\234\200\344\275\263\345\256\236\350\267\265.md" index 46064f396b..c67c2e080b 100644 --- "a/docs/source/Multi-Modal/qwen2-vl\346\234\200\344\275\263\345\256\236\350\267\265.md" +++ "b/docs/source/Multi-Modal/qwen2-vl\346\234\200\344\275\263\345\256\236\350\267\265.md" @@ -1,5 +1,6 @@ # Qwen2-VL 最佳实践 +qwen2-vl-72b-instruct的最佳实践可以查看[这里](https://github.com/modelscope/ms-swift/issues/2064). ## 目录 - [环境准备](#环境准备) diff --git a/docs/source_en/Instruction/Supported-models-datasets.md b/docs/source_en/Instruction/Supported-models-datasets.md index f844d3637c..eaadd50ddc 100644 --- a/docs/source_en/Instruction/Supported-models-datasets.md +++ b/docs/source_en/Instruction/Supported-models-datasets.md @@ -509,6 +509,7 @@ The table below introcudes all models supported by SWIFT: |florence-2-base-ft|[AI-ModelScope/Florence-2-base-ft](https://modelscope.cn/models/AI-ModelScope/Florence-2-base-ft/summary)|^(language_model\|image_projection)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|florence|✔|✘|✘|✘||vision|[microsoft/Florence-2-base-ft](https://huggingface.co/microsoft/Florence-2-base-ft)| |florence-2-large|[AI-ModelScope/Florence-2-large](https://modelscope.cn/models/AI-ModelScope/Florence-2-large/summary)|^(language_model\|image_projection)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|florence|✔|✘|✘|✘||vision|[microsoft/Florence-2-large](https://huggingface.co/microsoft/Florence-2-large)| |florence-2-large-ft|[AI-ModelScope/Florence-2-large-ft](https://modelscope.cn/models/AI-ModelScope/Florence-2-large-ft/summary)|^(language_model\|image_projection)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|florence|✔|✘|✘|✘||vision|[microsoft/Florence-2-large-ft](https://huggingface.co/microsoft/Florence-2-large-ft)| +|got-ocr2|[stepfun-ai/GOT-OCR2_0](https://modelscope.cn/models/stepfun-ai/GOT-OCR2_0/summary)|^(model.layers\|model.mm_projector_vary)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|got_ocr2|✔|✘|✘|✘||audio|[stepfun-ai/GOT-OCR2_0](https://huggingface.co/stepfun-ai/GOT-OCR2_0)| ## Datasets diff --git a/docs/source_en/Multi-Modal/index.md b/docs/source_en/Multi-Modal/index.md index f5893944fb..05c2cd2072 100644 --- a/docs/source_en/Multi-Modal/index.md +++ b/docs/source_en/Multi-Modal/index.md @@ -19,6 +19,7 @@ A single round of dialogue can contain multiple images (or no images): 7. [Internlm2-Xcomposers Best Practice](internlm-xcomposer2-best-practice.md) 8. [Phi3-Vision Best Practice](phi3-vision-best-practice.md), [Phi3.5-Vision Best Practice](https://github.com/modelscope/ms-swift/issues/1809). 9. [mPLUG-Owl3 Best Practice](https://github.com/modelscope/ms-swift/issues/1969) +10. [GOT-OCR2 Best Practice](https://github.com/modelscope/ms-swift/issues/2122) A single round of dialogue can only contain one image: 1. [Yi-VL Best Practice.md](yi-vl-best-practice.md) diff --git a/docs/source_en/Multi-Modal/qwen2-vl-best-practice.md b/docs/source_en/Multi-Modal/qwen2-vl-best-practice.md index 83d5c08f34..f1cf56ae31 100644 --- a/docs/source_en/Multi-Modal/qwen2-vl-best-practice.md +++ b/docs/source_en/Multi-Modal/qwen2-vl-best-practice.md @@ -1,5 +1,6 @@ # Qwen2-VL Best Practice +The best practices for qwen2-vl-72b-instruct can be found [here](https://github.com/modelscope/ms-swift/issues/2064). ## Table of Contents - [Environment Setup](#environment-setup) diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py index 3c71a9a726..10f14481d0 100644 --- a/swift/llm/utils/model.py +++ b/swift/llm/utils/model.py @@ -606,6 +606,8 @@ class ModelType: florence_2_large = 'florence-2-large' florence_2_large_ft = 'florence-2-large-ft' + got_ocr2 = 'got-ocr2' + @classmethod def get_model_name_list(cls) -> List[str]: res = [] @@ -636,6 +638,7 @@ class LoRATM(NamedTuple): idefics3 = 'idefics3' mplug_owl3 = 'mplug_owl3' llama3_1_omni = 'llama3_1_omni' + got_ocr2 = 'got_ocr2' # default lora target modules for nlp llms. minicpm3 = ['q_a_proj', 'q_b_proj', 'kv_a_proj_with_mqa', 'kv_b_proj'] baichuan = ['W_pack'] @@ -6558,6 +6561,22 @@ def get_model_tokenizer_omnli(model_dir: str, return model, tokenizer +@register_model( + ModelType.got_ocr2, + 'stepfun-ai/GOT-OCR2_0', + LoRATM.got_ocr2, + TemplateType.got_ocr2, + support_flash_attn=True, + placeholder_tokens=[''], + eos_token='<|im_end|>', + tags=['multi-modal', 'audio'], + hf_model_id='stepfun-ai/GOT-OCR2_0') +def get_model_tokenizer_got_ocr2(*args, **kwargs): + kwargs['automodel_class'] = AutoModel + model, tokenizer = get_model_tokenizer_with_flash_attn(*args, **kwargs) + return model, tokenizer + + def fix_transformers_upgrade(module: PreTrainedModel) -> None: # from 4.35, transformers changes its arguments of _set_gradient_checkpointing if version.parse(transformers.__version__) >= version.parse('4.35'): diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py index 584edc437e..eef7dc35af 100644 --- a/swift/llm/utils/template.py +++ b/swift/llm/utils/template.py @@ -16,7 +16,7 @@ from peft import PeftModel from torch.nn.utils.rnn import pad_sequence from transformers import PreTrainedTokenizerBase, StoppingCriteria -from transformers.dynamic_module_utils import get_class_from_dynamic_module +from transformers.dynamic_module_utils import get_class_from_dynamic_module, get_class_in_module from transformers.integrations import is_deepspeed_zero3_enabled from transformers.utils import strtobool @@ -139,6 +139,7 @@ class TemplateType: mengzi = 'mengzi' c4ai = 'c4ai' chatml = 'chatml' + got_ocr2 = 'got_ocr2' # compatibility. (Deprecated) default_generation_bos = 'default-generation-bos' yi = 'yi' @@ -1235,6 +1236,61 @@ class QwenTemplate(QwenTemplateMixin, Template): pass +class GOTImageEvalProcessor: + + def __init__(self, image_size=384, mean=None, std=None): + from torchvision import transforms + from torchvision.transforms.functional import InterpolationMode + if mean is None: + mean = (0.48145466, 0.4578275, 0.40821073) + if std is None: + std = (0.26862954, 0.26130258, 0.27577711) + + self.normalize = transforms.Normalize(mean, std) + + self.transform = transforms.Compose([ + transforms.Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC), + transforms.ToTensor(), + self.normalize, + ]) + + def __call__(self, item): + return self.transform(item) + + +class GOT_OCR2Template(QwenTemplate): + system = ' You should follow the instructions carefully and explain your answers in detail.' + + def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int, + example: Dict[str, Any]) -> List[Context]: + # OCR: + # OCR with format: + assert media_type == 'image' + return ['' + '' * 256 + '\n'] + + def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]: + inputs, tokenizer_kwargs = super()._encode(example) + if len(inputs) == 0: + return inputs, {} + images = example['images'] + image_processor_high = GOTImageEvalProcessor(image_size=1024) + for i, image in enumerate(images): + images[i] = image_processor_high(image)[None].to(self.model.dtype) + if images: + inputs['images'] = images + return inputs, {} + + def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] = None) -> Dict[str, Any]: + res = super().data_collator(batch, padding_to) + images = _gather_list(batch, 'images') + if images: + res['images'] = images + return res + + +register_template(TemplateType.got_ocr2, GOT_OCR2Template(), lazy_tokenize=True, use_model=True) + + class _QwenVLTemplateMixin: load_medias = False diff --git a/swift/utils/module_mapping.py b/swift/utils/module_mapping.py index 2fd8298bb5..6e4b3aac93 100644 --- a/swift/utils/module_mapping.py +++ b/swift/utils/module_mapping.py @@ -290,6 +290,12 @@ def __post_init__(self): generator='speech_generator', ) +GOT_OCR2 = MultiModelKeys( + language_model='model.layers', + connector='model.mm_projector_vary', + vision_tower='model.vision_tower_high', +) + MODEL_KEYS_MAPPING = OrderedDict([ # MLLM here ('qwen_audio', QWEN_AUDIO_KEYS), @@ -310,6 +316,7 @@ def __post_init__(self): ('idefics3', IDEFICS3_KEYS), ('mplug_owl3', MPLUG_OWL3_KEYS), ('llama3_1_omni', LLAMA3_1_OMNI), + ('got_ocr2', GOT_OCR2), # LLM begins here ('llama', LLAMA_KEYS), ('mistral', LLAMA_KEYS), From 6d32c0752c7b7b3ec8f6b8bd24c87bebaf95df78 Mon Sep 17 00:00:00 2001 From: "huangjintao.hjt" Date: Wed, 25 Sep 2024 16:44:57 +0800 Subject: [PATCH 2/2] update --- swift/llm/utils/template.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py index eef7dc35af..b8a8a81da6 100644 --- a/swift/llm/utils/template.py +++ b/swift/llm/utils/template.py @@ -16,7 +16,7 @@ from peft import PeftModel from torch.nn.utils.rnn import pad_sequence from transformers import PreTrainedTokenizerBase, StoppingCriteria -from transformers.dynamic_module_utils import get_class_from_dynamic_module, get_class_in_module +from transformers.dynamic_module_utils import get_class_from_dynamic_module from transformers.integrations import is_deepspeed_zero3_enabled from transformers.utils import strtobool