modelscope · Jintao-Huang · Sep 25, 2024 · Sep 25, 2024 · Sep 25, 2024
diff --git a/docs/source/Instruction/支持的模型和数据集.md b/docs/source/Instruction/支持的模型和数据集.md
@@ -509,6 +509,7 @@
 |florence-2-base-ft|[AI-ModelScope/Florence-2-base-ft](https://modelscope.cn/models/AI-ModelScope/Florence-2-base-ft/summary)|^(language_model\|image_projection)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|florence|&#x2714;|&#x2718;|&#x2718;|&#x2718;||vision|[microsoft/Florence-2-base-ft](https://huggingface.co/microsoft/Florence-2-base-ft)|
 |florence-2-large|[AI-ModelScope/Florence-2-large](https://modelscope.cn/models/AI-ModelScope/Florence-2-large/summary)|^(language_model\|image_projection)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|florence|&#x2714;|&#x2718;|&#x2718;|&#x2718;||vision|[microsoft/Florence-2-large](https://huggingface.co/microsoft/Florence-2-large)|
 |florence-2-large-ft|[AI-ModelScope/Florence-2-large-ft](https://modelscope.cn/models/AI-ModelScope/Florence-2-large-ft/summary)|^(language_model\|image_projection)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|florence|&#x2714;|&#x2718;|&#x2718;|&#x2718;||vision|[microsoft/Florence-2-large-ft](https://huggingface.co/microsoft/Florence-2-large-ft)|
+|got-ocr2|[stepfun-ai/GOT-OCR2_0](https://modelscope.cn/models/stepfun-ai/GOT-OCR2_0/summary)|^(model.layers\|model.mm_projector_vary)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|got_ocr2|&#x2714;|&#x2718;|&#x2718;|&#x2718;||audio|[stepfun-ai/GOT-OCR2_0](https://huggingface.co/stepfun-ai/GOT-OCR2_0)|
 
 
 ## 数据集

diff --git a/docs/source/Multi-Modal/index.md b/docs/source/Multi-Modal/index.md
@@ -19,6 +19,7 @@
 7. [Internlm2-Xcomposers最佳实践](internlm-xcomposer2最佳实践.md)
 8. [Phi3-Vision最佳实践](phi3-vision最佳实践.md), [Phi3.5-Vision最佳实践](https://github.com/modelscope/ms-swift/issues/1809)
 9. [mPLUG-Owl2最佳实践](mplug-owl2最佳实践.md), [mPLUG-Owl3最佳实践](https://github.com/modelscope/ms-swift/issues/1969)
+10. [GOT-OCR2最佳实践](https://github.com/modelscope/ms-swift/issues/2122)
 
 
 一轮对话只能包含一张图片（可能可以不含图片）:

diff --git a/docs/source/Multi-Modal/qwen2-vl最佳实践.md b/docs/source/Multi-Modal/qwen2-vl最佳实践.md
@@ -1,5 +1,6 @@
 
 # Qwen2-VL 最佳实践
+qwen2-vl-72b-instruct的最佳实践可以查看[这里](https://github.com/modelscope/ms-swift/issues/2064).
 
 ## 目录
 - [环境准备](#环境准备)

diff --git a/docs/source_en/Instruction/Supported-models-datasets.md b/docs/source_en/Instruction/Supported-models-datasets.md
@@ -509,6 +509,7 @@ The table below introcudes all models supported by SWIFT:
 |florence-2-base-ft|[AI-ModelScope/Florence-2-base-ft](https://modelscope.cn/models/AI-ModelScope/Florence-2-base-ft/summary)|^(language_model\|image_projection)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|florence|&#x2714;|&#x2718;|&#x2718;|&#x2718;||vision|[microsoft/Florence-2-base-ft](https://huggingface.co/microsoft/Florence-2-base-ft)|
 |florence-2-large|[AI-ModelScope/Florence-2-large](https://modelscope.cn/models/AI-ModelScope/Florence-2-large/summary)|^(language_model\|image_projection)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|florence|&#x2714;|&#x2718;|&#x2718;|&#x2718;||vision|[microsoft/Florence-2-large](https://huggingface.co/microsoft/Florence-2-large)|
 |florence-2-large-ft|[AI-ModelScope/Florence-2-large-ft](https://modelscope.cn/models/AI-ModelScope/Florence-2-large-ft/summary)|^(language_model\|image_projection)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|florence|&#x2714;|&#x2718;|&#x2718;|&#x2718;||vision|[microsoft/Florence-2-large-ft](https://huggingface.co/microsoft/Florence-2-large-ft)|
+|got-ocr2|[stepfun-ai/GOT-OCR2_0](https://modelscope.cn/models/stepfun-ai/GOT-OCR2_0/summary)|^(model.layers\|model.mm_projector_vary)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|got_ocr2|&#x2714;|&#x2718;|&#x2718;|&#x2718;||audio|[stepfun-ai/GOT-OCR2_0](https://huggingface.co/stepfun-ai/GOT-OCR2_0)|
 
 
 ## Datasets

diff --git a/docs/source_en/Multi-Modal/index.md b/docs/source_en/Multi-Modal/index.md
@@ -19,6 +19,7 @@ A single round of dialogue can contain multiple images (or no images):
 7. [Internlm2-Xcomposers Best Practice](internlm-xcomposer2-best-practice.md)
 8. [Phi3-Vision Best Practice](phi3-vision-best-practice.md), [Phi3.5-Vision Best Practice](https://github.com/modelscope/ms-swift/issues/1809).
 9. [mPLUG-Owl3 Best Practice](https://github.com/modelscope/ms-swift/issues/1969)
+10. [GOT-OCR2 Best Practice](https://github.com/modelscope/ms-swift/issues/2122)
 
 A single round of dialogue can only contain one image:
 1. [Yi-VL Best Practice.md](yi-vl-best-practice.md)

diff --git a/docs/source_en/Multi-Modal/qwen2-vl-best-practice.md b/docs/source_en/Multi-Modal/qwen2-vl-best-practice.md
@@ -1,5 +1,6 @@
 
 # Qwen2-VL Best Practice
+The best practices for qwen2-vl-72b-instruct can be found [here](https://github.com/modelscope/ms-swift/issues/2064).
 
 ## Table of Contents
 - [Environment Setup](#environment-setup)

diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
@@ -606,6 +606,8 @@ class ModelType:
     florence_2_large = 'florence-2-large'
     florence_2_large_ft = 'florence-2-large-ft'
 
+    got_ocr2 = 'got-ocr2'
+
     @classmethod
     def get_model_name_list(cls) -> List[str]:
         res = []
@@ -636,6 +638,7 @@ class LoRATM(NamedTuple):
     idefics3 = 'idefics3'
     mplug_owl3 = 'mplug_owl3'
     llama3_1_omni = 'llama3_1_omni'
+    got_ocr2 = 'got_ocr2'
     # default lora target modules for nlp llms.
     minicpm3 = ['q_a_proj', 'q_b_proj', 'kv_a_proj_with_mqa', 'kv_b_proj']
     baichuan = ['W_pack']
@@ -6558,6 +6561,22 @@ def get_model_tokenizer_omnli(model_dir: str,
     return model, tokenizer
 
 
+@register_model(
+    ModelType.got_ocr2,
+    'stepfun-ai/GOT-OCR2_0',
+    LoRATM.got_ocr2,
+    TemplateType.got_ocr2,
+    support_flash_attn=True,
+    placeholder_tokens=['<imgpad>'],
+    eos_token='<|im_end|>',
+    tags=['multi-modal', 'audio'],
+    hf_model_id='stepfun-ai/GOT-OCR2_0')
+def get_model_tokenizer_got_ocr2(*args, **kwargs):
+    kwargs['automodel_class'] = AutoModel
+    model, tokenizer = get_model_tokenizer_with_flash_attn(*args, **kwargs)
+    return model, tokenizer
+
+
 def fix_transformers_upgrade(module: PreTrainedModel) -> None:
     # from 4.35, transformers changes its arguments of _set_gradient_checkpointing
     if version.parse(transformers.__version__) >= version.parse('4.35'):

diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py
@@ -139,6 +139,7 @@ class TemplateType:
     mengzi = 'mengzi'
     c4ai = 'c4ai'
     chatml = 'chatml'
+    got_ocr2 = 'got_ocr2'
     # compatibility. (Deprecated)
     default_generation_bos = 'default-generation-bos'
     yi = 'yi'
@@ -1235,6 +1236,61 @@ class QwenTemplate(QwenTemplateMixin, Template):
     pass
 
 
+class GOTImageEvalProcessor:
+
+    def __init__(self, image_size=384, mean=None, std=None):
+        from torchvision import transforms
+        from torchvision.transforms.functional import InterpolationMode
+        if mean is None:
+            mean = (0.48145466, 0.4578275, 0.40821073)
+        if std is None:
+            std = (0.26862954, 0.26130258, 0.27577711)
+
+        self.normalize = transforms.Normalize(mean, std)
+
+        self.transform = transforms.Compose([
+            transforms.Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC),
+            transforms.ToTensor(),
+            self.normalize,
+        ])
+
+    def __call__(self, item):
+        return self.transform(item)
+
+
+class GOT_OCR2Template(QwenTemplate):
+    system = '        You should follow the instructions carefully and explain your answers in detail.'
+
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
+                    example: Dict[str, Any]) -> List[Context]:
+        # OCR:
+        # OCR with format:
+        assert media_type == 'image'
+        return ['<img>' + '<imgpad>' * 256 + '</img>\n']
+
+    def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        inputs, tokenizer_kwargs = super()._encode(example)
+        if len(inputs) == 0:
+            return inputs, {}
+        images = example['images']
+        image_processor_high = GOTImageEvalProcessor(image_size=1024)
+        for i, image in enumerate(images):
+            images[i] = image_processor_high(image)[None].to(self.model.dtype)
+        if images:
+            inputs['images'] = images
+        return inputs, {}
+
+    def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] = None) -> Dict[str, Any]:
+        res = super().data_collator(batch, padding_to)
+        images = _gather_list(batch, 'images')
+        if images:
+            res['images'] = images
+        return res
+
+
+register_template(TemplateType.got_ocr2, GOT_OCR2Template(), lazy_tokenize=True, use_model=True)
+
+
 class _QwenVLTemplateMixin:
     load_medias = False
 

diff --git a/swift/utils/module_mapping.py b/swift/utils/module_mapping.py
@@ -290,6 +290,12 @@ def __post_init__(self):
     generator='speech_generator',
 )
 
+GOT_OCR2 = MultiModelKeys(
+    language_model='model.layers',
+    connector='model.mm_projector_vary',
+    vision_tower='model.vision_tower_high',
+)
+
 MODEL_KEYS_MAPPING = OrderedDict([
     # MLLM here
     ('qwen_audio', QWEN_AUDIO_KEYS),
@@ -310,6 +316,7 @@ def __post_init__(self):
     ('idefics3', IDEFICS3_KEYS),
     ('mplug_owl3', MPLUG_OWL3_KEYS),
     ('llama3_1_omni', LLAMA3_1_OMNI),
+    ('got_ocr2', GOT_OCR2),
     # LLM begins here
     ('llama', LLAMA_KEYS),
     ('mistral', LLAMA_KEYS),