diff --git a/README.md b/README.md index 2eb0037..647354f 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,8 @@ A simple flask web app to evaluate small VLMs (Vision Language Models). Currentl - Qwen/Qwen2-VL-2B-Instruct - microsoft/Phi-3.5-vision-instruct - microsoft/Florence-2-large +- google/paligemma2-3b-pt-896 +- microsoft/kosmos-2.5 The application with VLM running loaded and inferencing locally was tested on a consumer grade desktop with following configuration. diff --git a/src/__pycache__/app.cpython-312.pyc b/src/__pycache__/app.cpython-312.pyc new file mode 100644 index 0000000..30b5070 Binary files /dev/null and b/src/__pycache__/app.cpython-312.pyc differ diff --git a/src/app.py b/src/app.py index 68451a2..b04b44f 100644 --- a/src/app.py +++ b/src/app.py @@ -4,6 +4,7 @@ import requests import configparser from transformers import AutoModelForCausalLM, AutoProcessor, AutoModelForVision2Seq, AutoConfig, Qwen2VLForConditionalGeneration +from transformers import Kosmos2ForConditionalGeneration, Kosmos2Processor # Added for KOSMOS-2.5 from PIL import Image import torch import logging @@ -36,7 +37,8 @@ "Qwen/Qwen2-VL-2B-Instruct": "main", "microsoft/Phi-3.5-vision-instruct": "main", "microsoft/Florence-2-large": "main", # Added Florence-2-large model - "google/paligemma2-3b-pt-896": "main" # Added paligemma2-3b-pt-896 model + "google/paligemma2-3b-pt-896": "main", # Added paligemma2-3b-pt-896 model + "microsoft/kosmos-2.5": "main" # Added KOSMOS-2.5 model } @@ -104,6 +106,15 @@ def load_model_and_processor(model_name, revision): ).to("cuda:0" if torch.cuda.is_available() else "cpu") processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True) + elif model_name == "microsoft/kosmos-2.5": + # Load KOSMOS-2.5 model and processor + model = Kosmos2ForConditionalGeneration.from_pretrained( + model_name, + torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, + device_map="auto" if torch.cuda.is_available() else None, + use_auth_token=api_token + ) + processor = Kosmos2Processor.from_pretrained(model_name, use_auth_token=api_token) else: # Load other models using AutoModelForVision2Seq @@ -267,6 +278,16 @@ def extract_text_from_image(image, model_name, prompt): if "pixel_values" in inputs: inputs["pixel_values"] = inputs["pixel_values"].to(device, dtype=torch_dtype) + elif model_name == "microsoft/kosmos-2.5": + # Handle text extraction for KOSMOS-2.5 model + # KOSMOS-2 typically expects a specific prompt format + formatted_prompt = f"An image of {prompt}" + inputs = processor( + text=formatted_prompt, + images=image, + return_tensors='pt' + ).to(device) + # Generate output from the image using the selected model logger.info("Generating output from the model...") output_ids = model.generate(