VLM Pipeline (Intern,LLava)#256
Conversation
Signed-off-by: Dipankar Sarkar <quic_dipankar@quicinc.com>
| return generate_func(**kwargs) | ||
|
|
||
| def generate_inputs_intern(self, **kwargs): | ||
| bs: int = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE |
There was a problem hiding this comment.
Please move this inside the modeling file.
| return inputs, dynamic_axes, output_names | ||
|
|
||
| def generate_inputs_llava(self, **kwargs): | ||
| bs: int = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE |
There was a problem hiding this comment.
Move this inside the modeling file modelling_llava.py
| # ) | ||
| # num_logits_to_keep = num_speculative_tokens + 1 | ||
| # if prefill_seq_len < num_logits_to_keep: | ||
| # raise ValueError( |
There was a problem hiding this comment.
Remove commented lines.
| generation_len = self.ctx_len - input_len.max() # in standalone this is tensor | ||
| assert generation_len > 0, "generation length should be greater than zero" | ||
| generated_ids = np.full((batch_size, generation_len + 1), self.processor.tokenizer.pad_token_id) | ||
| # inputs["input_ids"]=torch.nn.functional.pad(inputs["input_ids"],(0,self.seq_len_constant-inputs["input_ids"].size(1)),"constant",self.pad_token_id) |
| MAX_RETRIES = 5 # This constant will be used set the maximum number of retry attempts for downloading a model using huggingface_hub snapshot_download | ||
| NUM_SPECULATIVE_TOKENS = 2 | ||
| CTX_LEN_VLM_LLAVA = 1280 | ||
| IMG_SIZE = 336 |
There was a problem hiding this comment.
Are you using these at the time of export to define shapes?
|
|
||
|
|
||
| # if __name__ == "__main__": | ||
| # # model_name = "OpenGVLab/InternVL2_5-1B" |
There was a problem hiding this comment.
Remove commented parts.
| if num_speculative_tokens: | ||
| compile_hash.update(to_hashable({"num_speculative_tokens": num_speculative_tokens})) | ||
|
|
||
| # import ipdb; ipdb.set_trace() |
| if hasattr(module, "__qeff_init__"): | ||
| module.__qeff_init__() | ||
| transformed = True | ||
|
|
There was a problem hiding this comment.
Can we combine both if conditions?
| input_ids_size = input_ids.shape[1] | ||
| # attention_mask = inputs["attention_mask"] | ||
| inputs["input_ids"] = torch.nn.functional.pad( | ||
| inputs["input_ids"], (0, 3072 - input_ids_size), "constant", self.processor.tokenizer.pad_token_id |
There was a problem hiding this comment.
Please avoid hardcoded value.
There was a problem hiding this comment.
Make this value generic, and fetch from qpc session - prefill_seq_len. For whichever value it was compiled for.
| breakpoint() | ||
| self.model.config.use_cache = True | ||
| self.processor = processor | ||
| self.num_layers = model.config.text_config.num_hidden_layers |
There was a problem hiding this comment.
Make fetching num_layers generic. Also the padding shape. Please refer llava PR, use a similar function, which fetches based on model architecture.
|
Already addressed in #267. |
Added Generic Framework to onboard and run VLMs in QEff