diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py index e13705b3a8f..a53f8e39071 100644 --- a/backends/qualcomm/utils/utils.py +++ b/backends/qualcomm/utils/utils.py @@ -823,6 +823,7 @@ def generate_multi_graph_program( ) assert qnn_mgr.Init().value == 0, "failed to load processed bytes" binary_info = bytes(qnn_mgr.Compile()) + print("Checking the size of QNN binary info: ", len(binary_info)) assert len(binary_info) != 0, "failed to generate QNN context binary" graph_names = qnn_mgr.GetGraphNames() for graph_name in graph_names: diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py index 0af0f55b88f..cbcb3b0c04d 100755 --- a/examples/qualcomm/oss_scripts/llama/llama.py +++ b/examples/qualcomm/oss_scripts/llama/llama.py @@ -494,7 +494,8 @@ def compile(args, pte_filename, tokenizer): annotate_linear_16a8w_in_affine_layer, ) if args.ptq != None: - kv_quant_attrs = {} + import hashlib + kv_quant_attrs, parameter_hash = {}, [] for i, llama_instance in enumerate(llama_instance_list): llama_instance.quantize( quant_dtype=quant_dtype, @@ -517,6 +518,31 @@ def compile(args, pte_filename, tokenizer): kv_quant_attrs=kv_quant_attrs, ), ) + + tensor_to_md5 = {} + for name, buffer in llama_instance.llama_model.named_buffers(): + md5_buffer = hashlib.md5(buffer.numpy().tobytes()).hexdigest() + if md5_buffer in tensor_to_md5: + tensor_to_md5[md5_buffer].append(name) + else: + tensor_to_md5[md5_buffer] = [name] + parameter_hash.append(tensor_to_md5) + + # check tensors in prefill & decode are exactly the same + assert len(parameter_hash[0]) == len(parameter_hash[1]) + num_keys = len(parameter_hash[0]) + # Remove common keys from both dictionaries + for key in set(parameter_hash[0]).intersection(set(parameter_hash[1])): + del parameter_hash[0][key] + del parameter_hash[1][key] + print(f"{num_keys - len(parameter_hash[0])} / {num_keys} tensors are matched") + + for buf, name in parameter_hash[0].items(): # kv + print(f"KV buffers: {name} cannot find a match") + for buf, name in parameter_hash[1].items(): # prefill + print(f"Prefill buffers: {name} cannot find a match") + + end_quantize_ts = time.time() logging.info(f"Time for quantizing: {end_quantize_ts - start_quantize_ts}") diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp index e06d52fbb37..99d6d715db7 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp +++ b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp @@ -404,10 +404,10 @@ Error Runner::generate( token_callback(piece_res.get().c_str()); } - if (pos >= num_prompt_tokens && eos_id_.count(cur_token) > 0) { - ET_LOG(Info, "\nReached to the end of generation"); - break; - } + // if (pos >= num_prompt_tokens && eos_id_.count(cur_token) > 0) { + // ET_LOG(Info, "\nReached to the end of generation"); + // break; + // } } }; diff --git a/examples/qualcomm/utils.py b/examples/qualcomm/utils.py index c2d2f002aa8..a1890022be4 100755 --- a/examples/qualcomm/utils.py +++ b/examples/qualcomm/utils.py @@ -167,6 +167,8 @@ def execute(self, custom_runner_cmd=None, method_index=0): ) else: qnn_executor_runner_cmds = custom_runner_cmd + + print("Execution command is: ", qnn_executor_runner_cmds) self._adb(["shell", f"{qnn_executor_runner_cmds}"])