diff --git a/src/runtime/contrib/clml/clml_runtime.cc b/src/runtime/contrib/clml/clml_runtime.cc index 6d79196a08e6..992665ee697c 100644 --- a/src/runtime/contrib/clml/clml_runtime.cc +++ b/src/runtime/contrib/clml/clml_runtime.cc @@ -140,15 +140,17 @@ class CLMLRuntime : public JSONRuntimeBase { void InitCLML() { // Setup CLML Context cl_int result = 0; - workspace = cl::OpenCLWorkspace::Global(); workspace->Init(); tentry = workspace->GetThreadEntry(); if (!ExtensionStringPresent()) { - LOG(WARNING) << "CLML Runtime Init: Qualcomm extn not present.\n"; + LOG(FATAL) << "CLML Runtime Init: Qualcomm extn not present.\n"; return; } + device_id = workspace->GetCLDeviceID(tentry->device.device_id); + platform_id = workspace->device_to_platform[device_id]; + // Query and Get CLML Interface static const cl_uint MAX_VERSIONS = 256; cl_int majorVersions[MAX_VERSIONS]; @@ -492,7 +494,7 @@ class CLMLRuntime : public JSONRuntimeBase { JSONGraphNode node = it->second.second; void* node_data = nullptr; - allocateTensorMemory(h_ClmlIntf, workspace->context, tensor_desc); + allocateTensorMemory(h_ClmlIntf, workspace->contexts[platform_id], tensor_desc); if (node.GetOpType() == "const") { node_data = data_entry_[EntryID(it->first, 0)]->data; @@ -581,11 +583,9 @@ class CLMLRuntime : public JSONRuntimeBase { bool ExtensionStringPresent(void) { cl_int result = 0; - if (workspace->platform_id == nullptr) { - return 0; - } size_t reqd_size = 0; - cl_device_id device_id = workspace->devices[workspace->GetThreadEntry()->device.device_id]; + cl_device_id device_id = + workspace->GetCLDeviceID(workspace->GetThreadEntry()->device.device_id); result = clGetDeviceInfo(device_id, CL_DEVICE_EXTENSIONS, 0, NULL, &reqd_size); ICHECK(reqd_size > 0u && result == CL_SUCCESS) << "clGetDeviceInfo:" << result; @@ -607,7 +607,8 @@ class CLMLRuntime : public JSONRuntimeBase { cl_ml_tensor_desc_qcom desc = { dtype, layout, dims.n, dims.c, dims.h, dims.w, 0, CL_TENSOR_DIMENSIONS_4D_QCOM, { 0 }}; - result = h_ClmlIntf->clCreateMLTensorQCOM(workspace->context, NULL, &desc, &tensor); + result = + h_ClmlIntf->clCreateMLTensorQCOM(workspace->contexts[platform_id], NULL, &desc, &tensor); ICHECK(tensor && result == CL_SUCCESS) << "clCreateMLTensorQCOM:" << result; (void)result; return tensor; @@ -619,11 +620,12 @@ class CLMLRuntime : public JSONRuntimeBase { cl_int result = CL_OUT_OF_HOST_MEMORY; cl_mem buffer = NULL; - result = - h_ClmlIntf->clGetMLTensorMemorySizeQCOM(workspace->context, pTensorMemDesc->tensor, &size); + result = h_ClmlIntf->clGetMLTensorMemorySizeQCOM(workspace->contexts[platform_id], + pTensorMemDesc->tensor, &size); ICHECK(result == CL_SUCCESS) << "clGetMLTensorMemorySizeQCOM:" << result; - buffer = clCreateBuffer(workspace->context, CL_MEM_READ_WRITE, size, NULL, &result); + buffer = + clCreateBuffer(workspace->contexts[platform_id], CL_MEM_READ_WRITE, size, NULL, &result); ICHECK(result == CL_SUCCESS) << "clCreateBuffer:" << result; pTensorMemDesc->memory = buffer; @@ -686,7 +688,8 @@ class CLMLRuntime : public JSONRuntimeBase { cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype); auto tensor_dsc = std::make_shared(); - tensor_dsc->tensor = DeviceMakeCLMLTensor(workspace->context, dims, layout, cl_dtype); + tensor_dsc->tensor = + DeviceMakeCLMLTensor(workspace->contexts[platform_id], dims, layout, cl_dtype); return tensor_dsc; } @@ -800,8 +803,8 @@ class CLMLRuntime : public JSONRuntimeBase { } else { cl_ml_tensor_desc_qcom desc = {}; desc.num_dimensions = CL_TENSOR_UNUSED_QCOM; - result = - h_ClmlIntf->clCreateMLTensorQCOM(workspace->context, NULL, &desc, &layer_.unusedTensor); + result = h_ClmlIntf->clCreateMLTensorQCOM(workspace->contexts[platform_id], NULL, &desc, + &layer_.unusedTensor); ICHECK(layer_.unusedTensor && result == CL_SUCCESS) << "clCreateMLTensorQCOM:" << result; bias->tensor = layer_.unusedTensor; } @@ -821,13 +824,13 @@ class CLMLRuntime : public JSONRuntimeBase { if (!has_bn) { if (!has_act) { result = h_ClmlIntf->clCreateMLOpConvolutionForwardQCOM( - workspace->context, 0, &conv_desc, input->tensor, weight->tensor, bias->tensor, - output->tensor, &op, NULL); + workspace->contexts[platform_id], 0, &conv_desc, input->tensor, weight->tensor, + bias->tensor, output->tensor, &op, NULL); ICHECK(op && result == CL_SUCCESS) << "Convolution Error:" << result; } else { result = h_ClmlIntf->clCreateMLOpFusedConvolutionActivationForwardQCOM( - workspace->context, 0, &conv_desc, &act_desc, input->tensor, weight->tensor, - bias->tensor, NULL, output->tensor, &op, tuning_cache); + workspace->contexts[platform_id], 0, &conv_desc, &act_desc, input->tensor, + weight->tensor, bias->tensor, NULL, output->tensor, &op, tuning_cache); ICHECK(op && result == CL_SUCCESS) << "Convolution Error:" << result; } layer_.func_ins.push_back(input); @@ -854,15 +857,15 @@ class CLMLRuntime : public JSONRuntimeBase { cl_ml_op_batchnorm_desc_qcom bn_desc = {CL_BATCHNORM_MODE_SPATIAL_QCOM, cl_arithmetic_mode}; if (!has_act) { result = h_ClmlIntf->clCreateMLOpFusedConvolutionBatchNormForwardQCOM( - workspace->context, 0, &conv_desc, &bn_desc, input->tensor, weight->tensor, - bias->tensor, output->tensor, bn_mean->tensor, bn_var->tensor, bn_scale->tensor, - bn_bias->tensor, &op, tuning_cache); + workspace->contexts[platform_id], 0, &conv_desc, &bn_desc, input->tensor, + weight->tensor, bias->tensor, output->tensor, bn_mean->tensor, bn_var->tensor, + bn_scale->tensor, bn_bias->tensor, &op, tuning_cache); ICHECK(op && result == CL_SUCCESS) << "Convolution Error:" << result; } else { result = h_ClmlIntf->clCreateMLOpFusedConvolutionBatchNormActivationForwardQCOM( - workspace->context, 0, &conv_desc, &bn_desc, &act_desc, input->tensor, weight->tensor, - bias->tensor, output->tensor, NULL, bn_mean->tensor, bn_var->tensor, bn_scale->tensor, - bn_bias->tensor, &op, tuning_cache); + workspace->contexts[platform_id], 0, &conv_desc, &bn_desc, &act_desc, input->tensor, + weight->tensor, bias->tensor, output->tensor, NULL, bn_mean->tensor, bn_var->tensor, + bn_scale->tensor, bn_bias->tensor, &op, tuning_cache); ICHECK(op && result == CL_SUCCESS) << "Convolution Error:" << result; } @@ -895,13 +898,13 @@ class CLMLRuntime : public JSONRuntimeBase { cl_ml_tensor_desc_qcom desc = {}; desc.num_dimensions = CL_TENSOR_UNUSED_QCOM; - result = - h_ClmlIntf->clCreateMLTensorQCOM(workspace->context, NULL, &desc, &layer_.unusedTensor); + result = h_ClmlIntf->clCreateMLTensorQCOM(workspace->contexts[platform_id], NULL, &desc, + &layer_.unusedTensor); ICHECK(layer_.unusedTensor && result == CL_SUCCESS) << ":" << result; - result = h_ClmlIntf->clCreateMLOpActivationForwardQCOM(workspace->context, 0, &act_desc, - input->tensor, layer_.unusedTensor, - output->tensor, &op, tuning_cache); + result = h_ClmlIntf->clCreateMLOpActivationForwardQCOM( + workspace->contexts[platform_id], 0, &act_desc, input->tensor, layer_.unusedTensor, + output->tensor, &op, tuning_cache); ICHECK(op && result == CL_SUCCESS) << "Activation Error:" << result; layer_.func_ins.push_back(input); @@ -947,8 +950,8 @@ class CLMLRuntime : public JSONRuntimeBase { cl_ml_op_batchnorm_desc_qcom bn_desc = {CL_BATCHNORM_MODE_SPATIAL_QCOM, cl_arithmetic_mode}; result = h_ClmlIntf->clCreateMLOpBatchNormForwardQCOM( - workspace->context, 0, &bn_desc, input->tensor, bn_mean->tensor, bn_var->tensor, - bn_scale->tensor, bn_bias->tensor, output->tensor, &op, tuning_cache); + workspace->contexts[platform_id], 0, &bn_desc, input->tensor, bn_mean->tensor, + bn_var->tensor, bn_scale->tensor, bn_bias->tensor, output->tensor, &op, tuning_cache); ICHECK(op && result == CL_SUCCESS) << "Batchnorm Error:" << result; layer->function.push_back(op); @@ -997,12 +1000,13 @@ class CLMLRuntime : public JSONRuntimeBase { cl_ml_tensor_desc_qcom desc = {}; cl_ml_tensor_qcom unusedTensor = NULL; desc.num_dimensions = CL_TENSOR_UNUSED_QCOM; - result = h_ClmlIntf->clCreateMLTensorQCOM(workspace->context, NULL, &desc, &unusedTensor); + result = h_ClmlIntf->clCreateMLTensorQCOM(workspace->contexts[platform_id], NULL, &desc, + &unusedTensor); ICHECK(unusedTensor && result == CL_SUCCESS) << ":" << result; - result = - h_ClmlIntf->clCreateMLOpPoolingForwardQCOM(workspace->context, 0, &pool_desc, input->tensor, - unusedTensor, output->tensor, &op, tuning_cache); + result = h_ClmlIntf->clCreateMLOpPoolingForwardQCOM(workspace->contexts[platform_id], 0, + &pool_desc, input->tensor, unusedTensor, + output->tensor, &op, tuning_cache); ICHECK(op && result == CL_SUCCESS) << "Pooling Error:" << result; layer_.func_ins.push_back(input); @@ -1043,13 +1047,13 @@ class CLMLRuntime : public JSONRuntimeBase { cl_ml_tensor_desc_qcom desc = {}; desc.num_dimensions = CL_TENSOR_UNUSED_QCOM; - result = - h_ClmlIntf->clCreateMLTensorQCOM(workspace->context, NULL, &desc, &layer_.unusedTensor); + result = h_ClmlIntf->clCreateMLTensorQCOM(workspace->contexts[platform_id], NULL, &desc, + &layer_.unusedTensor); ICHECK(layer_.unusedTensor && result == CL_SUCCESS) << ":" << result; - result = h_ClmlIntf->clCreateMLOpPoolingForwardQCOM(workspace->context, 0, &pool_desc, - input->tensor, layer_.unusedTensor, - output->tensor, &op, tuning_cache); + result = h_ClmlIntf->clCreateMLOpPoolingForwardQCOM( + workspace->contexts[platform_id], 0, &pool_desc, input->tensor, layer_.unusedTensor, + output->tensor, &op, tuning_cache); ICHECK(op && result == CL_SUCCESS) << "Pooling Error:" << result; layer_.func_ins.push_back(input); @@ -1079,7 +1083,7 @@ class CLMLRuntime : public JSONRuntimeBase { cl_ml_op_softmax_desc_qcom softmax_desc = {CL_SOFTMAX_ALGORITHM_ACCURATE_QCOM, CL_SOFTMAX_MODE_INSTANCE_QCOM, cl_arithmetic_mode}; - result = h_ClmlIntf->clCreateMLOpSoftmaxQCOM(workspace->context, 0, &softmax_desc, + result = h_ClmlIntf->clCreateMLOpSoftmaxQCOM(workspace->contexts[platform_id], 0, &softmax_desc, input->tensor, output->tensor, &op, tuning_cache); ICHECK(op && result == CL_SUCCESS) << "SoftMax Error:" << result; @@ -1125,8 +1129,8 @@ class CLMLRuntime : public JSONRuntimeBase { {clml_padding[0], clml_padding[1], clml_padding[2], clml_padding[3], 0, 0, 0, 0}, cl_arithmetic_mode}; - result = h_ClmlIntf->clCreateMLOpPadQCOM(workspace->context, 0, &pad_desc, input->tensor, - output->tensor, &op, tuning_cache); + result = h_ClmlIntf->clCreateMLOpPadQCOM(workspace->contexts[platform_id], 0, &pad_desc, + input->tensor, output->tensor, &op, tuning_cache); ICHECK(op && result == CL_SUCCESS) << "Pad Error:" << result; layer_.func_ins.push_back(input); @@ -1150,7 +1154,7 @@ class CLMLRuntime : public JSONRuntimeBase { cl_dtype); auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); - result = h_ClmlIntf->clCreateMLOpReshapeQCOM(workspace->context, 0, input->tensor, + result = h_ClmlIntf->clCreateMLOpReshapeQCOM(workspace->contexts[platform_id], 0, input->tensor, output->tensor, &op, tuning_cache); ICHECK(op && result == CL_SUCCESS) << "Reshape Error:" << result; @@ -1175,7 +1179,7 @@ class CLMLRuntime : public JSONRuntimeBase { cl_dtype); auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); - result = h_ClmlIntf->clCreateMLOpReshapeQCOM(workspace->context, 0, input->tensor, + result = h_ClmlIntf->clCreateMLOpReshapeQCOM(workspace->contexts[platform_id], 0, input->tensor, output->tensor, &op, tuning_cache); ICHECK(op && result == CL_SUCCESS) << "Reshape Error:" << result; @@ -1210,8 +1214,8 @@ class CLMLRuntime : public JSONRuntimeBase { } cl_ml_op_concat_desc_qcom concatDesc = {axis, (cl_uint)inputSize, cl_arithmetic_mode}; - result = h_ClmlIntf->clCreateMLOpConcatQCOM(workspace->context, 0, &concatDesc, concatInputs, - output->tensor, &op, tuning_cache); + result = h_ClmlIntf->clCreateMLOpConcatQCOM(workspace->contexts[platform_id], 0, &concatDesc, + concatInputs, output->tensor, &op, tuning_cache); ICHECK(op && result == CL_SUCCESS) << "Concat Error:" << result; layer->function.push_back(op); @@ -1250,8 +1254,8 @@ class CLMLRuntime : public JSONRuntimeBase { } else { cl_ml_tensor_desc_qcom desc = {}; desc.num_dimensions = CL_TENSOR_UNUSED_QCOM; - result = - h_ClmlIntf->clCreateMLTensorQCOM(workspace->context, NULL, &desc, &layer_.unusedTensor); + result = h_ClmlIntf->clCreateMLTensorQCOM(workspace->contexts[platform_id], NULL, &desc, + &layer_.unusedTensor); ICHECK(layer_.unusedTensor && result == CL_SUCCESS) << "clCreateMLTensorQCOM:" << result; bias->tensor = layer_.unusedTensor; } @@ -1269,8 +1273,8 @@ class CLMLRuntime : public JSONRuntimeBase { cl_arithmetic_mode}; result = h_ClmlIntf->clCreateMLOpConvolutionForwardQCOM( - workspace->context, 0, &conv_desc, input->tensor, weight->tensor, bias->tensor, - output->tensor, &op, NULL); + workspace->contexts[platform_id], 0, &conv_desc, input->tensor, weight->tensor, + bias->tensor, output->tensor, &op, NULL); ICHECK(op && result == CL_SUCCESS) << "Fully Connected Error:" << result; layer->function.push_back(op); @@ -1300,8 +1304,8 @@ class CLMLRuntime : public JSONRuntimeBase { cl_ml_op_clip_desc_qcom clip_desc = { CL_CLIP_BY_VALUE_QCOM, {{a_max}, CL_FLOAT}, {{a_min}, CL_FLOAT}, cl_arithmetic_mode}; - result = h_ClmlIntf->clCreateMLOpClipQCOM(workspace->context, 0, &clip_desc, input->tensor, - output->tensor, &op, tuning_cache); + result = h_ClmlIntf->clCreateMLOpClipQCOM(workspace->contexts[platform_id], 0, &clip_desc, + input->tensor, output->tensor, &op, tuning_cache); ICHECK(op && result == CL_SUCCESS) << "Clip Error:" << result; layer_.func_ins.push_back(input); @@ -1342,8 +1346,9 @@ class CLMLRuntime : public JSONRuntimeBase { cl_ml_op_binary_desc_qcom add_desc = { binary_op, {{1.0}, CL_FLOAT}, {{1.0}, CL_FLOAT}, {{0.0}, CL_FLOAT}, cl_arithmetic_mode}; - result = h_ClmlIntf->clCreateMLOpBinaryQCOM(workspace->context, 0, &add_desc, input_a->tensor, - input_b->tensor, output->tensor, &op, tuning_cache); + result = h_ClmlIntf->clCreateMLOpBinaryQCOM(workspace->contexts[platform_id], 0, &add_desc, + input_a->tensor, input_b->tensor, output->tensor, + &op, tuning_cache); ICHECK(op && result == CL_SUCCESS) << op_name << " Node Error:" << result; layer_.func_ins.push_back(input_a); @@ -1371,8 +1376,9 @@ class CLMLRuntime : public JSONRuntimeBase { cl_uint block_size = std::stoi(node.GetAttr>("block_size")[0]); cl_ml_op_depthtospace_desc_qcom dtos_desc = {block_size, cl_arithmetic_mode}; - result = h_ClmlIntf->clCreateMLOpDepthToSpaceQCOM( - workspace->context, 0, &dtos_desc, input->tensor, output->tensor, &op, tuning_cache); + result = + h_ClmlIntf->clCreateMLOpDepthToSpaceQCOM(workspace->contexts[platform_id], 0, &dtos_desc, + input->tensor, output->tensor, &op, tuning_cache); ICHECK(op && result == CL_SUCCESS) << "DepthToSpace Layer Error:" << result; layer_.func_ins.push_back(input); @@ -1399,8 +1405,9 @@ class CLMLRuntime : public JSONRuntimeBase { cl_bool align_corners = std::stoi(node.GetAttr>("align_corners")[0]); cl_ml_op_resize_bilinear_desc_qcom resize_desc = {align_corners, false, cl_arithmetic_mode}; - result = h_ClmlIntf->clCreateMLOpResizeBilinearQCOM( - workspace->context, 0, &resize_desc, input->tensor, output->tensor, &op, tuning_cache); + result = h_ClmlIntf->clCreateMLOpResizeBilinearQCOM(workspace->contexts[platform_id], 0, + &resize_desc, input->tensor, output->tensor, + &op, tuning_cache); ICHECK(op && result == CL_SUCCESS) << "Resize Layer Error:" << result; layer_.func_ins.push_back(input); @@ -1418,6 +1425,8 @@ class CLMLRuntime : public JSONRuntimeBase { GET_ML_API_INTERFACE* h_ClmlIntf = NULL; cl::OpenCLWorkspace* workspace = NULL; cl::OpenCLThreadEntry* tentry = NULL; + cl_device_id device_id; + cl_platform_id platform_id; cl_ml_tuningcache_qcom tuning_cache = NULL; bool is_tuning_run; char* tuning_file; diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h index fbb4e13e0534..cef49fdd50ca 100644 --- a/src/runtime/opencl/opencl_common.h +++ b/src/runtime/opencl/opencl_common.h @@ -282,6 +282,8 @@ class OpenCLWorkspace : public DeviceAPI { return prop & CL_QUEUE_PROFILING_ENABLE; } + // Check if the device is present or not + bool IsDeviceExists(unsigned int device_id) { return device_id < devices.size(); } // Enable queue profiling, recreate if required void EnableQueueProfiling(Device dev, bool enable) { bool is_enabled = cl::OpenCLWorkspace::Global()->IsProfiling(dev);