diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h
index 2e7f05f91020..36224cb416c1 100644
--- a/src/runtime/opencl/opencl_common.h
+++ b/src/runtime/opencl/opencl_common.h
@@ -162,6 +162,29 @@ inline const char* CLGetErrorString(cl_int error) {
   }
 }
 
+inline cl_channel_type DTypeToOpenCLChannelType(DLDataType data_type) {
+  DataType dtype(data_type);
+  if (dtype == DataType::Float(32)) {
+    return CL_FLOAT;
+  } else if (dtype == DataType::Float(16)) {
+    return CL_HALF_FLOAT;
+  } else if (dtype == DataType::Int(8)) {
+    return CL_SIGNED_INT8;
+  } else if (dtype == DataType::Int(16)) {
+    return CL_SIGNED_INT16;
+  } else if (dtype == DataType::Int(32)) {
+    return CL_SIGNED_INT32;
+  } else if (dtype == DataType::UInt(8)) {
+    return CL_UNSIGNED_INT8;
+  } else if (dtype == DataType::UInt(16)) {
+    return CL_UNSIGNED_INT16;
+  } else if (dtype == DataType::UInt(32)) {
+    return CL_UNSIGNED_INT32;
+  }
+  LOG(FATAL) << "data type is not supported in OpenCL runtime yet: " << dtype;
+  return CL_FLOAT;
+}
+
 /*!
  * \brief Protected OpenCL call
  * \param func Expression to call.
@@ -231,6 +254,8 @@ class OpenCLWorkspace : public DeviceAPI {
   void SetDevice(TVMContext ctx) final;
   void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final;
   void* AllocDataSpace(TVMContext ctx, size_t size, size_t alignment, DLDataType type_hint) final;
+  void* AllocDataSpace(TVMContext ctx, int ndim, const int64_t* shape, DLDataType dtype,
+                       Optional<String> mem_scope = NullOpt) final;
   void FreeDataSpace(TVMContext ctx, void* ptr) final;
   void StreamSync(TVMContext ctx, TVMStreamHandle stream) final;
   void* AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) final;
@@ -337,6 +362,14 @@ class OpenCLModuleNode : public ModuleNode {
   std::vector<cl_kernel> kernels_;
 };
 
+inline cl_mem_object_type GetMemObjectType(const void* mem_ptr) {
+  cl_mem mem = static_cast<cl_mem>(const_cast<void*>(mem_ptr));
+  cl_mem_info param_name = CL_MEM_TYPE;
+  cl_mem_object_type mem_type;
+  OPENCL_CALL(clGetMemObjectInfo(mem, param_name, sizeof(mem_type), &mem_type, NULL));
+  return mem_type;
+}
+
 }  // namespace runtime
 }  // namespace tvm
 #endif  // TVM_RUNTIME_OPENCL_OPENCL_COMMON_H_
diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc
index a3ec21e28f1d..5db3d2e5cb20 100644
--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -126,6 +126,81 @@ void* OpenCLWorkspace::AllocDataSpace(TVMContext ctx, size_t size, size_t alignm
   return mptr;
 }
 
+static inline size_t GetDataAlignment(const DLDataType dtype) {
+  size_t align = (dtype.bits / 8) * dtype.lanes;
+  if (align < kAllocAlignment) return kAllocAlignment;
+  return align;
+}
+
+static std::tuple<int64_t, int64_t> FlatShapeTo2D(std::vector<int64_t> shape) {
+  ICHECK(shape.size() >= 1 && shape.back() == 4);
+  while (shape.size() < 3) {
+    shape.insert(shape.end() - 1, 1);
+  }
+  int64_t width = 1;
+  for (auto it = shape.begin(); it < shape.end() - 2; ++it) {
+    width *= *it;
+  }
+  int64_t height = *(shape.end() - 2);
+  return std::make_tuple(width, height);
+}
+
+void* OpenCLWorkspace::AllocDataSpace(TVMContext ctx, int ndim, const int64_t* shape,
+                                      DLDataType dtype, Optional<String> mem_scope) {
+  if (!mem_scope.defined() || mem_scope.value() == "global") {
+    // by default, we can always redirect to the flat memory allocations
+    DLTensor temp;
+    temp.data = nullptr;
+    temp.ctx = ctx;
+    temp.ndim = ndim;
+    temp.dtype = dtype;
+    temp.shape = const_cast<int64_t*>(shape);
+    temp.strides = nullptr;
+    temp.byte_offset = 0;
+    size_t size = GetDataSize(temp);
+    size_t alignment = GetDataAlignment(temp.dtype);
+    return AllocDataSpace(ctx, size, alignment, dtype);
+  } else if (mem_scope.value() == "global:texture-act") {
+    this->Init();
+    ICHECK(this->context != nullptr) << "No OpenCL device";
+    cl_image_format image_format;
+    image_format.image_channel_data_type = DTypeToOpenCLChannelType(dtype);
+    cl_image_desc image_desc;
+
+    // shape must be (?, ..., ?, 4)
+    ICHECK_GT(ndim, 1);
+    ICHECK_EQ(shape[ndim - 1], 4);
+    // prepare descriptors
+    image_format.image_channel_order = CL_RGBA;
+    image_desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+    // flat the tensor shape to 2D image
+    size_t width, height;
+    std::vector<int64_t> vshape(shape, shape + ndim);
+    std::tie(width, height) = FlatShapeTo2D(vshape);
+    // LOG(INFO) << "width = " << width;
+    // LOG(INFO) << "height = " << height;
+    image_desc.image_width = width;
+    image_desc.image_height = height;
+    image_desc.image_depth = 1;
+    image_desc.image_array_size = 1;
+    image_desc.image_row_pitch = 0;
+    image_desc.image_slice_pitch = 0;
+    image_desc.num_mip_levels = 0;
+    image_desc.num_samples = 0;
+    image_desc.buffer = NULL;
+
+    cl_int err_code;
+    cl_mem mptr = clCreateImage(this->context, CL_MEM_READ_WRITE, &image_format, &image_desc,
+                                nullptr, &err_code);
+    OPENCL_CHECK_ERROR(err_code);
+    return mptr;
+  } else {
+    LOG(FATAL) << "Device does not support allocate data space with "
+               << "specified memory scope: " << mem_scope.value();
+    return nullptr;
+  }
+}
+
 void OpenCLWorkspace::FreeDataSpace(TVMContext ctx, void* ptr) {
   // We have to make sure that the memory object is not in the command queue
   // for some OpenCL platforms.
@@ -135,6 +210,17 @@ void OpenCLWorkspace::FreeDataSpace(TVMContext ctx, void* ptr) {
   OPENCL_CALL(clReleaseMemObject(mptr));
 }
 
+static inline void GetImageShape(const void* mem_ptr, size_t* region) {
+  cl_mem mem = static_cast<cl_mem>(const_cast<void*>(mem_ptr));
+  size_t width, height;
+  OPENCL_CALL(clGetImageInfo(mem, CL_IMAGE_WIDTH, sizeof(width), &width, NULL));
+  OPENCL_CALL(clGetImageInfo(mem, CL_IMAGE_HEIGHT, sizeof(height), &height, NULL));
+  region[0] = width;
+  region[1] = height;
+  region[2] = 1;
+  return;
+}
+
 void OpenCLWorkspace::CopyDataFromTo(const void* from, size_t from_offset, void* to,
                                      size_t to_offset, size_t size, TVMContext ctx_from,
                                      TVMContext ctx_to, DLDataType type_hint,
@@ -142,21 +228,74 @@ void OpenCLWorkspace::CopyDataFromTo(const void* from, size_t from_offset, void*
   this->Init();
   ICHECK(stream == nullptr);
   if (IsOpenCLDevice(ctx_from) && IsOpenCLDevice(ctx_to)) {
-    OPENCL_CALL(clEnqueueCopyBuffer(this->GetQueue(ctx_to),
-                                    static_cast<cl_mem>((void*)from),  // NOLINT(*)
-                                    static_cast<cl_mem>(to), from_offset, to_offset, size, 0,
-                                    nullptr, nullptr));
+    cl_mem_object_type from_type = GetMemObjectType(from);
+    cl_mem_object_type to_type = GetMemObjectType(to);
+    if (from_type == CL_MEM_OBJECT_BUFFER && to_type == CL_MEM_OBJECT_BUFFER) {
+      OPENCL_CALL(clEnqueueCopyBuffer(this->GetQueue(ctx_to),
+                                      static_cast<cl_mem>((void*)from),  // NOLINT(*)
+                                      static_cast<cl_mem>(to), from_offset, to_offset, size, 0,
+                                      nullptr, nullptr));
+    } else if (from_type == CL_MEM_OBJECT_IMAGE2D && to_type == CL_MEM_OBJECT_IMAGE2D) {
+      size_t from_origin[3] = {0, 0, 0};
+      size_t to_origin[3] = {0, 0, 0};
+      size_t region[3];
+      GetImageShape(from, region);
+      OPENCL_CALL(clEnqueueCopyImage(this->GetQueue(ctx_to),
+                                     static_cast<cl_mem>((void*)from),  // NOLINT(*)
+                                     static_cast<cl_mem>(to), from_origin, to_origin, region, 0,
+                                     nullptr, nullptr));
+    } else {
+      LOG(FATAL) << "OpenCL memory object type is wrong.";
+    }
   } else if (IsOpenCLDevice(ctx_from) && ctx_to.device_type == kDLCPU) {
-    OPENCL_CALL(clEnqueueReadBuffer(this->GetQueue(ctx_from),
-                                    static_cast<cl_mem>((void*)from),  // NOLINT(*)
-                                    CL_FALSE, from_offset, size, static_cast<char*>(to) + to_offset,
-                                    0, nullptr, nullptr));
-    OPENCL_CALL(clFinish(this->GetQueue(ctx_from)));
+    cl_mem_object_type from_type = GetMemObjectType(from);
+    switch (from_type) {
+      case CL_MEM_OBJECT_BUFFER:
+        OPENCL_CALL(clEnqueueReadBuffer(this->GetQueue(ctx_from),
+                                        static_cast<cl_mem>((void*)from),  // NOLINT(*)
+                                        CL_FALSE, from_offset, size,
+                                        static_cast<char*>(to) + to_offset, 0, nullptr, nullptr));
+        OPENCL_CALL(clFinish(this->GetQueue(ctx_from)));
+        break;
+      case CL_MEM_OBJECT_IMAGE2D: {
+        size_t origin[3] = {0, 0, 0};
+        size_t region[3];
+        GetImageShape(from, region);
+        OPENCL_CALL(clEnqueueReadImage(this->GetQueue(ctx_from),
+                                       static_cast<cl_mem>((void*)from),  // NOLINT(*)
+                                       CL_FALSE, origin, region, 0, 0,
+                                       static_cast<char*>(to) + to_offset, 0, nullptr, nullptr));
+        OPENCL_CALL(clFinish(this->GetQueue(ctx_from)));
+        break;
+      }
+      default:
+        LOG(FATAL) << "OpenCL memory object type is wrong.";
+    }
   } else if (ctx_from.device_type == kDLCPU && IsOpenCLDevice(ctx_to)) {
-    OPENCL_CALL(clEnqueueWriteBuffer(this->GetQueue(ctx_to), static_cast<cl_mem>(to), CL_FALSE,
-                                     to_offset, size, static_cast<const char*>(from) + from_offset,
-                                     0, nullptr, nullptr));
-    OPENCL_CALL(clFinish(this->GetQueue(ctx_to)));
+    cl_mem_object_type to_type = GetMemObjectType(to);
+    switch (to_type) {
+      case CL_MEM_OBJECT_BUFFER:
+        OPENCL_CALL(clEnqueueWriteBuffer(
+            this->GetQueue(ctx_to), static_cast<cl_mem>(to), CL_FALSE, to_offset, size,
+            static_cast<const char*>(from) + from_offset, 0, nullptr, nullptr));
+        OPENCL_CALL(clFinish(this->GetQueue(ctx_to)));
+        break;
+      case CL_MEM_OBJECT_IMAGE2D: {
+        size_t origin[3] = {0, 0, 0};
+        size_t region[3];
+        GetImageShape(to, region);
+        OPENCL_CALL(clEnqueueWriteImage(this->GetQueue(ctx_to),
+                                        static_cast<cl_mem>((void*)to),  // NOLINT(*)
+                                        CL_FALSE, origin, region, 0, 0,
+                                        static_cast<const char*>(from) + from_offset, 0, nullptr,
+                                        nullptr));
+        OPENCL_CALL(clFinish(this->GetQueue(ctx_to)));
+        break;
+      }
+      default:
+        LOG(FATAL) << "OpenCL memory type is wrong.";
+    }
+
   } else {
     LOG(FATAL) << "Expect copy from/to OpenCL or between OpenCL";
   }
diff --git a/tests/python/unittest/test_target_codegen_opencl.py b/tests/python/unittest/test_target_codegen_opencl.py
index 8a070da89641..48d84ca36618 100644
--- a/tests/python/unittest/test_target_codegen_opencl.py
+++ b/tests/python/unittest/test_target_codegen_opencl.py
@@ -15,8 +15,9 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
-from tvm import te
+from tvm import te, nd
 import tvm.testing
+import numpy as np
 
 target = "opencl"
 
@@ -120,6 +121,23 @@ def check_max(ctx, n, dtype):
     check_max(ctx, 1, "float64")
 
 
+@tvm.testing.requires_gpu
+@tvm.testing.requires_opencl
+def test_opencl_texture_memory():
+    def check_allocate_and_copy(shape):
+        cpu_arr = nd.array(np.random.rand(*shape).astype("float32"), tvm.cpu(0))
+        opencl_arr0 = nd.empty(cpu_arr.shape, cpu_arr.dtype, tvm.opencl(0), "global:texture-act")
+        opencl_arr1 = nd.empty(cpu_arr.shape, cpu_arr.dtype, tvm.opencl(0), "global:texture-act")
+        cpu_arr.copyto(opencl_arr0)
+        opencl_arr0.copyto(opencl_arr1)
+        np.testing.assert_equal(cpu_arr.asnumpy(), opencl_arr1.asnumpy())
+
+    check_allocate_and_copy((3, 4))
+    check_allocate_and_copy((5, 6, 4))
+    check_allocate_and_copy((8, 5, 6, 4))
+
+
 if __name__ == "__main__":
     test_opencl_ternary_expression()
     test_opencl_inf_nan()
+    test_opencl_texture_memory()