diff --git a/.gitmodules b/.gitmodules
index ba725af1588a..4ee900b418d8 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -19,6 +19,9 @@
 [submodule "3rdparty/OpenCL-Headers"]
 	path = 3rdparty/OpenCL-Headers
 	url = https://github.com/KhronosGroup/OpenCL-Headers.git
+[submodule "3rdparty/gemmini"]
+	path = 3rdparty/gemmini
+	url = https://github.com/ucb-bar/gemmini
 [submodule "3rdparty/cnpy"]
 	path = 3rdparty/cnpy
 	url = https://github.com/rogersce/cnpy.git
diff --git a/3rdparty/gemmini b/3rdparty/gemmini
new file mode 160000
index 000000000000..b6bdad59cbd6
--- /dev/null
+++ b/3rdparty/gemmini
@@ -0,0 +1 @@
+Subproject commit b6bdad59cbd6313f1ea4c93d3493db3d59b9e418
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 67e87d907141..bc1feab7b472 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -509,6 +509,7 @@ include(cmake/modules/Micro.cmake)
 include(cmake/modules/contrib/EthosN.cmake)
 include(cmake/modules/contrib/CMSISNN.cmake)
 include(cmake/modules/contrib/EthosU.cmake)
+include(cmake/modules/contrib/Gemmini.cmake)
 include(cmake/modules/contrib/BLAS.cmake)
 include(cmake/modules/contrib/CODEGENC.cmake)
 include(cmake/modules/contrib/DNNL.cmake)
@@ -591,6 +592,9 @@ if(USE_MICRO)
   add_dependencies(tvm_runtime crt)
   add_dependencies(tvm_runtime host_standalone_crt)
   add_dependencies(tvm_runtime zephyr)
+  if(USE_GEMMINI)
+    add_dependencies(tvm_runtime gemmini)
+  endif()
 endif()
 
 if(USE_CPP_RPC)
diff --git a/apps/microtvm/gemmini/README.md b/apps/microtvm/gemmini/README.md
new file mode 100644
index 000000000000..2691844797f5
--- /dev/null
+++ b/apps/microtvm/gemmini/README.md
@@ -0,0 +1,20 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+This directory contains code to create code for the Gemmini accelerator using microTVM. These tests are then executed on the Spike RISC-V ISA simulator.
+
+In order to use this correctly, the Spike simulator has to be installed. This can be done by following the steps found on the [Chipyard](https://chipyard.readthedocs.io/en/stable/) repository. The instructions to also install the patch of the Spike simulator that adds the Gemmini functional simulator can be found in the [Gemmini](https://github.com/ucb-bar/gemmini) repository.
diff --git a/apps/microtvm/gemmini/template_project/crt_config/crt_config.h b/apps/microtvm/gemmini/template_project/crt_config/crt_config.h
new file mode 100644
index 000000000000..b3126cfac920
--- /dev/null
+++ b/apps/microtvm/gemmini/template_project/crt_config/crt_config.h
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \brief CRT configuration for the host-linked CRT.
+ */
+#ifndef TVM_RUNTIME_MICRO_CRT_CONFIG_H_
+#define TVM_RUNTIME_MICRO_CRT_CONFIG_H_
+
+/*! Log level of the CRT runtime */
+#define TVM_CRT_LOG_LEVEL TVM_CRT_LOG_LEVEL_DEBUG
+
+/*! Support low-level debugging in MISRA-C runtime */
+#define TVM_CRT_DEBUG 0
+
+/*! Maximum supported dimension in NDArray */
+#define TVM_CRT_MAX_NDIM 6
+/*! Maximum supported arguments in generated functions */
+#define TVM_CRT_MAX_ARGS 10
+/*! Maximum supported string length in dltype, e.g. "int8", "int16", "float32" */
+#define TVM_CRT_MAX_STRLEN_DLTYPE 10
+/*! Maximum supported string length in function names */
+#define TVM_CRT_MAX_STRLEN_FUNCTION_NAME 120
+/*! Maximum supported string length in parameter names */
+#define TVM_CRT_MAX_STRLEN_PARAM_NAME 80
+
+/*! Maximum number of registered modules. */
+#define TVM_CRT_MAX_REGISTERED_MODULES 2
+
+/*! Size of the global function registry, in bytes. */
+#define TVM_CRT_GLOBAL_FUNC_REGISTRY_SIZE_BYTES 512
+
+/*! Maximum packet size, in bytes, including the length header. */
+#define TVM_CRT_MAX_PACKET_SIZE_BYTES 8 * 1024
+
+/*! \brief Maximum length of a PackedFunc function name. */
+#define TVM_CRT_MAX_FUNCTION_NAME_LENGTH_BYTES 30
+
+// #define TVM_CRT_FRAMER_ENABLE_LOGS
+
+#endif  // TVM_RUNTIME_MICRO_CRT_CONFIG_H_
diff --git a/apps/microtvm/gemmini/template_project/microtvm_api_server.py b/apps/microtvm/gemmini/template_project/microtvm_api_server.py
new file mode 100644
index 000000000000..86661cb7320f
--- /dev/null
+++ b/apps/microtvm/gemmini/template_project/microtvm_api_server.py
@@ -0,0 +1,312 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+MicroTVM API Server for Gemmini baremetal tests on the Spike simulator
+=====================
+"""
+
+import atexit
+import collections
+import functools
+import json
+import logging
+import os
+import os.path
+import pathlib
+import re
+import shlex
+import shutil
+import shlex, subprocess
+import sys
+import tarfile
+import tempfile
+import time
+from string import Template
+import re
+from distutils.dir_util import copy_tree
+import subprocess
+import serial
+
+# import serial.tools.list_ports
+from tvm.micro.project_api import server
+
+from subprocess import PIPE
+
+_LOG = logging.getLogger(__name__)
+
+MODEL_LIBRARY_FORMAT_RELPATH = pathlib.Path("src") / "model" / "model.tar"
+API_SERVER_DIR = pathlib.Path(os.path.dirname(__file__) or os.path.getcwd())
+BUILD_DIR = API_SERVER_DIR / "build"
+MODEL_LIBRARY_FORMAT_PATH = API_SERVER_DIR / MODEL_LIBRARY_FORMAT_RELPATH
+
+IS_TEMPLATE = not (API_SERVER_DIR / MODEL_LIBRARY_FORMAT_RELPATH).exists()
+
+# PROJECT_TYPES = [
+#    "dense_example",
+#    "conv2d_example",
+#    "dwconv2d_example",
+#    "add_example",
+#    "maxpool2d_example",
+#    "mobilenet_example",
+# ]
+
+PROJECT_TYPES = []
+if IS_TEMPLATE:
+    for d in (API_SERVER_DIR / "src").iterdir():
+        if d.is_dir():
+            PROJECT_TYPES.append(d.name)
+
+PROJECT_OPTIONS = server.default_project_options(
+    project_type={"choices": tuple(PROJECT_TYPES)},
+    board={"choices": "", "optional": ["flash", "open_transport"]},
+    warning_as_error={"optional": ["build", "flash"]},
+)
+
+"""
+PROJECT_OPTIONS = [
+    server.ProjectOption(
+        "project_type",
+        required=["generate_project"],
+        choices=tuple(PROJECT_TYPES),
+        type="str",
+        help="Type of project to generate.",
+    )
+]
+"""
+
+
+class Handler(server.ProjectAPIHandler):
+    def __init__(self):
+        super(Handler, self).__init__()
+        self._proc = None
+        self._port = None
+        self._transport = None
+        self._project_dir = None
+        self._qemu_instance = None
+
+    def server_info_query(self, tvm_version):
+        return server.ServerInfo(
+            platform_name="gemmini",
+            is_template=IS_TEMPLATE,
+            model_library_format_path="" if IS_TEMPLATE else MODEL_LIBRARY_FORMAT_PATH,
+            project_options=PROJECT_OPTIONS,
+        )
+
+    def _copy_project_files(self, api_server_dir, project_dir, project_type):
+        """Copies the files for project_type into project_dir.
+
+        Notes
+        -----
+        template_dir is NOT a project type, and that directory is never copied
+        in this function. template_dir only holds this file and its unit tests,
+        so this file is copied separately in generate_project.
+
+        """
+        for item in (API_SERVER_DIR / "src" / project_type).iterdir():
+            dest = project_dir / "src" / item.name
+            if item.is_dir():
+                shutil.copytree(item, dest)
+            else:
+                shutil.copy2(item, dest)
+
+        shutil.copy2(project_dir / "src" / "Makefrag.mk", project_dir / "src" / "Makefrag")
+
+        test_name = project_type.replace("_example", "")
+        new_line = f"tests = {test_name}\n"
+        with open(project_dir / "src" / "Makefile", "r") as original:
+            data = original.read()
+        with open(project_dir / "src" / "Makefile", "w") as modified:
+            modified.write(new_line + data)
+
+    CRT_COPY_ITEMS = ("include", "src")
+
+    def _copy_standalone_crt(self, source_dir, standalone_crt_dir):
+        output_crt_dir = source_dir / "standalone_crt"
+        for item in self.CRT_COPY_ITEMS:
+            src_path = os.path.join(standalone_crt_dir, item)
+            dst_path = output_crt_dir / item
+            if os.path.isdir(src_path):
+                shutil.copytree(src_path, dst_path)
+            else:
+                shutil.copy2(src_path, dst_path)
+
+    def _disassemble_mlf(self, mlf_tar_path, source_dir):
+        with tempfile.TemporaryDirectory() as mlf_unpacking_dir_str:
+            mlf_unpacking_dir = pathlib.Path(mlf_unpacking_dir_str)
+            with tarfile.open(mlf_tar_path, "r:") as tar:
+                tar.extractall(mlf_unpacking_dir)
+
+            model_dir = source_dir / "model"
+            model_dir.mkdir()
+
+            # Copy C files from model. The filesnames and quantity
+            # depend on the target string, so we just copy all c files
+            source_dir = mlf_unpacking_dir / "codegen" / "host" / "src"
+            for file in source_dir.rglob(f"*.c"):
+                shutil.copy(file, model_dir)
+
+            source_dir = mlf_unpacking_dir / "codegen" / "host" / "include"
+            for file in source_dir.rglob(f"*.h"):
+                shutil.copy(file, model_dir)
+
+            # Return metadata.json for use in templating
+            with open(os.path.join(mlf_unpacking_dir, "metadata.json")) as f:
+                metadata = json.load(f)
+        return metadata
+
+    CPP_FILE_EXTENSION_SYNONYMS = ("cc", "cxx")
+
+    def _convert_includes(self, project_dir, source_dir):
+        """Changes all #include statements in project_dir to be relevant to their
+        containing file's location.
+
+        """
+        for ext in ("c", "h", "cpp"):
+            for filename in source_dir.rglob(f"*.{ext}"):
+                with filename.open("rb") as src_file:
+                    lines = src_file.readlines()
+                    with filename.open("wb") as dst_file:
+                        for i, line in enumerate(lines):
+                            line_str = str(line, "utf-8")
+                            # Check if line has an include
+                            result = re.search(r"#include\s*[<\"]([^>]*)[>\"]", line_str)
+                            if not result:
+                                dst_file.write(line)
+                            else:
+                                new_include = self._find_modified_include_path(
+                                    project_dir, filename, result.groups()[0]
+                                )
+                                updated_line = f'#include "{new_include}"\n'
+                                dst_file.write(updated_line.encode("utf-8"))
+
+    # Most of the files we used to be able to point to directly are under "src/standalone_crt/include/".
+    # Howver, crt_config.h lives under "src/standalone_crt/crt_config/", and more exceptions might
+    # be added in the future.
+    POSSIBLE_BASE_PATHS = ["src/standalone_crt/include/", "src/standalone_crt/crt_config/"]
+
+    def _find_modified_include_path(self, project_dir, file_path, include_path):
+        """Takes a single #include path, and returns the location it should point to.
+
+        Examples
+        --------
+        >>> _find_modified_include_path(
+        ...     "/path/to/project/dir"
+        ...     "/path/to/project/dir/src/standalone_crt/src/runtime/crt/common/ndarray.c"
+        ...     "tvm/runtime/crt/platform.h"
+        ... )
+        "../../../../../../src/standalone_crt/include/tvm/runtime/crt/platform.h"
+
+        """
+        if include_path.endswith(".inc"):
+            include_path = re.sub(r"\.[a-z]+$", ".h", include_path)
+
+        # Change includes referencing .cc and .cxx files to point to the renamed .cpp file
+        if include_path.endswith(self.CPP_FILE_EXTENSION_SYNONYMS):
+            include_path = re.sub(r"\.[a-z]+$", ".cpp", include_path)
+
+        # If the include already works, don't modify it
+        if (file_path.parents[0] / include_path).exists():
+            return include_path
+
+        relative_path = file_path.relative_to(project_dir)
+        up_dirs_path = "../" * str(relative_path).count("/")
+
+        for base_path in self.POSSIBLE_BASE_PATHS:
+            full_potential_path = project_dir / base_path / include_path
+            if full_potential_path.exists():
+                return up_dirs_path + base_path + include_path
+
+        # If we can't find the file, just leave it untouched
+        # It's probably a standard C/C++ header
+        return include_path
+
+    def _copy_debug_data_files(self, project_dir):
+        if os.path.isdir(str(project_dir / ".." / "include")):
+            copy_tree(str(project_dir / ".." / "include"), str(project_dir / "src" / "model"))
+
+    def generate_project(self, model_library_format_path, standalone_crt_dir, project_dir, options):
+
+        # Reference key directories with pathlib
+        project_dir = pathlib.Path(project_dir)
+        project_dir.mkdir()
+        source_dir = project_dir / "src"
+        source_dir.mkdir()
+        extra_files_tar = options.get("extra_files_tar")
+
+        # Copies files from the template folder to project_dir
+        shutil.copy2(API_SERVER_DIR / "microtvm_api_server.py", project_dir)
+        self._copy_project_files(API_SERVER_DIR, project_dir, options["project_type"])
+
+        # Copy standalone_crt into src folder
+        self._copy_standalone_crt(source_dir, standalone_crt_dir)
+
+        # Populate crt-config.h
+        crt_config_dir = project_dir / "src" / "standalone_crt" / "crt_config"
+        crt_config_dir.mkdir()
+        shutil.copy2(
+            API_SERVER_DIR / "crt_config" / "crt_config.h", crt_config_dir / "crt_config.h"
+        )
+
+        # Unpack the MLF and copy the relevant files
+        metadata = self._disassemble_mlf(model_library_format_path, source_dir)
+        shutil.copy2(model_library_format_path, project_dir / MODEL_LIBRARY_FORMAT_RELPATH)
+
+        # self._copy_debug_data_files(project_dir)
+        if extra_files_tar:
+            with tarfile.open(extra_files_tar, mode="r:*") as tf:
+                tf.extractall(project_dir)
+                for filename in project_dir.rglob(f"include/tvm/*.h"):
+                    with filename.open("rb") as src_file:
+                        lines = src_file.readlines()
+                    new_lines = []
+                    for line in lines:
+                        if "dlpack" not in str(line):
+                            new_lines.append(line)
+                    with filename.open("wb") as dst_file:
+                        dst_file.writelines(new_lines)
+
+        # Recursively change includes
+        self._convert_includes(project_dir, source_dir)
+
+    def build(self, options):
+        subprocess.call(
+            "cd src && ./build.sh",
+            shell=True,
+        )
+
+    def flash(self, options):
+        test_name = options["project_type"].split("_")[0]
+        subprocess.call(
+            "cd src/build && spike --extension=gemmini %s" % (test_name + "-baremetal",),
+            shell=True,
+        )
+
+    def open_transport(self, options):
+        pass
+
+    def close_transport(self):
+        pass
+
+    def read_transport(self, n, timeout_sec):
+        pass
+
+    def write_transport(self, data, timeout_sec):
+        pass
+
+
+if __name__ == "__main__":
+    server.main(Handler())
diff --git a/apps/microtvm/gemmini/template_project/src/Makefile b/apps/microtvm/gemmini/template_project/src/Makefile
new file mode 100644
index 000000000000..df459ba96121
--- /dev/null
+++ b/apps/microtvm/gemmini/template_project/src/Makefile
@@ -0,0 +1,76 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+include $(abs_top_srcdir)/Makefrag
+
+tests_baremetal = $(tests:=-baremetal)
+
+ifeq ($(findstring spike,$(RUNNER)),spike)
+# Currently don't support conv or conv-with-pool on spike
+runs_baremetal = $(addsuffix .run,$(filter-out conv-baremetal conv_with_pool-baremetal,$(tests_baremetal)))
+else
+# Don't run very long benchmarks for RTL sim
+runs_baremetal = $(addsuffix .run,$(filter-out tiled_matmul_cpu-baremetal tiled_matmul_option-baremetal,$(tests_baremetal)))
+endif
+
+RISCV_TESTS = ${TVM_HOME}/3rdparty/gemmini/software/gemmini-rocc-tests/riscv-tests
+BENCH_COMMON = ${RISCV_TESTS}/benchmarks/common
+GEMMINI_HEADERS = $(abs_top_srcdir)/include/gemmini.h $(abs_top_srcdir)/include/gemmini_params.h $(abs_top_srcdir)/include/gemmini_testutils.h
+STANDALONE_CRT = $(abs_top_srcdir)/standalone_crt
+DEBUG_DATA_HEADERS = $(abs_top_srcdir)/../include/tvm
+
+CFLAGS := $(CFLAGS) \
+	-DPREALLOCATE=1 \
+	-DMULTITHREAD=1 \
+	-mcmodel=medany \
+	-std=gnu99 \
+	-O2 \
+	-ffast-math \
+	-fno-common \
+	-fno-builtin-printf \
+	-march=rv64gc -Wa,-march=rv64gcxhwacha \
+	-lgcc \
+	-I${RISCV_TESTS} \
+	-I${RISCV_TESTS}/env \
+	-I$(abs_top_srcdir) \
+	-I$(abs_top_srcdir)/include \
+	-I$(BENCH_COMMON) \
+	-I$(DEBUG_DATA_HEADERS) \
+	-DID_STRING=$(ID_STRING) \
+	-DPRINT_TILE=0 \
+
+CFLAGS_BAREMETAL := \
+	$(CFLAGS) \
+	-nostartfiles \
+	-static \
+	-T $(BENCH_COMMON)/test.ld \
+	-DBAREMETAL=1 \
+
+all: $(tests_baremetal)
+
+vpath %.c $(src_dir)
+
+%-baremetal: %.c $(GEMMINI_HEADERS)
+	$(CC_BAREMETAL) $(CFLAGS_BAREMETAL) $< $(LFLAGS) -o $@ \
+		$(wildcard $(BENCH_COMMON)/*.c) $(wildcard $(abs_top_srcdir)/model/*.c) $(wildcard $(BENCH_COMMON)/*.S) $(LIBS) -lm
+
+run-baremetal: $(runs_baremetal)
+
+%-baremetal.run: %-baremetal
+	$(RUNNER)$(abs_top_srcdir)/build/$^
+
+junk += $(tests_baremetal)
diff --git a/apps/microtvm/gemmini/template_project/src/Makefile.in b/apps/microtvm/gemmini/template_project/src/Makefile.in
new file mode 100644
index 000000000000..ed017cc918ce
--- /dev/null
+++ b/apps/microtvm/gemmini/template_project/src/Makefile.in
@@ -0,0 +1,34 @@
+prefix		:= @prefix@
+abs_top_srcdir	:= @abs_top_srcdir@
+XLEN            := @XLEN@
+RISCVTOOLS      := @RISCVTOOLS@
+ROCC = examples
+RUNNER         := "spike --extension=gemmini "
+
+.PHONY: all bareMetalC clean
+all: bareMetalC
+
+vars = \
+	abs_top_srcdir=$(abs_top_srcdir) \
+	XLEN=$(XLEN) \
+	PREFIX=$(ROCC)-$@ \
+	src_dir=$(abs_top_srcdir) \
+	RISCVTOOLS=$(RISCVTOOLS)
+
+bareMetalC:
+	$(MAKE) -f $(abs_top_srcdir)/Makefile $(vars)
+
+clean:
+	$(MAKE) -f $(abs_top_srcdir)/Makefile abs_top_srcdir=$(abs_top_srcdir) PREFIX=$(ROCC)-bareMetalC clean
+
+test-baremetal-bareMetalC:
+	make	\
+	        -f $(abs_top_srcdir)/Makefile \
+                TARGET_MAKEFILE=$(abs_top_srcdir)/Makefile \
+		abs_top_srcdir=$(abs_top_srcdir) \
+	 	src_dir=$(abs_top_srcdir) \
+	 	XLEN=$(XLEN) \
+	 	PREFIX=$(ROCC)-bareMetalC \
+		RISCVTOOLS=$(RISCVTOOLS) \
+		RUNNER=$(RUNNER) \
+		run-baremetal
diff --git a/apps/microtvm/gemmini/template_project/src/Makefrag.mk b/apps/microtvm/gemmini/template_project/src/Makefrag.mk
new file mode 100644
index 000000000000..cb4e5ee72da9
--- /dev/null
+++ b/apps/microtvm/gemmini/template_project/src/Makefrag.mk
@@ -0,0 +1,42 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+XLEN ?= 64
+
+CC_BAREMETAL := riscv$(XLEN)-unknown-elf-gcc
+
+CC_LINUX_PRESENT := $(shell command -v riscv$(XLEN)-unknown-linux-gnu-gcc 2> /dev/null)
+
+# Support Linux gcc from riscv-gnu-toolchain and from system packages
+# riscv64-unknown-linux-gnu-gcc is built from riscv-gnu-toolchain, comes with Firesim's tools
+# riscv64-linux-gnu-gcc comes from a system package
+ifdef CC_LINUX_PRESENT
+    CC_LINUX := riscv$(XLEN)-unknown-linux-gnu-gcc
+else
+    CC_LINUX := riscv$(XLEN)-linux-gnu-gcc
+endif
+
+ENV_P = $(abs_top_srcdir)/riscv-tests/env/p
+ENV_V = $(abs_top_srcdir)/riscv-tests/env/v
+
+.PHONY: all clean default
+
+default: all
+src_dir = .
+
+clean:
+	rm -rf $(junk)
diff --git a/apps/microtvm/gemmini/template_project/src/add.c b/apps/microtvm/gemmini/template_project/src/add.c
new file mode 100644
index 000000000000..13aeb1a80e3f
--- /dev/null
+++ b/apps/microtvm/gemmini/template_project/src/add.c
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "assert.h"
+#include "stddef.h"
+#include "stdint.h"
+#include "stdio.h"
+#include "stdlib.h"
+#ifndef BAREMETAL
+#include "sys/mman.h"
+#endif
+#include "model/inputs.h"
+#include "model/outputs.h"
+#include "model/tvmgen_default.h"
+
+int8_t output_add[output_len];
+
+int main() {
+  printf("Starting add test...\r\n");
+#ifndef BAREMETAL
+  if (mlockall(MCL_CURRENT | MCL_FUTURE) != 0) {
+    perror("mlockall failed");
+    exit(1);
+  }
+#endif
+
+  struct tvmgen_default_inputs inputs;
+  inputs.serving_default_x_0 = input_1;
+  inputs.serving_default_y_0 = input_2;
+  struct tvmgen_default_outputs outputs;
+  outputs.PartitionedCall_0 = output_add;
+  int error_counter = 0;
+
+  tvmgen_default_run(&inputs, &outputs);
+
+  // Look for errors!
+  for (int i = 0; i < output_len; i++) {
+    if (output_add[i] != output[i]) {
+      error_counter += 1;
+      printf("ERROR IN ADD EXAMPLE! output_add[%d] (%d) != output[%d] (%d)\r\n", i, output_add[i],
+             i, output[i]);
+      // exit(1);
+    }
+  }
+
+  // We allow for a very small percentage of errors, this could be related to rounding errors
+  float error_perc = ((float)(error_counter / output_len) * 100);
+  if (error_perc < 1)
+    printf("SUCCESS! (error_counter = %d)\r\n", error_counter);
+  else
+    printf("FAIL! (error_counter = %d)\r\n", error_counter);
+  exit(0);
+}
diff --git a/apps/microtvm/gemmini/template_project/src/add_example/add.c b/apps/microtvm/gemmini/template_project/src/add_example/add.c
new file mode 100644
index 000000000000..f0ca93422efe
--- /dev/null
+++ b/apps/microtvm/gemmini/template_project/src/add_example/add.c
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "assert.h"
+#include "stddef.h"
+#include "stdint.h"
+#include "stdio.h"
+#include "stdlib.h"
+#ifndef BAREMETAL
+#include "sys/mman.h"
+#endif
+#include "input_1.h"
+#include "input_2.h"
+#include "model/tvmgen_default.h"
+#include "output.h"
+
+int8_t output_add[OUTPUT_LEN];
+
+int main() {
+  printf("Starting add test...\r\n");
+#ifndef BAREMETAL
+  if (mlockall(MCL_CURRENT | MCL_FUTURE) != 0) {
+    perror("mlockall failed");
+    exit(1);
+  }
+#endif
+
+  struct tvmgen_default_inputs inputs;
+  inputs.serving_default_x_0 = input_1;
+  inputs.serving_default_y_0 = input_2;
+  struct tvmgen_default_outputs outputs;
+  outputs.PartitionedCall_0 = output_add;
+  int error_counter = 0;
+
+  tvmgen_default_run(&inputs, &outputs);
+
+  // Look for errors!
+  for (int i = 0; i < OUTPUT_LEN; i++) {
+    if (output_add[i] != output[i]) {
+      error_counter += 1;
+      printf("ERROR IN ADD EXAMPLE! output_add[%d] (%d) != output[%d] (%d)\r\n", i, output_add[i],
+             i, output[i]);
+      // exit(1);
+    }
+  }
+
+  // We allow for a very small percentage of errors, this could be related to rounding errors
+  float error_perc = ((float)(error_counter / OUTPUT_LEN) * 100);
+  if (error_perc < 1)
+    printf("SUCCESS! (error_counter = %d)\r\n", error_counter);
+  else
+    printf("FAIL! (error_counter = %d)\r\n", error_counter);
+  exit(0);
+}
diff --git a/apps/microtvm/gemmini/template_project/src/conv2d.c b/apps/microtvm/gemmini/template_project/src/conv2d.c
new file mode 100644
index 000000000000..22f1bcb1d281
--- /dev/null
+++ b/apps/microtvm/gemmini/template_project/src/conv2d.c
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "assert.h"
+#include "stddef.h"
+#include "stdint.h"
+#include "stdio.h"
+#include "stdlib.h"
+#ifndef BAREMETAL
+#include "sys/mman.h"
+#endif
+#include "model/inputs.h"
+#include "model/outputs.h"
+#include "model/tvmgen_default.h"
+
+int8_t output_conv[output_len];
+
+int main() {
+  printf("Starting conv2d test...\r\n");
+#ifndef BAREMETAL
+  if (mlockall(MCL_CURRENT | MCL_FUTURE) != 0) {
+    perror("mlockall failed");
+    exit(1);
+  }
+#endif
+
+  struct tvmgen_default_inputs inputs;
+  inputs.serving_default_conv2d_input_0 = input;
+  struct tvmgen_default_outputs outputs;
+  outputs.StatefulPartitionedCall_0 = output_conv;
+  int error_counter = 0;
+
+  tvmgen_default_run(&inputs, &outputs);
+
+  // Look for errors!
+  for (int i = 0; i < output_len; i++) {
+    if (output_conv[i] != output[i]) {
+      error_counter += 1;
+      printf("ERROR IN CONV2D EXAMPLE! output_conv[%d] (%d) != output[%d] (%d)\r\n", i,
+             output_conv[i], i, output[i]);
+      // exit(1);
+    }
+  }
+
+  // We allow for a very small percentage of errors, this could be related to rounding errors
+  if (((float)(error_counter / output_len) * 100) < 1)
+    printf("SUCCESS!\r\n");
+  else
+    printf("FAIL!\r\n");
+  exit(0);
+}
diff --git a/apps/microtvm/gemmini/template_project/src/conv2d_example/conv2d.c b/apps/microtvm/gemmini/template_project/src/conv2d_example/conv2d.c
new file mode 100644
index 000000000000..6b91db406eaf
--- /dev/null
+++ b/apps/microtvm/gemmini/template_project/src/conv2d_example/conv2d.c
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "assert.h"
+#include "stddef.h"
+#include "stdint.h"
+#include "stdio.h"
+#include "stdlib.h"
+#ifndef BAREMETAL
+#include "sys/mman.h"
+#endif
+#include "input.h"
+#include "model/tvmgen_default.h"
+#include "output.h"
+
+int8_t output_conv[OUTPUT_LEN];
+
+int main() {
+  printf("Starting conv2d test...\r\n");
+#ifndef BAREMETAL
+  if (mlockall(MCL_CURRENT | MCL_FUTURE) != 0) {
+    perror("mlockall failed");
+    exit(1);
+  }
+#endif
+
+  struct tvmgen_default_inputs inputs;
+  inputs.serving_default_conv2d_input_0 = input;
+  struct tvmgen_default_outputs outputs;
+  outputs.StatefulPartitionedCall_0 = output_conv;
+  int error_counter = 0;
+
+  tvmgen_default_run(&inputs, &outputs);
+
+  // Look for errors!
+  for (int i = 0; i < OUTPUT_LEN; i++) {
+    if (output_conv[i] != output[i]) {
+      error_counter += 1;
+      printf("ERROR IN CONV2D EXAMPLE! output_conv[%d] (%d) != output[%d] (%d)\r\n", i,
+             output_conv[i], i, output[i]);
+      // exit(1);
+    }
+  }
+
+  // We allow for a very small percentage of errors, this could be related to rounding errors
+  if (((float)(error_counter / OUTPUT_LEN) * 100) < 1)
+    printf("SUCCESS!\r\n");
+  else
+    printf("FAIL!\r\n");
+  exit(0);
+}
diff --git a/apps/microtvm/gemmini/template_project/src/dense.c b/apps/microtvm/gemmini/template_project/src/dense.c
new file mode 100644
index 000000000000..414eeac88020
--- /dev/null
+++ b/apps/microtvm/gemmini/template_project/src/dense.c
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "assert.h"
+#include "stddef.h"
+#include "stdint.h"
+#include "stdio.h"
+#include "stdlib.h"
+#ifndef BAREMETAL
+#include "sys/mman.h"
+#endif
+#include "model/inputs.h"
+#include "model/outputs.h"
+#include "model/tvmgen_default.h"
+
+int8_t output_gemm[output_len];
+
+int main() {
+  printf("Starting dense test...\r\n");
+#ifndef BAREMETAL
+  if (mlockall(MCL_CURRENT | MCL_FUTURE) != 0) {
+    perror("mlockall failed");
+    exit(1);
+  }
+#endif
+
+  struct tvmgen_default_inputs inputs;
+  inputs.serving_default_x_0 = input;
+  struct tvmgen_default_outputs outputs;
+  outputs.StatefulPartitionedCall_0 = output_gemm;
+  int error_counter = 0;
+
+  tvmgen_default_run(&inputs, &outputs);
+
+  // Look for errors!
+  for (int i = 0; i < output_len; i++) {
+    if (output_gemm[i] != output[i]) {
+      error_counter += 1;
+      printf("ERROR IN DENSE EXAMPLE! output_gemm[%d] (%d) != output[%d] (%d)\r\n", i,
+             output_gemm[i], i, output[i]);
+      // exit(1);
+    }
+  }
+
+  // We allow for a very small percentage of errors, this could be related to rounding errors
+  if (((float)(error_counter / output_len) * 100) < 1)
+    printf("SUCCESS!\r\n");
+  else
+    printf("FAIL!\r\n");
+  exit(0);
+}
diff --git a/apps/microtvm/gemmini/template_project/src/dense_example/dense.c b/apps/microtvm/gemmini/template_project/src/dense_example/dense.c
new file mode 100644
index 000000000000..64ca5b821d22
--- /dev/null
+++ b/apps/microtvm/gemmini/template_project/src/dense_example/dense.c
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "assert.h"
+#include "stddef.h"
+#include "stdint.h"
+#include "stdio.h"
+#include "stdlib.h"
+#ifndef BAREMETAL
+#include "sys/mman.h"
+#endif
+#include "input.h"
+#include "model/tvmgen_default.h"
+#include "output.h"
+
+int8_t output_gemm[OUTPUT_LEN];
+
+int main() {
+  printf("Starting dense test...\r\n");
+#ifndef BAREMETAL
+  if (mlockall(MCL_CURRENT | MCL_FUTURE) != 0) {
+    perror("mlockall failed");
+    exit(1);
+  }
+#endif
+
+  struct tvmgen_default_inputs inputs;
+  inputs.serving_default_x_0 = input;
+  struct tvmgen_default_outputs outputs;
+  outputs.StatefulPartitionedCall_0 = output_gemm;
+  int error_counter = 0;
+
+  tvmgen_default_run(&inputs, &outputs);
+
+  // Look for errors!
+  for (int i = 0; i < OUTPUT_LEN; i++) {
+    if (output_gemm[i] != output[i]) {
+      error_counter += 1;
+      printf("ERROR IN DENSE EXAMPLE! output_gemm[%d] (%d) != output[%d] (%d)\r\n", i,
+             output_gemm[i], i, output[i]);
+      // exit(1);
+    }
+  }
+
+  // We allow for a very small percentage of errors, this could be related to rounding errors
+  if (((float)(error_counter / OUTPUT_LEN) * 100) < 1)
+    printf("SUCCESS!\r\n");
+  else
+    printf("FAIL!\r\n");
+  exit(0);
+}
diff --git a/apps/microtvm/gemmini/template_project/src/dwconv2d.c b/apps/microtvm/gemmini/template_project/src/dwconv2d.c
new file mode 100644
index 000000000000..ee125e2fdc25
--- /dev/null
+++ b/apps/microtvm/gemmini/template_project/src/dwconv2d.c
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "assert.h"
+#include "stddef.h"
+#include "stdint.h"
+#include "stdio.h"
+#include "stdlib.h"
+#ifndef BAREMETAL
+#include "sys/mman.h"
+#endif
+#include "model/inputs.h"
+#include "model/outputs.h"
+#include "model/tvmgen_default.h"
+
+int8_t output_conv[output_len];
+
+int main() {
+  printf("Starting dw conv2d test...\r\n");
+#ifndef BAREMETAL
+  if (mlockall(MCL_CURRENT | MCL_FUTURE) != 0) {
+    perror("mlockall failed");
+    exit(1);
+  }
+#endif
+
+  struct tvmgen_default_inputs inputs;
+  inputs.serving_default_depthwise_conv2d_input_0 = input;
+  struct tvmgen_default_outputs outputs;
+  outputs.StatefulPartitionedCall_0 = output_conv;
+  int error_counter = 0;
+
+  tvmgen_default_run(&inputs, &outputs);
+
+  // Look for errors!
+  for (int i = 0; i < output_len; i++) {
+    if (output_conv[i] != output[i]) {
+      error_counter += 1;
+      printf("ERROR IN DW CONV2D EXAMPLE! output_conv[%d] (%d) != output[%d] (%d)\r\n", i,
+             output_conv[i], i, output[i]);
+      // exit(1);
+    }
+  }
+
+  // We allow for a very small percentage of errors, this could be related to rounding errors
+  if (((float)(error_counter / output_len) * 100) < 1)
+    printf("SUCCESS!\r\n");
+  else
+    printf("FAIL!\r\n");
+  exit(0);
+}
diff --git a/apps/microtvm/gemmini/template_project/src/dwconv2d_example/dwconv2d.c b/apps/microtvm/gemmini/template_project/src/dwconv2d_example/dwconv2d.c
new file mode 100644
index 000000000000..b352512e08a1
--- /dev/null
+++ b/apps/microtvm/gemmini/template_project/src/dwconv2d_example/dwconv2d.c
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "assert.h"
+#include "stddef.h"
+#include "stdint.h"
+#include "stdio.h"
+#include "stdlib.h"
+#ifndef BAREMETAL
+#include "sys/mman.h"
+#endif
+#include "input.h"
+#include "model/tvmgen_default.h"
+#include "output.h"
+
+int8_t output_conv[OUTPUT_LEN];
+
+int main() {
+  printf("Starting dw conv2d test...\r\n");
+#ifndef BAREMETAL
+  if (mlockall(MCL_CURRENT | MCL_FUTURE) != 0) {
+    perror("mlockall failed");
+    exit(1);
+  }
+#endif
+
+  struct tvmgen_default_inputs inputs;
+  inputs.serving_default_depthwise_conv2d_input_0 = input;
+  struct tvmgen_default_outputs outputs;
+  outputs.StatefulPartitionedCall_0 = output_conv;
+  int error_counter = 0;
+
+  tvmgen_default_run(&inputs, &outputs);
+
+  // Look for errors!
+  for (int i = 0; i < OUTPUT_LEN; i++) {
+    if (output_conv[i] != output[i]) {
+      error_counter += 1;
+      printf("ERROR IN DW CONV2D EXAMPLE! output_conv[%d] (%d) != output[%d] (%d)\r\n", i,
+             output_conv[i], i, output[i]);
+      // exit(1);
+    }
+  }
+
+  // We allow for a very small percentage of errors, this could be related to rounding errors
+  if (((float)(error_counter / OUTPUT_LEN) * 100) < 1)
+    printf("SUCCESS!\r\n");
+  else
+    printf("FAIL!\r\n");
+  exit(0);
+}
diff --git a/apps/microtvm/gemmini/template_project/src/maxpool2d.c b/apps/microtvm/gemmini/template_project/src/maxpool2d.c
new file mode 100644
index 000000000000..8f508333c492
--- /dev/null
+++ b/apps/microtvm/gemmini/template_project/src/maxpool2d.c
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "assert.h"
+#include "stddef.h"
+#include "stdint.h"
+#include "stdio.h"
+#include "stdlib.h"
+#ifndef BAREMETAL
+#include "sys/mman.h"
+#endif
+#include "model/inputs.h"
+#include "model/outputs.h"
+#include "model/tvmgen_default.h"
+
+int8_t output_maxpool2d[output_len];
+
+int main() {
+  printf("Starting max pooling 2D test...\r\n");
+#ifndef BAREMETAL
+  if (mlockall(MCL_CURRENT | MCL_FUTURE) != 0) {
+    perror("mlockall failed");
+    exit(1);
+  }
+#endif
+
+  struct tvmgen_default_inputs inputs;
+  inputs.serving_default_x_0 = input;
+  struct tvmgen_default_outputs outputs;
+  outputs.PartitionedCall_0 = output_maxpool2d;
+  int error_counter = 0;
+
+  tvmgen_default_run(&inputs, &outputs);
+
+  // Look for errors!
+  for (int i = 0; i < output_len; i++) {
+    if (output_maxpool2d[i] != output[i]) {
+      error_counter += 1;
+      printf("ERROR IN MAX POOL 2D EXAMPLE! output_maxpool2d[%d] (%d) != output[%d] (%d)\r\n", i,
+             output_maxpool2d[i], i, output[i]);
+      // exit(1);
+    }
+  }
+
+  // We allow for a very small percentage of errors, this could be related to rounding errors
+  if (((float)(error_counter / output_len) * 100) < 1)
+    printf("SUCCESS!\r\n");
+  else
+    printf("FAIL!\r\n");
+  exit(0);
+}
diff --git a/apps/microtvm/gemmini/template_project/src/maxpool2d_example/maxpool2d.c b/apps/microtvm/gemmini/template_project/src/maxpool2d_example/maxpool2d.c
new file mode 100644
index 000000000000..a81bc7d3c612
--- /dev/null
+++ b/apps/microtvm/gemmini/template_project/src/maxpool2d_example/maxpool2d.c
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "assert.h"
+#include "stddef.h"
+#include "stdint.h"
+#include "stdio.h"
+#include "stdlib.h"
+#ifndef BAREMETAL
+#include "sys/mman.h"
+#endif
+#include "input.h"
+#include "model/tvmgen_default.h"
+#include "output.h"
+
+int8_t output_maxpool2d[OUTPUT_LEN];
+
+int main() {
+  printf("Starting max pooling 2D test...\r\n");
+#ifndef BAREMETAL
+  if (mlockall(MCL_CURRENT | MCL_FUTURE) != 0) {
+    perror("mlockall failed");
+    exit(1);
+  }
+#endif
+
+  struct tvmgen_default_inputs inputs;
+  inputs.serving_default_x_0 = input;
+  struct tvmgen_default_outputs outputs;
+  outputs.PartitionedCall_0 = output_maxpool2d;
+  int error_counter = 0;
+
+  tvmgen_default_run(&inputs, &outputs);
+
+  // Look for errors!
+  for (int i = 0; i < OUTPUT_LEN; i++) {
+    if (output_maxpool2d[i] != output[i]) {
+      error_counter += 1;
+      printf("ERROR IN MAX POOL 2D EXAMPLE! output_maxpool2d[%d] (%d) != output[%d] (%d)\r\n", i,
+             output_maxpool2d[i], i, output[i]);
+      // exit(1);
+    }
+  }
+
+  // We allow for a very small percentage of errors, this could be related to rounding errors
+  if (((float)(error_counter / OUTPUT_LEN) * 100) < 1)
+    printf("SUCCESS!\r\n");
+  else
+    printf("FAIL!\r\n");
+  exit(0);
+}
diff --git a/apps/microtvm/gemmini/template_project/src/mobilenet.c b/apps/microtvm/gemmini/template_project/src/mobilenet.c
new file mode 100644
index 000000000000..45b606004653
--- /dev/null
+++ b/apps/microtvm/gemmini/template_project/src/mobilenet.c
@@ -0,0 +1,127 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "assert.h"
+#include "stddef.h"
+#include "stdint.h"
+#include "stdio.h"
+#include "stdlib.h"
+#ifndef BAREMETAL
+#include "sys/mman.h"
+#endif
+#include "model/inputs.h"
+#include "model/outputs.h"
+#include "model/tvmgen_default.h"
+
+uint8_t output_pred[1001];
+
+int argmax(uint8_t* vec) {
+  int idx = 0;
+  uint8_t max_value = 0;
+  for (int i = 0; i < 1001; i++) {
+    if (vec[i] > max_value) {
+      idx = i;
+      max_value = vec[i];
+    }
+  }
+  return idx;
+}
+
+void get_top_5_labels(int* top_5, uint8_t* predicted_output) {
+  uint8_t prev_max_value = (uint8_t)255;
+  uint8_t current_max_value = 0;
+  int idx = 0;
+  for (int i = 0; i < 5; i++) {
+    current_max_value = 0;
+    idx = 0;
+    for (int j = 0; j < 1001; j++) {
+      if ((predicted_output[j] > current_max_value) && (predicted_output[j] < prev_max_value)) {
+        current_max_value = predicted_output[j];
+        idx = j;
+      }
+    }
+    top_5[i] = idx;
+    prev_max_value = current_max_value;
+  }
+}
+
+int main() {
+  printf("Starting MobileNet test...\r\n");
+#ifndef BAREMETAL
+  if (mlockall(MCL_CURRENT | MCL_FUTURE) != 0) {
+    perror("mlockall failed");
+    exit(1);
+  }
+#endif
+
+  int top_5_labels[5];
+
+  struct tvmgen_default_inputs inputs;
+  inputs.input = input;
+  struct tvmgen_default_outputs outputs;
+  outputs.MobilenetV2_Predictions_Reshape = output_pred;
+  int error_counter = 0;
+
+  tvmgen_default_run(&inputs, &outputs);
+
+  // Look for errors!
+  /*for(int i = 0; i < output_len; i++)
+  {
+          if(output_pred[i] != output[i])
+{
+error_counter += 1;
+printf("ERROR IN MOBILENET EXAMPLE! output_pred[%d] (%d) != output[%d]
+(%d)\r\n",i,(int)output_pred[i],i,(int)output[i]);
+//exit(1);
+}
+  }*/
+
+  get_top_5_labels(top_5_labels, output_pred);
+
+  printf("Real Top-5 output labels: [ ");
+  for (int i = 0; i < 5; i++) printf("%d ", (int)top_5_labels[i]);
+  printf("]\r\n");
+
+  printf("Expected Top-5 output labels: [ ");
+  for (int i = 0; i < 5; i++) printf("%d ", (int)output[i]);
+  printf("]\r\n");
+
+  /*for(int i = 0; i < 5; i++)
+        {
+                if(top_5_labels[i] != output[i])
+    {
+      error_counter += 1;
+      printf("ERROR IN MOBILENET EXAMPLE! top_5_labels[%d] (%d) != output[%d]
+    (%d)\r\n",i,(int)top_5_labels[i],i,(int)output[i]);
+      //exit(1);
+    }
+        }*/
+
+  // printf("SUCCESS!\r\n");
+  exit(0);
+
+  // Take the argmax to get the predicted label, and the expected label
+  /*int predicted_label = argmax(output_pred);
+  int expected_label = argmax(output);
+  printf("Expected label = %d\r\n",expected_label);
+  printf("Predicted label = %d\r\n",predicted_label);
+  if(expected_label == predicted_label) printf("SUCCESS!\r\n");
+  else printf("FAILED!\r\n");
+  exit(0);*/
+}
diff --git a/apps/microtvm/gemmini/template_project/src/mobilenet_example/mobilenet.c b/apps/microtvm/gemmini/template_project/src/mobilenet_example/mobilenet.c
new file mode 100644
index 000000000000..70bd145da2a1
--- /dev/null
+++ b/apps/microtvm/gemmini/template_project/src/mobilenet_example/mobilenet.c
@@ -0,0 +1,127 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "assert.h"
+#include "stddef.h"
+#include "stdint.h"
+#include "stdio.h"
+#include "stdlib.h"
+#ifndef BAREMETAL
+#include "sys/mman.h"
+#endif
+#include "input.h"
+#include "model/tvmgen_default.h"
+#include "output.h"
+
+uint8_t output_pred[1001];
+
+int argmax(uint8_t* vec) {
+  int idx = 0;
+  uint8_t max_value = 0;
+  for (int i = 0; i < 1001; i++) {
+    if (vec[i] > max_value) {
+      idx = i;
+      max_value = vec[i];
+    }
+  }
+  return idx;
+}
+
+void get_top_5_labels(int* top_5, uint8_t* predicted_output) {
+  uint8_t prev_max_value = (uint8_t)255;
+  uint8_t current_max_value = 0;
+  int idx = 0;
+  for (int i = 0; i < 5; i++) {
+    current_max_value = 0;
+    idx = 0;
+    for (int j = 0; j < 1001; j++) {
+      if ((predicted_output[j] > current_max_value) && (predicted_output[j] < prev_max_value)) {
+        current_max_value = predicted_output[j];
+        idx = j;
+      }
+    }
+    top_5[i] = idx;
+    prev_max_value = current_max_value;
+  }
+}
+
+int main() {
+  printf("Starting MobileNet test...\r\n");
+#ifndef BAREMETAL
+  if (mlockall(MCL_CURRENT | MCL_FUTURE) != 0) {
+    perror("mlockall failed");
+    exit(1);
+  }
+#endif
+
+  int top_5_labels[5];
+
+  struct tvmgen_default_inputs inputs;
+  inputs.input = input;
+  struct tvmgen_default_outputs outputs;
+  outputs.MobilenetV2_Predictions_Reshape = output_pred;
+  int error_counter = 0;
+
+  tvmgen_default_run(&inputs, &outputs);
+
+  // Look for errors!
+  /*for(int i = 0; i < output_len; i++)
+  {
+          if(output_pred[i] != output[i])
+{
+error_counter += 1;
+printf("ERROR IN MOBILENET EXAMPLE! output_pred[%d] (%d) != output[%d]
+(%d)\r\n",i,(int)output_pred[i],i,(int)output[i]);
+//exit(1);
+}
+  }*/
+
+  get_top_5_labels(top_5_labels, output_pred);
+
+  printf("Real Top-5 output labels: [ ");
+  for (int i = 0; i < 5; i++) printf("%d ", (int)top_5_labels[i]);
+  printf("]\r\n");
+
+  printf("Expected Top-5 output labels: [ ");
+  for (int i = 0; i < 5; i++) printf("%d ", (int)output[i]);
+  printf("]\r\n");
+
+  /*for(int i = 0; i < 5; i++)
+        {
+                if(top_5_labels[i] != output[i])
+    {
+      error_counter += 1;
+      printf("ERROR IN MOBILENET EXAMPLE! top_5_labels[%d] (%d) != output[%d]
+    (%d)\r\n",i,(int)top_5_labels[i],i,(int)output[i]);
+      //exit(1);
+    }
+        }*/
+
+  // printf("SUCCESS!\r\n");
+  exit(0);
+
+  // Take the argmax to get the predicted label, and the expected label
+  /*int predicted_label = argmax(output_pred);
+  int expected_label = argmax(output);
+  printf("Expected label = %d\r\n",expected_label);
+  printf("Predicted label = %d\r\n",predicted_label);
+  if(expected_label == predicted_label) printf("SUCCESS!\r\n");
+  else printf("FAILED!\r\n");
+  exit(0);*/
+}
diff --git a/apps/microtvm/zephyr/template_project/src/mlperftiny/tvmruntime.cc b/apps/microtvm/zephyr/template_project/src/mlperftiny/tvmruntime.cc
new file mode 100644
index 000000000000..b16a1e711f8c
--- /dev/null
+++ b/apps/microtvm/zephyr/template_project/src/mlperftiny/tvmruntime.cc
@@ -0,0 +1,164 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "tvmruntime.h"
+
+#include <assert.h>
+#include <float.h>
+#include <kernel.h>
+#include <math.h>
+#include <power/reboot.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/crt/logging.h>
+#include <tvm/runtime/crt/platform.h>
+#include <tvm/runtime/crt/stack_allocator.h>
+
+#include "output_data.h"
+#include "tvmgen_default.h"
+#include "zephyr_uart.h"
+
+#ifdef CONFIG_ARCH_POSIX
+#include "posix_board_if.h"
+#endif
+
+// OUT_QUANT_SCALE and OUT_QUANT_ZERO are set in python.
+#if TARGET_MODEL == 3
+float* g_output_data = output_data;
+#else
+int8_t* g_output_data = output_data;
+float g_quant_scale = OUT_QUANT_SCALE;
+int8_t g_quant_zero = OUT_QUANT_ZERO;
+#endif
+size_t g_output_data_len = output_data_len;
+
+// WORKSPACE_SIZE is defined in python
+static uint8_t g_aot_memory[WORKSPACE_SIZE];
+tvm_workspace_t app_workspace;
+
+size_t TVMPlatformFormatMessage(char* out_buf, size_t out_buf_size_bytes, const char* fmt,
+                                va_list args) {
+  return vsnprintk(out_buf, out_buf_size_bytes, fmt, args);
+}
+
+void TVMLogf(const char* msg, ...) {
+  char buffer[128];
+  int size;
+  va_list args;
+  va_start(args, msg);
+  size = TVMPlatformFormatMessage(buffer, 128, msg, args);
+  va_end(args);
+  TVMPlatformWriteSerial(buffer, (size_t)size);
+}
+
+void __attribute__((noreturn)) TVMPlatformAbort(tvm_crt_error_t error) {
+  TVMLogf("TVMPlatformAbort: %08x\n", error);
+  sys_reboot(SYS_REBOOT_COLD);
+  for (;;)
+    ;
+}
+
+tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLDevice dev, void** out_ptr) {
+  return StackMemoryManager_Allocate(&app_workspace, num_bytes, out_ptr);
+}
+
+tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLDevice dev) {
+  return StackMemoryManager_Free(&app_workspace, ptr);
+}
+
+void timer_expiry_function(struct k_timer* timer_id) { return; }
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+void* TVMBackendAllocWorkspace(int device_type, int device_id, uint64_t nbytes, int dtype_code_hint,
+                               int dtype_bits_hint) {
+  tvm_crt_error_t err = kTvmErrorNoError;
+  void* ptr = 0;
+  DLDevice dev = {(DLDeviceType)device_type, device_id};
+  assert(nbytes > 0);
+  err = TVMPlatformMemoryAllocate(nbytes, dev, &ptr);
+  CHECK_EQ(err, kTvmErrorNoError,
+           "TVMBackendAllocWorkspace(%d, %d, %" PRIu64 ", %d, %d) -> %" PRId32, device_type,
+           device_id, nbytes, dtype_code_hint, dtype_bits_hint, err);
+  return ptr;
+}
+
+int TVMBackendFreeWorkspace(int device_type, int device_id, void* ptr) {
+  tvm_crt_error_t err = kTvmErrorNoError;
+  DLDevice dev = {(DLDeviceType)device_type, device_id};
+  err = TVMPlatformMemoryFree(ptr, dev);
+  CHECK_EQ(err, kTvmErrorNoError, "TVMBackendFreeWorkspace(%d, %d)", device_type, device_id);
+  return err;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+void TVMRuntimeInit() { StackMemoryManager_Init(&app_workspace, g_aot_memory, WORKSPACE_SIZE); }
+
+void TVMInfer(void* input_ptr) {
+  struct tvmgen_default_inputs inputs = {
+#if TARGET_MODEL == MODEL_KWS
+    .input_1 = input_ptr,
+#elif TARGET_MODEL == MODEL_IC
+    .input_1_int8 = input_ptr,
+#elif TARGET_MODEL == MODEL_VWW
+    .input_1_int8 = input_ptr,
+#elif TARGET_MODEL == MODEL_AD
+    .input_1 = input_ptr,
+#elif
+#error Wrong model.
+#endif
+  };
+
+  struct tvmgen_default_outputs outputs = {
+#if TARGET_MODEL == MODEL_KWS
+#if COMPILE_WITH_CMSISNN
+    .Identity = output_data,
+#else
+    .output = output_data,
+#endif
+#elif TARGET_MODEL == MODEL_IC
+    .Identity_int8 = output_data,
+#elif TARGET_MODEL == MODEL_VWW
+    .Identity_int8 = output_data,
+#elif TARGET_MODEL == MODEL_AD
+    .Identity = output_data,
+#endif
+  };
+
+  int ret_val = tvmgen_default_run(&inputs, &outputs);
+  if (ret_val != 0) {
+    TVMLogf("Error: %d\n", ret_val);
+  }
+}
+
+int8_t QuantizeFloatToInt8(float value, float scale, int zero_point) {
+  int32_t result = round(value / scale) + zero_point;
+  if (result < INT8_MIN) {
+    result = INT8_MIN;
+  }
+  if (result > INT8_MAX) {
+    result = INT8_MAX;
+  }
+  return (int8_t)(result);
+}
diff --git a/apps/microtvm/zephyr/template_project/src/mlperftiny/tvmruntime.h b/apps/microtvm/zephyr/template_project/src/mlperftiny/tvmruntime.h
new file mode 100644
index 000000000000..940d64634d59
--- /dev/null
+++ b/apps/microtvm/zephyr/template_project/src/mlperftiny/tvmruntime.h
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef APPS_MICROTVM_ZEPHYR_TEMPLATE_PROJECT_SRC_MLPERFTINY_TVMRUNTIME_H_
+#define APPS_MICROTVM_ZEPHYR_TEMPLATE_PROJECT_SRC_MLPERFTINY_TVMRUNTIME_H_
+
+#include <stdarg.h>
+#include <tvm/runtime/crt/error_codes.h>
+#include <unistd.h>
+
+#define MODEL_KWS 1
+#define MODEL_VWW 2
+#define MODEL_AD 3
+#define MODEL_IC 4
+
+extern const unsigned char g_wakeup_sequence[];
+extern size_t g_output_data_len;
+
+#if TARGET_MODEL == 3
+extern float* g_output_data;
+#else
+extern int8_t* g_output_data;
+#endif
+
+extern float g_quant_scale;
+extern int8_t g_quant_zero;
+
+/*!
+ * \brief Initialize TVM runtime.
+ */
+void TVMRuntimeInit();
+
+/*!
+ * \brief Run TVM inference.
+ */
+void TVMInfer(void* input_ptr);
+
+/*!
+ * \brief Quantize float to int8.
+ * \param value Input data in float.
+ * \param scale Quantization scale factor.
+ * \param zero_point Quantization zero point.
+ */
+int8_t QuantizeFloatToInt8(float value, float scale, int zero_point);
+
+#endif /* APPS_MICROTVM_ZEPHYR_TEMPLATE_PROJECT_SRC_MLPERFTINY_TVMRUNTIME_H_ */
diff --git a/apps/microtvm/zephyr/template_project/src/mlperftiny/zephyr_uart.cc b/apps/microtvm/zephyr/template_project/src/mlperftiny/zephyr_uart.cc
new file mode 100644
index 000000000000..9880eadd4d9b
--- /dev/null
+++ b/apps/microtvm/zephyr/template_project/src/mlperftiny/zephyr_uart.cc
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "zephyr_uart.h"
+
+#include <drivers/uart.h>
+#include <sys/ring_buffer.h>
+#include <tvm/runtime/crt/error_codes.h>
+
+#include "crt_config.h"
+
+static const struct device* g_microtvm_uart;
+
+static uint8_t uart_data[8];
+
+// UART interrupt callback.
+void uart_irq_cb(const struct device* dev, void* user_data) {
+  while (uart_irq_update(dev) && uart_irq_is_pending(dev)) {
+    struct ring_buf* rbuf = (struct ring_buf*)user_data;
+    if (uart_irq_rx_ready(dev) != 0) {
+      for (;;) {
+        // Read a small chunk of data from the UART.
+        int bytes_read = uart_fifo_read(dev, uart_data, sizeof(uart_data));
+        if (bytes_read < 0) {
+          TVMPlatformAbort((tvm_crt_error_t)(0xbeef1));
+        } else if (bytes_read == 0) {
+          break;
+        }
+        // Write it into the ring buffer.
+        int bytes_written = ring_buf_put(rbuf, uart_data, bytes_read);
+        if (bytes_read != bytes_written) {
+          TVMPlatformAbort((tvm_crt_error_t)(0xbeef2));
+        }
+      }
+    }
+  }
+}
+
+// Initialize the UART receiver.
+void uart_rx_init(struct ring_buf* rbuf, const struct device* dev) {
+  uart_irq_callback_user_data_set(dev, uart_irq_cb, (void*)rbuf);
+  uart_irq_rx_enable(dev);
+}
+
+// UART read.
+char TVMPlatformUartRxRead() {
+  unsigned char c;
+  int ret = -1;
+  while (ret != 0) {
+    ret = uart_poll_in(g_microtvm_uart, &c);
+  }
+  return (char)c;
+}
+
+// UART write.
+uint32_t TVMPlatformWriteSerial(const char* data, uint32_t size) {
+  for (uint32_t i = 0; i < size; i++) {
+    uart_poll_out(g_microtvm_uart, data[i]);
+  }
+  return size;
+}
+
+// Initialize UART.
+void TVMPlatformUARTInit(uint32_t baudrate /* = TVM_UART_DEFAULT_BAUDRATE */) {
+  // Claim console device.
+  g_microtvm_uart = device_get_binding(DT_LABEL(DT_CHOSEN(zephyr_console)));
+  const struct uart_config config = {.baudrate = baudrate,
+                                     .parity = UART_CFG_PARITY_NONE,
+                                     .stop_bits = UART_CFG_STOP_BITS_1,
+                                     .data_bits = UART_CFG_DATA_BITS_8,
+                                     .flow_ctrl = UART_CFG_FLOW_CTRL_NONE};
+  uart_configure(g_microtvm_uart, &config);
+}
diff --git a/apps/microtvm/zephyr/template_project/src/mlperftiny/zephyr_uart.h b/apps/microtvm/zephyr/template_project/src/mlperftiny/zephyr_uart.h
new file mode 100644
index 000000000000..f10cf0262224
--- /dev/null
+++ b/apps/microtvm/zephyr/template_project/src/mlperftiny/zephyr_uart.h
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef APPS_MICROTVM_ZEPHYR_TEMPLATE_PROJECT_SRC_MLPERFTINY_ZEPHYR_UART_H_
+#define APPS_MICROTVM_ZEPHYR_TEMPLATE_PROJECT_SRC_MLPERFTINY_ZEPHYR_UART_H_
+
+#include <stdint.h>
+
+#define TVM_UART_DEFAULT_BAUDRATE 115200
+
+/*!
+ * \brief Read Uart Rx buffer.
+ * \param data Pointer to read data.
+ * \param data_size_bytes Read request size in bytes.
+ *
+ * \return Number of data read in bytes.
+ */
+char TVMPlatformUartRxRead();
+
+/*!
+ * \brief Write data in serial.
+ * \param data Pointer to data to write.
+ * \param size Size of data in bytes.
+ *
+ * \return Number of write in bytes.
+ */
+uint32_t TVMPlatformWriteSerial(const char* data, uint32_t size);
+
+/*!
+ * \brief Initialize Uart.
+ * \param baudrate Desired UART baudrate.
+ */
+void TVMPlatformUARTInit(uint32_t baudrate = TVM_UART_DEFAULT_BAUDRATE);
+
+#endif /* APPS_MICROTVM_ZEPHYR_TEMPLATE_PROJECT_SRC_MLPERFTINY_ZEPHYR_UART_H_ */
diff --git a/cmake/config.cmake b/cmake/config.cmake
index 8a7a0f1fdd29..723cd1f62bb4 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -296,6 +296,9 @@ set(USE_ANTLR OFF)
 # Whether use Relay debug mode
 set(USE_RELAY_DEBUG OFF)
 
+# Wheter to build the microTVM Gemmini integration
+set(USE_GEMMINI OFF)
+
 # Whether to build fast VTA simulator driver
 set(USE_VTA_FSIM OFF)
 
diff --git a/cmake/modules/CRT.cmake b/cmake/modules/CRT.cmake
new file mode 100644
index 000000000000..518a613dc102
--- /dev/null
+++ b/cmake/modules/CRT.cmake
@@ -0,0 +1,74 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more contributor
+# license agreements.  See the NOTICE file distributed with this work for
+# additional information regarding copyright ownership.  The ASF licenses this
+# file to you under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License.  You may obtain a copy of
+# the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+# License for the specific language governing permissions and limitations under
+# the License.
+
+if(USE_MICRO)
+  message(STATUS "Add CRT template project for microTVM")
+
+  function(microtvm_add_crt)
+    list(
+      APPEND
+      CRT_TEMPLATE_FILE_COPY_JOBS
+      "src/runtime/crt/host microtvm_api_server.py -> crt"
+      "src/runtime/crt/host Makefile.template -> crt"
+      "src/runtime/crt crt_config-template.h -> crt"
+      "src/runtime/crt/host main.cc -> crt/src"
+    )
+
+    foreach(job_spec IN LISTS CRT_TEMPLATE_FILE_COPY_JOBS)
+      string(REPLACE " " ";" job_spec "${job_spec}")
+      list(LENGTH job_spec job_spec_length)
+      math(EXPR job_spec_length_mod "${job_spec_length} % 3")
+      if(NOT "${job_spec_length_mod}" EQUAL 1)
+        message(
+          FATAL_ERROR
+            "CRT copy job spec list length is ${job_spec_length}; parsed job spec is ${job_spec}"
+        )
+      endif()
+      math(EXPR job_spec_stop "${job_spec_length} - 3")
+
+      list(GET job_spec 0 job_src_base)
+      set(job_src_base "${CMAKE_CURRENT_SOURCE_DIR}/${job_src_base}")
+      foreach(copy_pattern_index RANGE 1 "${job_spec_stop}" 3)
+        list(GET job_spec ${copy_pattern_index} copy_pattern)
+        math(EXPR copy_dest_index "${copy_pattern_index} + 2")
+        list(GET job_spec ${copy_dest_index} copy_dest)
+
+        file(
+          GLOB_RECURSE copy_files
+          RELATIVE "${job_src_base}"
+          "${job_src_base}/${copy_pattern}")
+        list(LENGTH copy_files copy_files_length)
+        if("${copy_files_length}" EQUAL 0)
+          message(
+            FATAL_ERROR
+              "CRT copy job matched 0 files: ${job_src_base}/${copy_pattern} -> ${copy_dest}"
+          )
+        endif()
+        foreach(copy_src IN LISTS copy_files)
+          get_filename_component(
+            dest_path "${MICROTVM_TEMPLATE_PROJECTS}/${copy_dest}/${copy_src}"
+            ABSOLUTE)
+          tvm_micro_add_copy_file(crt_template_deps
+                                  ${job_src_base}/${copy_src} ${dest_path})
+        endforeach()
+      endforeach()
+    endforeach()
+
+    add_custom_target(crt DEPENDS ${crt_template_deps})
+  endfunction()
+
+  microtvm_add_crt()
+
+endif(USE_MICRO)
diff --git a/cmake/modules/contrib/Gemmini.cmake b/cmake/modules/contrib/Gemmini.cmake
new file mode 100644
index 000000000000..aaac04ec7ba1
--- /dev/null
+++ b/cmake/modules/contrib/Gemmini.cmake
@@ -0,0 +1,134 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+if(USE_GEMMINI)
+  message(STATUS "Add Gemmini for microTVM")
+
+  function(microtvm_add_gemmini)
+    list(
+      APPEND
+      GEMMINI_FILE_COPY_JOBS
+      "apps/microtvm/gemmini/template_project microtvm_api_server.py -> gemmini"
+
+      # Dense example project generation
+      "apps/microtvm/gemmini/template_project/src/dense_example dense.c -> gemmini/src/dense_example"
+      "apps/microtvm/gemmini/template_project/src Makefile -> gemmini/src/dense_example"
+      "apps/microtvm/gemmini/template_project/src Makefile.in -> gemmini/src/dense_example"
+      "apps/microtvm/gemmini/template_project/src Makefrag.mk -> gemmini/src/dense_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests build.sh -> gemmini/src/dense_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests configure.ac -> gemmini/src/dense_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests/include *.h -> gemmini/src/dense_example/include"
+      "3rdparty/gemmini/software/gemmini-rocc-tests/rocc-software/src *.h -> gemmini/src/dense_example/rocc-software/src"
+
+      # CONV2D example project generation
+      "apps/microtvm/gemmini/template_project/src/conv2d_example conv2d.c -> gemmini/src/conv2d_example"
+      "apps/microtvm/gemmini/template_project/src Makefile -> gemmini/src/conv2d_example"
+      "apps/microtvm/gemmini/template_project/src Makefile.in -> gemmini/src/conv2d_example"
+      "apps/microtvm/gemmini/template_project/src Makefrag.mk -> gemmini/src/conv2d_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests build.sh -> gemmini/src/conv2d_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests configure.ac -> gemmini/src/conv2d_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests/include *.h -> gemmini/src/conv2d_example/include"
+      "3rdparty/gemmini/software/gemmini-rocc-tests/rocc-software/src *.h -> gemmini/src/conv2d_example/rocc-software/src"
+
+      # DW CONV2D example project generation
+      "apps/microtvm/gemmini/template_project/src/dwconv2d_example dwconv2d.c -> gemmini/src/dwconv2d_example"
+      "apps/microtvm/gemmini/template_project/src Makefile -> gemmini/src/dwconv2d_example"
+      "apps/microtvm/gemmini/template_project/src Makefile.in -> gemmini/src/dwconv2d_example"
+      "apps/microtvm/gemmini/template_project/src Makefrag.mk -> gemmini/src/dwconv2d_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests build.sh -> gemmini/src/dwconv2d_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests configure.ac -> gemmini/src/dwconv2d_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests/include *.h -> gemmini/src/dwconv2d_example/include"
+      "3rdparty/gemmini/software/gemmini-rocc-tests/rocc-software/src *.h -> gemmini/src/dwconv2d_example/rocc-software/src"
+
+      # ADD example project generation
+      "apps/microtvm/gemmini/template_project/src/add_example add.c -> gemmini/src/add_example"
+      "apps/microtvm/gemmini/template_project/src Makefile -> gemmini/src/add_example"
+      "apps/microtvm/gemmini/template_project/src Makefile.in -> gemmini/src/add_example"
+      "apps/microtvm/gemmini/template_project/src Makefrag.mk -> gemmini/src/add_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests build.sh -> gemmini/src/add_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests configure.ac -> gemmini/src/add_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests/include *.h -> gemmini/src/add_example/include"
+      "3rdparty/gemmini/software/gemmini-rocc-tests/rocc-software/src *.h -> gemmini/src/add_example/rocc-software/src"
+
+      # Max pooling 2d example project generation
+      "apps/microtvm/gemmini/template_project/src/maxpool2d_example maxpool2d.c -> gemmini/src/maxpool2d_example"
+      "apps/microtvm/gemmini/template_project/src Makefile -> gemmini/src/maxpool2d_example"
+      "apps/microtvm/gemmini/template_project/src Makefile.in -> gemmini/src/maxpool2d_example"
+      "apps/microtvm/gemmini/template_project/src Makefrag.mk -> gemmini/src/maxpool2d_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests build.sh -> gemmini/src/maxpool2d_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests configure.ac -> gemmini/src/maxpool2d_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests/include *.h -> gemmini/src/maxpool2d_example/include"
+      "3rdparty/gemmini/software/gemmini-rocc-tests/rocc-software/src *.h -> gemmini/src/maxpool2d_example/rocc-software/src"
+
+      # Mobilenet example project generation
+      "apps/microtvm/gemmini/template_project/src/mobilenet_example mobilenet.c -> gemmini/src/mobilenet_example"
+      "apps/microtvm/gemmini/template_project/src Makefile -> gemmini/src/mobilenet_example"
+      "apps/microtvm/gemmini/template_project/src Makefile.in -> gemmini/src/mobilenet_example"
+      "apps/microtvm/gemmini/template_project/src Makefrag.mk -> gemmini/src/mobilenet_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests build.sh -> gemmini/src/mobilenet_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests configure.ac -> gemmini/src/mobilenet_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests/include *.h -> gemmini/src/mobilenet_example/include"
+      "3rdparty/gemmini/software/gemmini-rocc-tests/rocc-software/src *.h -> gemmini/src/mobilenet_example/rocc-software/src"
+    )
+
+    foreach(job_spec IN LISTS GEMMINI_FILE_COPY_JOBS)
+      string(REPLACE " " ";" job_spec "${job_spec}")
+      list(LENGTH job_spec job_spec_length)
+      math(EXPR job_spec_length_mod "${job_spec_length} % 3")
+      if(NOT "${job_spec_length_mod}" EQUAL 1)
+        message(
+          FATAL_ERROR
+            "Gemmini copy job spec list length is ${job_spec_length}; parsed job spec is ${job_spec}"
+        )
+      endif()
+      math(EXPR job_spec_stop "${job_spec_length} - 3")
+
+      list(GET job_spec 0 job_src_base)
+      set(job_src_base "${CMAKE_SOURCE_DIR}/${job_src_base}")
+      foreach(copy_pattern_index RANGE 1 "${job_spec_stop}" 3)
+        list(GET job_spec ${copy_pattern_index} copy_pattern)
+        math(EXPR copy_dest_index "${copy_pattern_index} + 2")
+        list(GET job_spec ${copy_dest_index} copy_dest)
+
+        file(
+          GLOB_RECURSE copy_files
+          RELATIVE "${job_src_base}"
+          "${job_src_base}/${copy_pattern}")
+        list(LENGTH copy_files copy_files_length)
+        if("${copy_files_length}" EQUAL 0)
+          message(
+            FATAL_ERROR
+              "Gemmini copy job matched 0 files: ${job_src_base}/${copy_pattern} -> ${copy_dest}"
+          )
+        endif()
+        foreach(copy_src IN LISTS copy_files)
+          get_filename_component(
+            dest_path "${MICROTVM_TEMPLATE_PROJECTS}/${copy_dest}/${copy_src}"
+            ABSOLUTE)
+          tvm_micro_add_copy_file(gemmini_template_deps
+                                  ${job_src_base}/${copy_src} ${dest_path})
+        endforeach()
+      endforeach()
+    endforeach()
+
+    add_custom_target(gemmini DEPENDS ${gemmini_template_deps})
+  endfunction()
+
+  microtvm_add_gemmini()
+  generate_crt_config(gemmini "${CMAKE_CURRENT_BINARY_DIR}/microtvm_template_projects/gemmini/crt_config/crt_config.h")
+
+endif(USE_MICRO)
diff --git a/cmake/utils/CRTConfig.cmake b/cmake/utils/CRTConfig.cmake
index 42c523b08786..1d767cb72c13 100644
--- a/cmake/utils/CRTConfig.cmake
+++ b/cmake/utils/CRTConfig.cmake
@@ -30,6 +30,8 @@ function(generate_crt_config platform output_path)
     set(TVM_CRT_MAX_PACKET_SIZE_BYTES 512)
   elseif("${platform}" STREQUAL "arduino")
     set(TVM_CRT_MAX_PACKET_SIZE_BYTES 8*1024)
+  elseif("${platform}" STREQUAL "gemmini")
+    set(TVM_CRT_MAX_PACKET_SIZE_BYTES 8*1024)
   endif()
   configure_file("${CMAKE_CURRENT_SOURCE_DIR}/src/runtime/crt/crt_config.h.template" "${output_path}")
 endfunction()
diff --git a/gallery/tutorial/micro_gemmini_add.py b/gallery/tutorial/micro_gemmini_add.py
new file mode 100644
index 000000000000..2b1e85dbdeb8
--- /dev/null
+++ b/gallery/tutorial/micro_gemmini_add.py
@@ -0,0 +1,233 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Running TVM on the Gemmini accelerator - A single add layer example
+======================================================================================
+**Author**:
+`Federico Peccia <https://fPecc.github.io/>`_
+
+This tutorials shows how a quantized add layer can be compiled to be executed on the Gemmini accelerator. The generated baremetal C code is then tested on the Spike RISC-V ISA simulator. Before starting this tutorial, you should have downloaded the Chipyard repository and installed the Spike simulator with the Gemmini extension.
+
+Note: This is an **experimental** layer!
+"""
+
+import tensorflow as tf
+from tensorflow.keras import layers
+import tarfile
+import tempfile
+import pathlib
+import numpy as np
+import os
+import tvm.contrib.gemmini as gemmini
+from tvm import relay
+import tvm
+from tvm.micro.testing.utils import create_header_file
+
+##################################
+# Pre-requisites
+# --------------------------------
+#
+# After the installation of the Chipyard development tools, you should have an env.sh file in your Chipyard home directory. This file needs to be sourced before running this tutorial:
+#
+# .. code-block:: bash
+#
+#   source <your chipyard home path>/env.sh
+#
+# WARNING: if you have installed TVM in a virtual environment, FIRST activate the Chipyard environment, and THEN activate the tvm entironment.
+
+##################################
+# Baseline generation
+# --------------------------------
+#
+# In this section, we will generate the baseline input and expected output, which we are going to use to compare with the actual obtained output after running on the Gemmini accelerator.
+
+# Then we define the parameters of the layer we want to test. In this case:
+input_height = 16
+input_width = 16
+input_channels = 16
+activation = 0
+
+# We will generate a prequantized TFLite model, because for now the Gemmini integration only supports models that were quantized with specific flags as input.
+class Model(tf.Module):
+    def __init__(self, name=None):
+        super().__init__(name)
+
+    @tf.function(
+        input_signature=[
+            tf.TensorSpec(
+                shape=[1, input_height, input_width, input_channels],
+                dtype=tf.float32,
+            ),
+            tf.TensorSpec(
+                shape=[1, input_height, input_width, input_channels],
+                dtype=tf.float32,
+            ),
+        ]
+    )
+    def add(self, x, y):
+        if activation == 0:
+            return x + y
+        else:
+            return layers.Activation("relu")(x + y)
+
+
+model = Model()
+
+# Convert the concrete functions using TFLiteConverter
+converter = tf.lite.TFLiteConverter.from_keras_model(model)
+
+
+def representative_data_gen():
+    dataset = [
+        (
+            np.array(
+                np.random.randint(-127, 128, size=(1, input_height, input_width, input_channels)),
+                dtype=np.float32,
+            ),
+            np.array(
+                np.random.randint(0, 128, size=(1, input_height, input_width, input_channels)),
+                dtype=np.float32,
+            ),
+        )
+        for s in range(100)
+    ]
+    for input_value in dataset:
+        yield [input_value[0], input_value[1]]
+
+
+converter.optimizations = [tf.lite.Optimize.DEFAULT]
+converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
+converter.inference_input_type = tf.uint8
+converter.inference_output_type = tf.int8
+converter.representative_dataset = representative_data_gen
+converter._experimental_disable_per_channel = True
+
+tflite_model = converter.convert()
+
+# Save the model.
+tmpdir = tvm.contrib.utils.tempdir()
+tflite_file = tmpdir / "add.tflite"
+with open(tflite_file, "wb") as f:
+    f.write(tflite_model)
+
+# Now that we have created the model, we import the model and run it. We store the output, in order to compare it with the output that will be later obtained from the Gemmini accelerator.
+
+os.system("rm -rf generated-project/")
+tflite_model_buf = open(tflite_file, "rb").read()
+input_tensor = "layer1_input"
+input_dtype = "uint8"
+
+try:
+    import tflite
+
+    tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)
+except AttributeError:
+    import tflite.Model
+
+    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
+
+# Load the TFLite model and allocate tensors.
+interpreter = tf.lite.Interpreter(
+    model_path=str(tflite_file), experimental_preserve_all_tensors=True
+)
+interpreter.allocate_tensors()
+input_details = interpreter.get_input_details()
+output_details = interpreter.get_output_details()
+tensor_details = interpreter.get_tensor_details()
+
+input_matrix_1 = np.random.randint(
+    0, 255, (1, input_height, input_width, input_channels), dtype=np.uint8
+)
+input_matrix_2 = np.random.randint(
+    0, 255, (1, input_height, input_width, input_channels), dtype=np.uint8
+)
+
+interpreter.set_tensor(input_details[0]["index"], input_matrix_1)
+interpreter.set_tensor(input_details[1]["index"], input_matrix_2)
+
+interpreter.invoke()
+expected_output = interpreter.get_tensor(output_details[0]["index"])
+
+##################################
+# Compiling the model with TVM
+# --------------------------------
+#
+# In this section, we will compile the model using TVM and the Gemmini integration.
+
+# The Gemmini environment class needs to be initialized with the parameters of the Gemmini accelerator where we want to execute our operation. We use here the default parameters.
+gemmini.Environment.init_overwrite(
+    dim=16, acc_rows=1024, bank_rows=4096, use_experimental_qnn_add=True
+)
+
+# The TFLite model generated in the previous steps is now imported into TVM.
+
+mod, params = relay.frontend.from_tflite(
+    tflite_model,
+    shape_dict={
+        "serving_default_x": (1, input_height, input_width, input_channels),
+        "serving_default_y": (1, input_height, input_width, input_channels),
+    },
+    dtype_dict={"serving_default_x": input_dtype, "serving_default_y": input_dtype},
+)
+mod = relay.transform.InferType()(mod)
+mod["main"]
+
+# In order to be able to build a model for the Gemmini accelerator, we need to replace all supported layers by the Gemmini specific operators. This is done using the gemmini.preprocess pass. Notice the changes in the "main" function after running the preprocess pass.
+
+mod = gemmini.preprocess_pass(mod)
+mod["main"]
+
+# Now, we build the Relay Graph. Notice that we are using the CRT runtime, the target is C because we want to generate C code (but the device is Gemmini), and we use the AOT executor and the USMP feature in order to get a complete bare metal C code, without calls to memory allocator APIs.
+# The gemmini.build_config function returns a PassContext object containing the specific parameters needed to correctly build the model for the Gemmini accelerator.
+
+RUNTIME = tvm.relay.backend.Runtime("crt", {"system-lib": False})
+TARGET = tvm.target.target.Target({"kind": "c", "device": "gemmini"})
+EXECUTOR = tvm.relay.backend.Executor("aot", options={"interface-api": "c", "unpacked-api": 1})
+
+with gemmini.build_config(usmp_alg="hill_climb", opt_level=3, disabled_pass=["AlterOpLayout"]):
+    module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)
+
+#################################################
+# Exporting and testing the model using microTVM
+# -----------------------------------------------
+#
+# In this section, we will export the model using one of the provided example microTVM projects, we will compile it using the Chipyard tool, and then test the generated baremetal code on the Spike simulator.
+
+tmpdir = tvm.contrib.utils.tempdir()
+model_library_format_tar_path = tvm.micro.export_model_library_format(module, tmpdir / "model.tar")
+with tempfile.NamedTemporaryFile() as tar_temp_file:
+    with tarfile.open(tar_temp_file.name, "w:gz") as tar_file:
+        # Here, we create headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one.
+        create_header_file("input_1", input_matrix_1, "include/tvm", tar_file)
+        create_header_file("input_2", input_matrix_2, "include/tvm", tar_file)
+        create_header_file("output", expected_output, "include/tvm", tar_file)
+
+    # Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects.
+    template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("gemmini"))
+    project_options = {"project_type": "add_example", "extra_files_tar": tar_temp_file.name}
+
+    generated_project_dir = pathlib.Path(pathlib.Path.cwd(), "generated-project")
+    generated_project = tvm.micro.generate_project(
+        template_project_path, module, generated_project_dir, project_options
+    )
+
+# We build the project. This will generate an executable we can run on the Spike simulator.
+generated_project.build()
+
+# Finally, we execute the compiled baremetal project on the Spike simulator.
+# Note: if there are errors, these can be related to rounding errors.
+generated_project.flash()
diff --git a/gallery/tutorial/micro_gemmini_conv2d.py b/gallery/tutorial/micro_gemmini_conv2d.py
new file mode 100644
index 000000000000..cc96a4a6a1bf
--- /dev/null
+++ b/gallery/tutorial/micro_gemmini_conv2d.py
@@ -0,0 +1,217 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Running TVM on the Gemmini accelerator - A single 2d convolutional layer example
+======================================================================================
+**Author**:
+`Federico Peccia <https://fPecc.github.io/>`_
+
+This tutorials shows how a quantized 2d convolution layer can be compiled to be executed on the Gemmini accelerator. The generated baremetal C code is then tested on the Spike RISC-V ISA simulator. Before starting this tutorial, you should have downloaded the Chipyard repository and installed the Spike simulator with the Gemmini extension.
+
+"""
+
+import tensorflow as tf
+from tensorflow import keras
+import tarfile
+import tempfile
+import pathlib
+from tensorflow.keras import layers
+import numpy as np
+import os
+import tvm.contrib.gemmini as gemmini
+from tvm import relay
+import tvm
+from tvm.micro.testing.utils import create_header_file
+
+##################################
+# Pre-requisites
+# --------------------------------
+#
+# After the installation of the Chipyard development tools, you should have an env.sh file in your Chipyard home directory. This file needs to be sourced before running this tutorial:
+#
+# .. code-block:: bash
+#
+#   source <your chipyard home path>/env.sh
+#
+# WARNING: if you have installed TVM in a virtual environment, FIRST activate the Chipyard environment, and THEN activate the tvm entironment.
+
+##################################
+# Baseline generation
+# --------------------------------
+#
+# In this section, we will generate the baseline input and expected output, which we are going to use to compare with the actual obtained output after running on the Gemmini accelerator.
+
+# Then we define the parameters of the layer we want to test. In this case:
+input_height = 16
+input_width = 16
+input_channels = 16
+output_channels = 16
+kernel_size = 3
+stride = 1
+padding = "valid"
+activation = None
+bias = True
+
+# We can add a max pooling layer after the convolution. This can be merged by the integration and can be executed together with the convolution on the Gemmini accelerator.
+pool_size = 1
+pool_stride = 1
+pool_padding = "valid"
+use_pool = False
+
+# We will generate a prequantized TFLite model, because for now the Gemmini integration only supports models that were quantized with specific flags as input.
+
+layer_sequence = [
+    layers.Conv2D(
+        output_channels,
+        kernel_size=kernel_size,
+        padding=padding,
+        activation=activation,
+        use_bias=True,
+        bias_initializer="ones",
+        input_shape=(input_height, input_width, input_channels),
+        strides=stride,
+    )
+]
+if use_pool:
+    layer_sequence.append(
+        layers.MaxPool2D(pool_size=pool_size, strides=pool_stride, padding=pool_padding)
+    )
+
+model = keras.Sequential(layer_sequence)
+
+# Convert the concrete functions using TFLiteConverter
+converter = tf.lite.TFLiteConverter.from_keras_model(model)
+
+
+def representative_data_gen():
+    dataset = [
+        np.array(
+            np.random.randint(0, 10, size=(100, input_height, input_width, input_channels)),
+            dtype=np.float32,
+        )
+        for s in range(10)
+    ]
+    for input_value in dataset:
+        # Model has only one input so each data point has one element.s
+        yield [input_value]
+
+
+converter.optimizations = [tf.lite.Optimize.DEFAULT]
+converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
+converter.inference_input_type = tf.uint8
+converter.inference_output_type = tf.int8
+converter.representative_dataset = representative_data_gen
+converter._experimental_disable_per_channel = True
+
+tflite_model = converter.convert()
+
+# Save the model.
+tmpdir = tvm.contrib.utils.tempdir()
+tflite_file = tmpdir / "conv.tflite"
+with open(tflite_file, "wb") as f:
+    f.write(tflite_model)
+
+# Now that we have created the model, we import the model and run it. We store the output, in order to compare it with the output that will be later obtained from the Gemmini accelerator.
+
+os.system("rm -rf generated-project/")
+
+tflite_model_buf = open(tflite_file, "rb").read()
+input_tensor = "layer1_input"
+input_dtype = "uint8"
+
+# os.system("mkdir -p include")
+
+try:
+    import tflite
+
+    tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)
+except AttributeError:
+    import tflite.Model
+
+    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
+
+# Load the TFLite model and allocate tensors.
+interpreter = tf.lite.Interpreter(model_path=str(tflite_file))
+interpreter.allocate_tensors()
+input_details = interpreter.get_input_details()
+output_details = interpreter.get_output_details()
+input_matrix = np.random.randint(
+    0, 127, (1, input_height, input_width, input_channels), dtype=np.uint8
+)
+interpreter.set_tensor(input_details[0]["index"], input_matrix)
+interpreter.invoke()
+expected_output = interpreter.get_tensor(output_details[0]["index"])
+
+##################################
+# Compiling the model with TVM
+# --------------------------------
+#
+# In this section, we will compile the model using TVM and the Gemmini integration.
+
+# The Gemmini environment class needs to be initialized with the parameters of the Gemmini accelerator where we want to execute our operation. We use here the default parameters.
+gemmini.Environment.init_overwrite(dim=16, acc_rows=1024, bank_rows=4096)
+
+# The TFLite model generated in the previous steps is now imported into TVM.
+mod, params = relay.frontend.from_tflite(
+    tflite_model,
+    shape_dict={input_tensor: (input_height, input_width, input_channels)},
+    dtype_dict={input_tensor: input_dtype},
+)
+mod["main"]
+
+# In order to be able to build a model for the Gemmini accelerator, we need to replace all supported layers by the Gemmini specific operators. This is done using the gemmini.preprocess pass. Notice the changes in the "main" function after running the preprocess pass.
+mod = gemmini.preprocess_pass(mod)
+mod["main"]
+
+# Now, we build the Relay Graph. Notice that we are using the CRT runtime, the target is C because we want to generate C code (but the device is Gemmini), and we use the AOT executor and the USMP feature in order to get a complete bare metal C code, without calls to memory allocator APIs.
+# The gemmini.build_config function returns a PassContext object containing the specific parameters needed to correctly build the model for the Gemmini accelerator.
+RUNTIME = tvm.relay.backend.Runtime("crt", {"system-lib": False})
+TARGET = tvm.target.target.Target({"kind": "c", "device": "gemmini"})
+EXECUTOR = tvm.relay.backend.Executor("aot", options={"interface-api": "c", "unpacked-api": 1})
+
+with gemmini.build_config(usmp_alg="hill_climb", opt_level=3, disabled_pass=["AlterOpLayout"]):
+    module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)
+
+#################################################
+# Exporting and testing the model using microTVM
+# -----------------------------------------------
+#
+# In this section, we will export the model using one of the provided example microTVM projects, we will compile it using the Chipyard tool, and then test the generated baremetal code on the Spike simulator.
+
+tmpdir = tvm.contrib.utils.tempdir()
+model_library_format_tar_path = tvm.micro.export_model_library_format(module, tmpdir / "model.tar")
+with tempfile.NamedTemporaryFile() as tar_temp_file:
+    with tarfile.open(tar_temp_file.name, "w:gz") as tar_file:
+        # Here, we create headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one.
+        create_header_file("input", input_matrix, "include/tvm", tar_file)
+        create_header_file("output", expected_output, "include/tvm", tar_file)
+
+    # Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects.
+    template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("gemmini"))
+    project_options = {"project_type": "conv2d_example", "extra_files_tar": tar_temp_file.name}
+
+    generated_project_dir = pathlib.Path(pathlib.Path.cwd(), "generated-project")
+    generated_project = tvm.micro.generate_project(
+        template_project_path, module, generated_project_dir, project_options
+    )
+
+# We build the project. This will generate an executable we can run on the Spike simulator.
+generated_project.build()
+
+# Finally, we execute the compiled baremetal project on the Spike simulator.
+# Note: if there are errors, these can be related to rounding errors.
+generated_project.flash()
diff --git a/gallery/tutorial/micro_gemmini_dense.py b/gallery/tutorial/micro_gemmini_dense.py
new file mode 100644
index 000000000000..c4fb5c82d01b
--- /dev/null
+++ b/gallery/tutorial/micro_gemmini_dense.py
@@ -0,0 +1,210 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Running TVM on the Gemmini accelerator - A single dense layer example
+======================================================================================
+**Author**:
+`Federico Peccia <https://fPecc.github.io/>`_
+
+This tutorials shows how a quantized dense (fully connected) layer can be compiled to be executed on the Gemmini accelerator. The generated baremetal C code is then tested on the Spike RISC-V ISA simulator. Before starting this tutorial, you should have downloaded the Chipyard repository and installed the Spike simulator with the Gemmini extension.
+
+"""
+
+import tensorflow as tf
+import numpy as np
+import tarfile
+import tempfile
+import pathlib
+import os
+import tvm.contrib.gemmini as gemmini
+from tvm import relay
+import tvm
+from tvm.micro.testing.utils import create_header_file
+
+##################################
+# Pre-requisites
+# --------------------------------
+#
+# After the installation of the Chipyard development tools, you should have an env.sh file in your Chipyard home directory. This file needs to be sourced before running this tutorial:
+#
+# .. code-block:: bash
+#
+#   source <your chipyard home path>/env.sh
+#
+# WARNING: if you have installed TVM in a virtual environment, FIRST activate the Chipyard environment, and THEN activate the tvm entironment.
+
+##################################
+# Baseline generation
+# --------------------------------
+#
+# In this section, we will generate the baseline input and expected output, which we are going to use to compare with the actual obtained output after running on the Gemmini accelerator.
+
+# Then we define the parameters of the layer we want to test. In this case:
+input_height = 32
+input_width = 32
+output_width = 32
+
+# We will generate a prequantized TFLite model, because for now the Gemmini integration only supports models that were quantized with specific flags as input.
+class Model(tf.Module):
+    def __init__(self, name=None):
+        super().__init__(name)
+        self.w = tf.Variable(tf.random.normal([input_width, output_width]), name="w")
+        self.b = tf.Variable(tf.random.normal([output_width]), name="b")
+
+    @tf.function(
+        input_signature=[
+            tf.TensorSpec(shape=[input_height, input_width], dtype=tf.float32),
+        ]
+    )
+    def matmul(self, x):
+        return tf.linalg.matmul(x, self.w, transpose_b=False) + self.b
+
+
+model = Model()
+
+# Convert the concrete functions using TFLiteConverter
+converter = tf.lite.TFLiteConverter.from_keras_model(model)
+
+
+def representative_data_gen():
+    dataset = [
+        (
+            np.array(
+                np.random.randint(-127, 128, size=(input_height, input_width)), dtype=np.float32
+            ),
+            np.array(
+                np.random.randint(-127, 128, size=(input_width, output_width)), dtype=np.float32
+            ),
+        )
+        for s in range(100)
+    ]
+    for input_value in dataset:
+        # Model has only one input so each data point has one element.
+        yield [input_value[0]]
+
+
+converter.optimizations = [tf.lite.Optimize.DEFAULT]
+converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
+converter.inference_input_type = tf.uint8
+converter.inference_output_type = tf.int8
+converter.representative_dataset = representative_data_gen
+converter._experimental_disable_per_channel = True
+
+tflite_model = converter.convert()
+
+# Save the model.
+tmpdir = tvm.contrib.utils.tempdir()
+tflite_file = tmpdir / "matmul.tflite"
+with open(tflite_file, "wb") as f:
+    f.write(tflite_model)
+
+# Now that we have created the model, we import the model and run it. We store the output, in order to compare it with the output that will be later obtained from the Gemmini accelerator.
+os.system("rm -rf generated-project/")
+
+tflite_model_buf = open(tflite_file, "rb").read()
+input_tensor = "layer1_input"
+input_dtype = "uint8"
+
+os.system("mkdir -p include")
+
+try:
+    import tflite
+
+    tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)
+except AttributeError:
+    import tflite.Model
+
+    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
+
+# Load the TFLite model and allocate tensors.
+interpreter = tf.lite.Interpreter(
+    model_path=str(tflite_file), experimental_preserve_all_tensors=True
+)
+interpreter.allocate_tensors()
+input_details = interpreter.get_input_details()
+output_details = interpreter.get_output_details()
+tensor_details = interpreter.get_tensor_details()
+
+input1 = np.random.randint(0, 255, (input_height, input_width), dtype=np.uint8)
+interpreter.set_tensor(input_details[0]["index"], input1)
+
+interpreter.invoke()
+expected_output = interpreter.get_tensor(output_details[0]["index"])
+
+##################################
+# Compiling the model with TVM
+# --------------------------------
+#
+# In this section, we will compile the model using TVM and the Gemmini integration.
+
+# The Gemmini environment class needs to be initialized with the parameters of the Gemmini accelerator where we want to execute our operation. We use here the default parameters.
+gemmini.Environment.init_overwrite(dim=16, acc_rows=1024, bank_rows=4096)
+
+# The TFLite model generated in the previous steps is now imported into TVM.
+mod, params = relay.frontend.from_tflite(
+    tflite_model,
+    shape_dict={
+        "serving_default_x:0": (input_height, input_width),
+    },
+    dtype_dict={
+        "serving_default_x:0": input_dtype,
+    },
+)
+mod["main"]
+
+# In order to be able to build a model for the Gemmini accelerator, we need to replace all supported layers by the Gemmini specific operators. This is done using the gemmini.preprocess pass. Notice the changes in the "main" function after running the preprocess pass.
+mod = gemmini.preprocess_pass(mod)
+mod["main"]
+
+# Now, we build the Relay Graph. Notice that we are using the CRT runtime, the target is C because we want to generate C code (but the device is Gemmini), and we use the AOT executor and the USMP feature in order to get a complete bare metal C code, without calls to memory allocator APIs.
+# The gemmini.build_config function returns a PassContext object containing the specific parameters needed to correctly build the model for the Gemmini accelerator.
+RUNTIME = tvm.relay.backend.Runtime("crt", {"system-lib": False})
+TARGET = tvm.target.target.Target({"kind": "c", "device": "gemmini"})
+EXECUTOR = tvm.relay.backend.Executor("aot", options={"interface-api": "c", "unpacked-api": 1})
+
+with gemmini.build_config(usmp_alg="hill_climb", opt_level=3, disabled_pass=["AlterOpLayout"]):
+    module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)
+
+#################################################
+# Exporting and testing the model using microTVM
+# -----------------------------------------------
+#
+# In this section, we will export the model using one of the provided example microTVM projects, we will compile it using the Chipyard tool, and then test the generated baremetal code on the Spike simulator.
+
+tmpdir = tvm.contrib.utils.tempdir()
+model_library_format_tar_path = tvm.micro.export_model_library_format(module, tmpdir / "model.tar")
+with tempfile.NamedTemporaryFile() as tar_temp_file:
+    with tarfile.open(tar_temp_file.name, "w:gz") as tar_file:
+        # Here, we create headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one.
+        create_header_file("input", input1, "include/tvm", tar_file)
+        create_header_file("output", expected_output, "include/tvm", tar_file)
+
+    # Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects.
+    template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("gemmini"))
+    project_options = {"project_type": "dense_example", "extra_files_tar": tar_temp_file.name}
+
+    generated_project_dir = pathlib.Path(pathlib.Path.cwd(), "generated-project")
+    generated_project = tvm.micro.generate_project(
+        template_project_path, module, generated_project_dir, project_options
+    )
+
+# We build the project. This will generate an executable we can run on the Spike simulator.
+generated_project.build()
+
+# Finally, we execute the compiled baremetal project on the Spike simulator.
+# Note: if there are errors, these can be related to rounding errors.
+generated_project.flash()
diff --git a/gallery/tutorial/micro_gemmini_dwconv2d.py b/gallery/tutorial/micro_gemmini_dwconv2d.py
new file mode 100644
index 000000000000..ebdf8c1be22c
--- /dev/null
+++ b/gallery/tutorial/micro_gemmini_dwconv2d.py
@@ -0,0 +1,206 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Running TVM on the Gemmini accelerator - A single 2d depthwise convolutional layer example
+===========================================================================================
+**Author**:
+`Federico Peccia <https://fPecc.github.io/>`_
+
+This tutorials shows how a quantized 2D depthwise convolution layer can be compiled to be executed on the Gemmini accelerator. The generated baremetal C code is then tested on the Spike RISC-V ISA simulator. Before starting this tutorial, you should have downloaded the Chipyard repository and installed the Spike simulator with the Gemmini extension.
+"""
+
+import itertools
+import tensorflow as tf
+from tensorflow import keras
+import tarfile
+import tempfile
+import pathlib
+from tensorflow.keras import layers
+import numpy as np
+import os
+import argparse
+import random
+import tvm.contrib.gemmini as gemmini
+from tvm import relay
+import tvm
+from tvm.micro.testing.utils import create_header_file
+
+##################################
+# Pre-requisites
+# --------------------------------
+#
+# After the installation of the Chipyard development tools, you should have an env.sh file in your Chipyard home directory. This file needs to be sourced before running this tutorial:
+#
+# .. code-block:: bash
+#
+#   source <your chipyard home path>/env.sh
+#
+# WARNING: if you have installed TVM in a virtual environment, FIRST activate the Chipyard environment, and THEN activate the tvm entironment.
+
+##################################
+# Baseline generation
+# --------------------------------
+#
+# In this section, we will generate the baseline input and expected output, which we are going to use to compare with the actual obtained output after running on the Gemmini accelerator.
+
+# Then we define the parameters of the layer we want to test. In this case:
+input_height = 112
+input_width = 112
+input_channels = 32
+kernel_size = 3
+stride = 1
+padding = "same"
+activation = None
+bias = True
+
+# We will generate a prequantized TFLite model, because for now the Gemmini integration only supports models that were quantized with specific flags as input.
+model = keras.Sequential(
+    [
+        layers.DepthwiseConv2D(
+            kernel_size=kernel_size,
+            padding=padding,
+            activation=activation,
+            use_bias=True,
+            bias_initializer="ones",
+            input_shape=(input_height, input_width, input_channels),
+            strides=stride,
+        )
+    ]
+)
+
+# Convert the concrete functions using TFLiteConverter
+converter = tf.lite.TFLiteConverter.from_keras_model(model)
+
+
+def representative_data_gen():
+    dataset = [
+        np.array(
+            np.random.randint(0, 127, size=(10, input_height, input_width, input_channels)),
+            dtype=np.float32,
+        )
+        for s in range(10)
+    ]
+    for input_value in dataset:
+        # Model has only one input so each data point has one element.s
+        yield [input_value]
+
+
+converter.optimizations = [tf.lite.Optimize.DEFAULT]
+converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
+converter.inference_input_type = tf.uint8
+converter.inference_output_type = tf.int8
+converter.representative_dataset = representative_data_gen
+converter._experimental_disable_per_channel = True
+
+tflite_model = converter.convert()
+
+# Save the model.
+tmpdir = tvm.contrib.utils.tempdir()
+tflite_file = tmpdir / "dwconv.tflite"
+with open(tflite_file, "wb") as f:
+    f.write(tflite_model)
+
+# Now that we have created the model, we import the model and run it. We store the output, in order to compare it with the output that will be later obtained from the Gemmini accelerator.
+os.system("rm -rf generated-project/")
+
+tflite_model_buf = open(tflite_file, "rb").read()
+input_tensor = "layer1_input"
+input_dtype = "uint8"
+
+os.system("mkdir -p include")
+
+try:
+    import tflite
+
+    tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)
+except AttributeError:
+    import tflite.Model
+
+    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
+
+# Load the TFLite model and allocate tensors.
+interpreter = tf.lite.Interpreter(model_path=str(tflite_file))
+interpreter.allocate_tensors()
+input_details = interpreter.get_input_details()
+output_details = interpreter.get_output_details()
+tensor_details = interpreter.get_tensor_details()
+
+input = np.random.randint(0, 2, (1, input_height, input_width, input_channels), dtype=np.uint8)
+interpreter.set_tensor(input_details[0]["index"], input)
+
+interpreter.invoke()
+expected_output = interpreter.get_tensor(output_details[0]["index"])
+
+##################################
+# Compiling the model with TVM
+# --------------------------------
+#
+# In this section, we will compile the model using TVM and the Gemmini integration.
+
+# The Gemmini environment class needs to be initialized with the parameters of the Gemmini accelerator where we want to execute our operation. We use here the default parameters.
+gemmini.Environment.init_overwrite(dim=16, acc_rows=1024, bank_rows=4096)
+
+# The TFLite model generated in the previous steps is now imported into TVM.
+mod, params = relay.frontend.from_tflite(
+    tflite_model,
+    shape_dict={input_tensor: (input_height, input_width, input_channels)},
+    dtype_dict={input_tensor: input_dtype},
+)
+mod["main"]
+
+# In order to be able to build a model for the Gemmini accelerator, we need to replace all supported layers by the Gemmini specific operators. This is done using the gemmini.preprocess pass. Notice the changes in the "main" function after running the preprocess pass.
+mod = gemmini.preprocess_pass(mod)
+mod["main"]
+
+# Now, we build the Relay Graph. Notice that we are using the CRT runtime, the target is C because we want to generate C code (but the device is Gemmini), and we use the AOT executor and the USMP feature in order to get a complete bare metal C code, without calls to memory allocator APIs.
+# The gemmini.build_config function returns a PassContext object containing the specific parameters needed to correctly build the model for the Gemmini accelerator.
+RUNTIME = tvm.relay.backend.Runtime("crt", {"system-lib": False})
+TARGET = tvm.target.target.Target({"kind": "c", "device": "gemmini"})
+EXECUTOR = tvm.relay.backend.Executor("aot", options={"interface-api": "c", "unpacked-api": 1})
+
+with gemmini.build_config(usmp_alg="hill_climb", opt_level=3, disabled_pass=["AlterOpLayout"]):
+    module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)
+
+#################################################
+# Exporting and testing the model using microTVM
+# -----------------------------------------------
+#
+# In this section, we will export the model using one of the provided example microTVM projects, we will compile it using the Chipyard tool, and then test the generated baremetal code on the Spike simulator.
+
+tmpdir = tvm.contrib.utils.tempdir()
+model_library_format_tar_path = tvm.micro.export_model_library_format(module, tmpdir / "model.tar")
+with tempfile.NamedTemporaryFile() as tar_temp_file:
+    with tarfile.open(tar_temp_file.name, "w:gz") as tar_file:
+        # Here, we create headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one.
+        create_header_file("input", input, "include/tvm", tar_file)
+        create_header_file("output", expected_output, "include/tvm", tar_file)
+
+    # Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects.
+    template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("gemmini"))
+    project_options = {"project_type": "dwconv2d_example", "extra_files_tar": tar_temp_file.name}
+
+    generated_project_dir = pathlib.Path(pathlib.Path.cwd(), "generated-project")
+    generated_project = tvm.micro.generate_project(
+        template_project_path, module, generated_project_dir, project_options
+    )
+
+# We build the project. This will generate an executable we can run on the Spike simulator.
+generated_project.build()
+
+# Finally, we execute the compiled baremetal project on the Spike simulator.
+# Note: if there are errors, these can be related to rounding errors.
+generated_project.flash()
diff --git a/gallery/tutorial/micro_gemmini_maxpool2d.py b/gallery/tutorial/micro_gemmini_maxpool2d.py
new file mode 100644
index 000000000000..f4587d2d510d
--- /dev/null
+++ b/gallery/tutorial/micro_gemmini_maxpool2d.py
@@ -0,0 +1,209 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Running TVM on the Gemmini accelerator - A single 2d max pooling layer example
+======================================================================================
+**Author**:
+`Federico Peccia <https://fPecc.github.io/>`_
+
+This tutorials shows how a quantized 2D max pooling layer can be compiled to be executed on the Gemmini accelerator. The generated baremetal C code is then tested on the Spike RISC-V ISA simulator. Before starting this tutorial, you should have downloaded the Chipyard repository and installed the Spike simulator with the Gemmini extension.
+"""
+
+import tensorflow as tf
+from tensorflow.keras import layers
+import tarfile
+import tempfile
+import pathlib
+import numpy as np
+import os
+import tvm.contrib.gemmini as gemmini
+from tvm import relay
+import tvm
+from tvm.micro.testing.utils import create_header_file
+
+##################################
+# Pre-requisites
+# --------------------------------
+#
+# After the installation of the Chipyard development tools, you should have an env.sh file in your Chipyard home directory. This file needs to be sourced before running this tutorial:
+#
+# .. code-block:: bash
+#
+#   source <your chipyard home path>/env.sh
+#
+# WARNING: if you have installed TVM in a virtual environment, FIRST activate the Chipyard environment, and THEN activate the tvm entironment.
+
+##################################
+# Baseline generation
+# --------------------------------
+#
+# In this section, we will generate the baseline input and expected output, which we are going to use to compare with the actual obtained output after running on the Gemmini accelerator.
+
+# Then we define the parameters of the layer we want to test. In this case:
+input_height = 16
+input_width = 16
+input_channels = 16
+pool_size = 2
+pool_stride = 1
+pool_padding = "valid"
+
+# We will generate a prequantized TFLite model, because for now the Gemmini integration only supports models that were quantized with specific flags as input.
+class Model(tf.Module):
+    def __init__(self, name=None):
+        super().__init__(name)
+
+    @tf.function(
+        input_signature=[
+            tf.TensorSpec(
+                shape=[1, input_height, input_width, input_channels],
+                dtype=tf.float32,
+            )
+        ]
+    )
+    def maxpool(self, x):
+        return layers.MaxPool2D(pool_size=pool_size, strides=pool_stride, padding=pool_padding)(x)
+
+
+model = Model()
+
+# Convert the concrete functions using TFLiteConverter
+converter = tf.lite.TFLiteConverter.from_keras_model(model)
+
+
+def representative_data_gen():
+    dataset = [
+        np.array(
+            np.random.randint(-127, 128, size=(1, input_height, input_width, input_channels)),
+            dtype=np.float32,
+        )
+        for s in range(100)
+    ]
+    for input_value in dataset:
+        # Model has only one input so each data point has one element.
+        yield [input_value]
+
+
+converter.optimizations = [tf.lite.Optimize.DEFAULT]
+converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
+converter.inference_input_type = tf.uint8
+converter.inference_output_type = tf.int8
+converter.representative_dataset = representative_data_gen
+converter._experimental_disable_per_channel = True
+
+tflite_model = converter.convert()
+
+# Save the model.
+tmpdir = tvm.contrib.utils.tempdir()
+tflite_file = tmpdir / "maxpool.tflite"
+with open(tflite_file, "wb") as f:
+    f.write(tflite_model)
+
+# Now that we have created the model, we import the model and run it. We store the output, in order to compare it with the output that will be later obtained from the Gemmini accelerator.
+os.system("rm -rf generated-project/")
+
+tflite_model_buf = open(tflite_file, "rb").read()
+input_tensor = "layer1_input"
+input_dtype = "uint8"
+
+os.system("mkdir -p include")
+
+try:
+    import tflite
+
+    tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)
+except AttributeError:
+    import tflite.Model
+
+    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
+
+# Load the TFLite model and allocate tensors.
+interpreter = tf.lite.Interpreter(
+    model_path=str(tflite_file), experimental_preserve_all_tensors=True
+)
+interpreter.allocate_tensors()
+input_details = interpreter.get_input_details()
+output_details = interpreter.get_output_details()
+tensor_details = interpreter.get_tensor_details()
+
+input_matrix_1 = np.random.randint(
+    0, 255, (1, input_height, input_width, input_channels), dtype=np.uint8
+)
+
+interpreter.set_tensor(input_details[0]["index"], input_matrix_1)
+
+interpreter.invoke()
+expected_output = interpreter.get_tensor(output_details[0]["index"])
+
+##################################
+# Compiling the model with TVM
+# --------------------------------
+#
+# In this section, we will compile the model using TVM and the Gemmini integration.
+
+# The Gemmini environment class needs to be initialized with the parameters of the Gemmini accelerator where we want to execute our operation. We use here the default parameters.
+gemmini.Environment.init_overwrite(dim=16, acc_rows=1024, bank_rows=4096)
+
+# The TFLite model generated in the previous steps is now imported into TVM.
+mod, params = relay.frontend.from_tflite(
+    tflite_model,
+    shape_dict={"serving_default_x": (1, input_height, input_width, input_channels)},
+    dtype_dict={"serving_default_x": input_dtype},
+)
+mod["main"]
+
+# In order to be able to build a model for the Gemmini accelerator, we need to replace all supported layers by the Gemmini specific operators. This is done using the gemmini.preprocess pass. Notice the changes in the "main" function after running the preprocess pass.
+mod = gemmini.preprocess_pass(mod)
+mod["main"]
+
+# Now, we build the Relay Graph. Notice that we are using the CRT runtime, the target is C because we want to generate C code (but the device is Gemmini), and we use the AOT executor and the USMP feature in order to get a complete bare metal C code, without calls to memory allocator APIs.
+# The gemmini.build_config function returns a PassContext object containing the specific parameters needed to correctly build the model for the Gemmini accelerator.
+RUNTIME = tvm.relay.backend.Runtime("crt", {"system-lib": False})
+TARGET = tvm.target.target.Target({"kind": "c", "device": "gemmini"})
+EXECUTOR = tvm.relay.backend.Executor("aot", options={"interface-api": "c", "unpacked-api": 1})
+
+with gemmini.build_config(usmp_alg="hill_climb", opt_level=3, disabled_pass=["AlterOpLayout"]):
+    module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)
+
+#################################################
+# Exporting and testing the model using microTVM
+# -----------------------------------------------
+#
+# In this section, we will export the model using one of the provided example microTVM projects, we will compile it using the Chipyard tool, and then test the generated baremetal code on the Spike simulator.
+
+tmpdir = tvm.contrib.utils.tempdir()
+model_library_format_tar_path = tvm.micro.export_model_library_format(module, tmpdir / "model.tar")
+with tempfile.NamedTemporaryFile() as tar_temp_file:
+    with tarfile.open(tar_temp_file.name, "w:gz") as tar_file:
+        # Here, we create headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one.
+        create_header_file("input", input_matrix_1, "include/tvm", tar_file)
+        create_header_file("output", expected_output, "include/tvm", tar_file)
+
+    # Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects.
+    template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("gemmini"))
+    project_options = {"project_type": "maxpool2d_example", "extra_files_tar": tar_temp_file.name}
+
+    generated_project_dir = pathlib.Path(pathlib.Path.cwd(), "generated-project")
+    generated_project = tvm.micro.generate_project(
+        template_project_path, module, generated_project_dir, project_options
+    )
+
+# We build the project. This will generate an executable we can run on the Spike simulator.
+generated_project.build()
+
+# Finally, we execute the compiled baremetal project on the Spike simulator.
+# Note: if there are errors, these can be related to rounding errors.
+generated_project.flash()
diff --git a/gallery/tutorial/micro_gemmini_mobilenet.py b/gallery/tutorial/micro_gemmini_mobilenet.py
new file mode 100644
index 000000000000..b0df1573ffef
--- /dev/null
+++ b/gallery/tutorial/micro_gemmini_mobilenet.py
@@ -0,0 +1,253 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Running TVM on the Gemmini accelerator - A complete MobileNet example
+======================================================================================
+**Author**:
+`Federico Peccia <https://fPecc.github.io/>`_
+
+This tutorials shows how a quantized MobileNet network can be compiled to be executed on the Gemmini accelerator. The generated baremetal C code is then tested on the Spike RISC-V ISA simulator. Before starting this tutorial, you should have downloaded the Chipyard repository and installed the Spike simulator with the Gemmini extension.
+"""
+
+import numpy as np
+import tensorflow as tf
+import os
+import tvm.contrib.gemmini as gemmini
+import tarfile
+import tempfile
+import pathlib
+from tvm import relay
+import tvm
+from tvm.contrib.download import download_testdata
+from tvm.micro.testing.utils import create_header_file
+
+##################################
+# Pre-requisites
+# --------------------------------
+#
+# After the installation of the Chipyard development tools, you should have an env.sh file in your Chipyard home directory. This file needs to be sourced before running this tutorial:
+#
+# .. code-block:: bash
+#
+#   source <your chipyard home path>/env.sh
+#
+# WARNING: if you have installed TVM in a virtual environment, FIRST activate the Chipyard environment, and THEN activate the tvm entironment.
+
+##################################
+# Helper functions
+# --------------------------------
+#
+# This functions will help us generate the MobileNet model
+
+
+def get_real_image(im_height, im_width):
+    from PIL import Image
+
+    repo_base = "https://github.com/dmlc/web-data/raw/main/tensorflow/models/InceptionV1/"
+    img_name = "elephant-299.jpg"
+    image_url = os.path.join(repo_base, img_name)
+    img_path = download_testdata(image_url, img_name, module="data")
+    image = Image.open(img_path).resize((im_height, im_width))
+    x = np.array(image).astype("uint8")
+    data = np.reshape(x, (1, im_height, im_width, 3))
+    return data
+
+
+def run_tflite_model(tflite_model_buf, input_data):
+    """Generic function to execute TFLite"""
+    try:
+        from tensorflow import lite as interpreter_wrapper
+    except ImportError:
+        from tensorflow.contrib import lite as interpreter_wrapper
+
+    input_data = input_data if isinstance(input_data, list) else [input_data]
+
+    interpreter = interpreter_wrapper.Interpreter(model_content=tflite_model_buf)
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    output_details = interpreter.get_output_details()
+
+    # set input
+    assert len(input_data) == len(input_details)
+    for i in range(len(input_details)):
+        interpreter.set_tensor(input_details[i]["index"], input_data[i])
+
+    # Run
+    interpreter.invoke()
+
+    # get output
+    tflite_output = list()
+    for i in range(len(output_details)):
+        tflite_output.append(interpreter.get_tensor(output_details[i]["index"]))
+
+    return tflite_output
+
+
+def download_model():
+    model_url = "http://download.tensorflow.org/models/tflite_11_05_08/mobilenet_v2_1.0_224.tgz"
+
+    # Download model tar file and extract it to get mobilenet_v2_1.0_224.tflite
+    model_path = download_testdata(
+        model_url, "mobilenet_v2_1.0_224.tgz", module=["tf", "official", "mobilenet_v2"]
+    )
+    model_dir = os.path.dirname(model_path)
+
+    return model_dir, model_path
+
+
+def extract(path):
+    import tarfile
+
+    if path.endswith("tgz") or path.endswith("gz"):
+        dir_path = os.path.dirname(path)
+        tar = tarfile.open(path)
+        tar.extractall(path=dir_path)
+        tar.close()
+    else:
+        raise RuntimeError("Could not decompress the file: " + path)
+
+
+def create_tflite_model(model_dir: str):
+    # tflite_model_name = [f for f in os.listdir(model_dir) if f.endswith(".tflite")][0]
+    # return f"{model_dir}/{tflite_model_name}"
+    def representative_data_gen():
+        dataset = [
+            np.array(np.random.randint(0, 255, size=(1, 224, 224, 3)), dtype=np.float32)
+            for s in range(100)
+        ]
+        for input_value in dataset:
+            # Model has only one input so each data point has one element.s
+            yield [input_value]
+
+    pb_file = [f for f in os.listdir(model_dir) if f.endswith(".pb")][0]
+    converter = tf.compat.v1.lite.TFLiteConverter.from_frozen_graph(
+        f"{model_dir}/{pb_file}",
+        input_arrays=["input"],
+        input_shapes={"input": [1, 224, 224, 3]},
+        output_arrays=["MobilenetV2/Predictions/Reshape"],
+    )
+    converter.optimizations = [tf.lite.Optimize.DEFAULT]
+    converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
+    # converter.target_spec.supported_ops = [tf.lite.OpsSet.SELECT_TF_OPS]
+    converter.inference_input_type = tf.uint8
+    converter.inference_output_type = tf.uint8
+    converter.representative_dataset = representative_data_gen
+    converter._experimental_disable_per_channel = True
+
+    tflite_model = converter.convert()
+    tflite_model_name = pb_file.replace(".pb", ".tflite")
+    with open(f"{model_dir}/{tflite_model_name}", "wb") as f:
+        f.write(tflite_model)
+
+    return f"{model_dir}/{tflite_model_name}"
+
+
+def generate_mobilenet_tflite_model():
+    model_dir, model_path = download_model()
+    extract(model_path)
+    return create_tflite_model(model_dir)
+
+
+##################################
+# Baseline generation
+# --------------------------------
+#
+# In this section, we will generate the baseline input and expected output, which we are going to use to compare with the actual obtained output after running on the Gemmini accelerator.
+
+# We clean and prepare the workspace
+os.system("rm -rf generated-project/")
+
+# We will generate a prequantized TFLite model, because for now the Gemmini integration only supports models that were quantized with specific flags as input.
+tflite_model_dir = generate_mobilenet_tflite_model()
+
+input_image = get_real_image(224, 224)
+
+tflite_model_file = os.path.join(tflite_model_dir)
+tflite_model_buf = open(tflite_model_file, "rb").read()
+
+# Now that we have created the model, we import the model and run it. We store the output, in order to compare it with the output that will be later obtained from the Gemmini accelerator.
+try:
+    import tflite
+
+    tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)
+except AttributeError:
+    import tflite.Model
+
+    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
+
+tflite_res = run_tflite_model(tflite_model_buf, input_image)
+tflite_pred = np.squeeze(tflite_res).argsort()[-5:][::-1]
+print("Expected argmax = %i" % (tflite_pred[0],))
+print("Expected max labels = %s" % (tflite_pred,))
+
+##################################
+# Compiling the model with TVM
+# --------------------------------
+#
+# In this section, we will compile the model using TVM and the Gemmini integration.
+
+# The Gemmini environment class needs to be initialized with the parameters of the Gemmini accelerator where we want to execute our operation. We use here the default parameters.
+gemmini.Environment.init_overwrite(dim=16, acc_rows=1024, bank_rows=4096)
+
+# The TFLite model generated in the previous steps is now imported into TVM.
+dtype_dict = {"input": input_image.dtype.name}
+shape_dict = {"input": input_image.shape}
+
+mod, params = relay.frontend.from_tflite(tflite_model, shape_dict=shape_dict, dtype_dict=dtype_dict)
+mod = relay.transform.InferType()(mod)
+
+# In order to be able to build a model for the Gemmini accelerator, we need to replace all supported layers by the Gemmini specific operators. This is done using the gemmini.preprocess pass. Notice the changes in the "main" function after running the preprocess pass.
+mod = gemmini.preprocess_pass(mod)
+
+# Now, we build the Relay Graph. Notice that we are using the CRT runtime, the target is C because we want to generate C code (but the device is Gemmini), and we use the AOT executor and the USMP feature in order to get a complete bare metal C code, without calls to memory allocator APIs.
+# The gemmini.build_config function returns a PassContext object containing the specific parameters needed to correctly build the model for the Gemmini accelerator.
+RUNTIME = tvm.relay.backend.Runtime("crt", {"system-lib": False})
+TARGET = tvm.target.target.Target({"kind": "c", "device": "gemmini"})
+EXECUTOR = tvm.relay.backend.Executor("aot", options={"interface-api": "c", "unpacked-api": 1})
+
+with gemmini.build_config(usmp_alg="hill_climb", opt_level=3, disabled_pass=["AlterOpLayout"]):
+    module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)
+
+#################################################
+# Exporting and testing the model using microTVM
+# -----------------------------------------------
+#
+# In this section, we will export the model using one of the provided example microTVM projects, we will compile it using the Chipyard tool, and then test the generated baremetal code on the Spike simulator
+
+tmpdir = tvm.contrib.utils.tempdir()
+model_library_format_tar_path = tvm.micro.export_model_library_format(module, tmpdir / "model.tar")
+with tempfile.NamedTemporaryFile() as tar_temp_file:
+    with tarfile.open(tar_temp_file.name, "w:gz") as tar_file:
+        # Here, we create headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one.
+        create_header_file("input", input_image, "include/tvm", tar_file)
+        create_header_file("output", tflite_pred.astype(np.int32), "include/tvm", tar_file)
+
+    # Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects.
+    template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("gemmini"))
+    project_options = {"project_type": "mobilenet_example", "extra_files_tar": tar_temp_file.name}
+
+    generated_project_dir = pathlib.Path(pathlib.Path.cwd(), "generated-project")
+    generated_project = tvm.micro.generate_project(
+        template_project_path, module, generated_project_dir, project_options
+    )
+
+# We build the project. This will generate an executable we can run on the Spike simulator.
+generated_project.build()
+
+# Finally, we execute the compiled baremetal project on the Spike simulator.
+generated_project.flash()
diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index eee15d8d879f..583e5053db5b 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -473,9 +473,21 @@ def _build_func_common(measure_input, runtime=None, checks=None, build_option=No
     target, task, config = measure_input
     target, task.target_host = Target.canon_target_and_host(target, task.target_host)
     checks = checks or {}
+
     with target:
         s, args = task.instantiate(config)
 
+        # if target is gemmini, we need to use gemmini build
+        if (
+            hasattr(measure_input.target, "device_name")
+            and measure_input.target.device_name == "gemmini"
+        ):
+            # pylint: disable=import-outside-toplevel
+            import tvm.contrib.gemmini as gemmini
+
+            func = gemmini.build(s, args, target=measure_input.target, runtime=runtime)
+            return func, tuple((get_const_tuple(x.shape), x.dtype) for x in args)
+
         # check invalidity of template and code hash consistency
         if not config.valid():
             raise InstantiationError(config.errors)
diff --git a/python/tvm/contrib/gemmini/__init__.py b/python/tvm/contrib/gemmini/__init__.py
new file mode 100644
index 000000000000..34abef4b085a
--- /dev/null
+++ b/python/tvm/contrib/gemmini/__init__.py
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Gemmini package is a TVM backend extension to support the Gemmini hardware accelerator
+=====================
+"""
+
+import tvm._ffi.base
+
+from tvm.relay.backend.contrib.gemmini import *
+from .environment import Environment
+from .build_module import build_config, lower, build, preprocess_pass
+from .utils import *
diff --git a/python/tvm/contrib/gemmini/build_module.py b/python/tvm/contrib/gemmini/build_module.py
new file mode 100644
index 000000000000..fdabfd102bca
--- /dev/null
+++ b/python/tvm/contrib/gemmini/build_module.py
@@ -0,0 +1,217 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Helpers and functions related to the build process to generate code for the Gemmini accelerator
+=====================
+"""
+
+import tvm
+from tvm import relay
+from .environment import Environment
+from .transform import (
+    InjectAMVINIntrin,
+    InjectAMVINIntrinTransposed,
+    InjectBMVINIntrin,
+    InjectBMVINIntrinTransposed,
+    InjectCMVOUTIntrin,
+    InjectCMVOUTIntrinTransposed,
+    InjectDMVINIntrin,
+    InjectDMVINIntrinTransposed,
+    InjectCMVINIntrin,
+    InjectCMVINIntrinTransposed,
+    InjectCMVINAccumIntrin,
+    InjectCMVINAccumIntrinTransposed,
+    InsertGemminiHeaderOperators,
+    InsertGemminiFenceOperator,
+)
+from .legalize import LegalizeGemmini
+
+
+def preprocess_pass(mod):
+    """This is the preprocess pass to use the Gemmini accelerator, it groups the
+
+    Args:
+        mod (tvm.ir.IRModule): IRModule to preprocess
+
+    Returns:
+        tvm.ir.IRModule: preprocessed IRModule
+    """
+
+    # First, merge all dw and convs that can be merged!
+    pattern = relay.op.contrib.get_pattern_table("gemmini")
+
+    mod = relay.transform.InferType()(mod)
+    mod = relay.transform.ConvertLayout({"qnn.conv2d": ["NHWC", "HWIO"]})(mod)
+    mod = relay.transform.SimplifyExpr()(mod)
+    mod = relay.transform.MergeComposite(pattern)(mod)
+    mod = relay.transform.InferType()(mod)
+    mod = relay.transform.InferType()(mod)
+    mod = relay.transform.SimplifyExpr()(mod)
+    mod = LegalizeGemmini()(mod)
+    mod = relay.transform.InferType()(mod)
+    return mod
+
+
+def internal_build_configs(usmp_alg=""):
+    """Builds the internal configurations for the build process
+
+    Args:
+        usmp_alg (str, optional): Which USMP algorithm to use. Defaults to "".
+
+    Returns:
+        dict: configurations
+    """
+    pass_list = [
+        (0, tvm.tir.transform.StorageFlatten(16)),
+        (1, InjectAMVINIntrin()),
+        (1, InjectAMVINIntrinTransposed()),
+        (1, InjectBMVINIntrin()),
+        (1, InjectBMVINIntrinTransposed()),
+        (1, InjectCMVOUTIntrin()),
+        (1, InjectCMVOUTIntrinTransposed()),
+        (1, InjectDMVINIntrin()),
+        (1, InjectDMVINIntrinTransposed()),
+        (1, InjectCMVINIntrin()),
+        (1, InjectCMVINIntrinTransposed()),
+        (1, InjectCMVINAccumIntrin()),
+        (1, InjectCMVINAccumIntrinTransposed()),
+        (1, tvm.tir.transform.CorrectGemminisScratchpadAndAccumulatorPointers()),
+        (2, tvm.tir.transform.LowerDeviceStorageAccessInfo()),
+        (4, InsertGemminiHeaderOperators()),
+        (5, InsertGemminiFenceOperator()),
+    ]
+
+    return {
+        "tir.add_lower_pass": pass_list,
+        "tir.disable_vectorize": True,
+        # "tir.CorrectGemminisScratchpadAndAccumulatorPointers": {"dim": env.DIM}
+        "tir.usmp.enable": bool(usmp_alg),
+        "tir.usmp.algorithm": usmp_alg,
+    }
+
+
+def build_config(usmp_alg="", **kwargs):
+    """Creates the PassContext needed by the build process to correctly build the Gemmini operators
+
+    Args:
+        usmp_alg (str, optional): Which USMP algorithm to use. Defaults to "".
+
+    Returns:
+        tvm.transform.PassContext: PassContext with specific configurations
+    """
+
+    config = internal_build_configs(usmp_alg)
+    if kwargs.get("config"):
+        config.update(kwargs[config])
+        del kwargs["config"]
+
+    return tvm.transform.PassContext(config=config, **kwargs)
+
+
+def lower(*args, **kwargs):
+    """Thin wrapper of tvm.lower
+
+    This wrapper automatically applies Gemmini's build_config
+    if there is no user specified build_config in context.
+
+    See Also
+    --------
+    tvm.lower : The original TVM's lower function
+    """
+    pass_ctx = tvm.transform.PassContext.current()
+    if not pass_ctx.config.get("add_lower_pass"):
+        with build_config():
+            return tvm.lower(*args, **kwargs)
+    return tvm.lower(*args, **kwargs)
+
+
+def build(*args, **kwargs):
+    """Thin wrapper of tvm.build
+
+    This wrapper automatically applies Gemmini's build_config
+    if there is no user specified build_config in context.
+
+    See Also
+    --------
+    tvm.build : The original TVM's build function
+    """
+    pass_ctx = tvm.transform.PassContext.current()
+    if not pass_ctx.config.get("tir.add_lower_pass"):
+        with build_config():
+            return tvm.build(*args, **kwargs)
+    return tvm.build(*args, **kwargs)
+
+
+# The memory information for the compiler
+@tvm.register_func(f"tvm.info.mem.{Environment.instance().scr_scope}")
+def mem_info_inp_buffer():
+    """Creates the information about the local.scratchpad memory node
+
+    Returns:
+        node: The corresponding MemoryInfo node
+    """
+    spec = Environment.instance()
+    return tvm.ir.make_node(
+        "MemoryInfo",
+        unit_bits=spec.inp_bits,
+        max_simd_bits=spec.DIM,
+        max_num_bits=int(spec.INP_SCR_ROWS * spec.DIM * spec.inp_bits),
+        # head_address=tvm.runtime.const(spec.INP_SCR_BASE_ADDRESS, "uint32"),
+        head_address=None,
+    )
+
+
+# The memory information for the compiler
+@tvm.register_func(f"tvm.info.mem.{Environment.instance().scr_wgt_scope}")
+def mem_info_wgt_buffer():
+    """Creates the information about the local.scratchpad_weight memory node
+
+    Returns:
+        node: The corresponding MemoryInfo node
+    """
+    spec = Environment.instance()
+    return tvm.ir.make_node(
+        "MemoryInfo",
+        unit_bits=spec.wgt_bits,
+        max_simd_bits=spec.DIM,
+        max_num_bits=int(spec.WGT_SCR_ROWS * spec.DIM * spec.wgt_bits),
+        # head_address=tvm.runtime.const(spec.WGT_SCR_BASE_ADDRESS, "uint32"),
+        head_address=None,
+    )
+
+
+# The memory information for the compiler
+@tvm.register_func(f"tvm.info.mem.{Environment.instance().acc_scope}")
+def mem_info_acc_buffer():
+    """Creates the information about the local.accumulator memory node
+
+    Returns:
+        node: The corresponding MemoryInfo node
+    """
+    Environment.instance()
+    return tvm.ir.make_node(
+        "MemoryInfo",
+        unit_bits=Environment.instance().inp_bits,
+        max_simd_bits=Environment.instance().DIM,
+        max_num_bits=int(
+            Environment.instance().ACC_ROWS
+            * Environment.instance().DIM
+            * Environment.instance().inp_bits
+        ),
+        # head_address=tvm.runtime.const(env.OUT_ACC_BASE_ADDRESS, "uint32"),
+        head_address=None,
+    )
diff --git a/python/tvm/contrib/gemmini/environment.py b/python/tvm/contrib/gemmini/environment.py
new file mode 100644
index 000000000000..3e579a4f4870
--- /dev/null
+++ b/python/tvm/contrib/gemmini/environment.py
@@ -0,0 +1,394 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, exec-used
+"""
+Environment declaration. Contains Gemminis hardware parameters.
+=====================
+"""
+
+from __future__ import absolute_import as _abs
+import re
+from typing import List, Tuple, Dict, Callable
+from .intrin import (
+    gemm,
+    gemm_cisc,
+    conv2d_cisc,
+    dw_conv2d_cisc,
+    add_tensorize,
+    add_mvout_tensorize,
+)
+from .utils import COUNTERS
+
+
+class Environment(object):
+    """Hardware configuration object.
+
+    This object contains all the information
+    needed for compiling to a specific Gemmini backend.
+
+    """
+
+    _instance = None
+
+    @classmethod
+    def init_overwrite(
+        cls,
+        batch=1,
+        dim=32,
+        max_bytes=64,
+        inp_dtype="int8",
+        wgt_dtype="int8",
+        acc_dtype="int32",
+        acc_rows=4096,
+        bank_rows=8192,
+        bank_num=4,
+        debug=False,
+        enabled_counters: Dict = None,
+        supports_non_zero_padding: bool = False,
+        use_experimental_qnn_add: bool = False,
+    ):
+        """Overwrites the init function
+
+        Args:
+            batch (int, optional): Batch size. Defaults to 1.
+            dim (int, optional): Gemminis systolic array dimensions (DIM). Defaults to 32.
+            max_bytes (int, optional): Limits maximum amount of mvin columns. Defaults to 64.
+            inp_dtype (str, optional): Type of the Gemmini scratchpad. Defaults to "int8".
+            wgt_dtype (str, optional): Type of the Gemmini weight scratchpad. Defaults to "int8".
+            acc_dtype (str, optional): Type of the Gemmini accumulator. Defaults to "int32".
+            acc_rows (int, optional): Rows of the accumulator. Defaults to 4096.
+            bank_rows (int, optional): Rows of each bank in the scratchpad. Defaults to 8192.
+            bank_num (int, optional): Banks for the scratchpad. Defaults to 4.
+            debug (bool, optional): Adds debug of Gemmini counters. Defaults to False.
+            enabled_counters (dict, optional): Enabled Gemmini counters for debug purposes.
+                Defaults to None.
+            supports_non_zero_padding (bool, optional): Gemmini supports instructions
+                with non-zero padding. Defaults to False.
+            use_experimental_qnn_add (bool, optional): Pattern matching for qnn.add.
+                Defaults to False.
+        """
+        inst = Environment.instance()
+        inst.init(
+            batch=batch,
+            dim=dim,
+            max_bytes=max_bytes,
+            inp_dtype=inp_dtype,
+            wgt_dtype=wgt_dtype,
+            acc_dtype=acc_dtype,
+            acc_rows=acc_rows,
+            bank_rows=bank_rows,
+            bank_num=bank_num,
+            debug=debug,
+            enabled_counters=enabled_counters,
+            supports_non_zero_padding=supports_non_zero_padding,
+            use_experimental_qnn_add=use_experimental_qnn_add,
+        )
+
+    @classmethod
+    def instance(cls):
+        """Returns the current instance
+
+        Returns:
+            _type_: _description_
+        """
+        if cls._instance is None:
+            cls._instance = cls.__new__(cls)
+            cls._instance.init()
+        return cls._instance
+
+    def init(
+        self,
+        batch=1,
+        dim=16,
+        max_bytes=64,
+        inp_dtype="int8",
+        wgt_dtype="int8",
+        acc_dtype="int32",
+        acc_rows=1024,
+        bank_rows=4096,
+        bank_num=4,
+        debug=False,
+        enabled_counters: Dict = None,
+        supports_non_zero_padding: bool = False,
+        use_experimental_qnn_add: bool = False,
+    ):
+        """_summary_
+
+        Args:
+            batch (int, optional): Batch size. Defaults to 1.
+            dim (int, optional): Gemminis systolic array dimensions (DIM). Defaults to 32.
+            max_bytes (int, optional): Limits maximum amount of mvin columns. Defaults to 64.
+            inp_dtype (str, optional): Type of the Gemmini scratchpad. Defaults to "int8".
+            wgt_dtype (str, optional): Type of the Gemmini "logical" weight scratchpad.
+                Defaults to "int8".
+            acc_dtype (str, optional): Type of the Gemmini accumulator. Defaults to "int32".
+            acc_rows (int, optional): Amount of rows of the accumulator. Defaults to 4096.
+            bank_rows (int, optional): Amount of rows of each bank in the scratchpad.
+                Defaults to 8192.
+            bank_num (int, optional): Amount of banks for the scratchpad. Defaults to 4.
+            debug (bool, optional): Adds debug of Gemmini counters. Defaults to False.
+            enabled_counters (dict, optional): Enabled Gemmini counters for debug purposes.
+                Defaults to None.
+            supports_non_zero_padding (bool, optional): Gemmini supports instructions
+                with non-zero padding. Defaults to False.
+            use_experimental_qnn_add (bool, optional): Pattern matching for qnn.add.
+                Defaults to False.
+        """
+
+        assert batch == 1, "Only batch size of 1 is currently supported"
+        self.debug = debug
+
+        self.BATCH = batch
+        self.DIM = dim
+        self.MAX_BYTES = max_bytes
+
+        self.inp_dtype = inp_dtype
+        self.wgt_dtype = wgt_dtype
+        self.acc_dtype = acc_dtype
+
+        self.inp_bits = int(
+            re.match(r"((float)|(int)|(uint))(?P<width_bits>[0-9]+)", self.inp_dtype).group(
+                "width_bits"
+            )
+        )
+        self.wgt_bits = int(
+            re.match(r"((float)|(int)|(uint))(?P<width_bits>[0-9]+)", self.wgt_dtype).group(
+                "width_bits"
+            )
+        )
+        self.acc_bits = int(
+            re.match(r"((float)|(int)|(uint))(?P<width_bits>[0-9]+)", self.acc_dtype).group(
+                "width_bits"
+            )
+        )
+
+        self.size_elem = int(self.inp_bits / 8)
+        self.size_acc = int(self.acc_bits / 8)
+
+        self.ACC_ROWS = acc_rows
+        self.BANK_ROWS = bank_rows
+        self.BANK_NUM = bank_num
+
+        self.WGT_SCR_BASE_ADDRESS = int(self.BANK_ROWS * self.BANK_NUM * 2 / 4)
+        self.WGT_SCR_ROWS = self.BANK_ROWS * self.BANK_NUM - self.WGT_SCR_BASE_ADDRESS
+        self.INP_SCR_BASE_ADDRESS = 0
+        self.INP_SCR_ROWS = self.WGT_SCR_BASE_ADDRESS
+        self.OUT_ACC_BASE_ADDRESS = 0xC0000000
+
+        self.MAX_BLOCK_LEN = int(self.MAX_BYTES / self.DIM)
+        if self.DIM * self.size_acc <= self.MAX_BYTES:
+            self.MAX_BLOCK_LEN_ACC = int(self.MAX_BYTES / (self.DIM * self.size_acc))
+        else:
+            self.MAX_BLOCK_LEN_ACC = 1
+
+        self.scr_scope = "local.scratchpad"
+        self.acc_scope = "local.accumulator"
+        # Actually, only one scratchpad should exist.
+        # But we do this logical partition to correctly manage the pointers
+        # to the buffers stored in this memories.
+        # Should see how we can fix this in the future.
+        self.scr_wgt_scope = "local.scratchpad_weight"
+
+        self.A_mvin = "A_mvin"
+        self.B_mvin = "B_mvin"
+        self.D_mvin = "D_mvin"
+        self.C_mvin = "C_mvin"
+        self.C_mvin_accum = "C_mvin_accum"
+        self.C_mvout = "C_mvout"
+        self.C_mvout_acc_dtype = "C_mvout_acc_dtype"
+
+        self.WEIGHT_STATIONARY = 1
+        self.OUTPUT_STATIONARY = 0
+
+        self.mvin_scale_identity = 1.0
+        self.max_matrix = 64
+
+        self.supports_non_zero_padding = supports_non_zero_padding
+        self.use_experimental_qnn_add = use_experimental_qnn_add
+
+        self.enabled_counters = enabled_counters if enabled_counters is not None else COUNTERS
+        # Check that all enabled counters exist in the actual counters from Gemmini
+        for key, value in self.enabled_counters.items():
+            assert (
+                value == COUNTERS[key]
+            ), f"Enabled counter with key {key} does not exist \
+            or has a different name in the actual counters dict!"
+
+    def gemm(
+        self,
+        I: int,
+        K: int,
+        J: int,
+        stride: int = 1,
+        is_depthwise_conv2d: bool = False,
+        mode: int = 1,
+        accum_patch=None,
+    ) -> Callable:
+        """Wrapper to expose the gemm intrinsic
+
+        Args:
+            I (int): output first axis dimension
+            K (int): reduction axis dimension
+            J (int): output second axis dimension
+            stride (int, optional): Stride, useful for convolutions. Defaults to 1.
+            is_depthwise_conv2d (bool, optional): Flag to explain if this is a
+                GEMM for a depthwise convolution. Defaults to False.
+            mode (int, optional): Systolic array mode (WS=1,OS=0). Defaults to 1.
+            accum_patch (_type_, optional): Var of the reduction axis loop. Defaults to None.
+
+        Returns:
+            Callable: gemm instrinsic
+        """
+        return gemm(self, I, K, J, stride, is_depthwise_conv2d, mode, accum_patch)
+
+    def gemm_cisc(
+        self,
+        inp_shape: Tuple[int, ...],
+        wgt_shape: Tuple[int, ...],
+        bias_shape: Tuple[int, ...],
+        scale: float,
+        matmul_type: int,
+    ) -> Callable:
+        """Wrapper to expose the gemm_cisc intrinsic
+
+        Args:
+            inp_shape (Tuple[int,...]): Input feature map shape
+            wgt_shape (Tuple[int,...]): Weights shape
+            bias_shape (Tuple[int,...]): Bias shape
+            scale (float): Output scaling factor
+            matmul_type (int): Systolic array mode (WS=1,OS=0)
+
+        Returns:
+            Callable: gemm cisc intrinsic
+        """
+        return gemm_cisc(self, inp_shape, wgt_shape, bias_shape, scale, matmul_type)
+
+    def conv2d_cisc(
+        self,
+        inp_shape: Tuple[int, ...],
+        wgt_shape: Tuple[int, ...],
+        bias_shape: Tuple[int, ...],
+        out_shape: Tuple[int, ...],
+        strides: int,
+        padding: List[int],
+        padding_value: int,
+        activation: int,
+        scale: float,
+        pool_size: List[int],
+        pool_strides: List[int],
+        pool_dilation: List[int],
+        pool_padding: List[int],
+    ) -> Callable:
+        """Wrapper to expose the conv2d_cisc intrinsic
+
+        Args:
+            inp_shape (Tuple[int,...]): Input feature map shape
+            wgt_shape (Tuple[int,...]): Weights shape
+            bias_shape (Tuple[int,...]): Bias shape
+            out_shape (Tuple[int,...]): Output feature map shape
+            strides (int): Convolution stride
+            padding (List[int]): Pixels to pad in each direction
+            padding_value (int): Value to use for padding
+            activation (int): Has activation?
+            scale (float): Output scaling factor
+            pool_size (List[int]): Size of the output pooling window
+            pool_strides (List[int]): Strides for the output pooling window
+            pool_dilation (List[int]): Dilation for the output pooling window
+            pool_padding (List[int]): Padding for the output pooling
+
+        Returns:
+            Callable: conv2d cisc intrinsic
+        """
+        return conv2d_cisc(
+            self,
+            inp_shape,
+            wgt_shape,
+            bias_shape,
+            out_shape,
+            strides,
+            padding,
+            padding_value,
+            activation,
+            scale,
+            pool_size,
+            pool_strides,
+            pool_dilation,
+            pool_padding,
+        )
+
+    def dw_conv2d_cisc(
+        self,
+        inp_shape: Tuple[int, ...],
+        wgt_shape: Tuple[int, ...],
+        bias_shape: Tuple[int, ...],
+        out_shape: Tuple[int, ...],
+        strides: int,
+        padding: List[int],
+        padding_value: int,
+        activation: int,
+        scale: float,
+    ) -> Callable:
+        """Wrapper to expose the dw_conv2d_cisc intrinsic
+
+        Args:
+            inp_shape (Tuple[int,...]): Input feature map shape
+            wgt_shape (Tuple[int,...]): Weights shape
+            bias_shape (Tuple[int,...]): Bias shape
+            out_shape (Tuple[int,...]): Output feature map shape
+            strides (int): Convolution stride
+            padding (List[int]): Pixels to pad in each direction
+            padding_value (int): Value to use for padding
+            activation (int): Has activation?
+            scale (float): Output scaling factor
+
+        Returns:
+            Callable: dw conv2d cisc intrinsic
+        """
+        return dw_conv2d_cisc(
+            self,
+            inp_shape,
+            wgt_shape,
+            bias_shape,
+            out_shape,
+            strides,
+            padding,
+            padding_value,
+            activation,
+            scale,
+        )
+
+    def add_tensorize(self, oshape: Tuple[int, ...]) -> Callable:
+        """Wrapper to expose the add_tensorize intrinsic
+
+        Args:
+            oshape (Tuple[int,...]): Output feature map shape
+
+        Returns:
+            Callable: add intrinsic
+        """
+        return add_tensorize(self, oshape)
+
+    def add_mvout_tensorize(self, oshape: Tuple[int, ...]) -> Callable:
+        """Wrapper to expose the add_mvout_tensorize intrinsic
+
+        Args:
+            oshape (Tuple[int,...]): Output feature map shape
+
+        Returns:
+            Callable: add mvout intrinsic
+        """
+        return add_mvout_tensorize(self, oshape)
diff --git a/python/tvm/contrib/gemmini/helpers.py b/python/tvm/contrib/gemmini/helpers.py
new file mode 100644
index 000000000000..e0f99c3373e4
--- /dev/null
+++ b/python/tvm/contrib/gemmini/helpers.py
@@ -0,0 +1,72 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Miscellaneous helpers
+=====================
+"""
+
+from typing import List
+from six.moves import range
+from .environment import Environment
+
+
+ENV = Environment.instance()
+
+
+def get_divisors(x: int) -> List[int]:
+    """Gets all the numbers that perfectly divide x
+
+    Args:
+        x (int): Number to divide
+
+    Returns:
+        List[int]: list of divisors
+    """
+    divs = []
+    for i in range(1, x + 1):
+        if x % i == 0:
+            divs.append(i)
+    return divs
+
+
+def get_greater_div(x, limit: int = None):
+    """Gets the greater divisor for all x
+
+    Args:
+        x: _description_
+        limit (int, optional): Max greater divisor to return. Defaults to None.
+
+    Returns:
+        int: Greater divisor
+    """
+
+    limit = ENV.DIM if limit is None else limit
+
+    if isinstance(x, int):
+        elements = [x]
+    elif isinstance(x, list):
+        elements = x
+    else:
+        assert False, "datatype of x not supported!"
+
+    divisors = []
+    for element in elements:
+        divs = get_divisors(element)
+        filtered = filter(lambda d: d <= limit, divs)
+        divisors.append(filtered)
+
+    return max(set.intersection(*map(set, divisors)))
diff --git a/python/tvm/contrib/gemmini/intrin.py b/python/tvm/contrib/gemmini/intrin.py
new file mode 100644
index 000000000000..58f53b6b3e0a
--- /dev/null
+++ b/python/tvm/contrib/gemmini/intrin.py
@@ -0,0 +1,867 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Gemmini related intrinsics
+=====================
+"""
+
+from __future__ import absolute_import as _abs
+
+from typing import List, Tuple
+import tvm
+from tvm import te
+
+
+def gemm(
+    env,
+    dim_i: int,
+    dim_k: int,
+    dim_j: int,
+    stride: int = 1,
+    is_depthwise_conv2d: bool = True,
+    mode: int = 1,
+    accum_patch: tvm.tir.Var = None,
+):
+    """Matrix-matrix multiply intrinsic, inserts the most basic Gemmini instructions
+
+    Args:
+        env (Environment): Environment with configurations
+        dim_i (int): output first axis dimension
+        dim_k (int): reduction axis dimension
+        dim_j (int): output second axis dimension
+        stride (int, optional): Stride, useful for convolutions. Defaults to 1.
+        is_depthwise_conv2d (bool, optional): Flag to explain if this is a GEMM for
+            a depthwise convolution. Defaults to False.
+        mode (int, optional): Systolic array mode (WS=1,OS=0). Defaults to 1.
+        accum_patch (tvm.tir.Var, optional): Var of the reduction axis loop. Defaults to None.
+
+    Returns:
+        TensorIntrin: gemm tensor intrinsic
+    """
+
+    # TODO (FP): add assertions here for dim_i, dim_k and dim_j?
+
+    wgt_shape = (dim_k, dim_j)
+
+    inp_shape = (dim_i, dim_k)
+
+    out_shape = (dim_i, dim_j)
+
+    wgt = te.placeholder(wgt_shape, dtype=env.wgt_dtype, name=env.scr_wgt_scope)
+    inp = te.placeholder(inp_shape, dtype=env.inp_dtype, name=env.scr_scope)
+
+    bias = te.placeholder(out_shape, dtype=env.inp_dtype, name=env.scr_scope)
+
+    k = te.reduce_axis((0, wgt_shape[0]), name="k")
+
+    out_dtype = env.inp_dtype
+
+    if is_depthwise_conv2d:
+        out = te.compute(
+            out_shape,
+            lambda i, j: te.sum(
+                inp[i * stride + k, j].astype(env.inp_dtype) * wgt[0, k].astype(env.inp_dtype)
+                + bias[i, j].astype(env.inp_dtype),
+                axis=[k],
+            ),
+            name="out",
+        )
+    else:
+        out = te.compute(
+            out_shape,
+            lambda i, j: te.sum(
+                inp[i * stride, k].astype(env.inp_dtype) * wgt[k, j].astype(env.inp_dtype)
+                + bias[i, j].astype(env.inp_dtype),
+                axis=[k],
+            ),
+            name="out",
+        )
+    wgt_layout = tvm.tir.decl_buffer(
+        wgt.shape,
+        wgt.dtype,
+        "wgt_buff",
+        scope=env.scr_wgt_scope,
+        strides=[te.var("wgt_k"), te.var("wgt_y")],
+        offset_factor=env.DIM,
+    )
+    inp_layout = tvm.tir.decl_buffer(
+        inp.shape,
+        inp.dtype,
+        "inp_buff",
+        scope=env.scr_scope,
+        strides=[te.var("inp_x"), te.var("inp_k")],
+        offset_factor=env.DIM,
+    )
+    bias_layout = tvm.tir.decl_buffer(
+        bias.shape,
+        bias.dtype,
+        "bias_buff",
+        scope=env.acc_scope,
+        strides=[te.var("inp_x"), te.var("inp_k")],
+        offset_factor=env.DIM,
+    )
+    out_layout = tvm.tir.decl_buffer(
+        out.shape,
+        out_dtype,
+        "out_buff",
+        scope=env.acc_scope,
+        strides=[te.var("out_x"), te.var("out_y")],
+        offset_factor=env.DIM,
+    )
+
+    def intrin_func(ins, outs):
+        """Matrix-matrix multiply intrinsic function"""
+        dinp, dwgt, _ = ins
+        dout = outs[0]
+
+        inp_base_address = tvm.runtime.const(env.INP_SCR_BASE_ADDRESS, "uint32")
+        wgt_base_address = tvm.runtime.const(env.WGT_SCR_BASE_ADDRESS, "uint32")
+        wgt_access_ptr = dwgt.access_ptr("r", "uint32")
+        out_base_address = tvm.runtime.const(env.OUT_ACC_BASE_ADDRESS, "uint32")
+        out_access_ptr = dout.access_ptr("w", "uint32")
+
+        garbage = tvm.runtime.const(0xFFFFFFFF, "uint32")
+
+        def _body():
+            """Generate matrix-matrix multiply Gemmini instruction,
+            without accumulate (garbage address in compute_preloaded)"""
+            irb = tvm.tir.ir_builder.create()
+
+            inp_access_ptr = dinp.access_ptr("r", "uint32")
+
+            a_access_ptr = inp_base_address + inp_access_ptr
+            bd_access_ptr = (
+                wgt_base_address + wgt_access_ptr if mode == env.WEIGHT_STATIONARY else garbage
+            )
+            c_access_ptr = out_base_address + out_access_ptr
+            db_access_ptr = (
+                garbage if mode == env.WEIGHT_STATIONARY else wgt_base_address + wgt_access_ptr
+            )
+
+            a_cols = dinp.shape[1]
+            a_rows = dinp.shape[0]
+            bd_cols = dwgt.shape[1] if mode == env.WEIGHT_STATIONARY else dout.shape[1]
+            bd_rows = dwgt.shape[0] if mode == env.WEIGHT_STATIONARY else dout.shape[0]
+            c_cols = dout.shape[1]
+            c_rows = dout.shape[0]
+            db_cols = c_cols if mode == env.WEIGHT_STATIONARY else dwgt.shape[1]
+            db_rows = c_rows if mode == env.WEIGHT_STATIONARY else dwgt.shape[0]
+
+            with irb.if_scope(accum_patch == 0):
+                irb.emit(
+                    tvm.tir.call_extern(
+                        "",
+                        "gemmini_extended_preload",
+                        bd_access_ptr,
+                        c_access_ptr,
+                        bd_cols,
+                        bd_rows,
+                        c_cols,
+                        c_rows,
+                    )
+                )
+                irb.emit(
+                    tvm.tir.call_extern(
+                        "",
+                        "gemmini_extended_compute_preloaded",
+                        a_access_ptr,
+                        db_access_ptr,
+                        a_cols,
+                        a_rows,
+                        db_cols,
+                        db_rows,
+                    )
+                )
+            with irb.else_scope():
+                irb.emit(
+                    tvm.tir.call_extern(
+                        "",
+                        "gemmini_extended_preload",
+                        garbage,
+                        c_access_ptr,
+                        bd_cols,
+                        bd_rows,
+                        c_cols,
+                        c_rows,
+                    )
+                )
+                irb.emit(
+                    tvm.tir.call_extern(
+                        "",
+                        "gemmini_extended_compute_accumulated",
+                        a_access_ptr,
+                        db_access_ptr,
+                        a_cols,
+                        a_rows,
+                        db_cols,
+                        db_rows,
+                    )
+                )
+            return irb.get()
+
+        def _reduce_reset():
+            irb = tvm.tir.ir_builder.create()
+            return irb.get()
+
+        def _reduce_update():
+            return _body()
+
+        # return a triple of normal-set, reset, update
+        return (_body(), _reduce_reset(), _reduce_update())
+
+    return te.decl_tensor_intrin(
+        out.op,
+        intrin_func,
+        name="GEMM",
+        binds={inp: inp_layout, wgt: wgt_layout, bias: bias_layout, out: out_layout},
+    )
+
+
+def gemm_cisc(
+    env,
+    inp_shape: Tuple[int, ...],
+    wgt_shape: Tuple[int, ...],
+    bias_shape: Tuple[int, ...],
+    scale: float,
+    matmul_type: int,
+):
+    """Matrix-matrix multiply intrinsic, inserts the calls to the function
+    provided by the Gemmini developers to run matrix multiplication using the loop instructions
+
+    Args:
+        env (Environment): Environment with configurations
+        inp_shape (Tuple[int,...]): Input feature map shape
+        wgt_shape (Tuple[int,...]): Weights shape
+        bias_shape (Tuple[int,...]): Bias shape
+        scale (float): Output scaling factor
+        matmul_type (int): Systolic array mode (WS=1,OS=0)
+
+    Returns:
+        TensorIntrin: GEMM CISC tensor intrinsic
+    """
+
+    # TODO (FP): add assertions here for inp_shape, wgt_shape and bias_shape?
+
+    wgt = te.placeholder(wgt_shape, dtype=env.inp_dtype, name=env.scr_wgt_scope)
+    inp = te.placeholder(inp_shape, dtype=env.inp_dtype, name=env.scr_scope)
+    bias = te.placeholder(bias_shape, dtype=env.acc_dtype, name=env.scr_scope)
+
+    dim_k = wgt.shape[0]
+    dim_j = wgt.shape[1]
+    dim_i = inp.shape[0]
+
+    k_reduce = te.reduce_axis((0, dim_k), name="dim_k")
+
+    output_shape = (dim_i, dim_j)
+
+    out = te.compute(
+        output_shape,
+        lambda x_, y_: te.sum(
+            inp[x_, k_reduce].astype(env.inp_dtype) * wgt[k_reduce, y_].astype(env.inp_dtype)
+            + bias[y_].astype(env.inp_dtype),
+            axis=[k_reduce],
+        ),
+    )
+
+    wgt_layout = tvm.tir.decl_buffer(
+        wgt_shape,
+        env.inp_dtype,
+        "wgt_buff",
+    )
+    inp_layout = tvm.tir.decl_buffer(
+        inp_shape,
+        env.inp_dtype,
+        "inp_buff",
+        strides=[te.var("inp_x"), te.var("inp_y")],
+    )
+    bias_layout = tvm.tir.decl_buffer(
+        bias_shape,
+        env.acc_dtype,
+        "bias_buff",
+    )
+    out_layout = tvm.tir.decl_buffer(
+        output_shape,
+        env.inp_dtype,
+        "out_buff",
+    )
+
+    def intrin_func(ins, outs):
+        """Matrix-matrix multiply intrinsic function"""
+        dinp, dwgt, dbias = ins
+        dout = outs[0]
+
+        def _body():
+            irb = tvm.tir.ir_builder.create()
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "tiled_matmul_auto",
+                    dinp.shape[0],  # dim_I,
+                    dwgt.shape[1],  # dim_J,
+                    dinp.shape[1],  # dim_K,
+                    dinp.access_ptr("r"),
+                    dwgt.access_ptr("r"),
+                    dbias.access_ptr("r"),
+                    dout.access_ptr("w"),
+                    dinp.shape[0],  # stride_A
+                    dwgt.shape[1],  # stride_B
+                    dwgt.shape[1],  # stride_C
+                    dwgt.shape[1],  # stride_D
+                    1.0,  # A_scale_factor
+                    1.0,  # B_scale_factor
+                    1.0,  # D_scale_factor
+                    0,  # act
+                    scale,
+                    0,  # relu6_shift
+                    1,  # repeating_bias
+                    0,  # transpose_A
+                    0,  # transpose_B
+                    0,  # full_C
+                    0,  # low_D
+                    # 0,
+                    0,  # weightA
+                    matmul_type,
+                )
+            )
+            return irb.get()
+
+        def _reduce_reset():
+            irb = tvm.tir.ir_builder.create()
+            return irb.get()
+
+        def _reduce_update():
+            return _body()
+
+        # return a triple of normal-set, reset, update
+        return (_body(), _reduce_reset(), _reduce_update())
+
+    return te.decl_tensor_intrin(
+        out.op,
+        intrin_func,
+        name="CONV2D_CISC",
+        binds={inp: inp_layout, wgt: wgt_layout, bias: bias_layout, out: out_layout},
+    )
+
+
+def conv2d_cisc(
+    env,
+    inp_shape: Tuple[int, ...],
+    wgt_shape: Tuple[int, ...],
+    bias_shape: Tuple[int, ...],
+    out_shape: Tuple[int, ...],
+    strides: int,
+    padding: List[int],
+    padding_value: int,
+    activation: int,
+    scale: float,
+    pool_size: List[int],
+    pool_strides: List[int],
+    pool_dilation: List[int],
+    pool_padding: List[int],
+):
+    """2D convolution intrinsic, inserts the calls to the function provided
+    by the Gemmini developers to run a 2D convolution using the loop instructions
+
+    Args:
+        env (Environment): Environment with configurations
+        inp_shape (Tuple[int,...]): Input feature map shape
+        wgt_shape (Tuple[int,...]): Weights shape
+        bias_shape (Tuple[int,...]): Bias shape
+        out_shape (Tuple[int,...]): Output feature map shape
+        strides (int): Convolution stride
+        padding (List[int]): Pixels to pad in each direction
+        padding_value (int): Value to use for padding
+        activation (int): Has activation?
+        scale (float): Output scaling factor
+        pool_size (List[int]): Size of the output pooling window
+        pool_strides (List[int]): Strides for the output pooling window
+        pool_dilation (List[int]): Dilation for the output pooling window. Not used for now.
+        pool_padding (List[int]): Padding for the output pooling
+
+    Returns:
+        TensorIntrin: CONV2D CISC tensor intrinsic
+    """
+    _ = pool_dilation
+    # TODO (FP): add assertions here for the supported parameters?
+
+    wgt = te.placeholder(wgt_shape, dtype=env.inp_dtype, name=env.scr_wgt_scope)
+    inp = te.placeholder(inp_shape, dtype=env.inp_dtype, name=env.scr_scope)
+    bias = te.placeholder(bias_shape, dtype=env.acc_dtype, name=env.scr_scope)
+
+    k_h = wgt.shape[0]
+    k_w = wgt.shape[1]
+
+    i_c = inp.shape[3]
+
+    ric = te.reduce_axis((0, i_c), name="ric")
+    rkh = te.reduce_axis((0, k_h), name="rkh")
+    rkw = te.reduce_axis((0, k_w), name="rkw")
+
+    hstr = strides[0]
+    wstr = strides[1]
+
+    out = te.compute(
+        out_shape,
+        lambda b_o, i, j, c_o: te.sum(
+            inp[b_o, i * hstr + rkh, j * wstr + rkw, ric].astype(env.inp_dtype)
+            * wgt[rkh, rkw, ric, c_o].astype(env.inp_dtype)
+            + bias[c_o].astype(env.inp_dtype),
+            axis=[rkh, rkw, ric],
+        ),
+    )
+
+    wgt_layout = tvm.tir.decl_buffer(wgt_shape, env.inp_dtype, "wgt_buff")
+    inp_layout = tvm.tir.decl_buffer(
+        inp_shape,
+        env.inp_dtype,
+        "inp_buff",
+        strides=[te.var("inp_x"), te.var("inp_y"), te.var("inp_b"), te.var("inp_k")],
+    )
+    bias_layout = tvm.tir.decl_buffer(
+        bias_shape,
+        env.acc_dtype,
+        "bias_buff",
+    )
+    out_layout = tvm.tir.decl_buffer(
+        out_shape,
+        env.inp_dtype,
+        "out_buff",
+    )
+
+    def intrin_func(ins, outs):
+        """2D convolution intrinsic function"""
+        dinp, dwgt, dbias = ins
+        dout = outs[0]
+
+        def _body():
+            irb = tvm.tir.ir_builder.create()
+            if env.supports_non_zero_padding:
+                irb.emit(
+                    tvm.tir.call_extern(
+                        "",
+                        "tiled_conv_auto",
+                        dinp.shape[0],  # BATCH_SIZE,
+                        dinp.shape[1],  # IN_DIM,
+                        dinp.shape[3],  # IN_CHANNELS,
+                        dout.shape[3],  # OUT_CHANNELS,
+                        dout.shape[1],  # OUT_DIM,
+                        strides[0],
+                        1,
+                        1,
+                        padding[2],
+                        padding_value,
+                        dwgt.shape[0],
+                        0,
+                        0,
+                        0,
+                        0,
+                        0,
+                        dinp.access_ptr("r"),
+                        dwgt.access_ptr("r"),
+                        dbias.access_ptr("r"),
+                        dout.access_ptr("w"),
+                        activation,
+                        scale,
+                        pool_size[0],
+                        pool_strides[0],
+                        pool_padding[0],
+                        1,
+                    )
+                )
+            else:
+                irb.emit(
+                    tvm.tir.call_extern(
+                        "",
+                        "tiled_conv_auto",
+                        dinp.shape[0],  # BATCH_SIZE,
+                        dinp.shape[1],  # IN_DIM,
+                        dinp.shape[3],  # IN_CHANNELS,
+                        dout.shape[3],  # OUT_CHANNELS,
+                        dout.shape[1],  # OUT_DIM,
+                        strides[0],
+                        1,
+                        1,
+                        padding[2],
+                        dwgt.shape[0],
+                        0,
+                        0,
+                        0,
+                        0,
+                        0,
+                        dinp.access_ptr("r"),
+                        dwgt.access_ptr("r"),
+                        dbias.access_ptr("r"),
+                        dout.access_ptr("w"),
+                        activation,
+                        scale,
+                        pool_size[0],
+                        pool_strides[0],
+                        pool_padding[0],
+                        1,
+                    )
+                )
+            return irb.get()
+
+        def _reduce_reset():
+            irb = tvm.tir.ir_builder.create()
+            return irb.get()
+
+        def _reduce_update():
+            return _body()
+
+        # return a triple of normal-set, reset, update
+        return (_body(), _reduce_reset(), _reduce_update())
+
+    return te.decl_tensor_intrin(
+        out.op,
+        intrin_func,
+        name="CONV2D_CISC",
+        binds={inp: inp_layout, wgt: wgt_layout, bias: bias_layout, out: out_layout},
+    )
+
+
+def dw_conv2d_cisc(
+    env,
+    inp_shape: Tuple[int, ...],
+    wgt_shape: Tuple[int, ...],
+    bias_shape: Tuple[int, ...],
+    out_shape: Tuple[int, ...],
+    strides: int,
+    padding: List[int],
+    padding_value: int,
+    activation: int,
+    scale: float,
+):
+    """2D depthwise convolution intrinsic, inserts the calls to the function
+    provided by the Gemmini developers to run a 2D depthwise convolution using the loop instructions
+
+    Args:
+        env (Environment): Environment with configurations
+        inp_shape (Tuple[int,...]): Input feature map shape
+        wgt_shape (Tuple[int,...]): Weights shape
+        bias_shape (Tuple[int,...]): Bias shape
+        out_shape (Tuple[int,...]): Output feature map shape
+        strides (int): Convolution stride
+        padding (List[int]): Pixels to pad in each direction
+        padding_value (int): Value to use for padding
+        activation (int): Has activation?
+        scale (float): Output scaling factor
+
+    Returns:
+        TensorIntrin: depthwise convolution 2d tensor intrinsic
+    """
+
+    # TODO (FP): add assertions here for the supported parameters?
+
+    wgt = te.placeholder(wgt_shape, dtype=env.inp_dtype, name=env.scr_wgt_scope)
+    inp = te.placeholder(inp_shape, dtype=env.inp_dtype, name=env.scr_scope)
+    bias = te.placeholder(bias_shape, dtype=env.acc_dtype, name=env.scr_scope)
+
+    k_h = wgt.shape[1]
+    k_w = wgt.shape[2]
+
+    rkh = te.reduce_axis((0, k_h), name="rkh")
+    rkw = te.reduce_axis((0, k_w), name="rkw")
+
+    hstr = strides[0]
+    wstr = strides[1]
+
+    out = te.compute(
+        out_shape,
+        lambda b_o, i, j, c_o: te.sum(
+            inp[b_o, i * hstr + rkh, j * wstr + rkw, c_o].astype(env.inp_dtype)
+            * wgt[c_o, rkh, rkw].astype(env.inp_dtype)
+            + bias[c_o].astype(env.inp_dtype),
+            axis=[rkh, rkw],
+        ),
+    )
+
+    wgt_layout = tvm.tir.decl_buffer(
+        wgt_shape,
+        env.inp_dtype,
+        "wgt_buff",
+        # strides=[te.var("wgt_i"),te.var("wgt_j")]
+    )
+    inp_layout = tvm.tir.decl_buffer(
+        inp_shape,
+        env.inp_dtype,
+        "inp_buff",
+        strides=[te.var("inp_x"), te.var("inp_y"), te.var("inp_b"), te.var("inp_k")],
+    )
+    bias_layout = tvm.tir.decl_buffer(
+        bias_shape,
+        env.acc_dtype,
+        "bias_buff",
+    )
+    out_layout = tvm.tir.decl_buffer(
+        out_shape,
+        env.inp_dtype,
+        "out_buff",
+    )
+
+    def intrin_func(ins, outs):
+        """2D depthwise convolution intrinsic function"""
+        dinp, dwgt, dbias = ins
+        dout = outs[0]
+
+        def _body():
+            irb = tvm.tir.ir_builder.create()
+            if env.supports_non_zero_padding:
+                irb.emit(
+                    tvm.tir.call_extern(
+                        "",
+                        "tiled_conv_dw_auto",
+                        dinp.shape[0],  # BATCH_SIZE,
+                        dinp.shape[1],  # IN_DIM,
+                        dinp.shape[3],  # IN_CHANNELS,
+                        # dout.shape[3],#OUT_CHANNELS,
+                        dout.shape[1],  # OUT_DIM,
+                        strides[0],
+                        # 1, 1,
+                        padding[2],
+                        padding_value,
+                        dwgt.shape[1],
+                        # 0, 0, 0, 0, 0,
+                        dinp.access_ptr("r"),
+                        dwgt.access_ptr("r"),
+                        dbias.access_ptr("r"),
+                        dout.access_ptr("w"),
+                        activation,
+                        scale,
+                        1,
+                        0,
+                        0,
+                        1,
+                    )
+                )
+            else:
+                irb.emit(
+                    tvm.tir.call_extern(
+                        "",
+                        "tiled_conv_dw_auto",
+                        dinp.shape[0],  # BATCH_SIZE,
+                        dinp.shape[1],  # IN_DIM,
+                        dinp.shape[3],  # IN_CHANNELS,
+                        # dout.shape[3],#OUT_CHANNELS,
+                        dout.shape[1],  # OUT_DIM,
+                        strides[0],
+                        # 1, 1,
+                        padding[2],
+                        dwgt.shape[1],
+                        # 0, 0, 0, 0, 0,
+                        dinp.access_ptr("r"),
+                        dwgt.access_ptr("r"),
+                        dbias.access_ptr("r"),
+                        dout.access_ptr("w"),
+                        activation,
+                        scale,
+                        1,
+                        0,
+                        0,
+                        1,
+                    )
+                )
+
+            return irb.get()
+
+        def _reduce_reset():
+            irb = tvm.tir.ir_builder.create()
+            return irb.get()
+
+        def _reduce_update():
+            return _body()
+
+        # return a triple of normal-set, reset, update
+        return (_body(), _reduce_reset(), _reduce_update())
+
+    return te.decl_tensor_intrin(
+        out.op,
+        intrin_func,
+        name="DWCONV2D_CISC",
+        binds={inp: inp_layout, wgt: wgt_layout, bias: bias_layout, out: out_layout},
+    )
+
+
+def add_tensorize(env, oshape: Tuple[int, ...]):
+    """Add intrinsic, inserts the most basic Gemmini instructions to support the qnn.add operator
+
+    Args:
+        env (Environment): Environment with configurations
+        oshape (Tuple[int,...]): Output feature map shape
+
+    Returns:
+        TensorIntrin: add tensor intrinsic
+    """
+
+    # TODO (FP): add assertions here for the supported parameters?
+
+    ifm1 = te.placeholder(oshape, dtype=env.inp_dtype, name=env.acc_scope)
+    ifm2 = te.placeholder(oshape, dtype=env.inp_dtype, name=env.acc_scope)
+
+    out = te.compute(
+        oshape, lambda i, j: ifm1[i, j].astype(env.inp_dtype) + ifm2[i, j].astype(env.inp_dtype)
+    )
+
+    ifm1_dtype = env.inp_dtype
+
+    ifm1_layout = tvm.tir.decl_buffer(
+        oshape,
+        ifm1_dtype,
+        "ifm1_buff",
+        strides=[te.var("out_b"), te.var("out_x")],
+        offset_factor=env.DIM,
+    )
+    ifm2_layout = tvm.tir.decl_buffer(
+        oshape,
+        env.inp_dtype,
+        "ifm2_buff",
+        scope=env.acc_scope,
+        strides=[te.var("out_b"), te.var("out_x")],
+        offset_factor=env.DIM,
+    )
+    out_layout = tvm.tir.decl_buffer(
+        oshape,
+        env.inp_dtype,
+        "out_buff",
+        scope=env.acc_scope,
+        strides=[te.var("out_b"), te.var("out_x")],
+        offset_factor=env.DIM,
+    )
+
+    def intrin_func(ins, outs):
+        """Add intrinsic function"""
+        difm1, difm2 = ins
+        _ = outs
+
+        def _body():
+            irb = tvm.tir.ir_builder.create()
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "gemmini_extended_mvin2",
+                    difm1.access_ptr("r"),
+                    tvm.runtime.const(env.OUT_ACC_BASE_ADDRESS, "uint32")
+                    + difm2.access_ptr("w", "uint32"),
+                    difm1.shape[1],
+                    difm1.shape[0],
+                )
+            )
+
+            return irb.get()
+
+        def _reduce_reset():
+            irb = tvm.tir.ir_builder.create()
+            return irb.get()
+
+        def _reduce_update():
+            return _body()
+
+        # return a triple of normal-set, reset, update
+        return (_body(), _reduce_reset(), _reduce_update())
+
+    return te.decl_tensor_intrin(
+        out.op,
+        intrin_func,
+        name="ADD",
+        binds={ifm1: ifm1_layout, ifm2: ifm2_layout, out: out_layout},
+    )
+
+
+def add_mvout_tensorize(env, oshape: Tuple[int, ...]):
+    """Helper for the add intrinsic
+
+    Args:
+        env (Environment): Environment with configurations
+        oshape (Tuple[int,...]): Output feature map shape
+
+    Returns:
+        TensorIntrin: add mvout tensor intrinsic
+    """
+
+    # TODO (FP): add assertions here for the supported parameters?
+
+    ifm1 = te.placeholder(oshape, dtype=env.inp_dtype, name=env.acc_scope)
+    ifm2 = te.placeholder(oshape, dtype=env.inp_dtype, name=env.acc_scope)
+
+    out = te.compute(
+        oshape, lambda i, j: ifm1[i, j].astype(env.inp_dtype) + ifm2[i, j].astype(env.inp_dtype)
+    )
+
+    ifm1_dtype = env.inp_dtype
+
+    ifm1_layout = tvm.tir.decl_buffer(
+        oshape,
+        ifm1_dtype,
+        "ifm1_buff",
+        scope=env.acc_scope,
+        strides=[te.var("out_b"), te.var("out_x")],
+        offset_factor=env.DIM,
+    )
+    ifm2_layout = tvm.tir.decl_buffer(
+        oshape,
+        env.inp_dtype,
+        "ifm2_buff",
+        scope=env.acc_scope,
+        strides=[te.var("out_b"), te.var("out_x")],
+        offset_factor=env.DIM,
+    )
+    out_layout = tvm.tir.decl_buffer(
+        oshape,
+        env.inp_dtype,
+        "out_buff",
+        strides=[te.var("out_b"), te.var("out_x")],
+        offset_factor=env.DIM,
+    )
+
+    def intrin_func(ins, outs):
+        """Add mvout intrinsic function"""
+        difm1, difm2 = ins
+        dout = outs[0]
+
+        def _body():
+            irb = tvm.tir.ir_builder.create()
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "gemmini_extended_mvout",
+                    dout.access_ptr("w"),
+                    tvm.runtime.const(env.OUT_ACC_BASE_ADDRESS, "uint32")
+                    + difm2.access_ptr("w", "uint32")
+                    - tvm.runtime.const(0x40000000, "uint32"),
+                    difm1.shape[1],
+                    difm1.shape[0],
+                )
+            )
+
+            return irb.get()
+
+        def _reduce_reset():
+            irb = tvm.tir.ir_builder.create()
+            return irb.get()
+
+        def _reduce_update():
+            return _body()
+
+        # return a triple of normal-set, reset, update
+        return (_body(), _reduce_reset(), _reduce_update())
+
+    return te.decl_tensor_intrin(
+        out.op,
+        intrin_func,
+        name="ADD_MVOUT",
+        binds={ifm1: ifm1_layout, ifm2: ifm2_layout, out: out_layout},
+    )
diff --git a/python/tvm/contrib/gemmini/legalize.py b/python/tvm/contrib/gemmini/legalize.py
new file mode 100644
index 000000000000..c9a72eadbc07
--- /dev/null
+++ b/python/tvm/contrib/gemmini/legalize.py
@@ -0,0 +1,570 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+A set of passes to legalize the Gemmini operators
+=====================
+"""
+
+from typing import Tuple
+import tvm  # type: ignore
+from tvm import relay
+from tvm import ir
+from tvm.relay.dataflow_pattern import DFPatternCallback  # type: ignore
+from tvm.relay.dataflow_pattern import wildcard
+from tvm.relay.dataflow_pattern import rewrite
+
+from tvm.relay.op import _make  # type: ignore
+
+from .pattern_table import AddParams, CONV2DParams, GEMMParams, MaxPoolParams  # type: ignore
+
+
+def gemmini_gemm(
+    ifm1: tvm.relay.Expr,
+    ifm2: tvm.relay.Expr,
+    bias: tvm.relay.Expr,
+    ifm_scale: float,
+    ifm_offset: float,
+    bias_scale: float,
+    bias_offset: float,
+    ofm_scale: float,
+    ofm_offset: float,
+) -> tvm.relay.Call:
+    """Generates the call to the contrib.gemmini.gemm operator
+
+    Args:
+        ifm1 (tvm.relay.Expr): Input feature map 1
+        ifm2 (tvm.relay.Expr): Input feature map 2 (weights)
+        bias (tvm.relay.Expr): Biases
+        ifm_scale (float): Input feature map scaling factor
+        ifm_offset (float): Input feature map offset
+        bias_scale (float): Biases scaling factor
+        bias_offset (float): Biases offset
+        ofm_scale (float): Output feature map scaling factor
+        ofm_offset (float): Output feature map offset
+
+    Returns:
+        tvm.relay.Call: Call to the contrib.gemmini.gemm operator
+    """
+    return _make.gemmini_gemm(
+        ifm1, ifm2, bias, ifm_scale, ifm_offset, bias_scale, bias_offset, ofm_scale, ofm_offset
+    )
+
+
+def gemmini_add(
+    ifm1: tvm.relay.Expr,
+    ifm2: tvm.relay.Expr,
+    ifm1_scale: float,
+    ifm1_offset: float,
+    ifm2_scale: float,
+    ifm2_offset: float,
+    ofm_scale: float,
+    ofm_offset: float,
+    shape: Tuple[int, ...],
+) -> tvm.relay.Call:
+    """Generates the call to the contrib.gemmini.add operator
+
+    Args:
+        ifm1 (tvm.relay.Expr): Input feature map 1
+        ifm2 (tvm.relay.Expr): Input feature map 2
+        ifm1_scale (float): Input feature map 1 scaling factor
+        ifm1_offset (float): Input feature map 1 offset
+        ifm2_scale (float): Input feature map 2 scaling factor
+        ifm2_offset (float): Input feature map 2 offset
+        ofm_scale (float): Output feature map scaling factor
+        ofm_offset (float): Output feature map offset
+        shape (Tuple[int,...]): Shape of the input feature maps and the output feature map
+
+    Returns:
+        tvm.relay.Call: Call to the contrib.gemmini.add operator
+    """
+    return _make.gemmini_add(
+        ifm1,
+        ifm2,
+        ifm1_scale,
+        ifm1_offset,
+        ifm2_scale,
+        ifm2_offset,
+        ofm_scale,
+        ofm_offset,
+        shape,
+    )
+
+
+def gemmini_conv2d(
+    data: tvm.relay.Expr,
+    weights: tvm.relay.Expr,
+    bias: tvm.relay.Expr,
+    strides: tuple,
+    padding: tuple,
+    ifm_scale: float,
+    ifm_offset: float,
+    weights_scale: float,
+    weights_offset: float,
+    bias_scale: float,
+    bias_offset: float,
+    ofm_scale: float,
+    ofm_offset: float,
+    activation: bool,
+    has_pool: bool,
+    pool_size: tvm.relay.Expr,
+    pool_strides: tvm.relay.Expr,
+    pool_dilation: tvm.relay.Expr,
+    pool_padding: tvm.relay.Expr,
+    input_req_offset_out: tvm.relay.Expr,
+    has_activation: bool,
+    activation_scale_in: tvm.relay.Expr,
+    activation_offset_in: tvm.relay.Expr,
+    activation_scale_out: tvm.relay.Expr,
+    activation_offset_out: tvm.relay.Expr,
+) -> tvm.relay.Call:
+    """Generates the call to the contrib.gemmini.conv2d operator
+
+    Args:
+        data (tvm.relay.Expr): Input feature map
+        weights (tvm.relay.Expr): Convolution weights matrix
+        bias (tvm.relay.Expr): Convolution biases matrix
+        strides (tuple): Convolution strides
+        padding (tuple): Convolution paddings in each direction
+        ifm_scale (float): Input feature map scaling factor
+        ifm_offset (float): Input feature map offset
+        weights_scale (float): Weights scaling factor
+        weights_offset (float): Convolution weights offset
+        bias_scale (float): Biases scaling factor
+        bias_offset (float): Biases weights offset
+        ofm_scale (float): Output feature map scaling factor
+        ofm_offset (float): Output feature map offset
+        activation (bool): TODO (FP): see if this can be deleted! Has activation?
+        has_pool (bool): Has pooling layer after the output of the convolution?
+        pool_size (tvm.relay.Expr): Pooling window size
+        pool_strides (tvm.relay.Expr): Pooling window strides
+        pool_dilation (tvm.relay.Expr): Pooling window dilation
+        pool_padding (tvm.relay.Expr): Pooling padding in each direction
+        input_req_offset_out (tvm.relay.Expr): Requantize layer output offset
+        has_activation (bool): Has activation?
+        activation_scale_in (tvm.relay.Expr): TODO (FP): check if this can be deleted
+            and made more simple. Activation layer input scaling factor
+        activation_offset_in (tvm.relay.Expr): TODO (FP): check if this can be deleted
+            and made more simple. Activation layer input offset
+        activation_scale_out (tvm.relay.Expr): TODO (FP): check if this can be deleted
+            and made more simple. Activation layer output scaling factor
+        activation_offset_out (tvm.relay.Expr): TODO (FP): check if this can be deleted
+            and made more simple. Activation layer output offset
+
+    Returns:
+        tvm.relay.Call: Call to the contrib.gemmini.conv2d operator
+    """
+    return _make.gemmini_conv2d(
+        data,
+        weights,
+        bias,
+        strides,
+        padding,
+        ifm_scale,
+        ifm_offset,
+        weights_scale,
+        weights_offset,
+        bias_scale,
+        bias_offset,
+        ofm_scale,
+        ofm_offset,
+        activation,
+        has_pool,
+        pool_size,
+        pool_strides,
+        pool_dilation,
+        pool_padding,
+        input_req_offset_out,
+        has_activation,
+        activation_scale_in,
+        activation_offset_in,
+        activation_scale_out,
+        activation_offset_out,
+    )
+
+
+def gemmini_depthwise_conv2d(
+    data: tvm.relay.Expr,
+    weights: tvm.relay.Expr,
+    bias: tvm.relay.Expr,
+    strides: tuple,
+    padding: tuple,
+    ifm_scale: float,
+    ifm_offset: float,
+    weights_scale: float,
+    weights_offset: float,
+    bias_scale: float,
+    bias_offset: float,
+    ofm_scale: float,
+    ofm_offset: float,
+    activation: bool,
+) -> tvm.relay.Call:
+    """Generates the call to the contrib.gemmini.depthwiseconv2d operator
+
+    Args:
+        data (tvm.relay.Expr): Input feature map
+        weights (tvm.relay.Expr): Convolution weights matrix
+        bias (tvm.relay.Expr): Convolution biases matrix
+        strides (tuple): Convolution strides
+        padding (tuple): Convolution paddings in each direction
+        ifm_scale (float): Input feature map scaling
+        ifm_offset (float): Input feature map offset
+        weights_scale (float): Convolution weights scaling factor
+        weights_offset (float): Convolution weights offset
+        bias_scale (float): Convolution biases scaling factor
+        bias_offset (float): Convolution biases offset
+        ofm_scale (float): Output feature map scaling
+        ofm_offset (float): Output feature map offset
+        activation (bool): Has activation?
+
+    Returns:
+        tvm.relay.Call: Call to the contrib.gemmini.depthwiseconv2d operator
+    """
+    return _make.gemmini_depthwise_conv2d(
+        data,
+        weights,
+        bias,
+        strides,
+        padding,
+        ifm_scale,
+        ifm_offset,
+        weights_scale,
+        weights_offset,
+        bias_scale,
+        bias_offset,
+        ofm_scale,
+        ofm_offset,
+        activation,
+    )
+
+
+def gemmini_max_pool2d(
+    ifm: tvm.relay.Expr,
+    pool_size: tvm.relay.Expr,
+    pool_strides: tvm.relay.Expr,
+    pool_dilation: tvm.relay.Expr,
+    pool_padding: tvm.relay.Expr,
+    shape: tuple,
+) -> tvm.relay.Call:
+    """Generates the call to the contrib.gemmini.max_pool2d operator
+
+    Args:
+        ifm (tvm.relay.Expr): Input feature map
+        pool_size (tvm.relay.Expr): Pooling window size
+        pool_strides (tvm.relay.Expr): Pooling window strides
+        pool_dilation (tvm.relay.Expr): Pooling window dilation
+        pool_padding (tvm.relay.Expr): Pooling padding in each direction
+        shape (tuple): Input shape
+
+    Returns:
+        tvm.relay.Call: Call to the contrib.gemmini.max_pool2d operator
+    """
+    return _make.gemmini_max_pool2d(
+        ifm, pool_size, pool_strides, pool_dilation, pool_padding, shape
+    )
+
+
+class AddRewriter(DFPatternCallback):
+    """Convert add related composite functions into contrib.gemmini.add operators"""
+
+    def __init__(self):
+        super().__init__(require_type=True)
+        self.pattern = (wildcard().has_attr({"Composite": AddParams.composite_name}))(
+            wildcard(), wildcard()
+        )
+
+    def callback(
+        self, pre: tvm.relay.Expr, post: tvm.relay.Expr, node_map: tvm.ir.container.Map
+    ) -> tvm.relay.Expr:
+        params = AddParams(post.op.body)
+        gemmini_add_op = gemmini_add(
+            post.args[0],
+            post.args[1],
+            params.ifm1_scale,
+            params.ifm1_offset,
+            params.ifm2_scale,
+            params.ifm2_offset,
+            params.ofm_scale,
+            params.ofm_offset,
+            params.output_shape,
+        )
+        return gemmini_add_op
+
+
+class GEMMRewriter(DFPatternCallback):
+    """Convert gemm related composite functions into contrib.gemmini.gemm operators"""
+
+    def __init__(self):
+        super().__init__(require_type=True)
+        self.pattern = (wildcard().has_attr({"Composite": GEMMParams.composite_name}))(
+            wildcard(), wildcard(), wildcard()
+        )
+
+    def callback(
+        self, pre: tvm.relay.Expr, post: tvm.relay.Expr, node_map: tvm.ir.container.Map
+    ) -> tvm.relay.Expr:
+        params = GEMMParams(post.op.body)
+        gemmini_gemm_op = gemmini_gemm(
+            post.args[0],
+            post.args[1],
+            post.args[2],
+            params.ifm_scale,
+            params.ifm_offset,
+            params.bias_scale,
+            params.bias_offset,
+            params.ofm_scale,
+            params.ofm_offset,
+        )
+        return gemmini_gemm_op
+
+
+class CONV2DRewriter(DFPatternCallback):
+    """Convert conv2d related composite functions into contrib.gemmini.conv2d operators"""
+
+    def __init__(self):
+        super().__init__(require_type=True)
+        self.pattern = (wildcard().has_attr({"Composite": CONV2DParams.composite_name}))(
+            wildcard(), wildcard(), wildcard()
+        )
+        self.data_index = 0
+        self.weights_index = 1
+        self.bias_index = 2
+
+    def callback(
+        self, pre: tvm.relay.Expr, post: tvm.relay.Expr, node_map: tvm.ir.container.Map
+    ) -> tvm.relay.Expr:
+        params = CONV2DParams(post.op.body)
+        if params.has_external_pad:
+            self.weights_index = 2
+            self.bias_index = 3
+        else:
+            self.weights_index = 1
+            self.bias_index = 2
+
+        bias = post.args[self.bias_index]
+
+        if params.has_input_requantize:
+            data = relay.cast(post.args[self.data_index], "int8")
+        else:
+            data = post.args[self.data_index]
+
+        if params.is_depthwise:
+            reshaped_weights = relay.squeeze(
+                relay.transpose(post.args[self.weights_index], [3, 0, 1, 2]), axis=[3]
+            )
+            gemmini_depthwise_conv2d_op = gemmini_depthwise_conv2d(
+                data=data,
+                weights=reshaped_weights,
+                bias=bias,
+                strides=params.strides,
+                padding=params.padding,
+                ifm_scale=params.ifm_scale,
+                ifm_offset=params.ifm_offset,
+                weights_scale=params.weights_scale,
+                weights_offset=params.weights_offset,
+                bias_scale=params.bias_scale,
+                bias_offset=params.bias_offset,
+                ofm_scale=params.ofm_scale,
+                ofm_offset=params.ofm_offset,
+                activation=params.activation,
+            )
+            return gemmini_depthwise_conv2d_op
+        else:
+            gemmini_conv2d_op = gemmini_conv2d(
+                data=data,
+                weights=post.args[self.weights_index],
+                bias=bias,
+                strides=params.strides,
+                padding=params.padding,
+                ifm_scale=params.ifm_scale,
+                ifm_offset=params.ifm_offset,
+                weights_scale=params.weights_scale,
+                weights_offset=params.weights_offset,
+                bias_scale=params.bias_scale,
+                bias_offset=params.bias_offset,
+                ofm_scale=params.ofm_scale,
+                ofm_offset=params.ofm_offset,
+                activation=params.activation,
+                has_pool=params.has_pool,
+                pool_size=params.pool_size,
+                pool_strides=params.pool_strides,
+                pool_dilation=params.pool_dilation,
+                pool_padding=params.pool_padding,
+                input_req_offset_out=params.input_offset_out,
+                has_activation=params.has_activation,
+                activation_scale_in=params.activation_scale_in,
+                activation_offset_in=params.activation_offset_in,
+                activation_scale_out=params.activation_scale_out,
+                activation_offset_out=params.activation_offset_out,
+            )
+        return gemmini_conv2d_op
+
+
+class CONV2DExternalPadRewriter(CONV2DRewriter):
+    def __init__(self):
+        super().__init__()
+        self.pattern = (wildcard().has_attr({"Composite": CONV2DParams.composite_name}))(
+            wildcard(), wildcard(), wildcard(), wildcard()
+        )
+        self.data_index = 0
+
+
+class CONV2DExternalPadAndRelu6Rewriter(CONV2DRewriter):
+    def __init__(self):
+        super().__init__()
+        self.pattern = (wildcard().has_attr({"Composite": CONV2DParams.composite_name}))(
+            wildcard(), wildcard(), wildcard(), wildcard(), wildcard()
+        )
+        self.data_index = 0
+        self.min_index = 4
+
+
+class MAXPOOL2DRewriter(DFPatternCallback):
+    """Convert conv2d related composite functions into gemmini_max_pool2d operators"""
+
+    def __init__(self):
+        super().__init__(require_type=True)
+        self.pattern = (wildcard().has_attr({"Composite": MaxPoolParams.composite_name}))(
+            wildcard()
+        )
+        self.data_index = 0
+
+    def callback(
+        self, pre: tvm.relay.Expr, post: tvm.relay.Expr, node_map: tvm.ir.container.Map
+    ) -> tvm.relay.Expr:
+        params = MaxPoolParams(post.op.body)
+
+        data = post.args[self.data_index]
+
+        gemmini_max_pool2d_op = gemmini_max_pool2d(
+            ifm=data,
+            pool_size=params.pool_size,
+            pool_strides=params.pool_strides,
+            pool_dilation=params.pool_dilation,
+            pool_padding=params.pool_padding,
+            shape=params.shape,
+        )
+        return gemmini_max_pool2d_op
+
+
+@ir.transform.module_pass(opt_level=1)
+class LegalizeAdd:
+    """This is the pass that wraps the AddRewriter"""
+
+    def transform_module(self, mod: tvm.ir.IRModule, _) -> tvm.ir.IRModule:
+        for global_var, func in mod.functions.items():
+            func = rewrite(AddRewriter(), func)
+            mod.update_func(global_var, func)
+        return mod
+
+    def __call__(self, *args, **kwargs):
+        pass
+
+
+@ir.transform.module_pass(opt_level=1)
+class LegalizeMaxPool2D:
+    """This is the pass that wraps the MAXPOOL2DRewriter"""
+
+    def transform_module(self, mod: tvm.ir.IRModule, _) -> tvm.ir.IRModule:
+        for global_var, func in mod.functions.items():
+            func = rewrite(MAXPOOL2DRewriter(), func)
+            mod.update_func(global_var, func)
+        return mod
+
+    def __call__(self, *args, **kwargs):
+        pass
+
+
+@ir.transform.module_pass(opt_level=1)
+class LegalizeGEMM:
+    """This is the pass that wraps the GEMMRewriter"""
+
+    def transform_module(self, mod: tvm.ir.IRModule, _) -> tvm.ir.IRModule:
+        for global_var, func in mod.functions.items():
+            func = rewrite(GEMMRewriter(), func)
+            mod.update_func(global_var, func)
+        return mod
+
+    def __call__(self, *args, **kwargs):
+        pass
+
+
+@ir.transform.module_pass(opt_level=1)
+class LegalizeCONV2D:
+    """This is the pass that wraps the CONV2DRewriter"""
+
+    def transform_module(self, mod: tvm.ir.IRModule, _) -> tvm.ir.IRModule:
+        for global_var, func in mod.functions.items():
+            func = rewrite(CONV2DRewriter(), func)
+            mod.update_func(global_var, func)
+        return mod
+
+    def __call__(self, *args, **kwargs):
+        pass
+
+
+@ir.transform.module_pass(opt_level=1)
+class LegalizeCONV2DExternalPad:
+    """This is the pass that wraps the CONV2DExternalPadRewriter"""
+
+    def transform_module(self, mod: tvm.ir.IRModule, _) -> tvm.ir.IRModule:
+        for global_var, func in mod.functions.items():
+            func = rewrite(CONV2DExternalPadRewriter(), func)
+            mod.update_func(global_var, func)
+        return mod
+
+    def __call__(self, *args, **kwargs):
+        pass
+
+
+@ir.transform.module_pass(opt_level=1)
+class LegalizeCONV2DExternalPadAndRelu6:
+    """This is the pass that wraps the CONV2DExternalPadAndRelu6Rewriter"""
+
+    def transform_module(self, mod: tvm.ir.IRModule, _) -> tvm.ir.IRModule:
+        for global_var, func in mod.functions.items():
+            func = rewrite(CONV2DExternalPadAndRelu6Rewriter(), func)
+            mod.update_func(global_var, func)
+        return mod
+
+    def __call__(self, *args, **kwargs):
+        pass
+
+
+@ir.transform.module_pass(opt_level=1)
+class LegalizeGemmini:
+    """This is the pass to call graph-rewrites to perform graph transformation
+    in a way such that the operations are replaced with hardware/codegen supported
+    operations.
+    """
+
+    def transform_module(self, mod: tvm.ir.IRModule, _) -> tvm.ir.IRModule:
+        """This is the method that replaces the operations with hardware/codegen supported
+        operations.
+        """
+        mod = LegalizeCONV2DExternalPadAndRelu6()(mod)
+        mod = LegalizeCONV2DExternalPad()(mod)
+        mod = LegalizeAdd()(mod)
+        mod = LegalizeCONV2D()(mod)
+        mod = LegalizeGEMM()(mod)
+        mod = LegalizeMaxPool2D()(mod)
+        return mod
+
+    def __call__(self, *args, **kwargs):
+        # pylint is unable figure out the decorated
+        # class is callable, thus adding this to
+        # suppress the warning.
+        pass
diff --git a/python/tvm/contrib/gemmini/pattern_table.py b/python/tvm/contrib/gemmini/pattern_table.py
new file mode 100644
index 000000000000..8240640ac4e5
--- /dev/null
+++ b/python/tvm/contrib/gemmini/pattern_table.py
@@ -0,0 +1,465 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Pattern table declaring the supported Gemmini operators
+=====================
+"""
+
+from typing import Callable, List, Tuple
+
+import tvm  # type: ignore
+from tvm import relay
+from tvm.relay.op.contrib.register import register_pattern_table  # type: ignore
+from tvm.relay.dataflow_pattern import is_constant, wildcard, is_op
+from tvm.relay.frontend.common import infer_shape as _infer_shape
+from .utils import QDenseArgs, RequantArgs, BinaryElementwiseArgs, QConv2DArgs
+
+from .environment import Environment
+
+ENV = Environment.instance()
+
+
+class GEMMParams:
+    """
+    This class will parse a Call to a gemmini.gemm composite function
+    """
+
+    composite_name = "gemmini.gemm"
+
+    def __init__(self, func_body: tvm.relay.Function):
+
+        dense_op = func_body.args[0]
+        self.weights = func_body.args[1]
+        requantize_op = func_body
+
+        bias_add = requantize_op.args[0]
+        self.bias = bias_add.args[1]
+        dense_op = bias_add.args[0]
+        self.ifm_scale = dense_op.args[QDenseArgs.IFM_SCALE.value]
+        self.ifm_offset = dense_op.args[QDenseArgs.IFM_ZERO_POINT.value]
+
+        if requantize_op.op.name == "qnn.requantize":
+            self.merge_requantize = True
+            self.bias_scale = requantize_op.args[RequantArgs.IFM_SCALE.value]
+            self.bias_offset = requantize_op.args[RequantArgs.IFM_ZERO_POINT.value]
+            self.ofm_scale = requantize_op.args[RequantArgs.OFM_SCALE.value]
+            self.ofm_offset = requantize_op.args[RequantArgs.OFM_ZERO_POINT.value]
+        else:
+            self.merge_requantize = False
+            self.bias_scale = tvm.relay.const([1.0], "float")
+            self.bias_offset = tvm.relay.const(0, "int32")
+            self.ofm_scale = tvm.relay.const(1.0, "float")
+            self.ofm_offset = tvm.relay.const(0, "int32")
+
+    def is_valid(self) -> bool:
+        """
+        This function checks whether gemmini.gemm has compatible attributes with the Gemmini
+        """
+        # TODO (FP): complete this validation
+        return True
+
+
+class AddParams:
+    """
+    This class will parse a Call to a gemmini.add composite function
+    """
+
+    composite_name = "gemmini.add"
+    activation_map = {"clip": "CLIP"}
+
+    def __init__(self, func_body: tvm.relay.Function):
+        if str(func_body.op) in self.activation_map:
+            add_op = func_body.args[0]
+        else:
+            add_op = func_body
+
+        self.ifm1_scale = add_op.args[BinaryElementwiseArgs.IFM1_SCALE.value]
+        self.ifm1_offset = add_op.args[BinaryElementwiseArgs.IFM1_ZERO_POINT.value]
+        self.ifm2_scale = add_op.args[BinaryElementwiseArgs.IFM2_SCALE.value]
+        self.ifm2_offset = add_op.args[BinaryElementwiseArgs.IFM2_ZERO_POINT.value]
+        self.ofm_scale = add_op.args[BinaryElementwiseArgs.OFM_SCALE.value]
+        self.ofm_offset = add_op.args[BinaryElementwiseArgs.OFM_ZERO_POINT.value]
+        self.output_shape = _infer_shape(add_op)
+        self.ifm1_shape = _infer_shape(add_op.args[0])
+        self.ifm2_shape = _infer_shape(add_op.args[1])
+
+    def is_valid(self) -> bool:
+        """
+        This function checks whether gemmini.add has compatible attributes with the Gemmini
+        """
+        # TODO (FP): complete this validation
+        # We only support 4 dimensions add operators... for now
+        if len(self.output_shape) != 4:
+            return False
+        if self.ifm1_shape != self.ifm2_shape:
+            return False
+        return True
+
+
+class CONV2DParams:
+    """
+    This class will parse a Call to a gemmini.conv2d composite function
+    """
+
+    composite_name = "gemmini.conv2d"
+    activation_map = {"clip": "CLIP"}
+
+    def __init__(self, func_body: tvm.relay.Function):
+        activation = None
+        self.pool_size = [0, 0]
+        self.pool_strides = [0, 0]
+        self.pool_padding = [0, 0, 0, 0]
+        self.pool_dilation = [0, 0]
+        self.has_pool = False
+        self.has_activation = False
+        self.a_min = None
+        self.a_max = None
+        self.has_external_pad = False
+        self.activation_scale_in = tvm.relay.const(1.0, "float")
+        self.activation_offset_in = tvm.relay.const(0, "int32")
+        self.activation_scale_out = tvm.relay.const(1.0, "float")
+        self.activation_offset_out = tvm.relay.const(0, "int32")
+
+        _op = func_body
+
+        if _op.args[0].op.name != "nn.bias_add":
+
+            if _op.op.name == "clip":
+                _op = _op.args[0]
+            else:
+
+                if _op.op.name == "nn.max_pool2d":
+                    max_pool = _op
+                    self.pool_size = max_pool.attrs.pool_size
+                    self.pool_strides = max_pool.attrs.strides
+                    self.pool_padding = max_pool.attrs.padding
+                    self.pool_dilation = max_pool.attrs.dilation
+                    self.has_pool = True
+                    _op = max_pool.args[0]
+
+                if _op.op.name == "clip":
+                    _op = _op.args[0]
+                elif _op.args[0].op.name == "clip":
+                    self.activation_scale_in = _op.args[RequantArgs.IFM_SCALE.value]
+                    self.activation_offset_in = _op.args[RequantArgs.IFM_ZERO_POINT.value]
+                    self.activation_scale_out = _op.args[RequantArgs.OFM_SCALE.value]
+                    self.activation_offset_out = _op.args[RequantArgs.OFM_ZERO_POINT.value]
+                    clip = _op.args[0]
+                    self.has_activation = True
+                    _min = clip.args[0]
+                    self.a_min = clip.attrs.a_min
+                    self.a_max = clip.attrs.a_max
+                    _op = _min.args[0]
+
+        requantize_op = _op
+
+        bias_add = requantize_op.args[0]
+
+        conv2d_op = bias_add.args[0]
+
+        self.has_input_requantize = False
+        self.input_scale_in = tvm.relay.const(1.0, "float")
+        self.input_offset_in = tvm.relay.const(0, "int32")
+        self.input_scale_out = tvm.relay.const(1.0, "float")
+        self.input_offset_out = tvm.relay.const(0, "int32")
+
+        self.output_shape = _infer_shape(conv2d_op)
+        self.strides = conv2d_op.attrs.strides
+        self.padding = conv2d_op.attrs.padding
+        self.groups = conv2d_op.attrs.groups
+        self.is_depthwise = self.groups == conv2d_op.attrs.channels and self.groups != 1
+        self.data = conv2d_op.args[0]
+        self.input_shape = _infer_shape(self.data)
+        if (
+            not isinstance(self.data, relay.expr.Var)
+            and not isinstance(self.data.op, relay.function.Function)
+            and self.data.op.name == "nn.pad"
+        ):
+            padding = self.data.attrs.pad_width
+            self.padding = [padding[1][0], padding[2][0], padding[1][1], padding[2][1]]
+            self.has_external_pad = True
+        self.weights = conv2d_op.args[1]
+        self.weights_shape = _infer_shape(self.weights)
+        self.bias = bias_add.args[1]
+        self.ifm_scale = float(conv2d_op.args[QConv2DArgs.IFM_SCALE.value].data.numpy())
+        self.ifm_offset = conv2d_op.args[QConv2DArgs.IFM_ZERO_POINT.value]
+        self.ifm_offset_const = conv2d_op.args[QConv2DArgs.IFM_ZERO_POINT.value]
+        self.weights_scale = 1.0
+        self.weights_offset = 0.0
+
+        if requantize_op.op.name == "qnn.requantize":
+            self.bias_scale = requantize_op.args[RequantArgs.IFM_SCALE.value]
+            self.bias_offset = requantize_op.args[RequantArgs.IFM_ZERO_POINT.value]
+            self.ofm_scale = requantize_op.args[RequantArgs.OFM_SCALE.value]
+            self.ofm_offset = requantize_op.args[RequantArgs.OFM_ZERO_POINT.value]
+        else:
+            self.bias_scale = tvm.relay.const([1.0], "float")
+            self.bias_offset = tvm.relay.const(0, "int32")
+            self.ofm_scale = tvm.relay.const(1.0, "float")
+            self.ofm_offset = tvm.relay.const(0, "int32")
+
+        if activation is not None:
+            self.activation = False
+        else:
+            self.activation = False
+
+    def is_valid(self) -> bool:
+        """
+        This function checks whether gemmini.conv2d has compatible attributes with the Gemmini
+        """
+        # TODO (FP): complete this validation
+        if len(set(self.pool_padding)) != 1 or len(set(self.pool_strides)) != 1:
+            return False
+
+        if self.has_input_requantize:
+            if (
+                self.input_scale_in.data.numpy() != self.input_scale_out.data.numpy()
+                or self.input_offset_in.data.numpy() != 0
+            ):
+                # Only this specific cases are supported, for now...
+                return False
+
+        if self.a_max is not None and self.a_max != 127:
+            return False
+
+        return True
+
+
+class DepthwiseCONV2DParams(CONV2DParams):
+    """
+    This class will parse a Call to a gemmini.depthwiseconv2d composite function
+    """
+
+    composite_name = "gemmini.depthwiseconv2d"
+    activation_map = {"clip": "CLIP"}
+
+
+class MaxPoolParams:
+    """
+    This class will parse a Call to a gemmini.max_pool2d composite function
+    """
+
+    composite_name = "gemmini.max_pool2d"
+
+    def __init__(self, func_body: tvm.relay.Function):
+        self.pool_size = func_body.attrs.pool_size
+        self.pool_strides = func_body.attrs.strides
+        self.pool_padding = func_body.attrs.padding
+        self.pool_dilation = func_body.attrs.dilation
+        self.shape = _infer_shape(func_body)
+
+    def is_valid(self) -> bool:
+        """
+        This function checks whether max_pool2d has compatible attributes with the Gemmini
+        """
+        # TODO (FP): complete this validation?
+        if len(set(self.pool_padding)) != 1:
+            return False
+        if (self.shape[1] != self.shape[2]) or self.shape[1] == 1:
+            return False
+        return True
+
+
+def make_dense_pattern() -> tvm.relay.dataflow_pattern.DFPattern:
+    """Create patterns related to qnn.dense.
+
+    Parameters
+    ----------
+
+    Returns
+    -------
+    dense_out : CallPattern
+        Call node sequence.
+    """
+    data = wildcard()
+    weight = wildcard()
+    bias = wildcard()
+    dense = is_op("qnn.dense")(
+        data, weight, is_constant(), is_constant(), is_constant(), is_constant()
+    )
+    bias_add = is_op("nn.bias_add")(
+        dense,
+        bias,
+    )
+    req = is_op("qnn.requantize")(
+        bias_add, is_constant(), is_constant(), is_constant(), is_constant()
+    )
+    return req
+
+
+def make_add_pattern() -> tvm.relay.dataflow_pattern.DFPattern:
+    """Create patterns related to qnn.add.
+
+    Parameters
+    ----------
+
+    Returns
+    -------
+    add_out : CallPattern
+        Call node sequence.
+    """
+    ifm1 = wildcard()
+    ifm2 = wildcard()
+    add_out = is_op("qnn.add")(
+        ifm1,
+        ifm2,
+        is_constant(),
+        is_constant(),
+        is_constant(),
+        is_constant(),
+        is_constant(),
+        is_constant(),
+    )
+    clip_or_req = add_out.optional(is_op("clip"))
+    return clip_or_req
+
+
+def make_conv2d_pattern(
+    with_padded_input: bool = False, with_maxpool: bool = False, with_relu_6: bool = False
+) -> tvm.relay.dataflow_pattern.DFPattern:
+    """Create patterns related to qnn.conv2d.
+
+    Parameters
+    ----------
+
+    Returns
+    -------
+    conv2d_out : CallPattern
+        Call node sequence.
+    """
+    data = wildcard()
+    if with_padded_input:
+        data = is_op("nn.pad")(data, wildcard())
+    weight = wildcard()
+    bias = wildcard()
+    conv2d_out = is_op("qnn.conv2d")(
+        data, weight, is_constant(), is_constant(), is_constant(), is_constant()
+    )
+    bias_add = is_op("nn.bias_add")(
+        conv2d_out,
+        bias,
+    )
+    output = is_op("qnn.requantize")(
+        bias_add, is_constant(), is_constant(), is_constant(), is_constant()
+    )
+    if with_relu_6:
+        output = is_op("minimum")(output, wildcard())
+        output = is_op("clip")(output)
+        output = is_op("qnn.requantize")(
+            output, is_constant(), is_constant(), is_constant(), is_constant()
+        )
+    else:
+        output = output.optional(is_op("clip"))
+    if with_maxpool:
+        output = output.optional(is_op("nn.max_pool2d"))
+        return output
+    else:
+        return output
+
+
+def make_depthwiseconv2d_pattern() -> tvm.relay.dataflow_pattern.DFPattern:
+    """Create patterns related to qnn.conv2d, but only if it is a depthwise convolution.
+
+    Parameters
+    ----------
+
+    Returns
+    -------
+    conv2d_out : CallPattern
+        Call node sequence.
+    """
+    data = wildcard()
+    weight = wildcard()
+    bias = wildcard()
+    conv2d_out = is_op("qnn.conv2d")(
+        data, weight, is_constant(), is_constant(), is_constant(), is_constant()
+    ).has_attr({"kernel_layout": "HWOI"})
+    bias_add = is_op("nn.bias_add")(
+        conv2d_out,
+        bias,
+    )
+    output = is_op("qnn.requantize")(
+        bias_add, is_constant(), is_constant(), is_constant(), is_constant()
+    )
+    clip_or_req = output.optional(is_op("clip"))
+    return clip_or_req
+
+
+def make_maxpool_pattern() -> tvm.relay.dataflow_pattern.DFPattern:
+    """Create patterns related to nn.max_pool2d.
+
+    Parameters
+    ----------
+
+    Returns
+    -------
+    max_pool2d : CallPattern
+        Call node sequence.
+    """
+    max_pool2d = is_op("nn.max_pool2d")(wildcard())
+    return max_pool2d
+
+
+@register_pattern_table("gemmini")
+def pattern_table() -> List[Tuple[str, tvm.relay.dataflow_pattern.DFPattern, Callable]]:
+    """Declares Gemminis pattern table
+
+    Returns:
+        List[Tuple[str, tvm.relay.dataflow_pattern.DFPattern, Callable]]:
+            List of pattern, callable tuples
+    """
+
+    pattern_table_filters = []
+    pattern_table_filters.append(
+        (
+            GEMMParams.composite_name,
+            make_dense_pattern(),
+            lambda pat: GEMMParams(pat).is_valid(),
+        )
+    )
+
+    for pad in [True, False]:
+        for max_pool in [True, False]:
+            for relu6 in [True, False]:
+                pattern_table_filters.append(
+                    (
+                        CONV2DParams.composite_name,
+                        make_conv2d_pattern(
+                            with_padded_input=pad, with_maxpool=max_pool, with_relu_6=relu6
+                        ),
+                        lambda pat: CONV2DParams(pat).is_valid(),
+                    )
+                )
+
+    pattern_table_filters.append(
+        (
+            MaxPoolParams.composite_name,
+            make_maxpool_pattern(),
+            lambda pat: MaxPoolParams(pat).is_valid(),
+        )
+    )
+
+    if ENV.use_experimental_qnn_add:
+        pattern_table_filters.append(
+            (
+                AddParams.composite_name,
+                make_add_pattern(),
+                lambda pat: AddParams(pat).is_valid(),
+            )
+        )
+
+    return pattern_table_filters
diff --git a/python/tvm/contrib/gemmini/transform.py b/python/tvm/contrib/gemmini/transform.py
new file mode 100644
index 000000000000..bd377771ce32
--- /dev/null
+++ b/python/tvm/contrib/gemmini/transform.py
@@ -0,0 +1,782 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=len-as-condition, no-else-return, unused-argument, invalid-name
+"""
+Transformation passes for Gemmini
+=====================
+"""
+
+import ast
+from typing import Dict
+import tvm
+from tvm.tir.ir_builder import IRBuilder
+
+from .environment import Environment
+
+env = Environment.instance()
+
+
+def _get_counters(irb: IRBuilder):
+    """Generates calls to print the values of the configured timers
+
+    Args:
+        irb (IRBuilder): IRBuilder
+    """
+    irb.emit(tvm.tir.call_extern("", "counter_snapshot_take"))
+    irb.emit(tvm.tir.call_extern("", "printf", "Counter values:\\r\\n"))
+    counter_vars = []
+    for i, (_, value) in enumerate(env.enabled_counters.items()):
+        counter_var = irb.let(
+            value.lower() + "_var", tvm.tir.call_extern("uint32", "counter_read", i)
+        )
+        counter_vars.append(counter_var)
+        irb.emit(tvm.tir.call_extern("", "printf", tvm.tir.StringImm(f"{value},")))
+    irb.emit(tvm.tir.call_extern("", "printf", "\\r\\n"))
+    for c in counter_vars:
+        irb.emit(tvm.tir.call_extern("", "printf", tvm.tir.StringImm("%lu,"), c))
+    irb.emit(tvm.tir.call_extern("", "printf", "\\r\\n"))
+
+
+def _configure_timers(irb: IRBuilder):
+    """Generates calls to configure the enabled counters
+
+    Args:
+        irb (IRBuilder): IRBuilder
+    """
+    for i, (key, _) in enumerate(env.enabled_counters.items()):
+        irb.emit(tvm.tir.call_extern("", "counter_configure", i, key))
+
+
+def _reset_counters(irb: IRBuilder):
+    """Generates calls to reset all Gemmini counters
+
+    Args:
+        irb (IRBuilder): IRBuilder
+    """
+    irb.emit(tvm.tir.call_extern("", "counter_reset"))
+    irb.emit(tvm.tir.call_extern("", "counter_snapshot_reset"))
+
+
+def _match_pragma(stmt, key):
+    """Internal helper to match stmt to pragma stmt.
+
+    Parameters
+    ----------
+    stmt : Stmt
+        The AttrStmt
+
+    key : str
+        The pragma key
+    """
+    return (stmt.attr_key == "pragma_" + key) or (
+        stmt.attr_key == "pragma_scope" and stmt.value.value == key
+    )
+
+
+def _get_config_dict_from_str(str_value: str) -> Dict:
+    """Returns a configuration dictionary from its string representation
+
+    Args:
+        str_value (str): Dictionary encoded in a string
+
+    Returns:
+        Dict: Configuration dictionary
+    """
+    return ast.literal_eval(str(str_value).replace("'", '"').replace('"{', "{").replace('}"', "}"))
+
+
+def _gen_debug_header(irb: IRBuilder):
+    """If the debug flag is activated in the environment, generate the debug headers for the code
+
+    Args:
+        irb (IRBuilder): _description_
+    """
+    if env.debug:
+        _configure_timers(irb)
+        _reset_counters(irb)
+
+
+def _gen_debug_tail(irb: IRBuilder):
+    """If the debug flag is activated in the environment, generate the debug tails for the code
+
+    Args:
+        irb (IRBuilder): _description_
+    """
+    if env.debug:
+        _get_counters(irb)
+
+
+def InsertGemminiHeaderOperators():
+    """Pass to generate the calls to the Gemmini configuration instructions"""
+
+    def _do_fold(stmt):
+        if _match_pragma(stmt, "add_start"):
+            irb = tvm.tir.ir_builder.create()
+            _gen_debug_header(irb)
+
+            irb.emit(tvm.tir.call_extern("", "gemmini_flush", 0))
+
+            config_dict = _get_config_dict_from_str(stmt.body.value)
+            A_size = config_dict["A_size"]
+            B_size = config_dict["B_size"]
+            C_size = config_dict["C_size"]
+            A_private_stride = config_dict["A_private_stride"]
+            B_private_stride = config_dict["B_private_stride"]
+            execution_stride = config_dict["execution_stride"]
+            activation = config_dict["activation"]
+            mode = config_dict["mode"]
+            max_pixels_per_row = config_dict["max_pixels_per_row"]
+            ifm1_scale = config_dict["ifm1_scale"]
+            ifm2_scale = config_dict["ifm2_scale"]
+            scale = config_dict["scale"]
+            act = 1 if activation else 0
+
+            shrunk = 1
+            irb.emit(tvm.tir.call_extern("", "gemmini_config_ex", mode, act, 0))
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "gemmini_extended4_config_ld",
+                    A_size,
+                    ifm1_scale,
+                    shrunk,
+                    A_private_stride,
+                    0,
+                )
+            )
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "gemmini_extended4_config_ld",
+                    B_size,
+                    ifm2_scale,
+                    shrunk,
+                    B_private_stride,
+                    1,
+                )
+            )
+            irb.emit(
+                tvm.tir.call_extern(
+                    "", "gemmini_extended4_config_ld", C_size * 4, scale, 0, env.DIM, 2
+                )
+            )
+            irb.emit(tvm.tir.call_extern("", "gemmini_extended_config_st", C_size, act, scale))
+
+            return tvm.tir.SeqStmt([irb.get(), stmt])
+        elif _match_pragma(stmt, "gemm_start"):
+            irb = tvm.tir.ir_builder.create()
+            _gen_debug_header(irb)
+
+            irb.emit(tvm.tir.call_extern("", "gemmini_flush", 0))
+
+            config_dict = _get_config_dict_from_str(stmt.body.value)
+            A_size = config_dict["A_size"]
+            B_size = config_dict["B_size"]
+            C_size = config_dict["C_size"]
+            A_private_stride = config_dict["A_private_stride"]
+            B_private_stride = config_dict["B_private_stride"]
+            execution_stride = config_dict["execution_stride"]
+            activation = config_dict["activation"]
+            mode = config_dict["mode"]
+            max_pixels_per_row = config_dict["max_pixels_per_row"]
+            scale = config_dict["scale"]
+            padding_value = config_dict["padding_value"]
+            act = 1 if activation else 0
+
+            irb.emit(
+                tvm.tir.call_extern(
+                    "", "gemmini_extended_config_ex", mode, act, 0, execution_stride, 0, 0
+                )
+            )
+            if padding_value == 0:
+                irb.emit(
+                    tvm.tir.call_extern(
+                        "",
+                        "gemmini_extended5_config_ld",
+                        A_size,
+                        1.0,
+                        0,
+                        A_private_stride,
+                        max_pixels_per_row,
+                        0,
+                    )
+                )
+            else:
+                irb.emit(
+                    tvm.tir.call_extern(
+                        "",
+                        "gemmini_extended6_config_ld",
+                        A_size,
+                        1.0,
+                        0,
+                        A_private_stride,
+                        max_pixels_per_row,
+                        padding_value,
+                        0,
+                    )
+                )
+            irb.emit(
+                tvm.tir.call_extern(
+                    "", "gemmini_extended5_config_ld", B_size, 1.0, 0, B_private_stride, 1, 1
+                )
+            )
+            irb.emit(tvm.tir.call_extern("", "gemmini_extended4_config_ld", 0, 1.0, 0, env.DIM, 2))
+            irb.emit(tvm.tir.call_extern("", "gemmini_extended_config_st", C_size, act, scale))
+
+            return tvm.tir.SeqStmt([irb.get(), stmt])
+        elif _match_pragma(stmt, "gemm_cisc_start"):
+            irb = tvm.tir.ir_builder.create()
+            _gen_debug_header(irb)
+
+            irb.emit(tvm.tir.call_extern("", "gemmini_flush", 0))
+            return tvm.tir.SeqStmt([irb.get(), stmt])
+        elif _match_pragma(stmt, "conv2d_cisc_start") or _match_pragma(
+            stmt, "dw_conv2d_cisc_start"
+        ):
+            irb = tvm.tir.ir_builder.create()
+            _gen_debug_header(irb)
+
+            return tvm.tir.SeqStmt([irb.get(), stmt])
+        return None
+
+    def _ftransform(f, mod, ctx):
+        return f.with_body(
+            tvm.tir.stmt_functor.ir_transform(f.body, _do_fold, None, ["tir.AttrStmt"])
+        )
+
+    return tvm.tir.transform.prim_func_pass(
+        _ftransform, opt_level=0, name="tir.gemmini.insert_header_operators"
+    )
+
+
+def InsertGemminiFenceOperator():
+    """Pass to generate the call to the fence instruction at the end of the operator"""
+
+    def _do_fold(stmt):
+        if _match_pragma(stmt, "gemm_end"):
+            irb = tvm.tir.ir_builder.create()
+            irb.emit(tvm.tir.call_extern("", "gemmini_fence"))
+            _gen_debug_tail(irb)
+
+            return tvm.tir.SeqStmt([stmt, irb.get()])
+        return None
+
+    def _ftransform(f, mod, ctx):
+        return f.with_body(
+            tvm.tir.stmt_functor.ir_transform(f.body, _do_fold, None, ["tir.AttrStmt"])
+        )
+
+    return tvm.tir.transform.prim_func_pass(
+        _ftransform, opt_level=0, name="tir.gemmini.insert_fence_operators"
+    )
+
+
+def InjectAMVINIntrin():
+    """Pass to inject A mvin intrinsics.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The pass
+    """
+
+    def _inject_copy(src, dst, pad_before, pad_after, pad_value):
+        # TODO (FP): add padding support...
+        _ = pad_value
+        if dst.scope() == "global":
+            raise RuntimeError("A mvin should have a local destination")
+        if src.scope() == "global":
+            # Load
+            irb = tvm.tir.ir_builder.create()
+            if len(src.shape) == 1:
+                cols = 1
+            else:
+                cols = src.shape[1]
+            rows = src.shape[0]
+            dst_access_ptr = dst.access_ptr("w", "uint32")
+
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "gemmini_extended_mvin",
+                    src.access_ptr("r"),
+                    tvm.runtime.const(env.INP_SCR_BASE_ADDRESS, "uint8") + dst_access_ptr,
+                    cols,
+                    rows,
+                )
+            )
+
+            return irb.get()
+        else:
+            raise RuntimeError(f"Do not support copy {src.scope()}->{dst.scope()}")
+
+    return tvm.tir.transform.InjectCopyIntrin(env.A_mvin, _inject_copy)
+
+
+def InjectAMVINIntrinTransposed():
+    """Pass to inject A mvin intrinsics.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The pass
+    """
+
+    def _inject_copy(src, dst, pad_before, pad_after, pad_value):
+        # TODO (FP): add padding support...
+        _ = pad_value
+        if dst.scope() == "global":
+            raise RuntimeError("A mvin should have a local destination")
+        if src.scope() == "global":
+            # Load
+            irb = tvm.tir.ir_builder.create()
+            # TODO (FP): check this pointers types again!
+            if len(src.shape) == 1:
+                rows = 1
+            else:
+                rows = src.shape[1]
+            cols = src.shape[0]
+            dst_access_ptr = dst.access_ptr("w", "uint32")
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "gemmini_extended_mvin",
+                    src.access_ptr("r"),
+                    tvm.runtime.const(env.INP_SCR_BASE_ADDRESS, "uint8") + dst_access_ptr,
+                    cols,
+                    rows,
+                )
+            )
+            return irb.get()
+        else:
+            raise RuntimeError(f"Do not support copy {src.scope()}->{dst.scope()}")
+
+    return tvm.tir.transform.InjectCopyIntrin(env.A_mvin + "_t", _inject_copy)
+
+
+def InjectBMVINIntrin():
+    """Pass to inject B mvin intrinsics.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The pass
+    """
+
+    def _inject_copy(src, dst, pad_before, pad_after, pad_value):
+        # TODO (FP): add padding support...
+        _ = pad_value
+        wgt_base_address = tvm.runtime.const(env.WGT_SCR_BASE_ADDRESS, "int32")
+        if dst.scope() == "global":
+            raise RuntimeError("B mvin should have a local destination")
+        if src.scope() == "global":
+            # Load
+            irb = tvm.tir.ir_builder.create()
+            if len(src.shape) == 1:
+                cols = 1
+            else:
+                cols = src.shape[1]
+            rows = src.shape[0]
+            dst_access_ptr = dst.access_ptr("r", "int32")
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "gemmini_extended_mvin2",
+                    src.access_ptr("r"),
+                    wgt_base_address + dst_access_ptr,
+                    cols,
+                    rows,
+                )
+            )
+            return irb.get()
+        else:
+            raise RuntimeError(f"Do not support copy {src.scope()}->{dst.scope()}")
+
+    return tvm.tir.transform.InjectCopyIntrin(env.B_mvin, _inject_copy)
+
+
+def InjectBMVINIntrinTransposed():
+    """Pass to inject B mvin intrinsics.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The pass
+    """
+
+    def _inject_copy(src, dst, pad_before, pad_after, pad_value):
+        # TODO (FP): add padding support...
+        _ = pad_value
+        if dst.scope() == "global":
+            raise RuntimeError("B mvin should have a local destination")
+        if src.scope() == "global":
+            # Load
+            irb = tvm.tir.ir_builder.create()
+            if len(src.shape) == 1:
+                rows = 1
+            else:
+                rows = src.shape[1]
+            cols = src.shape[0]
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "gemmini_extended_mvin2",
+                    src.access_ptr("r"),
+                    tvm.runtime.const(env.WGT_SCR_BASE_ADDRESS, "uint32")
+                    + dst.access_ptr("w", "uint32"),
+                    cols,
+                    rows,
+                )
+            )
+            return irb.get()
+        else:
+            raise RuntimeError(f"Do not support copy {src.scope()}->{dst.scope()}")
+
+    return tvm.tir.transform.InjectCopyIntrin(env.B_mvin + "_t", _inject_copy)
+
+
+def InjectDMVINIntrin():
+    """Pass to inject D mvin intrinsics.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The pass
+    """
+
+    def _inject_copy(src, dst, pad_before, pad_after, pad_value):
+        # TODO (FP): add padding support...
+        _ = pad_value
+        if dst.scope() == "global":
+            raise RuntimeError("D mvin should have a local destination")
+        if src.scope() == "global":
+            # Load
+            irb = tvm.tir.ir_builder.create()
+            if len(src.shape) == 1:
+                cols = 1
+            else:
+                cols = src.shape[1]
+            rows = src.shape[0]
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "gemmini_extended_mvin3",
+                    src.access_ptr("r"),
+                    tvm.runtime.const(env.OUT_ACC_BASE_ADDRESS, "uint32")
+                    + dst.access_ptr("w", "uint32")
+                    - tvm.runtime.const(0x40000000, "uint32"),
+                    cols,
+                    rows,
+                )
+            )
+            return irb.get()
+        else:
+            raise RuntimeError(f"Do not support copy {src.scope()}->{dst.scope()}")
+
+    return tvm.tir.transform.InjectCopyIntrin(env.D_mvin, _inject_copy)
+
+
+def InjectDMVINIntrinTransposed():
+    """Pass to inject D mvin intrinsics.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The pass
+    """
+
+    def _inject_copy(src, dst, pad_before, pad_after, pad_value):
+        # TODO (FP): add padding support...
+        _ = pad_value
+        if dst.scope() == "global":
+            raise RuntimeError("D mvin should have a local destination")
+        if src.scope() == "global":
+            # Load
+            irb = tvm.tir.ir_builder.create()
+            if len(src.shape) == 1:
+                rows = 1
+            else:
+                rows = src.shape[1]
+            cols = src.shape[0]
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "gemmini_extended_mvin3",
+                    src.access_ptr("r"),
+                    tvm.runtime.const(env.OUT_ACC_BASE_ADDRESS, "uint32")
+                    + dst.access_ptr("w", "uint32")
+                    - tvm.runtime.const(0x40000000, "uint32"),
+                    cols,
+                    rows,
+                )
+            )
+            return irb.get()
+        else:
+            raise RuntimeError(f"Do not support copy {src.scope()}->{dst.scope()}")
+
+    return tvm.tir.transform.InjectCopyIntrin(env.D_mvin + "_t", _inject_copy)
+
+
+def InjectCMVOUTIntrin():
+    """Pass to inject C mvout intrinsics.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The pass
+    """
+
+    def _inject_copy(src, dst, pad_before, pad_after, pad_value):
+        # TODO (FP): add padding support...
+        _ = pad_value
+        if src.scope() == "global":
+            raise RuntimeError("C mvout should have a local source")
+        if dst.scope() == "global":
+            # Store
+            irb = tvm.tir.ir_builder.create()
+            if len(dst.shape) == 1:
+                cols = 1
+            else:
+                cols = dst.shape[1]
+            rows = dst.shape[0]
+            out_access_ptr = src.access_ptr("w", "uint32")
+            get_full_width = tvm.runtime.const(0x00000000, "uint32")
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "gemmini_extended_mvout",
+                    dst.access_ptr("w"),
+                    tvm.runtime.const(env.OUT_ACC_BASE_ADDRESS, "uint32")
+                    + out_access_ptr
+                    - tvm.runtime.const(0x40000000, "uint32")
+                    + get_full_width,
+                    cols,
+                    rows,
+                )
+            )
+            return irb.get()
+        else:
+            raise RuntimeError(f"Do not support copy {src.scope()}->{dst.scope()}")
+
+    return tvm.tir.transform.InjectCopyIntrin(env.C_mvout, _inject_copy)
+
+
+def InjectCMVOUTIntrinTransposed():
+    """Pass to inject C mvout intrinsics.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The pass
+    """
+
+    def _inject_copy(src, dst, pad_before, pad_after, pad_value):
+        # TODO (FP): add padding support...
+        _ = pad_value
+        if src.scope() == "global":
+            raise RuntimeError("C mvout should have a local source")
+        if dst.scope() == "global":
+            # Store
+            irb = tvm.tir.ir_builder.create()
+            # TODO (FP): check this pointers types again!
+            if len(dst.shape) == 1:
+                rows = 1
+            else:
+                rows = dst.shape[1]
+            cols = dst.shape[0]
+            out_access_ptr = src.access_ptr("w", "uint32")
+            get_full_width = tvm.runtime.const(0x00000000, "uint32")
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "gemmini_extended_mvout",
+                    dst.access_ptr("w"),
+                    tvm.runtime.const(env.OUT_ACC_BASE_ADDRESS, "uint32")
+                    + out_access_ptr
+                    - tvm.runtime.const(0x40000000, "uint32")
+                    + get_full_width,
+                    cols,
+                    rows,
+                )
+            )
+            return irb.get()
+        else:
+            raise RuntimeError(f"Do not support copy {src.scope()}->{dst.scope()}")
+
+    return tvm.tir.transform.InjectCopyIntrin(env.C_mvout + "_t", _inject_copy)
+
+
+def InjectCMVINIntrin():
+    """Pass to inject C mvin intrinsics.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The pass
+    """
+
+    def _inject_copy(src, dst, pad_before, pad_after, pad_value):
+        # TODO (FP): add padding support...
+        _ = pad_value
+        if dst.scope() == "global":
+            raise RuntimeError("C mvin should have a local destination")
+        if src.scope() == "global":
+            # Load
+            irb = tvm.tir.ir_builder.create()
+            if len(src.shape) == 1:
+                cols = 1
+            else:
+                cols = src.shape[1]
+            rows = src.shape[0]
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "gemmini_extended_mvin",
+                    src.access_ptr("r"),
+                    tvm.runtime.const(env.OUT_ACC_BASE_ADDRESS, "uint32")
+                    + dst.access_ptr("w", "uint32")
+                    - tvm.runtime.const(0x40000000, "uint32"),
+                    cols,
+                    rows,
+                )
+            )
+            return irb.get()
+        else:
+            raise RuntimeError(f"Do not support copy {src.scope()}->{dst.scope()}")
+
+    return tvm.tir.transform.InjectCopyIntrin(env.C_mvin, _inject_copy)
+
+
+def InjectCMVINIntrinTransposed():
+    """Pass to inject C mvin intrinsics.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The pass
+    """
+
+    def _inject_copy(src, dst, pad_before, pad_after, pad_value):
+        # TODO (FP): add padding support...
+        _ = pad_value
+        if dst.scope() == "global":
+            raise RuntimeError("C mvin should have a local destination")
+        if src.scope() == "global":
+            # Load
+            irb = tvm.tir.ir_builder.create()
+            if len(src.shape) == 1:
+                rows = 1
+            else:
+                rows = src.shape[1]
+            cols = src.shape[0]
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "gemmini_extended_mvin",
+                    src.access_ptr("r"),
+                    tvm.runtime.const(env.OUT_ACC_BASE_ADDRESS, "uint32")
+                    + dst.access_ptr("w", "uint32")
+                    - tvm.runtime.const(0x40000000, "uint32"),
+                    cols,
+                    rows,
+                )
+            )
+            return irb.get()
+        else:
+            raise RuntimeError(f"Do not support copy {src.scope()}->{dst.scope()}")
+
+    return tvm.tir.transform.InjectCopyIntrin(env.C_mvin + "_t", _inject_copy)
+
+
+def InjectCMVINAccumIntrin():
+    """Pass to inject C mvin accum intrinsics.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The pass
+    """
+
+    def _inject_copy(src, dst, pad_before, pad_after, pad_value):
+        # TODO (FP): add padding support...
+        _ = pad_value
+        if dst.scope() == "global":
+            raise RuntimeError("C mvin should have a local destination")
+        if src.scope() == "global":
+            # Load
+            irb = tvm.tir.ir_builder.create()
+            if len(src.shape) == 1:
+                cols = 1
+            else:
+                cols = src.shape[1]
+            rows = src.shape[0]
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "gemmini_extended_mvin3",
+                    src.access_ptr("r"),
+                    tvm.runtime.const(env.OUT_ACC_BASE_ADDRESS, "uint32")
+                    + dst.access_ptr("w", "uint32"),
+                    cols,
+                    rows,
+                )
+            )
+            return irb.get()
+        else:
+            raise RuntimeError(f"Do not support copy {src.scope()}->{dst.scope()}")
+
+    return tvm.tir.transform.InjectCopyIntrin(env.C_mvin_accum, _inject_copy)
+
+
+def InjectCMVINAccumIntrinTransposed():
+    """Pass to inject C mvin accum intrinsics.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The pass
+    """
+
+    def _inject_copy(src, dst, pad_before, pad_after, pad_value):
+        # TODO (FP): add padding support...
+        _ = pad_value
+        if dst.scope() == "global":
+            raise RuntimeError("C mvin should have a local destination")
+        if src.scope() == "global":
+            # Load
+            irb = tvm.tir.ir_builder.create()
+            if len(src.shape) == 1:
+                rows = 1
+            else:
+                rows = src.shape[1]
+            cols = src.shape[0]
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "gemmini_extended_mvin3",
+                    src.access_ptr("r"),
+                    tvm.runtime.const(env.OUT_ACC_BASE_ADDRESS, "uint32")
+                    + dst.access_ptr("w", "uint32"),
+                    cols,
+                    rows,
+                )
+            )
+            return irb.get()
+        else:
+            raise RuntimeError(f"Do not support copy {src.scope()}->{dst.scope()}")
+
+    return tvm.tir.transform.InjectCopyIntrin(env.C_mvin_accum + "_t", _inject_copy)
diff --git a/python/tvm/contrib/gemmini/utils.py b/python/tvm/contrib/gemmini/utils.py
new file mode 100644
index 000000000000..22428ee2d3c7
--- /dev/null
+++ b/python/tvm/contrib/gemmini/utils.py
@@ -0,0 +1,141 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Useful enumerations and others
+=====================
+"""
+
+from enum import Enum
+
+COUNTERS = {
+    1: "MAIN_LD_CYCLES",
+    2: "MAIN_ST_CYCLES",
+    3: "MAIN_EX_CYCLES",
+    4: "MAIN_LD_ST_CYCLES",
+    5: "MAIN_LD_EX_CYCLES",
+    6: "MAIN_ST_EX_CYCLES",
+    7: "MAIN_LD_ST_EX_CYCLES",
+    8: "LOAD_DMA_WAIT_CYCLE",
+    9: "LOAD_ACTIVE_CYCLE",
+    10: "LOAD_SCRATCHPAD_WAIT_CYCLE",
+    11: "STORE_DMA_WAIT_CYCLE",
+    12: "STORE_ACTIVE_CYCLE",
+    13: "STORE_POOLING_CYCLE",
+    14: "STORE_SCRATCHPAD_WAIT_CYCLE",
+    15: "DMA_TLB_MISS_CYCLE",
+    16: "DMA_TLB_HIT_REQ",
+    17: "DMA_TLB_TOTAL_REQ",
+    18: "RDMA_ACTIVE_CYCLE",
+    19: "RDMA_TLB_WAIT_CYCLES",
+    20: "RDMA_TL_WAIT_CYCLES",
+    21: "WDMA_ACTIVE_CYCLE",
+    22: "WDMA_TLB_WAIT_CYCLES",
+    23: "WDMA_TL_WAIT_CYCLES",
+    24: "EXE_ACTIVE_CYCLE",
+    25: "EXE_FLUSH_CYCLE",
+    26: "EXE_CONTROL_Q_BLOCK_CYCLE",
+    27: "EXE_PRELOAD_HAZ_CYCLE",
+    28: "EXE_OVERLAP_HAZ_CYCLE",
+    29: "SCRATCHPAD_A_WAIT_CYCLE",
+    30: "SCRATCHPAD_B_WAIT_CYCLE",
+    31: "SCRATCHPAD_D_WAIT_CYCLE",
+    32: "ACC_A_WAIT_CYCLE",
+    33: "ACC_B_WAIT_CYCLE",
+    34: "ACC_D_WAIT_CYCLE",
+    35: "A_GARBAGE_CYCLES",
+    36: "B_GARBAGE_CYCLES",
+    37: "D_GARBAGE_CYCLES",
+    38: "IM2COL_MEM_CYCLES",
+    39: "IM2COL_ACTIVE_CYCLES",
+    40: "IM2COL_TRANSPOSER_WAIT_CYCLE",
+    41: "RESERVATION_STATION_FULL_CYCLES",
+    42: "RESERVATION_STATION_ACTIVE_CYCLES",
+    43: "LOOP_MATMUL_ACTIVE_CYCLES",
+    44: "TRANSPOSE_PRELOAD_UNROLLER_ACTIVE_CYCLES",
+    45: "RESERVATION_STATION_LD_COUNT",
+    46: "RESERVATION_STATION_ST_COUNT",
+    47: "RESERVATION_STATION_EX_COUNT",
+    48: "RDMA_BYTES_REC",
+    49: "WDMA_BYTES_SENT",
+    50: "RDMA_TOTAL_LATENCY",
+    51: "WDMA_TOTAL_LATENCY",
+}
+
+
+class ClipArgs(Enum):
+    """
+    This is a helper enums to obtain the correct index
+    of clip arguments.
+    """
+
+    A_MIN = 1
+    A_MAX = 2
+
+
+class BinaryElementwiseArgs(Enum):
+    """This is a helper enums to access the correct index
+    of binary elementwise arguments
+    """
+
+    IFM1 = 0
+    IFM2 = 1
+    IFM1_SCALE = 2
+    IFM1_ZERO_POINT = 3
+    IFM2_SCALE = 4
+    IFM2_ZERO_POINT = 5
+    OFM_SCALE = 6
+    OFM_ZERO_POINT = 7
+
+
+class QDenseArgs(Enum):
+    """
+    This is a helper enum to access the correct index of
+    qnn.dense arguments
+    """
+
+    IFM = 0
+    WEIGHTS = 1
+    IFM_ZERO_POINT = 2
+    WEIGHTS_ZERO_POINT = 3
+    IFM_SCALE = 4
+    WEIGHTS_SCALE = 5
+
+
+class QConv2DArgs(Enum):
+    """
+    This is a helper enum to obtain the correct index
+    of qnn.conv2d arguments.
+    """
+
+    IFM = 0
+    WEIGHTS = 1
+    IFM_ZERO_POINT = 2
+    WEIGHTS_ZERO_POINT = 3
+    IFM_SCALE = 4
+    WEIGHTS_SCALE = 5
+
+
+class RequantArgs(Enum):
+    """
+    This is a helper enum to obtain the correct index
+    of qnn.requantize arguments.
+    """
+
+    IFM_SCALE = 1
+    IFM_ZERO_POINT = 2
+    OFM_SCALE = 3
+    OFM_ZERO_POINT = 4
diff --git a/python/tvm/micro/build.py b/python/tvm/micro/build.py
index df7d1fc7196d..d20f03e75492 100644
--- a/python/tvm/micro/build.py
+++ b/python/tvm/micro/build.py
@@ -40,6 +40,7 @@ class MicroTVMTemplateProject(enum.Enum):
     ZEPHYR = "zephyr"
     ARDUINO = "arduino"
     CRT = "crt"
+    GEMMINI = "gemmini"
 
     @classmethod
     def list(cls):
diff --git a/python/tvm/micro/model_library_format.py b/python/tvm/micro/model_library_format.py
index e54f4bfed1dd..4248064252d2 100644
--- a/python/tvm/micro/model_library_format.py
+++ b/python/tvm/micro/model_library_format.py
@@ -576,7 +576,7 @@ def _export_operator_model_library_format(mod: build_module.OperatorModule, temp
     """
     targets = []
     for target in mod.ir_module_by_target.keys():
-        if str(target.kind) not in ("llvm", "c"):
+        if str(target.kind) not in ("llvm", "c", "gemmini"):
             raise UnsupportedInModelLibraryFormatError(
                 f"Operator has non-DSO-exportable target {target!s}, which is not yet supported in "
                 "Model Library Format"
diff --git a/python/tvm/micro/testing/utils.py b/python/tvm/micro/testing/utils.py
index 755a85839d02..88aa7ac2d423 100644
--- a/python/tvm/micro/testing/utils.py
+++ b/python/tvm/micro/testing/utils.py
@@ -168,7 +168,7 @@ def create_header_file(
     header_file.write("#include <stddef.h>\n")
     header_file.write("#include <stdint.h>\n")
     header_file.write("#include <dlpack/dlpack.h>\n")
-    header_file.write(f"const size_t {tensor_name}_len = {npy_data.size};\n")
+    header_file.write(f"#define {tensor_name.upper()}_LEN {npy_data.size}\n")
     header_file.write(f"{_npy_dtype_to_ctype(npy_data)} {tensor_name}[] =")
 
     header_file.write("{")
diff --git a/python/tvm/relay/backend/contrib/gemmini/__init__.py b/python/tvm/relay/backend/contrib/gemmini/__init__.py
new file mode 100644
index 000000000000..6cb685ffe3d1
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/gemmini/__init__.py
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Gemmini operators compute and schedule declarations
+=====================
+"""
+
+from . import op
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_add.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_add.py
new file mode 100644
index 000000000000..90a8eb72088b
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_add.py
@@ -0,0 +1,209 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIsch, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=unused-argument
+"""
+Add operator declaration and schedule registration for Gemmini
+=====================
+"""
+
+import numpy as np
+import tvm
+from tvm import te
+from tvm import autotvm
+from tvm import topi
+
+from tvm.contrib.gemmini.environment import Environment
+from tvm.contrib.gemmini.helpers import get_greater_div
+
+
+ENV = Environment.instance()
+
+
+@autotvm.register_topi_compute("contrib.gemmini.add")
+def add(
+    cfg: tvm.autotvm.task.space.FallbackConfigEntity,
+    ifm1: tvm.te.tensor.Tensor,
+    ifm2: tvm.te.tensor.Tensor,
+    ofm_offset: tvm.te.tensor.Tensor,
+    ifm1_scale: float,
+    ifm2_scale: float,
+) -> tvm.te.tensor.Tensor:
+    """Computation definition for Gemmini's add operator
+
+    Args:
+        cfg (tvm.autotvm.task.space.FallbackConfigEntity): AutoTVM configuration entity
+        ifm1 (tvm.te.tensor.Tensor): input tensor 1
+        ifm2 (tvm.te.tensor.Tensor): input tensor 2
+        ofm_offset (tvm.te.tensor.Tensor): offset tensor
+        ifm1_scale (float): scaling factor for input tensor 1
+        ifm2_scale (float): scaling factor for input tensor 2
+
+    Raises:
+        topi.InvalidShapeError: if input shapes are not supported
+
+    Returns:
+        tvm.te.tensor.Tensor: add operator result
+    """
+
+    # Make sure that the input shapes make sense
+    if len(ifm1.shape) != 4 or len(ifm2.shape) != 4 or len(ofm_offset.shape) != 4:
+        raise topi.InvalidShapeError()
+
+    # Derive shapes
+    oshape = topi.utils.get_const_tuple(ifm1.shape)
+
+    tensor_type = ENV.inp_dtype
+
+    ofm_offset_stage = te.compute(
+        oshape,
+        lambda b, x, y, c: ofm_offset[b, x, y, c].astype(tensor_type),
+        name="ofm_offset.local",
+        tag="ofm_offset",
+    )
+    ifm2_stage = te.compute(
+        oshape,
+        lambda b, x, y, c: ifm2[b, x, y, c].astype(tensor_type)
+        + ofm_offset_stage[b, x, y, c].astype(tensor_type),
+        name="ifm2.local",
+        tag="ifm2",
+    )
+    res = te.compute(
+        oshape,
+        lambda b, x, y, c: ifm1[b, x, y, c].astype(tensor_type)
+        + ifm2_stage[b, x, y, c].astype(tensor_type),
+        name="res",
+        tag="add",
+        attrs={
+            "ifm1_scale": ifm1_scale,
+            "ifm2_scale": ifm2_scale,
+        },
+    )
+
+    cfg.add_flop(
+        3 * np.prod(topi.utils.get_const_tuple(oshape))
+        + 2  # element additions needed
+        * np.prod(
+            topi.utils.get_const_tuple(oshape)
+        )  # element multiplications needed (input scaling)
+    )
+
+    return res
+
+
+@autotvm.register_topi_schedule("contrib.gemmini.add")
+def schedule_add(
+    cfg: tvm.autotvm.task.space.FallbackConfigEntity, outs: tvm.ir.container.Array
+) -> tvm.te.schedule.Schedule:
+    """Schedule definition for Gemmini's add operator
+
+    Args:
+        cfg (tvm.autotvm.task.space.FallbackConfigEntity): AutoTVM configuration entity
+        outs (tvm.ir.container.Array): Output tensors
+
+    Returns:
+        tvm.te.schedule.Schedule: transformed schedule
+    """
+
+    assert len(outs) == 1
+    output = outs[0]
+
+    add_stage = output.op.output(0)
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    sch = te.create_schedule([x.op for x in outs])
+
+    ifm1, ifm2_op = add_stage.op.input_tensors
+    ifm2, ofm_offset_op = ifm2_op.op.input_tensors
+
+    # Prepare the scope of each buffer
+    cifm1 = sch.cache_read(ifm1, ENV.acc_scope, [add_stage])
+    sch[ifm2_op].set_scope(ENV.acc_scope)
+    sch[ofm_offset_op].set_scope(ENV.acc_scope)
+
+    # Split axis, taking into account the maximum value of rows and columns
+    # that can be moved into Gemminis accumulator (DIM)
+    y_factor = get_greater_div(int(sch[add_stage].op.axis[3].dom.extent))
+    x_factor = get_greater_div(int(sch[add_stage].op.axis[2].dom.extent))
+    y_o, y_i = sch[add_stage].split(sch[add_stage].op.axis[3], factor=y_factor)
+    x_o, x_i = sch[add_stage].split(sch[add_stage].op.axis[2], factor=x_factor)
+    sch[add_stage].reorder(x_o, y_o, x_i, y_i)
+
+    # Compute the stages in the correct position
+    sch[cifm1].compute_at(sch[add_stage], y_o)
+    sch[ifm2_op].compute_at(sch[add_stage], y_o)
+    sch[ofm_offset_op].compute_at(sch[add_stage], y_o)
+
+    # Split axis, taking into account the maximum value of rows and columns
+    # that can be moved into Gemminis accumulator (DIM)
+    cifm1_ax_0_1, cifm1_ax_0_2 = sch[cifm1].split(sch[cifm1].op.axis[2], factor=ENV.DIM)
+    cifm1_ax_1_1, cifm1_ax_1_2 = sch[cifm1].split(
+        sch[cifm1].op.axis[3], factor=ENV.MAX_BLOCK_LEN_ACC * ENV.DIM
+    )
+    sch[cifm1].reorder(cifm1_ax_0_1, cifm1_ax_1_1, cifm1_ax_0_2, cifm1_ax_1_2)
+
+    cifm2_ax_0_1, cifm2_ax_0_2 = sch[ifm2_op].split(sch[ifm2_op].op.axis[2], factor=ENV.DIM)
+    cifm2_ax_1_1, cifm2_ax_1_2 = sch[ifm2_op].split(
+        sch[ifm2_op].op.axis[3], factor=ENV.MAX_BLOCK_LEN_ACC * ENV.DIM
+    )
+    sch[ifm2_op].reorder(cifm2_ax_0_1, cifm2_ax_1_1, cifm2_ax_0_2, cifm2_ax_1_2)
+
+    cofm_offset_ax_0_1, cofm_offset_ax_0_2 = sch[ofm_offset_op].split(
+        sch[ofm_offset_op].op.axis[2], factor=ENV.DIM
+    )
+    cofm_offset_ax_1_1, cofm_offset_ax_1_2 = sch[ofm_offset_op].split(
+        sch[ofm_offset_op].op.axis[3], factor=ENV.MAX_BLOCK_LEN_ACC * ENV.DIM
+    )
+    sch[ofm_offset_op].reorder(
+        cofm_offset_ax_0_1, cofm_offset_ax_1_1, cofm_offset_ax_0_2, cofm_offset_ax_1_2
+    )
+
+    # Set pragmas to insert mvin instructions
+    oshape = (x_factor, y_factor)
+    if x_factor == 1:
+        sch[cifm1].pragma(cifm1_ax_0_2, ENV.C_mvin + "_t")
+        sch[ofm_offset_op].pragma(cofm_offset_ax_0_2, ENV.C_mvin_accum + "_t")
+    else:
+        sch[cifm1].pragma(cifm1_ax_0_2, ENV.C_mvin)
+        sch[ofm_offset_op].pragma(cofm_offset_ax_0_2, ENV.C_mvin_accum)
+
+    # Tensorize
+    sch[ifm2_op].tensorize(cifm2_ax_0_2, ENV.add_tensorize(oshape))
+    sch[add_stage].tensorize(x_i, ENV.add_mvout_tensorize(oshape))
+
+    # Create configuration dictionary
+    config_dict = {}
+    config_dict["A_size"] = int(ifm1.shape[3])
+    config_dict["B_size"] = int(ifm2.shape[3])
+    config_dict["C_size"] = int(output.shape[3])
+    config_dict["A_private_stride"] = ENV.DIM
+    config_dict["B_private_stride"] = ENV.DIM
+    config_dict["execution_stride"] = 1
+    config_dict["activation"] = 0
+    config_dict["mode"] = ENV.WEIGHT_STATIONARY
+    config_dict["max_pixels_per_row"] = 1
+    config_dict["ifm1_scale"] = float(add_stage.op.attrs["ifm1_scale"])
+    config_dict["ifm2_scale"] = float(add_stage.op.attrs["ifm2_scale"])
+    config_dict["scale"] = 1.0
+
+    # Set pragmas to configure the start and end of the Gemmini code
+    sch[output].pragma(sch[output].op.axis[0], "add_start")
+    sch[output].pragma(sch[output].op.axis[0], "configs", str(config_dict))
+    sch[output].pragma(sch[output].op.axis[0], "gemm_end")
+
+    # print(lower(sch,[ifm1,ifm2,ofm_offset,output]))
+    # breakpoint()
+
+    return sch
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_conv2d_cisc.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_conv2d_cisc.py
new file mode 100644
index 000000000000..44d10ca89306
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_conv2d_cisc.py
@@ -0,0 +1,238 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIsch, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=unused-argument
+"""
+Conv2d operator declaration and schedule registration for Gemmini's CISC instructions
+=====================
+"""
+
+import numpy as np
+import tvm
+from tvm import te
+from tvm import autotvm
+from tvm import topi
+
+from tvm.contrib.gemmini.environment import Environment
+
+ENV = Environment.instance()
+
+
+@autotvm.register_topi_compute("contrib.gemmini.conv2d_cisc")
+def conv2d_cisc(
+    cfg: tvm.autotvm.task.space.FallbackConfigEntity,
+    orig_data: tvm.te.tensor.Tensor,
+    kernel: tvm.te.tensor.Tensor,
+    bias: tvm.te.tensor.Tensor,
+    strides: tvm.ir.container.Array,
+    padding: tvm.ir.container.Array,
+    ifm_offset: int,
+    activation: int,
+    gemmini_scale: float,
+    pool_size: tvm.ir.container.Array,
+    pool_strides: tvm.ir.container.Array,
+    pool_dilation: tvm.ir.container.Array,
+    pool_padding: tvm.ir.container.Array,
+) -> tvm.te.tensor.Tensor:
+    """Computation definition for Gemmini's conv2d operator using CISC instructions
+
+    Args:
+        cfg (tvm.autotvm.task.space.FallbackConfigEntity): AutoTVM configuration entity
+        orig_data (tvm.te.tensor.Tensor): Input feature map
+        kernel (tvm.te.tensor.Tensor): Layer weights
+        bias (tvm.te.tensor.Tensor): Layer biases
+        strides (tvm.ir.container.Array): convolution strides
+        padding (tvm.ir.container.Array): input feature map padding
+        ifm_offset (int): input feature map offset (used for the padding of the input feature map)
+        activation (int): has activation?
+        gemmini_scale (float): output scaling factor
+        pool_size (tvm.ir.container.Array): size of the output pooling window
+        pool_strides (tvm.ir.container.Array): strides for the output pooling window
+        pool_dilation (tvm.ir.container.Array): dilation for the output pooling window (not used!)
+        pool_padding (tvm.ir.container.Array): padding for the output pooling window
+
+    Returns:
+        tvm.te.tensor.Tensor: conv2d operator result
+    """
+    assert len(orig_data.shape) == 4
+    assert len(kernel.shape) == 4
+    assert len(bias.shape) == 1
+    assert (
+        orig_data.shape[1] == orig_data.shape[2]
+    ), "GEMMINIs Conv2d CISC schedule only supports square inputs!"
+
+    o_c = kernel.shape[3]
+    k_h = kernel.shape[0]
+    k_w = kernel.shape[1]
+
+    n = orig_data.shape[0]
+    i_h = orig_data.shape[1]
+    i_w = orig_data.shape[2]
+    i_c = orig_data.shape[3]
+
+    hstr = strides[0]
+    wstr = strides[1]
+    top_pad = padding[0]
+    left_pad = padding[1]
+    bottom_pad = padding[2]
+    right_pad = padding[3]
+
+    o_h = topi.utils.get_const_int(tvm.tir.div((i_h + (top_pad + bottom_pad) - k_h), hstr) + 1)
+    o_w = topi.utils.get_const_int(tvm.tir.div((i_w + (left_pad + right_pad) - k_w), wstr) + 1)
+
+    ric = te.reduce_axis((0, i_c), name="ric")
+    rkh = te.reduce_axis((0, k_h), name="rkh")
+    rkw = te.reduce_axis((0, k_w), name="rkw")
+
+    oshape = (n, o_h, o_w, o_c)
+
+    if len(set(padding)) == 1 and (ENV.supports_non_zero_padding or ifm_offset == 0):
+        # If the padding is the same for all borders, there is no need to use topi.nn.pad,
+        # because Gemminis CISC instructions support equal padding
+        data = orig_data
+    else:
+        # If not, then pad before calling Gemminis functions
+        data = topi.nn.pad(
+            orig_data,
+            [0, top_pad, left_pad, 0],
+            [0, bottom_pad, right_pad, 0],
+            pad_value=ifm_offset,
+            name="pad_data",
+        )
+
+    res = te.compute(
+        oshape,
+        lambda b_o, i, j, c_o: te.sum(
+            data[b_o, i * hstr + rkh, j * wstr + rkw, ric].astype(ENV.inp_dtype)
+            * kernel[rkh, rkw, ric, c_o].astype(ENV.inp_dtype)
+            + bias[c_o].astype(ENV.inp_dtype),
+            axis=[rkh, rkw, ric],
+        ),
+        name="res",
+        tag="conv2d",
+        attrs={
+            "activation": activation,
+            "strides": [hstr, wstr],
+            "padding": padding,
+            "padding_value": ifm_offset,
+            "scale": gemmini_scale,
+            "pool_size": pool_size,
+            "pool_strides": pool_strides,
+            "pool_dilation": pool_dilation,
+            "pool_padding": pool_padding,
+        },
+    )
+
+    cfg.add_flop(
+        np.prod(topi.utils.get_const_tuple(oshape)) * k_h * k_w * i_c
+        + np.prod(topi.utils.get_const_tuple(oshape))
+        * (k_h * k_w * i_c - 1)  # Multiplications and additions needed
+        + np.prod(  # Additions needed
+            topi.utils.get_const_tuple(oshape)
+        )  # Output scaling multiplications
+    )
+
+    return res
+
+
+@autotvm.register_topi_schedule("contrib.gemmini.conv2d_cisc")
+def schedule_conv2d_cisc(
+    cfg: tvm.autotvm.task.space.FallbackConfigEntity, outs: tvm.ir.container.Array
+) -> tvm.te.schedule.Schedule:
+    """Schedule definition for Gemmini's conv2d operator using CISC instructions
+
+    Args:
+        cfg (tvm.autotvm.task.space.FallbackConfigEntity): AutoTVM configuration entity
+        outs (tvm.ir.container.Array): Output tensors
+
+    Returns:
+        tvm.te.schedule.Schedule: transformed schedule
+    """
+    assert len(outs) == 1
+    output = outs[0]
+    const_ops = []
+    ewise_inputs = []
+    ewise_ops = []
+    conv2d_res = []
+
+    def _traverse(op):
+        if topi.tag.is_broadcast(op.tag):
+            if not op.same_as(output.op):
+                if not op.axis:
+                    const_ops.append(op)
+                else:
+                    ewise_ops.append(op)
+            for tensor in op.input_tensors:
+                if isinstance(tensor.op, tvm.te.PlaceholderOp):
+                    ewise_inputs.append((op, tensor))
+                else:
+                    _traverse(tensor.op)
+        else:
+            if op.tag == "conv2d":
+                conv2d_res.append(op)
+            else:
+                for tensor in op.input_tensors:
+                    _traverse(tensor.op)
+
+    _traverse(output.op)
+    assert len(conv2d_res) == 1
+    conv2d_stage = conv2d_res[0].output(0)
+    sch = te.create_schedule(output.op)
+
+    data, kernel, bias = conv2d_stage.op.input_tensors
+
+    if isinstance(data.op, tvm.te.ComputeOp) and "pad" in data.op.tag:
+        temp = data.op.input_tensors[0]
+        pad_data = data
+        data = temp
+    else:
+        pad_data = data
+
+    x_bo, _, _, _ = sch[conv2d_stage].op.axis
+
+    x_bo_o, x_bo_i = sch[conv2d_stage].split(x_bo, factor=pad_data.shape[0])
+
+    axis_for_start = x_bo_o
+
+    # If topi.nn.pad was added, its because the padding was not equal in all dimensions.
+    padding = conv2d_stage.op.attrs["padding"] if pad_data == data else [0, 0, 0, 0]
+    padding_value = conv2d_stage.op.attrs["padding_value"] if pad_data == data else 0
+
+    # Apply tensorization
+    sch[conv2d_stage].tensorize(
+        x_bo_i,
+        ENV.conv2d_cisc(
+            pad_data.shape,
+            kernel.shape,
+            bias.shape,
+            conv2d_stage.shape,
+            conv2d_stage.op.attrs["strides"],
+            padding,
+            padding_value,
+            conv2d_stage.op.attrs["activation"],
+            conv2d_stage.op.attrs["scale"],
+            conv2d_stage.op.attrs["pool_size"],
+            conv2d_stage.op.attrs["pool_strides"],
+            conv2d_stage.op.attrs["pool_dilation"],
+            conv2d_stage.op.attrs["pool_padding"],
+        ),
+    )
+
+    # Tag loops with pragmas to delimit the start and end of the Gemmini related code
+    sch[conv2d_stage].pragma(axis_for_start, "conv2d_cisc_start")
+    sch[conv2d_stage].pragma(axis_for_start, "gemm_end")
+
+    return sch
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py
new file mode 100644
index 000000000000..e9da2903bc87
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py
@@ -0,0 +1,378 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIsch, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=unused-argument
+"""
+Dense (GEMM) operator declaration and schedule registration for Gemmini's intrinsic instructions
+=====================
+"""
+
+import numpy as np
+import tvm
+from tvm import te
+from tvm import autotvm
+from tvm import topi
+from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
+
+from tvm.contrib.gemmini.environment import Environment
+from tvm.contrib.gemmini.helpers import get_greater_div
+
+ENV = Environment.instance()
+
+
+@autotvm.register_topi_compute("contrib.gemmini.gemm")
+def gemm(
+    cfg: tvm.autotvm.task.space.FallbackConfigEntity,
+    data: tvm.te.tensor.Tensor,
+    weight: tvm.te.tensor.Tensor,
+    bias: tvm.te.tensor.Tensor,
+    scale: float,
+) -> tvm.te.tensor.Tensor:
+    """Computation definition for Gemmini's dense operator using intrinsic instructions
+
+    Args:
+        cfg (tvm.autotvm.task.space.FallbackConfigEntity): AutoTVM configuration entity
+        data (tvm.te.tensor.Tensor): Input feature map
+        weight (tvm.te.tensor.Tensor): Layer weights
+        bias (tvm.te.tensor.Tensor): Layer biases
+        scale (float): output scaling factor
+
+    Returns:
+        tvm.te.tensor.Tensor: dense operator result
+    """
+
+    # Derive shapes
+    ishape = topi.utils.get_const_tuple(data.shape)
+    wshape = topi.utils.get_const_tuple(weight.shape)
+    oshape = (data.shape[0], weight.shape[1])
+
+    # Reduction axes (input channel)
+    assert ishape[1] == wshape[0]
+    k_o = te.reduce_axis((0, wshape[0]), name="k_o")
+
+    bias_stage = te.compute(
+        oshape,
+        lambda x_o, y_o: bias[y_o].astype(ENV.inp_dtype),
+        name="bias.local.accumulator",
+        tag="bias_add",
+    )
+
+    res = te.compute(
+        oshape,
+        lambda x_o, y_o: te.sum(
+            data[x_o, k_o].astype(ENV.inp_dtype) * weight[k_o, y_o].astype(ENV.inp_dtype)
+            + bias_stage[x_o, y_o].astype(ENV.inp_dtype),
+            axis=[k_o],
+        ),
+        name="res",
+        tag="dense",
+        attrs={"scale": scale},
+    )
+
+    cfg.add_flop(
+        (2 * np.prod(topi.utils.get_const_tuple(oshape)) * ishape[1])  # element multiplications
+        + np.prod(topi.utils.get_const_tuple(oshape))  # bias additions
+    )
+
+    return res
+
+
+@autotvm.register_topi_schedule("contrib.gemmini.gemm")
+def schedule_gemm(
+    cfg: tvm.autotvm.task.space.FallbackConfigEntity, outs: tvm.ir.container.Array
+) -> tvm.te.schedule.Schedule:
+    """Schedule definition for Gemmini's dense operator using intrinsic instructions
+
+    Args:
+        cfg (tvm.autotvm.task.space.FallbackConfigEntity): AutoTVM configuration entity
+        outs (tvm.ir.container.Array): Output tensors
+
+    Returns:
+        tvm.te.schedule.Schedule: transformed schedule
+    """
+
+    assert len(outs) == 1
+    output = outs[0]
+
+    dense_stage = output.op.output(0)
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    sch = te.create_schedule([x.op for x in outs])
+
+    data, weight, bias_op = dense_stage.op.input_tensors
+
+    ##### space definition begin #####
+    x, y = sch[dense_stage].op.axis
+    (z_axis,) = sch[dense_stage].op.reduce_axis
+
+    # TODO (FP): add limits for scratchpad and accumulator sizes perhaps?
+    cfg.define_split(
+        "tile_xo",
+        x,
+        num_outputs=3,
+        policy="power2",
+        filter=lambda ax: (
+            ax.size[-1] == get_greater_div(int(data.shape[0]))
+            if (data.shape[0] >= ENV.DIM)
+            else ax.size[-1] <= ENV.DIM
+        ),
+    )
+
+    cfg.define_split(
+        "tile_yo",
+        y,
+        num_outputs=3,
+        policy="power2",
+        filter=lambda ax: (
+            ax.size[-1] == get_greater_div(int(weight.shape[1]))
+            if (weight.shape[1] >= ENV.DIM)
+            else ax.size[-1] <= ENV.DIM
+        ),
+    )
+
+    cfg.define_split(
+        "tile_zo",
+        z_axis,
+        num_outputs=3,
+        policy="power2",
+        filter=lambda ax: (
+            ax.size[-1] == get_greater_div(int(weight.shape[0]))
+            if (weight.shape[0] >= ENV.DIM)
+            else ax.size[-1] <= ENV.DIM
+        ),
+    )
+
+    # accumulate_multiple_patches knob
+    #   2: only one patch is computed in the accumulator
+    #   1: More than one patch is computed in the accumulator, depends on tile_yo
+    #   0: More than one patch is computed in the accumulator, depends on tile_yo AND tile_xo
+    cfg.define_knob("accumulate_multiple_patches", [0, 1, 2])
+    # exchange axis
+    #   exchange the order of axis x and y
+    cfg.define_knob("exchange_axis", [False, True])
+    # WS/OS
+    #   0: Gemmini will be configured as output stationary
+    #   1: Gemmini will be configured as weight stationary
+    cfg.define_knob("WS/OS", [ENV.WEIGHT_STATIONARY, ENV.OUTPUT_STATIONARY])
+    # mvout_big_block
+    #   False: generate mvout instructions moving as maximum DIM columns
+    #   True: generate mvout instructions moving more than DIM columns
+    cfg.define_knob("mvout_big_block", [True, False])
+    if cfg.is_fallback:
+        # Load default split values
+        cfg["tile_xo"] = SplitEntity([-1, 8, get_greater_div(int(data.shape[0]))])
+        cfg["tile_yo"] = SplitEntity([-1, 8, get_greater_div(int(weight.shape[1]))])
+        cfg["tile_zo"] = SplitEntity([-1, 8, get_greater_div(int(weight.shape[0]))])
+        cfg["accumulate_multiple_patches"] = OtherOptionEntity(0)
+        cfg["exchange_axis"] = OtherOptionEntity(False)
+        cfg["mvout_big_block"] = OtherOptionEntity(True)
+        cfg["WS/OS"] = OtherOptionEntity(ENV.WEIGHT_STATIONARY)
+
+    ###### space definition end ######
+
+    cdata = sch.cache_read(data, ENV.scr_scope, [dense_stage])
+    cweight = sch.cache_read(weight, ENV.scr_wgt_scope, [dense_stage])
+    dense_stage_acc = sch.cache_write(output, ENV.acc_scope)
+    sch[bias_op].set_scope(ENV.acc_scope)
+    (x_axis, y_axis) = sch[dense_stage_acc].op.axis
+    (z_axis_int,) = sch[dense_stage_acc].op.reduce_axis
+
+    # Split loops to generate the inner dimensions specified by knobs tile_xo and tile_yo
+    b_y, yo_axis, yi_axis = cfg["tile_yo"].apply(sch, output, sch[output].op.axis[1])
+    b_x, xo_axis, xi_axis = cfg["tile_xo"].apply(sch, output, sch[output].op.axis[0])
+
+    # Apply the exchange_axis knob
+    if cfg["exchange_axis"].val:
+        sch[output].reorder(b_y, b_x, yo_axis, xo_axis, yi_axis, xi_axis)
+    else:
+        sch[output].reorder(b_x, b_y, xo_axis, yo_axis, xi_axis, yi_axis)
+
+    # Apply the accumulate_multiple_patches knob
+    if cfg["accumulate_multiple_patches"].val == 0:
+        axis_for_output = b_x if cfg["exchange_axis"].val else b_y
+    elif cfg["accumulate_multiple_patches"].val == 1:
+        axis_for_output = yo_axis if cfg["exchange_axis"].val else xo_axis
+    else:
+        axis_for_output = xo_axis if cfg["exchange_axis"].val else yo_axis
+
+    axis_gemm_start = b_y if cfg["exchange_axis"].val else b_x
+
+    # Move the dense_stage_acc stage to the correct axis of the output stage
+    sch[dense_stage_acc].compute_at(sch[output], axis_for_output)
+
+    # # Split loops to generate the inner dimensions specified by knob tile_zo
+    xo_o, xi_o = sch[dense_stage_acc].split(x_axis, factor=ENV.DIM)
+    yo_o, yi_o = sch[dense_stage_acc].split(y_axis, factor=ENV.DIM)
+    b_z, zo_o, zi_o = cfg["tile_zo"].apply(sch, dense_stage_acc, z_axis_int)
+
+    # Apply the exchange_axis knob
+    if cfg["exchange_axis"].val:
+        sch[dense_stage_acc].reorder(b_z, xo_o, yo_o, zo_o, xi_o, yi_o, zi_o)
+    else:
+        sch[dense_stage_acc].reorder(b_z, yo_o, xo_o, zo_o, yi_o, xi_o, zi_o)
+
+    # Generate knobs to move the copy of data across different loops
+    axis_to_input_data = [b_x, b_z, xo_o, zo_o]
+    axis_to_input_weights = [b_y, b_z, yo_o, zo_o]
+    stages_to_input_data = [output, dense_stage_acc, dense_stage_acc, dense_stage_acc]
+    cfg.define_knob("axis_for_cdata", [0, 1, 2, 3])
+    cfg.define_knob("axis_for_cweight", [0, 1, 2, 3])
+    if cfg.is_fallback:
+        cfg["axis_for_cdata"] = OtherOptionEntity(0)
+        cfg["axis_for_cweight"] = OtherOptionEntity(0)
+
+    # Compute the move of the bias in the correct loop
+    sch[bias_op].compute_at(sch[output], axis_for_output)
+
+    # We assert here that the mvin of data does not use more space
+    # than the available one in the scratchpad
+    if cfg["axis_for_cdata"].val == 0:
+        assert (
+            cfg["tile_xo"].size[1] * cfg["tile_xo"].size[2] * data.shape[1]
+            <= ENV.INP_SCR_ROWS * ENV.DIM
+        ), "Data matrix will not fit in scratchpad!"
+    elif cfg["axis_for_cdata"].val == 1:
+        assert (
+            cfg["tile_xo"].size[2] * data.shape[1] <= ENV.INP_SCR_ROWS * ENV.DIM
+        ), "Data matrix will not fit in scratchpad!"
+    if cfg["axis_for_cweight"].val == 0:
+        assert (
+            cfg["tile_yo"].size[1] * cfg["tile_yo"].size[2] * weight.shape[0]
+            <= ENV.WGT_SCR_ROWS * ENV.DIM
+        ), "Weight matrix will not fit in scratchpad!"
+    elif cfg["axis_for_cweight"].val == 1:
+        assert (
+            cfg["tile_yo"].size[2] * weight.shape[0] <= ENV.WGT_SCR_ROWS * ENV.DIM
+        ), "Weight matrix will not fit in scratchpad!"
+
+    # And here we assert that there is enough place available in the accumulator
+    if cfg["accumulate_multiple_patches"].val == 0:
+        assert (
+            cfg["tile_xo"].size[1]
+            * cfg["tile_xo"].size[2]
+            * cfg["tile_yo"].size[1]
+            * cfg["tile_yo"].size[2]
+            <= ENV.ACC_ROWS * ENV.DIM
+        ), "Result matrix will not fit in accumulator!"
+    elif cfg["accumulate_multiple_patches"].val == 1:
+        assert (
+            cfg["tile_xo"].size[2] * cfg["tile_yo"].size[1] * cfg["tile_yo"].size[2]
+            <= ENV.ACC_ROWS * ENV.DIM
+        ), "Result matrix will not fit in accumulator!"
+
+    # Move the data and weight move instructions into the correct loops selected
+    # by the axis_for_cdata and axis_for_cweight knobs
+    axis_for_cdata = axis_to_input_data[cfg["axis_for_cdata"].val]
+    axis_for_cweight = axis_to_input_weights[cfg["axis_for_cweight"].val]
+    sch[cdata].compute_at(sch[stages_to_input_data[cfg["axis_for_cdata"].val]], axis_for_cdata)
+    sch[cweight].compute_at(
+        sch[stages_to_input_data[cfg["axis_for_cweight"].val]], axis_for_cweight
+    )
+
+    # Split input moves because Gemmini's mvin only supports mvins with
+    # rows <= DIM and cols <= MAX_BLOCK_LEN
+    cdata_ax_0_1, cdata_ax_0_2 = sch[cdata].split(sch[cdata].op.axis[0], factor=ENV.DIM)
+    cdata_ax_1_1, cdata_ax_1_2 = sch[cdata].split(
+        sch[cdata].op.axis[1], factor=ENV.MAX_BLOCK_LEN * ENV.DIM
+    )
+    sch[cdata].reorder(cdata_ax_0_1, cdata_ax_1_1, cdata_ax_0_2, cdata_ax_1_2)
+
+    cweight_ax_0_1, cweight_ax_0_2 = sch[cweight].split(sch[cweight].op.axis[0], factor=ENV.DIM)
+    cweight_ax_1_1, cweight_ax_1_2 = sch[cweight].split(
+        sch[cweight].op.axis[1], factor=ENV.MAX_BLOCK_LEN * ENV.DIM
+    )
+    sch[cweight].reorder(cweight_ax_0_1, cweight_ax_1_1, cweight_ax_0_2, cweight_ax_1_2)
+
+    cbias_ax_0_1, cbias_ax_0_2 = sch[bias_op].split(sch[bias_op].op.axis[0], factor=ENV.DIM)
+    cbias_ax_1_1, cbias_ax_1_2 = sch[bias_op].split(
+        sch[bias_op].op.axis[1], factor=ENV.MAX_BLOCK_LEN_ACC * ENV.DIM
+    )
+    sch[bias_op].reorder(cbias_ax_0_1, cbias_ax_1_1, cbias_ax_0_2, cbias_ax_1_2)
+
+    # Mvout preparation
+    if cfg["exchange_axis"].val:
+        sch[output].reorder(yo_axis, yi_axis, xo_axis, xi_axis)
+    else:
+        sch[output].reorder(xo_axis, xi_axis, yo_axis, yi_axis)
+    if cfg["accumulate_multiple_patches"].val == 0:
+        fused_x = sch[output].fuse(xo_axis, xi_axis)
+        fused_y = sch[output].fuse(yo_axis, yi_axis)
+    elif cfg["accumulate_multiple_patches"].val == 1:
+        if cfg["exchange_axis"].val:
+            fused_x = sch[output].fuse(xo_axis, xi_axis)
+            fused_y = yi_axis
+        else:
+            fused_x = xi_axis
+            fused_y = sch[output].fuse(yo_axis, yi_axis)
+    else:
+        fused_x = xi_axis
+        fused_y = yi_axis
+
+    fused_x_1, fused_x_2 = sch[output].split(fused_x, factor=ENV.DIM)
+    fused_y_1, fused_y_2 = sch[output].split(
+        fused_y, factor=ENV.MAX_BLOCK_LEN * ENV.DIM if cfg["mvout_big_block"].val else ENV.DIM
+    )
+    sch[output].reorder(fused_x_1, fused_y_1, fused_x_2, fused_y_2)
+
+    # Tag loops with pragmas, in order to insert the move in and move out instructions
+    sch[cweight].pragma(cweight_ax_0_2, ENV.B_mvin)
+    if data.shape[0] == 1 and weight.shape[1] > 1:
+        sch[cdata].pragma(cdata_ax_0_2, ENV.A_mvin + "_t")
+        sch[bias_op].pragma(cbias_ax_0_2, ENV.D_mvin + "_t")
+        sch[output].pragma(fused_x_2, ENV.C_mvout + "_t")
+    else:
+        sch[cdata].pragma(cdata_ax_0_2, ENV.A_mvin)
+        sch[bias_op].pragma(cbias_ax_0_2, ENV.D_mvin)
+        sch[output].pragma(fused_x_2, ENV.C_mvout)
+
+    # Apply tensorize
+    dim_i = data.shape[0] if data.shape[0] < ENV.DIM else cfg["tile_xo"].size[-1]
+    dim_k = weight.shape[0] if weight.shape[0] < ENV.DIM else cfg["tile_zo"].size[-1]
+    dim_j = weight.shape[1] if weight.shape[1] < ENV.DIM else cfg["tile_yo"].size[-1]
+
+    sch[dense_stage_acc].tensorize(
+        xi_o if cfg["exchange_axis"].val else yi_o,
+        ENV.gemm(
+            dim_i,
+            dim_k,
+            dim_j,
+            mode=cfg["WS/OS"].val,
+            accum_patch=tvm.tir.IntImm("uint8", 0)
+            if cfg["exchange_axis"].val or cfg["tile_zo"].size[1] != 1
+            else xo_o.var,
+        ),
+    )
+
+    # Generate configuration dictionary, in order to correctly generate
+    # the calls to the configuration instructions
+    config_dict = {}
+    config_dict["A_size"] = int(data.shape[1])
+    config_dict["B_size"] = int(weight.shape[1])
+    config_dict["C_size"] = int(output.shape[1])
+    config_dict["A_private_stride"] = ENV.DIM
+    config_dict["B_private_stride"] = ENV.DIM
+    config_dict["execution_stride"] = 1
+    config_dict["activation"] = 0
+    config_dict["mode"] = cfg["WS/OS"].val
+    config_dict["max_pixels_per_row"] = 1
+    config_dict["scale"] = float(dense_stage.op.attrs["scale"])
+    config_dict["padding_value"] = 0
+
+    # Tag loops with pragmas to delimit the start and end of the Gemmini related code
+    sch[output].pragma(axis_gemm_start, "gemm_start")
+    sch[output].pragma(axis_gemm_start, "configs", str(config_dict))
+    sch[output].pragma(axis_gemm_start, "gemm_end")
+
+    return sch
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_dense_cisc.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_dense_cisc.py
new file mode 100644
index 000000000000..872b017d1f4b
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_dense_cisc.py
@@ -0,0 +1,132 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIsch, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=unused-argument
+"""
+Dense (GEMM) operator declaration and schedule registration for Gemmini's CISC instructions
+=====================
+"""
+
+import numpy as np
+import tvm
+from tvm import te
+from tvm import autotvm
+from tvm import topi
+from tvm.autotvm.task.space import OtherOptionEntity
+
+from tvm.contrib.gemmini.environment import Environment
+
+ENV = Environment.instance()
+
+
+@autotvm.register_topi_compute("contrib.gemmini.gemm_cisc")
+def gemm_cisc(
+    cfg: tvm.autotvm.task.space.FallbackConfigEntity,
+    data: tvm.te.tensor.Tensor,
+    weight: tvm.te.tensor.Tensor,
+    bias: tvm.te.tensor.Tensor,
+    scale: float,
+) -> tvm.te.tensor.Tensor:
+    """Computation definition for Gemmini's dense operator using CISC instructions
+
+    Args:
+        cfg (tvm.autotvm.task.space.FallbackConfigEntity): AutoTVM configuration entity
+        data (tvm.te.tensor.Tensor): Input feature map
+        weight (tvm.te.tensor.Tensor): Layer weights
+        bias (tvm.te.tensor.Tensor): Layer biases
+        scale (float): output scaling factor
+
+    Returns:
+        tvm.te.tensor.Tensor: dense operator result
+    """
+
+    # Derive shapes
+    ishape = topi.utils.get_const_tuple(data.shape)
+    wshape = topi.utils.get_const_tuple(weight.shape)
+    oshape = (data.shape[0], weight.shape[1])
+
+    # Reduction axes (input channel)
+    assert ishape[1] == wshape[0]
+    k_o = te.reduce_axis((0, wshape[0]), name="k_o")
+
+    res = te.compute(
+        oshape,
+        lambda x_o, y_o: te.sum(
+            data[x_o, k_o].astype(ENV.inp_dtype) * weight[k_o, y_o].astype(ENV.inp_dtype)
+            + bias[y_o].astype(ENV.inp_dtype),
+            axis=[k_o],
+        ),
+        name="res",
+        tag="dense",
+        attrs={"scale": scale},
+    )
+
+    cfg.add_flop(
+        (2 * np.prod(topi.utils.get_const_tuple(oshape)) * ishape[1])  # element multiplications
+        + np.prod(topi.utils.get_const_tuple(oshape))  # bias additions
+    )
+
+    return res
+
+
+@autotvm.register_topi_schedule("contrib.gemmini.gemm_cisc")
+def schedule_gemm_cisc(
+    cfg: tvm.autotvm.task.space.FallbackConfigEntity, outs: tvm.ir.container.Array
+) -> tvm.te.schedule.Schedule:
+    """Schedule definition for Gemmini's dense operator using CISC instructions
+
+    Args:
+        cfg (tvm.autotvm.task.space.FallbackConfigEntity): AutoTVM configuration entity
+        outs (tvm.ir.container.Array): Output tensors
+
+    Returns:
+        tvm.te.schedule.Schedule: transformed schedule
+    """
+    assert len(outs) == 1
+    output = outs[0]
+
+    dense_stage = output.op.output(0)
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    sch = te.create_schedule([x.op for x in outs])
+
+    data, weight, bias = dense_stage.op.input_tensors
+
+    # WS/OS
+    #   0: Gemmini will be configured as output stationary
+    #   1: Gemmini will be configured as weight stationary
+    cfg.define_knob("WS/OS", [ENV.WEIGHT_STATIONARY, ENV.OUTPUT_STATIONARY])
+    if cfg.is_fallback:
+        cfg["WS/OS"] = OtherOptionEntity(ENV.WEIGHT_STATIONARY)
+
+    x_axis, _ = sch[dense_stage].op.axis
+
+    x_o, x_i = sch[dense_stage].split(x_axis, factor=data.shape[0])
+
+    axis_for_start = x_o
+
+    # Apply tensorization
+    sch[dense_stage].tensorize(
+        x_i,
+        ENV.gemm_cisc(
+            data.shape, weight.shape, bias.shape, dense_stage.op.attrs["scale"], cfg["WS/OS"].val
+        ),
+    )
+
+    # Tag loops with pragmas to delimit the start and end of the Gemmini related code
+    sch[dense_stage].pragma(axis_for_start, "gemm_cisc_start")
+    sch[dense_stage].pragma(axis_for_start, "gemm_end")
+
+    return sch
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_depthwise_conv2d_cisc.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_depthwise_conv2d_cisc.py
new file mode 100644
index 000000000000..1fc35df9e182
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_depthwise_conv2d_cisc.py
@@ -0,0 +1,220 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIsch, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=unused-argument
+"""
+Depthwise conv2d operator declaration and schedule registration for Gemmini's CISC instructions
+=====================
+"""
+
+import numpy as np
+import tvm
+from tvm import te
+from tvm import autotvm
+from tvm import topi
+
+from tvm.contrib.gemmini.environment import Environment
+
+ENV = Environment.instance()
+
+
+@autotvm.register_topi_compute("contrib.gemmini.depthwiseconv2d_cisc")
+def depthwise_conv2d_cisc(
+    cfg: tvm.autotvm.task.space.FallbackConfigEntity,
+    orig_data: tvm.te.tensor.Tensor,
+    orig_kernel: tvm.te.tensor.Tensor,
+    bias: tvm.te.tensor.Tensor,
+    strides: tvm.ir.container.Array,
+    padding: tvm.ir.container.Array,
+    ifm_offset: int,
+    activation: int,
+    gemmini_scale: float,
+) -> tvm.te.tensor.Tensor:
+    """Computation definition for Gemmini's depthwise conv2d operator using CISC instructions
+
+    Args:
+        cfg (tvm.autotvm.task.space.FallbackConfigEntity): AutoTVM configuration entity
+        orig_data (tvm.te.tensor.Tensor): Input feature map
+        orig_kernel (tvm.te.tensor.Tensor): Layer weights
+        bias (tvm.te.tensor.Tensor): Layer biases
+        strides (tvm.ir.container.Array): convolution strides
+        padding (tvm.ir.container.Array): input feature map padding
+        ifm_offset (int): input feature map offset (used for the padding of the input feature map)
+        activation (int): has activation?
+        gemmini_scale (float): output scaling factor
+
+    Returns:
+        tvm.te.tensor.Tensor: depthwise conv2d operator result
+    """
+
+    assert len(orig_data.shape) == 4
+    assert len(orig_kernel.shape) == 3
+    assert len(bias.shape) == 1
+    assert (
+        orig_data.shape[1] == orig_data.shape[2]
+    ), "GEMMINIs depthwise conv2d CISC schedule only supports square inputs!"
+
+    o_c = orig_kernel.shape[0]
+    k_h = orig_kernel.shape[1]
+    k_w = orig_kernel.shape[2]
+
+    kernel = orig_kernel
+
+    n = orig_data.shape[0]
+    i_h = orig_data.shape[1]
+    i_w = orig_data.shape[2]
+
+    hstr = strides[0]
+    wstr = strides[1]
+    top_pad = padding[0]
+    left_pad = padding[1]
+    bottom_pad = padding[2]
+    right_pad = padding[3]
+
+    o_h = topi.utils.get_const_int(tvm.tir.div((i_h + (top_pad + bottom_pad) - k_h), hstr) + 1)
+    o_w = topi.utils.get_const_int(tvm.tir.div((i_w + (left_pad + right_pad) - k_w), wstr) + 1)
+
+    if len(set(padding)) == 1 and ENV.supports_non_zero_padding:
+        # If the padding is the same for all borders, there is no need to use topi.nn.pad,
+        # because Gemminis CISC instructions support equal padding
+        data = orig_data
+    else:
+        # If not, then pad before calling Gemminis functions
+        data = topi.nn.pad(
+            orig_data,
+            [0, top_pad, left_pad, 0],
+            [0, bottom_pad, right_pad, 0],
+            pad_value=ifm_offset,
+            name="pad_data",
+        )
+
+    rkh = te.reduce_axis((0, k_h), name="rkh")
+    rkw = te.reduce_axis((0, k_w), name="rkw")
+
+    oshape = (n, o_h, o_w, o_c)
+
+    res = te.compute(
+        oshape,
+        lambda b_o, i, j, c_o: te.sum(
+            data[b_o, i * hstr + rkh, j * wstr + rkw, c_o].astype(ENV.inp_dtype)
+            * kernel[c_o, rkh, rkw].astype(ENV.inp_dtype)
+            + bias[c_o].astype(ENV.inp_dtype),
+            axis=[rkh, rkw],
+        ),
+        name="res",
+        tag="conv2d",
+        attrs={
+            "activation": activation,
+            "strides": [hstr, wstr],
+            "padding": padding,
+            "padding_value": ifm_offset,
+            "scale": gemmini_scale,
+        },
+    )
+
+    cfg.add_flop(
+        np.prod(topi.utils.get_const_tuple(oshape)) * k_h * k_w
+        + np.prod(topi.utils.get_const_tuple(oshape))
+        * (k_h * k_w - 1)  # Multiplications and additions needed
+        + np.prod(topi.utils.get_const_tuple(oshape))  # Output scaling factor multiplications
+    )
+
+    return res
+
+
+@autotvm.register_topi_schedule("contrib.gemmini.depthwiseconv2d_cisc")
+def schedule_depthwise_conv2d_cisc(
+    cfg: tvm.autotvm.task.space.FallbackConfigEntity, outs: tvm.ir.container.Array
+) -> tvm.te.schedule.Schedule:
+    """Schedule definition for Gemmini's depthwise conv2d operator using CISC instructions
+
+    Args:
+        cfg (tvm.autotvm.task.space.FallbackConfigEntity): AutoTVM configuration entity
+        outs (tvm.ir.container.Array): Output tensors
+
+    Returns:
+        tvm.te.schedule.Schedule: transformed schedule
+    """
+    assert len(outs) == 1
+    output = outs[0]
+    const_ops = []
+    ewise_inputs = []
+    ewise_ops = []
+    conv2d_res = []
+
+    def _traverse(op):
+        if topi.tag.is_broadcast(op.tag):
+            if not op.same_as(output.op):
+                if not op.axis:
+                    const_ops.append(op)
+                else:
+                    ewise_ops.append(op)
+            for tensor in op.input_tensors:
+                if isinstance(tensor.op, tvm.te.PlaceholderOp):
+                    ewise_inputs.append((op, tensor))
+                else:
+                    _traverse(tensor.op)
+        else:
+            if op.tag == "conv2d":
+                conv2d_res.append(op)
+            else:
+                for tensor in op.input_tensors:
+                    _traverse(tensor.op)
+
+    _traverse(output.op)
+    assert len(conv2d_res) == 1
+    conv2d_stage = conv2d_res[0].output(0)
+    sch = te.create_schedule(output.op)
+
+    data, kernel, bias = conv2d_stage.op.input_tensors
+    if isinstance(data.op, tvm.te.ComputeOp) and "pad" in data.op.tag:
+        temp = data.op.input_tensors[0]
+        pad_data = data
+        data = temp
+    else:
+        pad_data = data
+
+    x_bo, _, _, _ = sch[conv2d_stage].op.axis
+
+    x_bo_o, x_bo_i = sch[conv2d_stage].split(x_bo, factor=pad_data.shape[0])
+
+    axis_for_start = x_bo_o
+
+    # If topi.nn.pad was added, its because the padding was not equal in all dimensions.
+    padding = conv2d_stage.op.attrs["padding"] if pad_data == data else [0, 0, 0, 0]
+    padding_value = conv2d_stage.op.attrs["padding_value"] if pad_data == data else 0
+
+    # Apply tensorization
+    sch[conv2d_stage].tensorize(
+        x_bo_i,
+        ENV.dw_conv2d_cisc(
+            pad_data.shape,
+            kernel.shape,
+            bias.shape,
+            conv2d_stage.shape,
+            conv2d_stage.op.attrs["strides"],
+            padding,
+            padding_value,
+            conv2d_stage.op.attrs["activation"],
+            conv2d_stage.op.attrs["scale"],
+        ),
+    )
+
+    # Tag loops with pragmas to delimit the start and end of the Gemmini related code
+    sch[conv2d_stage].pragma(axis_for_start, "dw_conv2d_cisc_start")
+    sch[conv2d_stage].pragma(axis_for_start, "gemm_end")
+
+    return sch
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_max_pool2d.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_max_pool2d.py
new file mode 100644
index 000000000000..0cc7bde80812
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_max_pool2d.py
@@ -0,0 +1,148 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIsch, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=unused-argument
+"""
+MaxPool2D operator declaration and schedule registration for Gemmini's CISC instructions
+=====================
+"""
+
+import tvm
+from tvm import te
+from tvm import autotvm
+
+from tvm.contrib.gemmini.environment import Environment
+
+ENV = Environment.instance()
+
+
+@autotvm.register_topi_compute("contrib.gemmini.max_pool2d")
+# def conv2d(args,attrs):
+def max_pool2d(
+    cfg: tvm.autotvm.task.space.FallbackConfigEntity,
+    data: tvm.te.tensor.Tensor,
+    weights: tvm.te.tensor.Tensor,
+    pool_size: tvm.ir.container.Array,
+    pool_strides: tvm.ir.container.Array,
+    pool_dilation: tvm.ir.container.Array,
+    pool_padding: tvm.ir.container.Array,
+) -> tvm.te.tensor.Tensor:
+    """Computation definition to run a max pooling layer on Gemmini.
+    Uses a trick: we call a dw convolution + max pooling, but all weights are 1.
+    So the depthwise convolution does nothing, and the Gemmini accelerator takes care
+    internally of applying the max pooling.
+
+    Args:
+        cfg (tvm.autotvm.task.space.FallbackConfigEntity): AutoTVM configuration entity
+        data (tvm.te.tensor.Tensor): Input feature map
+        weights (tvm.te.tensor.Tensor): Weights... just all ones, needed by the called function
+        pool_size (tvm.ir.container.Array): Pooling window size
+        pool_strides (tvm.ir.container.Array): Pooling window strides
+        pool_dilation (tvm.ir.container.Array): Pooling window dilation (not used for now)
+        pool_padding (tvm.ir.container.Array): Pooling window padding
+
+    Returns:
+        tvm.te.tensor.Tensor: max pool2d operator result
+    """
+
+    assert len(data.shape) == 4
+
+    def irb_builder_func(ins, outs):
+        irb = tvm.tir.ir_builder.create()
+
+        if ENV.supports_non_zero_padding:
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "tiled_conv_dw_auto",
+                    ins[0].shape[0],  # BATCH_SIZE,
+                    ins[0].shape[1],  # IN_DIM,
+                    ins[0].shape[3],  # IN_CHANNELS,
+                    ins[0].shape[1],  # OUT_DIM,
+                    1,
+                    0,
+                    0,
+                    1,
+                    ins[0].access_ptr("r"),
+                    ins[1].access_ptr("r"),
+                    0,
+                    outs[0].access_ptr("w"),
+                    0,
+                    1.0,
+                    pool_size[0],
+                    pool_strides[0],
+                    pool_padding[0],
+                    1,
+                )
+            )
+        else:
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "tiled_conv_dw_auto",
+                    ins[0].shape[0],  # BATCH_SIZE,
+                    ins[0].shape[1],  # IN_DIM,
+                    ins[0].shape[3],  # IN_CHANNELS,
+                    ins[0].shape[1],  # OUT_DIM,
+                    1,
+                    0,
+                    1,
+                    ins[0].access_ptr("r"),
+                    ins[1].access_ptr("r"),
+                    0,
+                    outs[0].access_ptr("w"),
+                    0,
+                    1.0,
+                    pool_size[0],
+                    pool_strides[0],
+                    pool_padding[0],
+                    1,
+                )
+            )
+        irb.emit(tvm.tir.call_extern("", "gemmini_fence"))
+
+        return irb.get()
+
+    res = te.extern(
+        (1,),
+        [data, weights],
+        lambda ins, outs: irb_builder_func(ins, outs),  # pylint: disable=W0108
+        dtype="int8",
+    )
+
+    # TODO (FP): add correct FLOPS
+    # cfg.add_flop(2 * np.prod(topi.utils.get_const_tuple(oshape)) * KH * KW * IC)
+
+    return res
+
+
+@autotvm.register_topi_schedule("contrib.gemmini.max_pool2d")
+def schedule_max_pool2d(
+    cfg: tvm.autotvm.task.space.FallbackConfigEntity, outs: tvm.ir.container.Array
+) -> tvm.te.schedule.Schedule:
+    """Schedule definition for Gemmini's max pool2d operator
+
+    Args:
+        cfg (tvm.autotvm.task.space.FallbackConfigEntity): AutoTVM configuration entity
+        outs (tvm.ir.container.Array): Output tensors
+
+    Returns:
+        tvm.te.schedule.Schedule: transformed schedule
+    """
+    assert len(outs) == 1
+    output = outs[0]
+    sch = te.create_schedule(output.op)
+    return sch
diff --git a/python/tvm/relay/backend/contrib/gemmini/op.py b/python/tvm/relay/backend/contrib/gemmini/op.py
new file mode 100644
index 000000000000..990ae11f9808
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/gemmini/op.py
@@ -0,0 +1,277 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=unused-argument, ungrouped-imports
+"""
+Namespace for the supported Relay operators on Gemmini
+=====================
+"""
+
+from __future__ import absolute_import as _abs
+
+import tvm
+
+from tvm.relay.op import strategy as _strategy
+from tvm.relay.op.op import OpStrategy
+from tvm.contrib.gemmini.environment import Environment
+from .gemmini_dense import gemm, schedule_gemm
+from .gemmini_dense_cisc import gemm_cisc, schedule_gemm_cisc
+from .gemmini_conv2d_cisc import conv2d_cisc, schedule_conv2d_cisc
+from .gemmini_depthwise_conv2d_cisc import depthwise_conv2d_cisc, schedule_depthwise_conv2d_cisc
+from .gemmini_add import add, schedule_add
+from .gemmini_max_pool2d import max_pool2d, schedule_max_pool2d
+
+ENV = Environment.instance()
+
+
+def wrap_max_pool2d_topi_compute(topi_compute):
+    """Wrapper for the max pool2d compute
+
+    Args:
+        topi_compute (function): function to wrap
+    """
+
+    def wrapper(attrs, inputs, out_type):
+        return [
+            topi_compute(
+                *inputs,
+                attrs.pool_size,
+                attrs.pool_strides,
+                attrs.pool_dilation,
+                attrs.pool_padding,
+            )
+        ]
+
+    return wrapper
+
+
+@tvm.ir.register_op_attr("contrib.gemmini.max_pool2d", "FTVMStrategy")
+def max_pool2d_strategy_gemmini(attrs, inputs, out_type, target):
+    """Strategy implementations for Gemmini's max_pool2d operator
+
+    Args:
+        attrs (tvm.runtime.object.Object): attributes for the strategy
+        inputs (tvm.ir.container.Array): inputs
+        out_type (tvm.ir.tensor_type.TensorType): output type
+        target (tvm.target.target.Target): target for the strategy
+
+    Returns:
+        OpStrategy: strategies implementation
+    """
+    if len(inputs) == 2:
+        strategy = OpStrategy()
+        strategy.add_implementation(
+            wrap_max_pool2d_topi_compute(max_pool2d),
+            _strategy.wrap_topi_schedule(schedule_max_pool2d),
+            name="contrib.gemmini.max_pool2d",
+            plevel=10,
+        )
+        return strategy
+    return None
+
+
+def wrap_add_topi_compute(topi_compute):
+    """Wrapper for the add compute
+
+    Args:
+        topi_compute (function): function to wrap
+    """
+
+    def wrapper(attrs, inputs, out_type):
+        ifm1_scale = float(attrs.ifm1_scale.data.numpy() / attrs.ofm_scale.data.numpy())
+        ifm2_scale = float(attrs.ifm2_scale.data.numpy() / attrs.ofm_scale.data.numpy())
+        return [topi_compute(*inputs, ifm1_scale, ifm2_scale)]
+
+    return wrapper
+
+
+@tvm.ir.register_op_attr("contrib.gemmini.add", "FTVMStrategy")
+def add_strategy_gemmini(attrs, inputs, out_type, target):
+    """Strategy implementations for Gemmini's add operator
+
+    Args:
+        attrs (tvm.runtime.object.Object): attributes for the strategy
+        inputs (tvm.ir.container.Array): inputs
+        out_type (tvm.ir.tensor_type.TensorType): output type
+        target (tvm.target.target.Target): target for the strategy
+
+    Returns:
+        OpStrategy: strategies implementation
+    """
+    if len(inputs) == 3:
+        strategy = OpStrategy()
+        strategy.add_implementation(
+            wrap_add_topi_compute(add),
+            _strategy.wrap_topi_schedule(schedule_add),
+            name="contrib.gemmini.add",
+            plevel=10,
+        )
+        return strategy
+    return None
+
+
+def wrap_gemm_topi_compute(topi_compute):
+    """Wrapper for the GEMM compute
+
+    Args:
+        topi_compute (function): function to wrap
+    """
+
+    def wrapper(attrs, inputs, out_type):
+        return [
+            topi_compute(
+                *inputs, float(attrs.bias_scale.data.numpy() / attrs.ofm_scale.data.numpy())
+            )
+        ]
+
+    return wrapper
+
+
+@tvm.ir.register_op_attr("contrib.gemmini.gemm", "FTVMStrategy")
+def gemm_strategy_gemmini(attrs, inputs, out_type, target):
+    """Strategy implementations for Gemmini's GEMM operator
+
+    Args:
+        attrs (tvm.runtime.object.Object): attributes for the strategy
+        inputs (tvm.ir.container.Array): inputs
+        out_type (tvm.ir.tensor_type.TensorType): output type
+        target (tvm.target.target.Target): target for the strategy
+
+    Returns:
+        OpStrategy: strategies implementation
+    """
+    if len(inputs) == 3:
+        strategy = OpStrategy()
+        strategy.add_implementation(
+            wrap_gemm_topi_compute(gemm),
+            _strategy.wrap_topi_schedule(schedule_gemm),
+            name="contrib.gemmini.gemm",
+            plevel=9,
+        )
+        strategy.add_implementation(
+            wrap_gemm_topi_compute(gemm_cisc),
+            _strategy.wrap_topi_schedule(schedule_gemm_cisc),
+            name="contrib.gemmini.gemm_cisc",
+            plevel=10,  # Higher -> used over the other one, unless AutoTVM says the other is better
+        )
+        return strategy
+    return None
+
+
+def wrap_conv2d_topi_compute(topi_compute):
+    """Wrapper for the conv2d compute
+
+    Args:
+        topi_compute (function): function to wrap
+    """
+
+    def wrapper(attrs, inputs, out_type):
+        if attrs.has_activation:
+            gemmini_scale = float(
+                attrs.activation_scale_in.data.numpy() / attrs.activation_scale_out.data.numpy()
+            ) * float(attrs.bias_scale.data.numpy() / attrs.ofm_scale.data.numpy())
+        else:
+            gemmini_scale = float(attrs.bias_scale.data.numpy() / attrs.ofm_scale.data.numpy())
+        return [
+            topi_compute(
+                *inputs,
+                attrs.strides,
+                attrs.padding,
+                int(attrs.ifm_offset.data.numpy()),
+                attrs.activation,
+                gemmini_scale,
+                attrs.pool_size,
+                attrs.pool_strides,
+                attrs.pool_dilation,
+                attrs.pool_padding,
+            )
+        ]
+
+    return wrapper
+
+
+@tvm.ir.register_op_attr("contrib.gemmini.conv2d", "FTVMStrategy")
+def conv2d_strategy_gemmini(attrs, inputs, out_type, target):
+    """Strategy implementations for Gemmini's conv2d operator
+
+    Args:
+        attrs (tvm.runtime.object.Object): attributes for the strategy
+        inputs (tvm.ir.container.Array): inputs
+        out_type (tvm.ir.tensor_type.TensorType): output type
+        target (tvm.target.target.Target): target for the strategy
+
+    Returns:
+        OpStrategy: strategies implementation
+    """
+    if len(inputs[0].shape) == 4:
+        strategy = OpStrategy()
+        if inputs[0].shape[1] == inputs[0].shape[2]:
+            strategy.add_implementation(
+                wrap_conv2d_topi_compute(conv2d_cisc),
+                _strategy.wrap_topi_schedule(schedule_conv2d_cisc),
+                name="contrib.gemmini.conv2d_cisc",
+                plevel=10,
+            )
+        return strategy
+    return None
+
+
+def wrap_depthwise_conv2d_topi_compute(topi_compute):
+    """Wrapper for the depthwise conv2d compute
+
+    Args:
+        topi_compute (function): function to wrap
+    """
+
+    def wrapper(attrs, inputs, out_type):
+        return [
+            topi_compute(
+                *inputs,
+                attrs.strides,
+                attrs.padding,
+                int(attrs.ifm_offset.data.numpy()),
+                attrs.activation,
+                float(attrs.bias_scale.data.numpy() / attrs.ofm_scale.data.numpy()),
+            )
+        ]
+
+    return wrapper
+
+
+@tvm.ir.register_op_attr("contrib.gemmini.depthwiseconv2d", "FTVMStrategy")
+def depthwise_conv2d_strategy_gemmini(attrs, inputs, out_type, target):
+    """Strategy implementations for Gemmini's depthwiseconv2d operator
+
+    Args:
+        attrs (tvm.runtime.object.Object): attributes for the strategy
+        inputs (tvm.ir.container.Array): inputs
+        out_type (tvm.ir.tensor_type.TensorType): output type
+        target (tvm.target.target.Target): target for the strategy
+
+    Returns:
+        OpStrategy: strategies implementation
+    """
+    if len(inputs[0].shape) == 4:
+        strategy = OpStrategy()
+        if inputs[0].shape[1] == inputs[0].shape[2]:
+            strategy.add_implementation(
+                wrap_depthwise_conv2d_topi_compute(depthwise_conv2d_cisc),
+                _strategy.wrap_topi_schedule(schedule_depthwise_conv2d_cisc),
+                name="contrib.gemmini.depthwiseconv2d_cisc",
+                plevel=10,
+            )
+        return strategy
+    return None
diff --git a/python/tvm/runtime/script_printer.py b/python/tvm/runtime/script_printer.py
index 269cab8e5d4d..9b92910989f7 100644
--- a/python/tvm/runtime/script_printer.py
+++ b/python/tvm/runtime/script_printer.py
@@ -136,7 +136,6 @@ def script(
             The prefix of AST nodes from tvm.ir
         tir_prefix : str = "T"
             The prefix of AST nodes from tvm.tir
-
         buffer_dtype : str = "float32"
             The default data type of buffer
         int_dtype : str = "int32"
@@ -228,7 +227,6 @@ def show(
             The prefix of AST nodes from tvm.ir
         tir_prefix : str = "T"
             The prefix of AST nodes from tvm.tir
-
         buffer_dtype : str = "float32"
             The default data type of buffer
         int_dtype : str = "int32"
diff --git a/python/tvm/tir/transform/transform.py b/python/tvm/tir/transform/transform.py
index ffaeb85f7458..c6f91e1f9933 100644
--- a/python/tvm/tir/transform/transform.py
+++ b/python/tvm/tir/transform/transform.py
@@ -215,6 +215,17 @@ def InjectRollingBuffer():
     return _ffi_api.InjectRollingBuffer()  # type: ignore
 
 
+def CorrectGemminisScratchpadAndAccumulatorPointers():
+    """Corrects the pointer addresses of buffers inside Gemmini's scratchpad and accumulator
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    return _ffi_api.CorrectGemminisScratchpadAndAccumulatorPointers()  # type: ignore
+
+
 def StorageRewrite():
     """Rewrite storage allocation pattern.
 
diff --git a/python/tvm/topi/hexagon/qnn/global_avg_pool2d.py b/python/tvm/topi/hexagon/qnn/global_avg_pool2d.py
old mode 100644
new mode 100755
diff --git a/python/tvm/topi/hexagon/slice_ops/global_avg_pool2d.py b/python/tvm/topi/hexagon/slice_ops/global_avg_pool2d.py
old mode 100644
new mode 100755
diff --git a/python/tvm/topi/x86/batch_matmul.py b/python/tvm/topi/x86/batch_matmul.py
index 96b143326847..0a0545fa947b 100644
--- a/python/tvm/topi/x86/batch_matmul.py
+++ b/python/tvm/topi/x86/batch_matmul.py
@@ -29,6 +29,8 @@
 from .dense import dense_amx_int8_schedule, dense_int8_schedule
 from .injective import schedule_injective_from_existing
 
+# from .utils import target_has_avx512, target_has_amx
+
 
 @autotvm.register_topi_compute("batch_matmul_int8.x86")
 def batch_matmul_int8_compute(cfg, x, y, *_):
diff --git a/src/node/script_printer.cc b/src/node/script_printer.cc
index 8293af402ed9..abb109f387bf 100644
--- a/src/node/script_printer.cc
+++ b/src/node/script_printer.cc
@@ -49,7 +49,6 @@ PrinterConfig::PrinterConfig(Map<String, ObjectRef> config_dict) {
   if (auto v = config_dict.Get("tir_prefix")) {
     n->tir_prefix = Downcast<String>(v);
   }
-
   if (auto v = config_dict.Get("buffer_dtype")) {
     n->buffer_dtype = DataType(runtime::String2DLDataType(Downcast<String>(v)));
   }
diff --git a/src/relay/op/contrib/gemmini/add.cc b/src/relay/op/contrib/gemmini/add.cc
new file mode 100644
index 000000000000..b27ad4717d14
--- /dev/null
+++ b/src/relay/op/contrib/gemmini/add.cc
@@ -0,0 +1,134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/op/contrib/gemmini/add.cc
+ * \brief Add operator definition for Gemmini.
+ * \author Federico Peccia <https://fPecc.github.io/>
+ */
+#include <tvm/relay/op.h>
+
+#include "../../../qnn/op/op_common.h"
+#include "../../../qnn/utils.h"
+#include "../../op_common.h"
+
+namespace tvm {
+namespace relay {
+namespace op {
+namespace contrib {
+namespace gemmini {
+
+/*! \brief Attributes used by the Gemmini Add operators */
+struct GemminiAddAttrs : public tvm::AttrsNode<GemminiAddAttrs> {
+  Expr ifm1_scale;
+  Expr ifm1_offset;
+  Expr ifm2_scale;
+  Expr ifm2_offset;
+  Expr ofm_scale;
+  Expr ofm_offset;
+  Array<PrimExpr> shape;
+
+  TVM_DECLARE_ATTRS(GemminiAddAttrs, "relay.attrs.GemminiAddAttrs") {
+    TVM_ATTR_FIELD(ifm1_scale).describe("Input feature map 1 quantization scale");
+    TVM_ATTR_FIELD(ifm1_offset).describe("Input feature map 1 quantization offset");
+    TVM_ATTR_FIELD(ifm2_scale).describe("Input feature map 2 quantization scale");
+    TVM_ATTR_FIELD(ifm2_offset).describe("Input feature map 2 quantization offset");
+    TVM_ATTR_FIELD(ofm_scale).describe("Output feature map quantization scale");
+    TVM_ATTR_FIELD(ofm_offset).describe("Output feature map quantization offset");
+    TVM_ATTR_FIELD(shape).describe("Output shape");
+  }
+};
+
+TVM_REGISTER_NODE_TYPE(GemminiAddAttrs);
+
+bool GemminiAddRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                   const TypeReporter& reporter) {
+  const int ifm1_index = 0;
+  const int ifm2_index = 1;
+  const int result_index = 3;
+  ICHECK_EQ(types.size(), result_index + 1);
+
+  const auto* ifm1 = types[ifm1_index].as<TensorTypeNode>();
+  const auto* ifm2 = types[ifm2_index].as<TensorTypeNode>();
+  ICHECK(ifm1 != nullptr) << "ifm1 cannot be nullptr.";
+  ICHECK(ifm2 != nullptr) << "ifm2 cannot be nullptr.";
+
+  const auto* param = attrs.as<GemminiAddAttrs>();
+  ICHECK(param != nullptr) << "GemminiAddAttrs cannot be nullptr.";
+
+  DataType ofm_dtype = DataType::Int(8);
+
+  // Assign ofm type
+  Array<IndexExpr> ofm_shape({ifm1->shape[0], ifm2->shape[1], ifm2->shape[2], ifm2->shape[3]});
+  reporter->Assign(types[result_index], TensorType(ofm_shape, ofm_dtype));
+  return true;
+}
+
+Expr MakeGemminiAdd(Expr ifm1, Expr ifm2, Expr ifm1_scale, Expr ifm1_offset, Expr ifm2_scale,
+                    Expr ifm2_offset, Expr ofm_scale, Expr ofm_offset, Array<PrimExpr> shape) {
+  auto attrs = make_object<GemminiAddAttrs>();
+  attrs->ifm1_scale = std::move(ifm1_scale);
+  attrs->ifm1_offset = std::move(ifm1_offset);
+  attrs->ifm2_scale = std::move(ifm2_scale);
+  attrs->ifm2_offset = std::move(ifm2_offset);
+  attrs->ofm_scale = std::move(ofm_scale);
+  attrs->ofm_offset = std::move(ofm_offset);
+  attrs->shape = std::move(shape);
+
+  static const Op& op = Op::Get("contrib.gemmini.add");
+
+  auto requantized_ifm1 = ifm1;
+
+  auto requantized_ifm2 = ifm2;
+
+  auto ofm_offset_tensor = Full(attrs->ofm_offset, attrs->shape, DataType::Float(32));
+  auto ifm1_offset_tensor = Multiply(Divide(attrs->ifm1_scale, attrs->ofm_scale),
+                                     Cast(attrs->ifm1_offset, DataType::Float(32)));
+  auto ifm2_offset_tensor = Multiply(Divide(attrs->ifm2_scale, attrs->ofm_scale),
+                                     Cast(attrs->ifm2_offset, DataType::Float(32)));
+  ofm_offset_tensor = Subtract(Subtract(ofm_offset_tensor, ifm1_offset_tensor), ifm2_offset_tensor);
+
+  auto final_offset_tensor = tvm::relay::qnn::RequantizeOrUpcast(
+      ofm_offset_tensor, MakeConstantScalar(DataType::Float(32), 1),
+      MakeConstantScalar(DataType::Float(32), 0), MakeConstantScalar(DataType::Float(32), 1),
+      MakeConstantScalar(DataType::Float(32), 0), attrs->shape, -1);
+
+  auto add_output =
+      Call(op, {requantized_ifm1, requantized_ifm2, final_offset_tensor}, Attrs(attrs), {});
+  return add_output;
+}
+
+TVM_REGISTER_GLOBAL("relay.op._make.gemmini_add").set_body_typed(MakeGemminiAdd);
+
+RELAY_REGISTER_OP("contrib.gemmini.add")
+    .describe("Gemmini Add operator.")
+    .set_attrs_type<GemminiAddAttrs>()
+    .set_num_inputs(3)
+    .add_argument("ifm1", "Tensor", "The Input 1 Feature Map tensor.")
+    .add_argument("ifm2", "Tensor", "The Input 2 Feature Map tensor.")
+    .add_argument("ofm_offset_tensor", "Tensor", "The output offset tensor.")
+    .set_support_level(11)
+    .set_attr<TOpPattern>("TOpPattern", kOpaque)
+    .add_type_rel("GemminiAdd", GemminiAddRel);
+
+}  // namespace gemmini
+}  // namespace contrib
+}  // namespace op
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/op/contrib/gemmini/convolution.cc b/src/relay/op/contrib/gemmini/convolution.cc
new file mode 100644
index 000000000000..78c7f249c51c
--- /dev/null
+++ b/src/relay/op/contrib/gemmini/convolution.cc
@@ -0,0 +1,220 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/op/contrib/gemmini/convolution.cc
+ * \brief 2D convolution operator definition for Gemmini.
+ * \author Federico Peccia <https://fPecc.github.io/>
+ */
+#include <tvm/relay/op.h>
+
+#include "../../../qnn/utils.h"
+#include "../../op_common.h"
+
+namespace tvm {
+namespace relay {
+namespace op {
+namespace contrib {
+namespace gemmini {
+
+/*! \brief Attributes used by the Gemmini 2D convolution operator */
+struct GemminiConv2dAttrs : public tvm::AttrsNode<GemminiConv2dAttrs> {
+  Array<IndexExpr> strides;
+  Array<IndexExpr> padding;
+  double ifm_scale;
+  Expr ifm_offset;
+  double weights_scale;
+  double weights_offset;
+  Expr bias_scale;
+  Expr bias_offset;
+  Expr ofm_scale;
+  Expr ofm_offset;
+  bool activation;
+  bool has_pool;
+  Array<IndexExpr> pool_size;
+  Array<IndexExpr> pool_strides;
+  Array<IndexExpr> pool_dilation;
+  Array<IndexExpr> pool_padding;
+  Expr input_req_offset_out;
+  Expr activation_scale_in;
+  Expr activation_offset_in;
+  Expr activation_scale_out;
+  Expr activation_offset_out;
+  bool has_activation;
+
+  TVM_DECLARE_ATTRS(GemminiConv2dAttrs, "relay.attrs.GemminiConv2dAttrs") {
+    TVM_ATTR_FIELD(strides)
+        .set_default(Array<IndexExpr>({1, 1}))
+        .describe("The 2 dimensional strides as (stride_height, stride_width).");
+    TVM_ATTR_FIELD(padding)
+        .set_default(Array<IndexExpr>({0, 0, 0, 0}))
+        .describe("The 4 dimensional padding.");
+    TVM_ATTR_FIELD(ifm_scale).set_default(1.0).describe("Input quantization scale");
+    TVM_ATTR_FIELD(ifm_offset).describe("Input quantization offset");
+    TVM_ATTR_FIELD(weights_scale).set_default(1.0).describe("Weights quantization scale");
+    TVM_ATTR_FIELD(weights_offset).set_default(0.0).describe("Weights quantization offset");
+    TVM_ATTR_FIELD(bias_scale).describe("Bias quantization scale");
+    TVM_ATTR_FIELD(bias_offset).describe("Bias quantization offset");
+    TVM_ATTR_FIELD(ofm_scale).describe("Output quantization scale");
+    TVM_ATTR_FIELD(ofm_offset).describe("Output quantization offset");
+    TVM_ATTR_FIELD(activation)
+        .set_default(false)
+        .describe("If it has a ReLu activation (True) or not (False)");
+    TVM_ATTR_FIELD(has_pool).set_default(false).describe(
+        "If it has a pool layer (True) or not (False)");
+    TVM_ATTR_FIELD(pool_size).describe("Pooling window size");
+    TVM_ATTR_FIELD(pool_strides).describe("Pooling window strides");
+    TVM_ATTR_FIELD(pool_dilation).describe("Pooling window dilation");
+    TVM_ATTR_FIELD(pool_padding).describe("Pooling padding");
+    TVM_ATTR_FIELD(input_req_offset_out).describe("Requantization output offset");
+    TVM_ATTR_FIELD(activation_scale_in).describe("Activation input scaling factor");
+    TVM_ATTR_FIELD(activation_offset_in).describe("Activation input offset");
+    TVM_ATTR_FIELD(activation_scale_out).describe("Activation output scaling factor");
+    TVM_ATTR_FIELD(activation_offset_out).describe("Activation output offset");
+    TVM_ATTR_FIELD(has_activation).describe("Has activation?");
+  }
+};
+
+TVM_REGISTER_NODE_TYPE(GemminiConv2dAttrs);
+
+bool GemminiConv2dRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                      const TypeReporter& reporter) {
+  const int data_index = 0;
+  const int weights_index = 1;
+  const int bias_index = 2;
+  const int result_index = 3;
+
+  const auto* data = types[data_index].as<TensorTypeNode>();
+  const auto* weights = types[weights_index].as<TensorTypeNode>();
+  const auto* bias = types[bias_index].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+  if (weights == nullptr) return false;
+  if (bias == nullptr) return false;
+
+  const auto* params = attrs.as<GemminiConv2dAttrs>();
+  ICHECK(params != nullptr) << "GemminiConv2dAttrs cannot be nullptr.";
+
+  DataType ofm_dtype = DataType::Int(8);
+
+  // Assign ofm type
+  PrimExpr conv2d_output_h =
+      ((data->shape[1] + (params->padding[0] + params->padding[2]) - weights->shape[0]) /
+       params->strides[0]) +
+      1;
+  PrimExpr conv2d_output_w =
+      ((data->shape[2] + (params->padding[1] + params->padding[3]) - weights->shape[1]) /
+       params->strides[1]) +
+      1;
+  PrimExpr max_pool2d_h = conv2d_output_h;
+  PrimExpr max_pool2d_w = conv2d_output_w;
+  if (params->has_pool) {
+    max_pool2d_h = ((conv2d_output_h + (params->pool_padding[0] + params->pool_padding[2]) -
+                     params->pool_size[0]) /
+                    params->pool_strides[0]) +
+                   1;
+    max_pool2d_w = ((conv2d_output_w + (params->pool_padding[1] + params->pool_padding[3]) -
+                     params->pool_size[1]) /
+                    params->pool_strides[1]) +
+                   1;
+  }
+  Array<IndexExpr> ofm_shape({data->shape[0], max_pool2d_h, max_pool2d_w, weights->shape[3]});
+  reporter->Assign(types[result_index], TensorType(ofm_shape, ofm_dtype));
+  return true;
+}
+
+Expr MakeGemminiConv2d(Expr data, Expr weights, Expr bias, Array<IndexExpr> strides,
+                       Array<IndexExpr> padding, double ifm_scale, Expr ifm_offset,
+                       double weights_scale, double weights_offset, Expr bias_scale,
+                       Expr bias_offset, Expr ofm_scale, Expr ofm_offset, bool activation,
+                       bool has_pool, Array<IndexExpr> pool_size, Array<IndexExpr> pool_strides,
+                       Array<IndexExpr> pool_dilation, Array<IndexExpr> pool_padding,
+                       Expr input_req_offset_out, bool has_activation, Expr activation_scale_in,
+                       Expr activation_offset_in, Expr activation_scale_out,
+                       Expr activation_offset_out) {
+  auto attrs = make_object<GemminiConv2dAttrs>();
+  attrs->strides = std::move(strides);
+  attrs->padding = std::move(padding);
+  attrs->activation = std::move(activation);
+  attrs->ifm_scale = std::move(ifm_scale);
+  attrs->ifm_offset = std::move(ifm_offset);
+  attrs->weights_scale = std::move(weights_scale);
+  attrs->weights_offset = std::move(weights_offset);
+  attrs->bias_scale = std::move(bias_scale);
+  attrs->bias_offset = std::move(bias_offset);
+  attrs->ofm_scale = std::move(ofm_scale);
+  attrs->ofm_offset = std::move(ofm_offset);
+  attrs->has_pool = std::move(has_pool);
+  attrs->pool_size = std::move(pool_size);
+  attrs->pool_strides = std::move(pool_strides);
+  attrs->pool_dilation = std::move(pool_dilation);
+  attrs->pool_padding = std::move(pool_padding);
+  attrs->input_req_offset_out = std::move(input_req_offset_out);
+  attrs->activation_scale_in = std::move(activation_scale_in);
+  attrs->activation_offset_in = std::move(activation_offset_in);
+  attrs->activation_scale_out = std::move(activation_scale_out);
+  attrs->activation_offset_out = std::move(activation_offset_out);
+  attrs->has_activation = std::move(has_activation);
+
+  static const Op& op = Op::Get("contrib.gemmini.conv2d");
+
+  auto zero_const = MakeConstantScalar(DataType::Int(32), 0);
+  auto one_const = MakeConstantScalar(DataType::Int(32), 0);
+
+  auto new_bias = bias;
+  // Bias change
+  // Term 3
+  auto reduced_t3 = Sum(Cast(weights, DataType::Int(32)), {0, 1, 2}, false, false);
+  auto term3 = Multiply(attrs->ifm_offset, reduced_t3);
+  auto input_req_bias_term = Multiply(attrs->input_req_offset_out, reduced_t3);
+
+  new_bias = Add(Subtract(bias, term3), input_req_bias_term);
+  auto scale_1 = Divide(attrs->bias_scale, attrs->ofm_scale);
+  auto bias_fix = Divide(Cast(attrs->ofm_offset, DataType::Float(32)), scale_1);
+  new_bias = Add(new_bias, Cast(bias_fix, DataType::Int(32)));
+
+  if (attrs->has_activation) {
+    auto scale_2 = Divide(attrs->activation_scale_in, attrs->activation_scale_out);
+    auto term_1 = Cast(attrs->activation_offset_in, DataType::Float(32));
+    auto term_2 = Divide(Cast(attrs->activation_offset_out, DataType::Float(32)), scale_2);
+    auto bias_fix = Divide(Subtract(term_2, term_1), scale_1);
+    new_bias = Add(new_bias, Cast(bias_fix, DataType::Int(32)));
+  }
+
+  auto conv2d_output = Call(op, {data, weights, new_bias}, Attrs(attrs), {});
+  return conv2d_output;
+}
+
+TVM_REGISTER_GLOBAL("relay.op._make.gemmini_conv2d").set_body_typed(MakeGemminiConv2d);
+
+RELAY_REGISTER_OP("contrib.gemmini.conv2d")
+    .describe("Gemmini 2D convolution operator")
+    .set_attrs_type<GemminiConv2dAttrs>()
+    .set_num_inputs(3)
+    .add_argument("data", "Tensor", "The Input Feature Map tensor.")
+    .add_argument("weights", "Tensor", "The Weights tensor.")
+    .add_argument("bias", "Tensor", "The bias tensor.")
+    .set_support_level(11)
+    .set_attr<TOpPattern>("TOpPattern", kOpaque)
+    .add_type_rel("GemminiConv2d", GemminiConv2dRel);
+
+}  // namespace gemmini
+}  // namespace contrib
+}  // namespace op
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/op/contrib/gemmini/depthwise_convolution.cc b/src/relay/op/contrib/gemmini/depthwise_convolution.cc
new file mode 100644
index 000000000000..c956c5e1b815
--- /dev/null
+++ b/src/relay/op/contrib/gemmini/depthwise_convolution.cc
@@ -0,0 +1,158 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/op/contrib/gemmini/depthwise_convolution.cc
+ * \brief 2D depthwise convolution operator definition for Gemmini.
+ * \author Federico Peccia <https://fPecc.github.io/>
+ */
+#include <tvm/relay/op.h>
+
+#include "../../../qnn/utils.h"
+#include "../../op_common.h"
+
+namespace tvm {
+namespace relay {
+namespace op {
+namespace contrib {
+namespace gemmini {
+
+/*! \brief Attributes used by the Gemmini 2D depthwise convolution operator */
+struct GemminiDepthwiseConv2dAttrs : public tvm::AttrsNode<GemminiDepthwiseConv2dAttrs> {
+  Array<IndexExpr> strides;
+  Array<IndexExpr> padding;
+  double ifm_scale;
+  Expr ifm_offset;
+  double weights_scale;
+  double weights_offset;
+  Expr bias_scale;
+  Expr bias_offset;
+  Expr ofm_scale;
+  Expr ofm_offset;
+  bool activation;
+
+  TVM_DECLARE_ATTRS(GemminiDepthwiseConv2dAttrs, "relay.attrs.GemminiDepthwiseConv2dAttrs") {
+    TVM_ATTR_FIELD(strides)
+        .set_default(Array<IndexExpr>({1, 1}))
+        .describe("The 2 dimensional strides as (stride_height, stride_width).");
+    TVM_ATTR_FIELD(padding)
+        .set_default(Array<IndexExpr>({0, 0, 0, 0}))
+        .describe("The 4 dimensional padding.");
+    TVM_ATTR_FIELD(ifm_scale).set_default(1.0).describe("Input quantization scale");
+    TVM_ATTR_FIELD(ifm_offset).describe("Input quantization offset");
+    TVM_ATTR_FIELD(weights_scale).set_default(1.0).describe("Weights quantization scale");
+    TVM_ATTR_FIELD(weights_offset).set_default(0.0).describe("Weights quantization offset");
+    TVM_ATTR_FIELD(bias_scale).describe("Bias quantization scale");
+    TVM_ATTR_FIELD(bias_offset).describe("Bias quantization offset");
+    TVM_ATTR_FIELD(ofm_scale).describe("Output quantization scale");
+    TVM_ATTR_FIELD(ofm_offset).describe("Output quantization offset");
+    TVM_ATTR_FIELD(activation)
+        .set_default(false)
+        .describe("If it has a ReLu activation (True) or not (False)");
+  }
+};
+
+TVM_REGISTER_NODE_TYPE(GemminiDepthwiseConv2dAttrs);
+
+bool GemminiDepthwiseConv2dRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                               const TypeReporter& reporter) {
+  const int data_index = 0;
+  const int weights_index = 1;
+  const int bias_index = 2;
+  const int result_index = 3;
+
+  const auto* data = types[data_index].as<TensorTypeNode>();
+  const auto* weights = types[weights_index].as<TensorTypeNode>();
+  const auto* bias = types[bias_index].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+  if (weights == nullptr) return false;
+  if (bias == nullptr) return false;
+
+  const auto* params = attrs.as<GemminiDepthwiseConv2dAttrs>();
+  ICHECK(params != nullptr) << "GemminiDepthwiseConv2dAttrs cannot be nullptr.";
+
+  DataType ofm_dtype = DataType::Int(8);
+
+  // Assign ofm type
+  Array<IndexExpr> ofm_shape(
+      {data->shape[0],
+       ((data->shape[1] + (params->padding[0] + params->padding[2]) - weights->shape[1]) /
+        params->strides[0]) +
+           1,
+       ((data->shape[2] + (params->padding[1] + params->padding[3]) - weights->shape[2]) /
+        params->strides[1]) +
+           1,
+       weights->shape[0]});
+  reporter->Assign(types[result_index], TensorType(ofm_shape, ofm_dtype));
+  return true;
+}
+
+Expr MakeGemminiDepthwiseConv2d(Expr data, Expr weights, Expr bias, Array<IndexExpr> strides,
+                                Array<IndexExpr> padding, double ifm_scale, Expr ifm_offset,
+                                double weights_scale, double weights_offset, Expr bias_scale,
+                                Expr bias_offset, Expr ofm_scale, Expr ofm_offset,
+                                bool activation) {
+  auto attrs = make_object<GemminiDepthwiseConv2dAttrs>();
+  attrs->strides = std::move(strides);
+  attrs->padding = std::move(padding);
+  attrs->activation = std::move(activation);
+  attrs->ifm_scale = std::move(ifm_scale);
+  attrs->ifm_offset = std::move(ifm_offset);
+  attrs->weights_scale = std::move(weights_scale);
+  attrs->weights_offset = std::move(weights_offset);
+  attrs->bias_scale = std::move(bias_scale);
+  attrs->bias_offset = std::move(bias_offset);
+  attrs->ofm_scale = std::move(ofm_scale);
+  attrs->ofm_offset = std::move(ofm_offset);
+
+  static const Op& op = Op::Get("contrib.gemmini.depthwiseconv2d");
+
+  // Bias change
+  // Term 3
+  auto reduced_t3 = Sum(Cast(weights, DataType::Int(32)), {1, 2}, false, false);
+  auto term3 = Multiply(attrs->ifm_offset, reduced_t3);
+
+  auto new_bias = Subtract(bias, term3);
+  auto scale = Divide(attrs->bias_scale, attrs->ofm_scale);
+  auto bias_fix = Divide(Cast(attrs->ofm_offset, DataType::Float(32)), scale);
+  new_bias = Add(new_bias, Cast(bias_fix, DataType::Int(32)));
+
+  auto conv2d_output = Call(op, {data, weights, new_bias}, Attrs(attrs), {});
+  return conv2d_output;
+}
+
+TVM_REGISTER_GLOBAL("relay.op._make.gemmini_depthwise_conv2d")
+    .set_body_typed(MakeGemminiDepthwiseConv2d);
+
+RELAY_REGISTER_OP("contrib.gemmini.depthwiseconv2d")
+    .describe("Gemmini 2D depthwise convolution operator.")
+    .set_attrs_type<GemminiDepthwiseConv2dAttrs>()
+    .set_num_inputs(3)
+    .add_argument("data", "Tensor", "The Input Feature Map tensor.")
+    .add_argument("weights", "Tensor", "The Weights tensor.")
+    .add_argument("bias", "Tensor", "The bias tensor.")
+    .set_support_level(11)
+    .set_attr<TOpPattern>("TOpPattern", kOpaque)
+    .add_type_rel("GemminiDepthwiseConv2d", GemminiDepthwiseConv2dRel);
+
+}  // namespace gemmini
+}  // namespace contrib
+}  // namespace op
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/op/contrib/gemmini/gemm.cc b/src/relay/op/contrib/gemmini/gemm.cc
new file mode 100644
index 000000000000..eacbabafdc77
--- /dev/null
+++ b/src/relay/op/contrib/gemmini/gemm.cc
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/op/contrib/gemmini/gemm.cc
+ * \brief GEMM operator definition for Gemmini.
+ * \author Federico Peccia <https://fPecc.github.io/>
+ */
+#include <tvm/relay/op.h>
+
+#include "../../../qnn/utils.h"
+#include "../../op_common.h"
+
+namespace tvm {
+namespace relay {
+namespace op {
+namespace contrib {
+namespace gemmini {
+
+/*! \brief Attributes used by the Gemmini GEMM operator */
+struct GemminiGEMMAttrs : public tvm::AttrsNode<GemminiGEMMAttrs> {
+  Expr ifm_scale;
+  Expr ifm_offset;
+  Expr bias_scale;
+  Expr bias_offset;
+  Expr ofm_scale;
+  Expr ofm_offset;
+
+  TVM_DECLARE_ATTRS(GemminiGEMMAttrs, "relay.attrs.GemminiGEMMAttrs") {
+    TVM_ATTR_FIELD(ifm_scale).describe("Data quantization scale");
+    TVM_ATTR_FIELD(ifm_offset).describe("Data quantization offset");
+    TVM_ATTR_FIELD(bias_scale).describe("Bias quantization scale");
+    TVM_ATTR_FIELD(bias_offset).describe("Bias quantization offset");
+    TVM_ATTR_FIELD(ofm_scale).describe("Output quantization scale");
+    TVM_ATTR_FIELD(ofm_offset).describe("Output quantization offset");
+  }
+};
+
+TVM_REGISTER_NODE_TYPE(GemminiGEMMAttrs);
+
+bool GemminiGEMMRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                    const TypeReporter& reporter) {
+  const int ifm1_index = 0;
+  const int ifm2_index = 1;
+  const int bias_index = 2;
+  const int result_index = 3;
+
+  const auto* ifm1 = types[ifm1_index].as<TensorTypeNode>();
+  const auto* ifm2 = types[ifm2_index].as<TensorTypeNode>();
+  const auto* bias = types[bias_index].as<TensorTypeNode>();
+  if (ifm1 == nullptr) return false;
+  if (ifm2 == nullptr) return false;
+  if (bias == nullptr) return false;
+
+  const auto* param = attrs.as<GemminiGEMMAttrs>();
+  ICHECK(param != nullptr) << "GemminiGEMMAttrs cannot be nullptr.";
+
+  DataType ofm_dtype = DataType::Int(8);
+
+  // Assign ofm type
+  Array<IndexExpr> ofm_shape({ifm1->shape[0], ifm2->shape[1]});
+  reporter->Assign(types[result_index], TensorType(ofm_shape, ofm_dtype));
+  return true;
+}
+
+Expr MakeGemminiGEMM(Expr data, Expr weights, Expr bias, Expr ifm_scale, Expr ifm_offset,
+                     Expr bias_scale, Expr bias_offset, Expr ofm_scale, Expr ofm_offset) {
+  auto attrs = make_object<GemminiGEMMAttrs>();
+  attrs->ifm_scale = std::move(ifm_scale);
+  attrs->ifm_offset = std::move(ifm_offset);
+  attrs->bias_scale = std::move(bias_scale);
+  attrs->bias_offset = std::move(bias_offset);
+  attrs->ofm_scale = std::move(ofm_scale);
+  attrs->ofm_offset = std::move(ofm_offset);
+
+  static const Op& op = Op::Get("contrib.gemmini.gemm");
+
+  auto weights_transposed = MakeTranspose(weights, {1, 0});
+  auto reduced_t3 = Sum(Cast(weights_transposed, DataType::Int(32)), {0}, false, false);
+  auto term3 = Multiply(attrs->ifm_offset, reduced_t3);
+
+  auto scale = Divide(attrs->bias_scale, attrs->ofm_scale);
+  auto bias_fix = Divide(Cast(attrs->ofm_offset, DataType::Float(32)), scale);
+
+  auto new_bias = Add(Subtract(bias, term3), Cast(bias_fix, DataType::Int(32)));
+
+  auto gemm_output = Call(op, {data, weights_transposed, new_bias}, Attrs(attrs), {});
+  return gemm_output;
+}
+
+TVM_REGISTER_GLOBAL("relay.op._make.gemmini_gemm").set_body_typed(MakeGemminiGEMM);
+
+RELAY_REGISTER_OP("contrib.gemmini.gemm")
+    .describe("Gemmini GEMM operator")
+    .set_attrs_type<GemminiGEMMAttrs>()
+    .set_num_inputs(3)
+    .add_argument("ifm1", "Tensor", "The Input Feature Map tensor.")
+    .add_argument("ifm2", "Tensor", "The Weights tensor.")
+    .add_argument("bias", "Tensor", "The bias tensor")
+    .set_support_level(11)
+    .set_attr<TOpPattern>("TOpPattern", kOpaque)
+    .add_type_rel("GemminiGEMM", GemminiGEMMRel);
+
+}  // namespace gemmini
+}  // namespace contrib
+}  // namespace op
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/op/contrib/gemmini/max_pool2d.cc b/src/relay/op/contrib/gemmini/max_pool2d.cc
new file mode 100644
index 000000000000..082a4492547b
--- /dev/null
+++ b/src/relay/op/contrib/gemmini/max_pool2d.cc
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/op/contrib/gemmini/max_pool2d.cc
+ * \brief 2D max pool operator definition for Gemmini.
+ * \author Federico Peccia <https://fPecc.github.io/>
+ */
+#include <tvm/relay/op.h>
+
+#include "../../../qnn/utils.h"
+#include "../../op_common.h"
+
+namespace tvm {
+namespace relay {
+namespace op {
+namespace contrib {
+namespace gemmini {
+
+/*! \brief Attributes used by the Gemmini GEMM operators */
+struct GemminiMaxPool2DAttrs : public tvm::AttrsNode<GemminiMaxPool2DAttrs> {
+  Array<IndexExpr> pool_size;
+  Array<IndexExpr> pool_strides;
+  Array<IndexExpr> pool_dilation;
+  Array<IndexExpr> pool_padding;
+  Array<PrimExpr> shape;
+
+  TVM_DECLARE_ATTRS(GemminiMaxPool2DAttrs, "relay.attrs.GemminiMaxPool2DAttrs") {
+    TVM_ATTR_FIELD(pool_size).describe("Pooling window size");
+    TVM_ATTR_FIELD(pool_strides).describe("Pooling window strides");
+    TVM_ATTR_FIELD(pool_dilation).describe("Pooling window dilation");
+    TVM_ATTR_FIELD(pool_padding).describe("Pooling padding");
+    TVM_ATTR_FIELD(shape).describe("Input shape");
+  }
+};
+
+TVM_REGISTER_NODE_TYPE(GemminiMaxPool2DAttrs);
+
+bool GemminiMaxPool2DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                         const TypeReporter& reporter) {
+  const int data_index = 0;
+  const int result_index = 2;
+
+  const auto* data = types[data_index].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+
+  const auto* params = attrs.as<GemminiMaxPool2DAttrs>();
+  ICHECK(params != nullptr) << "GemminiMaxPool2DAttrs cannot be nullptr.";
+
+  DataType ofm_dtype = DataType::Int(8);
+
+  // Assign ofm type
+  PrimExpr max_pool2d_h = ((data->shape[1] + (params->pool_padding[0] + params->pool_padding[2]) -
+                            params->pool_size[0]) /
+                           params->pool_strides[0]) +
+                          1;
+  PrimExpr max_pool2d_w = ((data->shape[2] + (params->pool_padding[1] + params->pool_padding[3]) -
+                            params->pool_size[1]) /
+                           params->pool_strides[1]) +
+                          1;
+  Array<IndexExpr> ofm_shape({data->shape[0], max_pool2d_h, max_pool2d_w, data->shape[3]});
+  reporter->Assign(types[result_index], TensorType(ofm_shape, ofm_dtype));
+  return true;
+}
+
+Expr MakeGemminiMaxPool2D(Expr data, Array<IndexExpr> pool_size, Array<IndexExpr> pool_strides,
+                          Array<IndexExpr> pool_dilation, Array<IndexExpr> pool_padding,
+                          Array<PrimExpr> shape) {
+  auto attrs = make_object<GemminiMaxPool2DAttrs>();
+  attrs->pool_size = std::move(pool_size);
+  attrs->pool_strides = std::move(pool_strides);
+  attrs->pool_dilation = std::move(pool_dilation);
+  attrs->pool_padding = std::move(pool_padding);
+  attrs->shape = std::move(shape);
+
+  static const Op& op = Op::Get("contrib.gemmini.max_pool2d");
+
+  // Trick to be able to accelerate the max pooling operation using the dw convolution function of
+  // Gemmini ;)
+  auto weights =
+      Full(MakeConstantScalar(DataType::Int(8), 1), {attrs->shape[3], 1, 1}, DataType::Int(8));
+
+  auto max_pool2d_output = Call(op, {data, weights}, Attrs(attrs), {});
+
+  return max_pool2d_output;
+}
+
+TVM_REGISTER_GLOBAL("relay.op._make.gemmini_max_pool2d").set_body_typed(MakeGemminiMaxPool2D);
+
+RELAY_REGISTER_OP("contrib.gemmini.max_pool2d")
+    .describe("Gemmini 2D max pooling operator")
+    .set_attrs_type<GemminiMaxPool2DAttrs>()
+    .set_num_inputs(2)
+    .add_argument("data", "Tensor", "The Input Feature Map tensor.")
+    .add_argument("weights", "Tensor", "The Weights dummy tensor.")
+    .set_support_level(11)
+    .set_attr<TOpPattern>("TOpPattern", kOpaque)
+    .add_type_rel("GemminiMaxPool2D", GemminiMaxPool2DRel);
+
+}  // namespace gemmini
+}  // namespace contrib
+}  // namespace op
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/target/metadata_module.cc b/src/target/metadata_module.cc
index c8c099171c96..d827aac35647 100644
--- a/src/target/metadata_module.cc
+++ b/src/target/metadata_module.cc
@@ -229,7 +229,8 @@ runtime::Module CreateMetadataModule(
     // TODO(@manupa-arm) : we should be able to use csource_metadata
     // if the variables are empty when all the runtime modules implement get_func_names
     if (symbol_const_vars.empty() && is_targeting_crt && mod->IsDSOExportable() &&
-        (target->kind->name == "c" || target->kind->name == "llvm")) {
+        (target->kind->name == "c" || target->kind->name == "llvm" ||
+         target->kind->name == "gemmini")) {
       crt_exportable_modules.push_back(mod);
     } else {
       non_crt_exportable_modules.push_back(mod);
diff --git a/src/target/source/codegen_c_host.cc b/src/target/source/codegen_c_host.cc
index 1d8071774e9e..e3f87b0954b0 100644
--- a/src/target/source/codegen_c_host.cc
+++ b/src/target/source/codegen_c_host.cc
@@ -66,6 +66,9 @@ void CodeGenCHost::Init(bool output_ssa, bool emit_asserts, bool emit_fwd_func_d
     decl_stream << "#include <arm_nn_types.h>\n";
     decl_stream << "#include <arm_nn_math_types.h>\n";
   }
+  if (target_str.find("gemmini") != std::string::npos) {
+    decl_stream << "#include \"gemmini_testutils.h\"\n";
+  }
   CodeGenC::Init(output_ssa);
 }
 
diff --git a/src/tir/ir/stmt.cc b/src/tir/ir/stmt.cc
index 1d1e674a9dd1..819f7b17c15b 100644
--- a/src/tir/ir/stmt.cc
+++ b/src/tir/ir/stmt.cc
@@ -35,11 +35,12 @@ namespace tir {
 LetStmt::LetStmt(Var var, PrimExpr value, Stmt body, Span span) {
   ICHECK(value.defined());
   ICHECK(body.defined());
-  auto vdtype = value.dtype();
   // It is still valid to bind a pointer type
   // var to a value that is of type handle.
   if (var->type_annotation.as<PointerTypeNode>()) {
-    ICHECK(vdtype.is_handle());
+    // TODO(FP): Is this check really necessary?
+    // auto vdtype = value.dtype();
+    // ICHECK(vdtype.is_handle());
   } else {
     ICHECK_EQ(value.dtype(), var.dtype());
   }
diff --git a/src/tir/transforms/inject_gemmini_pointer_correction.cc b/src/tir/transforms/inject_gemmini_pointer_correction.cc
new file mode 100644
index 000000000000..4a9260ff014c
--- /dev/null
+++ b/src/tir/transforms/inject_gemmini_pointer_correction.cc
@@ -0,0 +1,128 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \brief Correct pointer addresses in scratchpad and accumulator of Gemmini
+ * \file inject_gemmini_pointer_correction.cc
+ * \author Federico Peccia <https://fPecc.github.io/>
+ */
+#include <tvm/arith/analyzer.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/runtime/registry.h>
+#include <tvm/target/target_info.h>
+#include <tvm/tir/buffer.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/op.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+
+#include "../../runtime/thread_storage_scope.h"
+#include "ir_utils.h"
+
+namespace tvm {
+namespace tir {
+
+struct CorrectGemminisScratchpadAndAccumulatorPointersConfigNode
+    : public tvm::AttrsNode<CorrectGemminisScratchpadAndAccumulatorPointersConfigNode> {
+  int dim;
+
+  TVM_DECLARE_ATTRS(CorrectGemminisScratchpadAndAccumulatorPointersConfigNode,
+                    "tir.transform.CorrectGemminisScratchpadAndAccumulatorPointersConfig") {
+    TVM_ATTR_FIELD(dim).describe("Systolic array DIM").set_default(16);
+  }
+};
+
+class CorrectGemminisScratchpadAndAccumulatorPointersConfig : public Attrs {
+ public:
+  TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(
+      CorrectGemminisScratchpadAndAccumulatorPointersConfig, Attrs,
+      CorrectGemminisScratchpadAndAccumulatorPointersConfigNode);
+};
+
+TVM_REGISTER_NODE_TYPE(CorrectGemminisScratchpadAndAccumulatorPointersConfigNode);
+TVM_REGISTER_PASS_CONFIG_OPTION("tir.CorrectGemminisScratchpadAndAccumulatorPointers",
+                                CorrectGemminisScratchpadAndAccumulatorPointersConfig);
+
+class CorrectGemminisScratchpadAndAccumulatorPointersInjector : public StmtExprMutator {
+ public:
+  explicit CorrectGemminisScratchpadAndAccumulatorPointersInjector(int dim) : dim_(dim) {}
+
+  Stmt Inject(Stmt stmt) { return this->VisitStmt(stmt); }
+
+  PrimExpr VisitExpr_(const CallNode* op) final {
+    /*
+    This pass is used to modify the access ptr
+    */
+    auto node = Downcast<Call>(StmtExprMutator::VisitExpr_(op));
+    if (node->op.same_as(builtin::tvm_access_ptr())) {
+      const VarNode* buffer = node->args[1].as<VarNode>();
+
+      if (std::string(buffer->name_hint).find("local") != std::string::npos) {
+        PrimExpr offset = this->VisitExpr(node->args[2]);
+        PrimExpr extent = this->VisitExpr(node->args[3]);
+
+        const auto* ptr_type = buffer->type_annotation.as<PointerTypeNode>();
+        ICHECK(ptr_type) << "The provided variable is not of pointer type";
+        auto scope = ptr_type->storage_scope;
+        auto info = GetMemoryInfo(scope);
+        ICHECK(info.defined()) << "Cannot find memory info of " << scope;
+
+        int div = dim_;
+
+        PrimExpr inner_offset = indexmod(offset, extent);
+        PrimExpr outer_offset = offset - inner_offset;
+        PrimExpr outer_offset_corrected = indexdiv(outer_offset, div);
+        PrimExpr offset_corrected = outer_offset_corrected + inner_offset;
+
+        return Call(node->dtype, node->op,
+                    {node->args[0], node->args[1], offset_corrected, extent, node->args[4]});
+      }
+    }
+    return StmtExprMutator::VisitExpr_(op);
+  }
+
+ private:
+  int dim_;
+};
+
+namespace transform {
+
+Pass CorrectGemminisScratchpadAndAccumulatorPointers() {
+  auto pass_func = [=](PrimFunc f, IRModule m, PassContext ctx) {
+    auto* n = f.CopyOnWrite();
+    auto cfg = ctx->GetConfig<CorrectGemminisScratchpadAndAccumulatorPointersConfig>(
+        "tir.CorrectGemminisScratchpadAndAccumulatorPointers");
+    if (!cfg.defined()) {
+      cfg = AttrsWithDefaultValues<CorrectGemminisScratchpadAndAccumulatorPointersConfig>();
+    }
+    n->body = CorrectGemminisScratchpadAndAccumulatorPointersInjector(cfg.value()->dim)
+                  .Inject(std::move(n->body));
+    return f;
+  };
+  return CreatePrimFuncPass(pass_func, 0, "tir.CorrectGemminisScratchpadAndAccumulatorPointers",
+                            {});
+}
+
+TVM_REGISTER_GLOBAL("tir.transform.CorrectGemminisScratchpadAndAccumulatorPointers")
+    .set_body_typed(CorrectGemminisScratchpadAndAccumulatorPointers);
+
+}  // namespace transform
+
+}  // namespace tir
+}  // namespace tvm
diff --git a/tests/python/contrib/test_hexagon/topi/slice_op/test_global_avg_pool2d.py b/tests/python/contrib/test_hexagon/topi/slice_op/test_global_avg_pool2d.py
old mode 100644
new mode 100755
diff --git a/tests/python/topi/python/test_topi_transform.py b/tests/python/topi/python/test_topi_transform.py
index 0f64b486f375..e34905f15379 100644
--- a/tests/python/topi/python/test_topi_transform.py
+++ b/tests/python/topi/python/test_topi_transform.py
@@ -859,6 +859,7 @@ def test_dynamic_strided_slice():
     verify_dynamic_strided_slice((3, 4, 3), [0, 2, 0], [1, 2, 3])
 
 
+@tvm.testing.requires_gpu
 @tvm.testing.uses_gpu
 def test_strided_set():
     verify_strided_set((3, 4, 3), (3, 2, 2), [0, 3, 0], [4, 1, 4], [1, -1, 2])
diff --git a/tests/python/unittest/test_arith_detect_cse.py b/tests/python/unittest/test_arith_detect_cse.py
old mode 100644
new mode 100755
diff --git a/tests/python/unittest/test_micro_ms_tuning.py b/tests/python/unittest/test_micro_ms_tuning.py
index edb27396e324..58ffa7845470 100644
--- a/tests/python/unittest/test_micro_ms_tuning.py
+++ b/tests/python/unittest/test_micro_ms_tuning.py
@@ -27,9 +27,39 @@
 from tvm import meta_schedule as ms
 
 
+def create_relay_module():
+    data_shape = (1, 3, 16, 16)
+    weight_shape = (8, 3, 5, 5)
+    data = relay.var("data", relay.TensorType(data_shape, "float32"))
+    weight = relay.var("weight", relay.TensorType(weight_shape, "float32"))
+    y = relay.nn.conv2d(
+        data,
+        weight,
+        padding=(2, 2),
+        kernel_size=(5, 5),
+        kernel_layout="OIHW",
+        out_dtype="float32",
+    )
+    f = relay.Function([data, weight], y)
+    mod = tvm.IRModule.from_expr(f)
+    mod = relay.transform.InferType()(mod)
+
+    weight_sample = np.random.rand(
+        weight_shape[0], weight_shape[1], weight_shape[2], weight_shape[3]
+    ).astype("float32")
+    params = {mod["main"].params[1].name_hint: weight_sample}
+
+    model_info = {
+        "in_tensor": "data",
+        "in_shape": data_shape,
+        "in_dtype": "float32",
+    }
+
+    return mod, params, model_info
+
+
 @tvm.testing.requires_micro
 def test_micro_tuning_with_meta_schedule():
-    from tests.micro.zephyr.test_ms_tuning import create_relay_module
     from tvm.contrib.micro.meta_schedule.local_builder_micro import get_local_builder_micro
     from tvm.contrib.micro.meta_schedule.rpc_runner_micro import get_rpc_runner_micro