diff --git a/docker/Dockerfile.demo_vitis_ai b/docker/Dockerfile.demo_vitis_ai
index 58326b66bf0c..8cc623e2f38c 100644
--- a/docker/Dockerfile.demo_vitis_ai
+++ b/docker/Dockerfile.demo_vitis_ai
@@ -20,10 +20,13 @@ FROM xilinx/vitis-ai:latest
 
 RUN apt-get update --fix-missing
 
-
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
 RUN bash /install/ubuntu_install_core.sh
 
+# Install Vitis-AI ubuntu dependencies
+COPY install/ubuntu_install_vitis_ai_core.sh /install/ubuntu_install_vitis_ai_core.sh
+RUN bash /install/ubuntu_install_vitis_ai_core.sh
+
 COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh
 RUN bash /install/ubuntu_install_python.sh
 
@@ -43,10 +46,6 @@ ENV PATH $PATH:$CARGO_HOME/bin:/usr/lib/go-1.10/bin
 COPY install/ubuntu_install_java.sh /install/ubuntu_install_java.sh
 RUN bash /install/ubuntu_install_java.sh
 
-# Install Vitis-AI ubuntu dependencies
-COPY install/ubuntu_install_vitis_ai_core.sh /install/ubuntu_install_vitis_ai_core.sh
-RUN bash /install/ubuntu_install_vitis_ai_core.sh
-
 # Install dependencies inside vitis-ai-tensorflow conda
 RUN . $VAI_ROOT/conda/etc/profile.d/conda.sh && \
     conda activate vitis-ai-tensorflow && \
diff --git a/docker/install/ubuntu_install_vitis_ai_core.sh b/docker/install/ubuntu_install_vitis_ai_core.sh
index ea05ffd170fe..a2d7c2ebe332 100644
--- a/docker/install/ubuntu_install_vitis_ai_core.sh
+++ b/docker/install/ubuntu_install_vitis_ai_core.sh
@@ -21,9 +21,9 @@ set -u
 set -o pipefail
 
 # install libraries for building Vitis-AI on ubuntu
-apt-get update && apt-get install -y --no-install-recommends \
-    graphviz\
-    gnupg2
-
-apt-get update && apt-get install -y gcc-aarch64-linux-gnu
-
+apt-get update && apt-get install -y \
+    graphviz \
+    gnupg2 \
+    gpg-agent \
+    gcc-aarch64-linux-gnu \
+    && rm -rf /var/lib/apt/lists/*
diff --git a/docs/deploy/vitis_ai.rst b/docs/deploy/vitis_ai.rst
index df29f16f9d8d..7de8f58ce54f 100755
--- a/docs/deploy/vitis_ai.rst
+++ b/docs/deploy/vitis_ai.rst
@@ -304,15 +304,22 @@ Edge hardware setup
   This section provides instructions for setting up with the `Pynq <http://www.pynq.io/>`__ platform but
   Petalinux based flows are also supported.
 
-1. Download the Pynq v2.5 image for your target (use Z1 or Z2 for
+1. Download the Pynq v2.6 image for your target (use Z1 or Z2 for
    Ultra96 target depending on board version) Link to image:
-   https://github.com/Xilinx/PYNQ/releases/tag/v2.5
+   https://github.com/Xilinx/PYNQ/releases/tag/v2.6.0
 2. Follow Pynq instructions for setting up the board: `pynq
    setup <https://pynq.readthedocs.io/en/latest/getting_started.html>`__
-3. After connecting to the board, make sure to run as root. Execute
+3. After connecting to the board, make sure to run as root. **Execute**
    ``su``
-4. Set up DPU on Pynq by following the steps here: `DPU Pynq
-   setup <https://github.com/Xilinx/DPU-PYNQ>`__
+4. Set up DPU on Pynq:
+
+    .. code:: bash
+
+     git clone --branch v1.2.0 --recursive --shallow-submodules https://github.com/Xilinx/DPU-PYNQ.git
+     cd DPU-PYNQ/upgrade
+     make
+     pip3 install pynq-dpu==1.2.0
+
 5. Run the following command to download the DPU bitstream:
 
    .. code:: bash
@@ -343,7 +350,7 @@ interface between TVM and Vitis-AI tools.
    .. code:: bash
 
       apt-get install libhdf5-dev
-      pip3 install pydot h5py
+      pip3 install pydot==1.4.1 h5py==2.8.0
 
 2. Install PyXIR
 
@@ -362,16 +369,17 @@ interface between TVM and Vitis-AI tools.
       mkdir build
       cp cmake/config.cmake build
       cd build
+      echo set\(USE_LLVM OFF\) >> config.cmake
       echo set\(USE_VITIS_AI ON\) >> config.cmake
       cmake ..
-      make
+      make tvm_runtime -j$(nproc)
 
 4. Install TVM
 
    .. code:: bash
 
       cd tvm/python
-      pip3 install -e . --user
+      pip3 install -e .
 
 5. Check whether the setup was successful in the Python shell:
 
@@ -441,7 +449,7 @@ TVM.
    import tvm
    import tvm.relay as relay
    from tvm.contrib.target import vitis_ai
-   from tvm.contrib import util, graph_runtime
+   from tvm.contrib import utils, graph_runtime
    from tvm.relay.build_module import bind_params_by_name
    from tvm.relay.op.contrib.vitis_ai import annotation
 
@@ -524,6 +532,8 @@ model in TVM with Vitis-AI at the edge. The first couple of steps will
 have to be run on the host machine and take care of quantization and
 compilation for deployment at the edge.
 
+A complete ResNet 18 example can be found `here <https://github.com/Xilinx/pyxir/tree/master/examples/tvm>`__.
+
 Host steps
 ^^^^^^^^^^
 
@@ -541,7 +551,7 @@ TVM.
    import tvm
    import tvm.relay as relay
    from tvm.contrib.target import vitis_ai
-   from tvm.contrib import util, graph_runtime
+   from tvm.contrib import utils, graph_runtime
    from tvm.relay.build_module import bind_params_by_name
    from tvm.relay.op.contrib.vitis_ai import annotation
 
@@ -549,12 +559,47 @@ After importing a convolutional neural network model using the usual
 Relay API's, annotate the Relay expression for the given Vitis-AI DPU
 target and partition the graph.
 
+.. note::
+
+    We recommend converting DPU convolutions' data layouts to NHWC and CPU convolutions'
+    data layouts to NCHW for best DPU and out of the box CPU performance. You can use the
+    ConvertLayout transformation pass two times to achieve this as demonstrated in the code
+    block underneath. You can also leave the CPU convolution layouts in NHWC and tune ARM CPU
+    performance for this data layout to avoid the layout transformation overheads introduced by
+    executing DPU convolutions in NHWC and CPU convolutions in NCHW
+    (check out the `AutoScheduling <https://tvm.apache.org/docs/tutorials/index.html#autoscheduler-template-free-auto-scheduling>`__
+    and `AutoTuning <https://tvm.apache.org/docs/tutorials/autotvm/tune_relay_arm.html>`__
+    tutorials for this).
+
 .. code:: python
 
    mod["main"] = bind_params_by_name(mod["main"], params)
+   
+   # For edge DPU we recommend converting the convolutions' data layout
+   #    to NHWC for best performance. Therefore, we first convert the layouts
+   #    of all convolutions to NHWC before partitioning. Afterwards, we can
+   #    convert any remaining convolutions (to be executed on CPU) back to NCHW.
+   desired_layouts = {'nn.conv2d': ['NHWC', 'default']}
+   seq = tvm.transform.Sequential([relay.transform.RemoveUnusedFunctions(),
+                                   relay.transform.ConvertLayout(desired_layouts),
+                                   relay.transform.FoldConstant()])
+   with tvm.transform.PassContext(opt_level=3):
+       mod = seq(mod)
+            
+   # Annotate and partition the Relay expression for the given target
    mod = annotation(mod, params, target)
    mod = relay.transform.MergeCompilerRegions()(mod)
    mod = relay.transform.PartitionGraph()(mod)
+   
+   # After partitioning we recommend transforming the remaining convolutions
+   #    (that will be executed on CPU, if any) back to NCHW data layout
+   #    for best CPU performance
+   desired_layouts = {'nn.conv2d': ['NCHW', 'default']}
+   seq = tvm.transform.Sequential([relay.transform.RemoveUnusedFunctions(),
+                                   relay.transform.ConvertLayout(desired_layouts),
+                                   relay.transform.FoldConstant()])
+   with tvm.transform.PassContext(opt_level=3):
+       mod = seq(mod)
 
 Now, we can build the TVM runtime library for executing the model. The
 TVM target is 'llvm' as the operations that can't be handled by the DPU
@@ -572,13 +617,9 @@ can be included.
 
 .. code:: python
 
-   from tvm.contrib import util
-
-   temp = util.tempdir()
-
    tvm_target = 'llvm'
    target='DPUCZDX8G-zcu104'
-   export_rt_mod_file = temp.relpath("vitis_ai.rtmod")
+   export_rt_mod_file = "vitis_ai.rtmod"
 
    with tvm.transform.PassContext(opt_level=3, config= {'relay.ext.vitis_ai.options.target': target,
    						        'relay.ext.vitis_ai.options.export_runtime_module': export_rt_mod_file}):
@@ -604,9 +645,9 @@ Save the TVM lib module so that the Vitis-AI runtime module will also be exporte
 
 .. code:: python
 
-   from tvm.contrib import util
+   from tvm.contrib import utils
 
-   temp = util.tempdir()
+   temp = utils.tempdir()
    lib.export_library(temp.relpath("tvm_lib.so"))
 
 After quantizing and compiling the model for Vitis-AI acceleration using the
@@ -638,15 +679,31 @@ Edge steps
 ^^^^^^^^^^
 
 After setting up TVM with Vitis-AI on the edge device, you can now load
-the TVM runtime module into memory and feed inputs for inference.
+the TVM runtime module into memory and feed inputs for inference. A nearly
+complete runtiem script can be found underneath. Make sure to run the script
+as root (execute ``su`` in terminal to log into root).
+
+
+.. note::
+
+    You will see a warning about the 'cpu-tf' runtime not being found. This warning is
+    expected on the board and can be ignored. Note also that you **shouldn't** import the
+    PyXIR targets in the run script (``import pyxir.contrib.target.DPUCZDX8G``).
 
 .. code:: python
 
+   import pyxir
+   import tvm
+   from tvm.contrib import graph_runtime
+
    ctx = tvm.cpu()
+   
+   # input_name = ...
+   # input_data = ...
 
    # load the module into memory
    lib = tvm.runtime.load_module("tvm_dpu_arm.so")
 
    module = graph_runtime.GraphModule(lib["default"](tvm.cpu()))
-   module.set_input(name, data)
+   module.set_input(input_name, input_data)
    module.run()
diff --git a/python/tvm/contrib/target/vitis_ai.py b/python/tvm/contrib/target/vitis_ai.py
index d4931d9e3f48..f319fd799829 100644
--- a/python/tvm/contrib/target/vitis_ai.py
+++ b/python/tvm/contrib/target/vitis_ai.py
@@ -132,14 +132,14 @@ def vitis_ai_compiler(ref):
         layers = xgraph.get_layers()
 
         # Get the output tensor names using XGraph and output Relay ids
-        out_tensor_names = []
+        out_tensor_names = ["unknown_name"] * len(output_relay_ids)
         for layer in layers:
             if not layer.internal:
                 for relay_id in layer.attrs["relay_id"]:
                     if relay_id in output_relay_ids:
-                        out_tensor_names.append(layer.name)
+                        out_tensor_names[output_relay_ids.index(relay_id)] = layer.name
                         break
-        if not out_tensor_names:
+        if any([name == "unkown_name" for name in out_tensor_names]):
             raise ValueError(
                 "During codegeneration the loading of subexpression \
                              failed due to output tensor name mismatch in Relay PyXIR interface."
diff --git a/python/tvm/relay/op/contrib/vitis_ai.py b/python/tvm/relay/op/contrib/vitis_ai.py
index fa17c63fc00a..aaa9f99e61ed 100644
--- a/python/tvm/relay/op/contrib/vitis_ai.py
+++ b/python/tvm/relay/op/contrib/vitis_ai.py
@@ -85,6 +85,10 @@ def visit_call(self, call):
 
 def annotation(mod, params, target):
     """Annotate Relay expression for Vitis-AI DPU accelerators"""
+    # We need type information for supporting models that contain operations that don't
+    #   have a Relay to XLayer translation
+    mod = relay.transform.InferType()(mod)
+
     xgraph = pyxir.frontend.tvm.from_relay(mod, params, postprocessing=None)
     xgraph = pyxir.partition(xgraph, targets=[target])