From 2a422058b02ff13853f13efe27d017aa20de5187 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Tue, 14 Mar 2017 23:12:36 -0700
Subject: [PATCH 1/2] [VERILOG] VPI Mem Interface/ VPI MMap

---
 dmlc-core                                     |   2 +-
 include/tvm/runtime/c_runtime_api.h           |   7 +
 include/tvm/runtime/packed_func.h             |   2 +
 python/tvm/__init__.py                        |   2 +-
 python/tvm/_ctypes/_ndarray.py                |  13 +-
 python/tvm/addon/verilog.py                   |  14 +-
 python/tvm/ndarray.py                         |   2 +-
 src/codegen/verilog/vpi_device_api.cc         | 416 ++++++++++++++++++
 src/codegen/verilog/vpi_session.cc            | 171 +++----
 src/codegen/verilog/vpi_session.h             |  48 +-
 src/pass/make_api.cc                          |   2 +-
 src/runtime/c_runtime_api.cc                  |   2 +-
 src/runtime/cpu_device_api.cc                 |   7 +-
 src/runtime/device_api.h                      |   1 +
 src/runtime/opencl/opencl_device_api.cc       |   2 +-
 tests/python/unittest/test_runtime_ndarray.py |   5 +-
 tests/verilog/test_counter.py                 |   1 -
 tests/verilog/test_loop.py                    |   1 -
 tests/verilog/test_vpi_mem_interface.py       | 131 ++++++
 tests/verilog/test_vpi_mem_interface.v        |  57 +++
 tests/verilog/test_vpi_mmap.py                |  56 +++
 tests/verilog/test_vpi_mmap.v                 |  53 +++
 verilog/tvm_vpi.cc                            |  58 ++-
 verilog/tvm_vpi.h                             |  17 +-
 verilog/tvm_vpi_mem_interface.v               |  43 ++
 verilog/tvm_vpi_mmap.v                        |  43 ++
 26 files changed, 1035 insertions(+), 121 deletions(-)
 create mode 100644 src/codegen/verilog/vpi_device_api.cc
 create mode 100644 tests/verilog/test_vpi_mem_interface.py
 create mode 100644 tests/verilog/test_vpi_mem_interface.v
 create mode 100644 tests/verilog/test_vpi_mmap.py
 create mode 100644 tests/verilog/test_vpi_mmap.v
 create mode 100644 verilog/tvm_vpi_mem_interface.v
 create mode 100644 verilog/tvm_vpi_mmap.v
diff --git a/dmlc-core b/dmlc-core
index c2871f5db508..2b75a0ce6f19 160000
--- a/dmlc-core
+++ b/dmlc-core
@@ -1 +1 @@
-Subproject commit c2871f5db50830f5278ff6e323e8e51a6d5516dd
+Subproject commit 2b75a0ce6f191ad0fcb5319039b41e990968542a
diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h
index 9b10f16008c2..91f33d495cc8 100644
--- a/include/tvm/runtime/c_runtime_api.h
+++ b/include/tvm/runtime/c_runtime_api.h
@@ -38,6 +38,13 @@
 TVM_EXTERN_C {
 /*! \brief type of array index. */
 typedef int64_t tvm_index_t;
+
+/*! \brief Extension device types in TVM */
+typedef enum {
+  /*! \brief Simulated on board RAM */
+  kVPI = 9
+} TVMDeviceExtType;
+
 /*!
  * \brief The type code in TVMType
  * \note TVMType is used in two places.
diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h
index 592b418e5918..0d5064dcff00 100644
--- a/include/tvm/runtime/packed_func.h
+++ b/include/tvm/runtime/packed_func.h
@@ -274,6 +274,7 @@ class TVMArgValue : public TVMPODValue_ {
     return value_.v_type;
   }
   operator PackedFunc() const {
+    if (type_code_ == kNull) return PackedFunc();
     TVM_CHECK_TYPE_CODE(type_code_, kFuncHandle);
     return *ptr<PackedFunc>();
   }
@@ -350,6 +351,7 @@ class TVMRetValue : public TVMPODValue_ {
     return value_.v_type;
   }
   operator PackedFunc() const {
+    if (type_code_ == kNull) return PackedFunc();
     TVM_CHECK_TYPE_CODE(type_code_, kFuncHandle);
     return *ptr<PackedFunc>();
   }
diff --git a/python/tvm/__init__.py b/python/tvm/__init__.py
index 4e294bdafdf2..51cb4a179436 100644
--- a/python/tvm/__init__.py
+++ b/python/tvm/__init__.py
@@ -15,7 +15,7 @@
 from . import module
 
 from . import ndarray as nd
-from .ndarray import cpu, gpu, opencl, cl
+from .ndarray import cpu, gpu, opencl, cl, vpi
 
 from ._base import TVMError
 from .api import *
diff --git a/python/tvm/_ctypes/_ndarray.py b/python/tvm/_ctypes/_ndarray.py
index 05f300ef4062..72ba6402c150 100644
--- a/python/tvm/_ctypes/_ndarray.py
+++ b/python/tvm/_ctypes/_ndarray.py
@@ -18,7 +18,8 @@ class TVMContext(ctypes.Structure):
     MASK2STR = {
         1 : 'cpu',
         2 : 'gpu',
-        4 : 'opencl'
+        4 : 'opencl',
+        9 : 'vpi'
     }
     def __init__(self, device_id, device_type):
         super(TVMContext, self).__init__()
@@ -76,6 +77,16 @@ def opencl(dev_id=0):
     """
     return TVMContext(dev_id, 4)
 
+def vpi(dev_id=0):
+    """Construct a VPI simulated device
+
+    Parameters
+    ----------
+    dev_id : int, optional
+        The integer device id
+    """
+    return TVMContext(dev_id, 9)
+
 
 def numpyasarray(np_data):
     """Return a TVMArray representation of a numpy array.
diff --git a/python/tvm/addon/verilog.py b/python/tvm/addon/verilog.py
index 1321c966e2fd..9ec36fa880ae 100644
--- a/python/tvm/addon/verilog.py
+++ b/python/tvm/addon/verilog.py
@@ -17,6 +17,7 @@ def __init__(self, handle):
         super(VPISession, self).__init__(handle)
         self.proc = None
         self.execpath = None
+        self.yield_callbacks = []
 
     def __del__(self):
         self.proc.kill()
@@ -47,6 +48,8 @@ def __getattr__(self, name):
 
     def yield_until_posedge(self):
         """Yield until next posedge"""
+        for f in self.yield_callbacks:
+            f()
         return _api_internal._vpi_SessYield(self)
 
     def shutdown(self):
@@ -222,7 +225,16 @@ def session(file_name):
     env['TVM_HREAD_PIPE'] = str(read_host)
     env['TVM_HWRITE_PIPE'] = str(write_host)
 
-    proc = subprocess.Popen(cmd, env=env, close_fds=False)
+    try:
+        # close_fds does not work well for all python3
+        # Use pass_fds instead.
+        # pylint: disable=unexpected-keyword-arg
+        pass_fds = (read_device, write_device, read_host, write_host)
+        proc = subprocess.Popen(cmd, pass_fds=pass_fds, env=env)
+    except TypeError:
+        # This is effective for python2
+        proc = subprocess.Popen(cmd, close_fds=False, env=env)
+
     # close device side pipe
     os.close(read_device)
     os.close(write_device)
diff --git a/python/tvm/ndarray.py b/python/tvm/ndarray.py
index 1d56e60b6ce9..4a88968143a8 100644
--- a/python/tvm/ndarray.py
+++ b/python/tvm/ndarray.py
@@ -7,7 +7,7 @@
 import numpy as _np
 
 from ._ctypes._ndarray import TVMContext, TVMType, NDArrayBase
-from ._ctypes._ndarray import cpu, gpu, opencl, empty, sync
+from ._ctypes._ndarray import cpu, gpu, opencl, vpi, empty, sync
 from ._ctypes._ndarray import _init_ndarray_module
 from ._ctypes._function import Function
 
diff --git a/src/codegen/verilog/vpi_device_api.cc b/src/codegen/verilog/vpi_device_api.cc
new file mode 100644
index 000000000000..384156cae200
--- /dev/null
+++ b/src/codegen/verilog/vpi_device_api.cc
@@ -0,0 +1,416 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file vpi_device.cc
+ * \brief Simulated VPI RAM device.
+ */
+#include <tvm/runtime/registry.h>
+#include <tvm/packed_func_ext.h>
+#include <cstdlib>
+#include <unordered_map>
+#include <map>
+#include <queue>
+#include "../../runtime/device_api.h"
+#include "./vpi_session.h"
+
+namespace tvm {
+namespace codegen {
+
+/*! \brief Simulated device ram */
+class VPIDeviceAPI : public runtime::DeviceAPI {
+ public:
+  VPIDeviceAPI() {
+    static const size_t kAllocAlign = 32U;
+    const char* s_ram_size = getenv("TVM_VPI_RAM_SIZE_MB");
+    // 16 MB ram.
+    int ram_size = 32;
+    if (s_ram_size != nullptr) {
+      ram_size = atoi(s_ram_size);
+    }
+    ram_.resize(ram_size << 17);
+    ram_head_ = kAllocAlign;
+    ram_max_ = ram_.size() * sizeof(int64_t);
+    LOG(INFO) << "Initialize VPI simulated ram " << ram_size << "MB ...";
+  }
+  // convert address to real address
+  void* RealAddr(const void* addr, size_t size) const {
+    int64_t ptr = reinterpret_cast<int64_t>(addr);
+    CHECK_LE(ptr + size, ram_max_)
+        << "VPI: Illegal memory access";
+    return (char*)(&ram_[0]) + ptr;  // NOLINT(*)
+  }
+  // convert address to real address
+  void* RealAddrSafe(const void* addr, size_t size) const {
+    int64_t ptr = reinterpret_cast<int64_t>(addr);
+    if (ptr + size >= ram_max_) return nullptr;
+    return (char*)(&ram_[0]) + ptr;  // NOLINT(*)
+  }
+  void* AllocDataSpace(TVMContext ctx, size_t size, size_t alignment) final {
+    static const size_t kAllocAlign = 32U;
+    // always align to 32 bytes at least.
+    CHECK_LE(alignment, kAllocAlign);
+    alignment = kAllocAlign;
+    // always allocate block with aligned size.
+    size += alignment - (size % alignment);
+    // This is not thread safe, but fine for simulation.
+    auto it = free_blocks_.lower_bound(size);
+    if (it != free_blocks_.end()) {
+      size_t head = it->second;
+      free_blocks_.erase(it);
+      Block& b = block_map_.at(head);
+      CHECK(b.is_free);
+      b.is_free = false;
+      return reinterpret_cast<void*>(head);
+    } else {
+      CHECK_EQ(ram_head_ % kAllocAlign, 0U);
+      Block b;
+      b.size = size;
+      b.is_free = false;
+      CHECK_LE(ram_head_ + size, ram_max_)
+          << "VPI: Out of memory";
+      block_map_[ram_head_] = b;
+      void* ret = reinterpret_cast<void*>(ram_head_);
+      ram_head_ += size;
+      return ret;
+    }
+  }
+  void FreeDataSpace(TVMContext ctx, void* ptr) final {
+    size_t head = reinterpret_cast<size_t>(ptr);
+    Block& b = block_map_.at(head);
+    b.is_free = true;
+    free_blocks_.insert({b.size, head});
+  }
+  void CopyDataFromTo(const void* from,
+                      void* to,
+                      size_t size,
+                      TVMContext ctx_from,
+                      TVMContext ctx_to,
+                      TVMStreamHandle stream) final {
+    if (static_cast<int>(ctx_from.device_type) == kVPI) {
+      from = RealAddr(from, size);
+    }
+    if (static_cast<int>(ctx_to.device_type) == kVPI) {
+      to = RealAddr(to, size);
+    }
+    memcpy(to, from, size);
+  }
+  void StreamSync(TVMContext ctx, TVMStreamHandle stream) final {
+  }
+  static VPIDeviceAPI* Global() {
+    static VPIDeviceAPI inst;
+    return &inst;
+  }
+
+ private:
+  // allocator block for reuse
+  struct Block {
+    // The size of the block
+    size_t size;
+    // Whether this is already freed.
+    bool is_free{true};
+  };
+  // head -> blocks
+  std::unordered_map<size_t, Block> block_map_;
+  // size -> free heads.
+  std::multimap<size_t, size_t> free_blocks_;
+  // top of the ram
+  size_t ram_head_, ram_max_;
+  // The ram space.
+  std::vector<int64_t> ram_;
+};
+
+/* !\brief vector buffer to help read/write */
+class VPIVecBuffer {
+ public:
+  // Put data into vec
+  void put_vec(const VPIHandle& h, size_t nwords,
+               const void* dptr, size_t size) {
+    wbuf_.resize(nwords);
+    vbuf_.resize(nwords);
+    memcpy(&wbuf_[0], dptr, size);
+    for (size_t i = 0; i < nwords; ++i) {
+      vbuf_[i].aval = wbuf_[i];
+      vbuf_[i].bval = 0;
+    }
+    h.put_vec(vbuf_);
+  }
+  // read data from vec.
+  void get_vec(const VPIHandle& h, void* dptr, size_t size) {
+    h.get_vec(&vbuf_);
+    wbuf_.resize(vbuf_.size());
+    for (size_t i = 0; i < vbuf_.size(); ++i) {
+      wbuf_[i] = vbuf_[i].aval;
+      CHECK_EQ(vbuf_[i].bval, 0)
+          << "Write indetermined value to RAM";
+    }
+    memcpy(dptr, &wbuf_[0], size);
+  }
+
+ private:
+  // Temporal buffers.
+  std::vector<int32_t> wbuf_;
+  std::vector<vpi::VPIVecVal> vbuf_;
+};
+
+/*!
+ * \brief Memory interface for VPI memory.
+ */
+class VPIMemoryInterface {
+ public:
+  // Initialize the FSM.
+  void Init(VPIHandle module) {
+    device_ = VPIDeviceAPI::Global();
+    in_rst_ = module["rst"];
+    // read ports
+    in_read_dequeue_ = module["read_en"];
+    out_reg_read_data_ = module["reg_read_data"];
+    // Write ports
+    in_write_enqueue_ = module["write_en"];
+    in_write_data_ = module["write_data_in"];
+    // Status port
+    out_reg_read_valid_ = module["reg_read_valid"];
+    out_reg_write_ready_ = module["reg_write_ready"];
+    // memory control signal
+    ctrl_read_req_ = module["host_read_req"];
+    ctrl_read_addr_ = module["host_read_addr"];
+    ctrl_read_size_ = module["host_read_size"];
+    ctrl_write_req_ = module["host_write_req"];
+    ctrl_write_addr_ = module["host_write_addr"];
+    ctrl_write_size_ = module["host_write_size"];
+    // The bit and bytes;
+    size_t read_bits =  out_reg_read_data_.size();
+    size_t write_bits =  in_write_data_.size();
+    CHECK_EQ(read_bits % 8U, 0)
+        << "Read/write unit have to be multiple of 8 bit(bytes)";
+    CHECK_EQ(write_bits % 8U, 0)
+        << "Read/write unit have to be multiple of 8 bit(bytes)";
+    read_unit_bytes_ = read_bits / 8U;
+    write_unit_bytes_ = write_bits / 8U;
+  }
+  // Callback at post-edge.
+  void AtPosEedge() {
+    // reset
+    if (in_rst_.get_int()) {
+      CHECK_EQ(pending_read_.size, 0U);
+      CHECK_EQ(pending_write_.size, 0U);
+      CHECK(read_tasks_.empty());
+      CHECK(write_tasks_.empty());
+      out_reg_write_ready_.put_int(0);
+      out_reg_read_valid_.put_int(0);
+      return;
+    }
+    // read write tasks
+    if (in_read_dequeue_.get_int() || !out_reg_read_valid_.get_int()) {
+      ReadFromFIFO();
+    }
+    // update write full
+    if (in_write_enqueue_.get_int()) {
+      CHECK(out_reg_write_ready_.get_int());
+      WriteToFIFO();
+    }
+    if (pending_write_.size || write_tasks_.size()) {
+      out_reg_write_ready_.put_int(1);
+    } else {
+      out_reg_write_ready_.put_int(0);
+    }
+    // Control tasks
+    if (ctrl_read_addr_.get_int() != 0) {
+      FIFOTask tsk;
+      tsk.addr = reinterpret_cast<char*>(ctrl_read_addr_.get_int());
+      tsk.size = static_cast<size_t>(ctrl_read_size_.get_int());
+      read_tasks_.push(tsk);
+    }
+    // Control tasks
+    if (ctrl_write_addr_.get_int() != 0) {
+      FIFOTask tsk;
+      tsk.addr = reinterpret_cast<char*>(ctrl_write_addr_.get_int());
+      tsk.size = static_cast<size_t>(ctrl_write_size_.get_int());
+      write_tasks_.push(tsk);
+    }
+  }
+
+ private:
+  // The FIFO tasks
+  struct FIFOTask {
+    char* addr{nullptr};
+    size_t size{0};
+  };
+  // handle dequeue event
+  void ReadFromFIFO() {
+    if (pending_read_.size == 0) {
+      if (!read_tasks_.empty()) {
+        pending_read_ = read_tasks_.front();
+        read_tasks_.pop();
+        // translate to real memory addr
+        pending_read_.addr = static_cast<char*>(
+            device_->RealAddr(
+                pending_read_.addr, pending_read_.size));
+      }
+    }
+    if (pending_read_.size != 0) {
+      // The size to be read
+      size_t nread = std::min(pending_read_.size, read_unit_bytes_);
+      // Read from the data
+      size_t nwords = (read_unit_bytes_ + 3) / 4;
+      vbuf_.put_vec(out_reg_read_data_, nwords,
+                    pending_read_.addr, nread);
+      // Update the pointer
+      pending_read_.size -= nread;
+      pending_read_.addr += nread;
+      // read into the vector
+      out_reg_read_valid_.put_int(1);
+    } else {
+      out_reg_read_valid_.put_int(0);
+    }
+  }
+  // handle write event
+  void WriteToFIFO() {
+    if (pending_write_.size == 0) {
+      if (!write_tasks_.empty()) {
+        pending_write_ = write_tasks_.front();
+        write_tasks_.pop();
+        // translate to real memory addr
+        pending_write_.addr = static_cast<char*>(
+            device_->RealAddr(
+                pending_write_.addr, pending_write_.size));
+      }
+    }
+    if (pending_write_.size != 0) {
+      // write to the ram.
+      size_t nwrite = std::min(pending_write_.size, write_unit_bytes_);
+      vbuf_.get_vec(in_write_data_, pending_write_.addr, nwrite);
+      // Update the pointer
+      pending_write_.size -= nwrite;
+      pending_write_.addr += nwrite;
+    }
+  }
+  // Device API
+  VPIDeviceAPI* device_{nullptr};
+  // Input clock and reset
+  VPIHandle in_rst_;
+  // Read FIFO signal
+  VPIHandle in_read_dequeue_;
+  // Write FIFO signal
+  VPIHandle in_write_enqueue_;
+  VPIHandle in_write_data_;
+  // Read memory controler signals
+  VPIHandle ctrl_read_req_;
+  VPIHandle ctrl_read_addr_;
+  VPIHandle ctrl_read_size_;
+  // Write memory controler signal signals
+  VPIHandle ctrl_write_req_;
+  VPIHandle ctrl_write_addr_;
+  VPIHandle ctrl_write_size_;
+  // Read FIFO outputs
+  VPIHandle out_reg_read_data_;
+  VPIHandle out_reg_read_valid_;
+  // Write FIFO outputs
+  VPIHandle out_reg_write_ready_;
+  // Size of current pending read.
+  FIFOTask pending_read_;
+  FIFOTask pending_write_;
+  // The read/write task queues.
+  std::queue<FIFOTask> read_tasks_;
+  std::queue<FIFOTask> write_tasks_;
+  // Unit bytes for read/writing
+  size_t read_unit_bytes_;
+  size_t write_unit_bytes_;
+  // Temporal buffers.
+  VPIVecBuffer vbuf_;
+};
+
+// Read only memory map.
+class VPIMemMapBase {
+ public:
+  // Initialize the FSM.
+  void Init(VPIHandle module, const std::string& data_port) {
+    device_ = VPIDeviceAPI::Global();
+    // intiatialize the connections
+    rst_ = module["rst"];
+    addr_ = module["addr"];
+    data_ = module[data_port];
+    mmap_addr_ = module["mmap_addr"];
+    size_t unit_bits =  data_.size();
+    CHECK_EQ(unit_bits % 8U, 0)
+        << "Read/write unit have to be multiple of 8 bit(bytes)";
+    unit_bytes_ = unit_bits / 8U;
+  }
+  void* RealAddr() {
+    int byte_offset = addr_.get_int() * unit_bytes_;
+    void* ptr =
+        device_->RealAddrSafe(
+            reinterpret_cast<void*>(mmap_addr_.get_int() + byte_offset), 1);
+    return ptr;
+  }
+
+ protected:
+  // Device API
+  VPIDeviceAPI* device_{nullptr};
+  VPIHandle rst_;
+  VPIHandle addr_;
+  VPIHandle data_;
+  VPIHandle mmap_addr_;
+  size_t unit_bytes_;
+  VPIVecBuffer vbuf_;
+};
+
+class VPIReadMemMap : public VPIMemMapBase {
+ public:
+  void Init(VPIHandle module) {
+    VPIMemMapBase::Init(module, "reg_data");
+  }
+  void AtPosEedge() {
+    void* ptr = RealAddr();
+    if (ptr == nullptr) return;
+    size_t nwords = (unit_bytes_ + 3) / 4;
+    vbuf_.put_vec(data_, nwords, ptr, unit_bytes_);
+  }
+};
+
+// Write only memory map.
+class VPIWriteMemMap : public VPIMemMapBase {
+ public:
+  void Init(VPIHandle module) {
+    VPIMemMapBase::Init(module, "data_in");
+    enable_ = module["en"];
+  }
+  void AtPosEedge() {
+    if (!enable_.get_int() || rst_.get_int()) return;
+    void* ptr = RealAddr();
+    CHECK(ptr != nullptr)
+        << "Illegal write to VPI RAM";
+    vbuf_.get_vec(data_, ptr, unit_bytes_);
+  }
+
+ private:
+  VPIHandle enable_;
+};
+
+TVM_REGISTER_GLOBAL(_device_api_vpi)
+.set_body([](runtime::TVMArgs args, runtime::TVMRetValue* rv) {
+    runtime::DeviceAPI* ptr = VPIDeviceAPI::Global();
+    *rv = static_cast<void*>(ptr);
+  });
+
+template<typename T>
+void TVMVPIHook(runtime::TVMArgs args, runtime::TVMRetValue* rv) {
+  VPIHandle m = args[0];
+  std::shared_ptr<T> p = std::make_shared<T>();
+  p->Init(m);
+  LOG(INFO) << "Hook " << m.name() << " to tvm vpi simulation...";
+  PackedFunc pf([p](const runtime::TVMArgs&, runtime::TVMRetValue*) {
+      p->AtPosEedge();
+    });
+  *rv = pf;
+}
+
+TVM_REGISTER_GLOBAL(_vpi_module_tvm_vpi_mem_interface)
+.set_body(TVMVPIHook<VPIMemoryInterface>);
+
+TVM_REGISTER_GLOBAL(_vpi_module_tvm_vpi_read_mmap)
+.set_body(TVMVPIHook<VPIReadMemMap>);
+
+TVM_REGISTER_GLOBAL(_vpi_module_tvm_vpi_write_mmap)
+.set_body(TVMVPIHook<VPIWriteMemMap>);
+
+}  // namespace codegen
+}  // namespace tvm
diff --git a/src/codegen/verilog/vpi_session.cc b/src/codegen/verilog/vpi_session.cc
index 14cc815debd1..7a3190b5beec 100644
--- a/src/codegen/verilog/vpi_session.cc
+++ b/src/codegen/verilog/vpi_session.cc
@@ -11,20 +11,19 @@ namespace codegen {
 
 using namespace vpi;
 
-/*! \brief Container for session. */
-class VPISessionNode : public Node {
+// helper class to get the node.
+class VPISessionEntry {
  public:
   // Whether in control.
   bool in_control{false};
   // Internal reader and writer.
   common::Pipe reader;
   common::Pipe writer;
-
   // internal constructor
-  VPISessionNode(int h_pipe_read, int h_pipe_write)
+  VPISessionEntry(int h_pipe_read, int h_pipe_write)
       : reader(h_pipe_read), writer(h_pipe_write) {
   }
-  ~VPISessionNode() {
+  ~VPISessionEntry() {
     if (in_control) {
       VPIReturnCode cd;
       writer.Write(kShutDown);
@@ -33,40 +32,11 @@ class VPISessionNode : public Node {
     reader.Close();
     writer.Close();
   }
-  // visit all attributes
-  void VisitAttrs(AttrVisitor* v) final {
-  }
   void ReadExpect(VPIReturnCode rcode) {
     VPIReturnCode code;
     CHECK(reader.Read(&code));
     CHECK_EQ(code, rcode) << "Error in simulation";
   }
-
-  static constexpr const char* _type_key = "VPISession";
-  TVM_DECLARE_NODE_TYPE_INFO(VPISessionNode, Node);
-};
-
-/*! \brief Container for handle */
-class VPIHandleNode : public Node {
- public:
-  // The internal session.
-  VPISession sess;
-  // Internal handle
-  VPIRawHandle handle;
-
-  void VisitAttrs(AttrVisitor* v) final {
-    v->Visit("sess", &sess);
-  }
-  static VPIHandle make(const VPISession& sess, VPIRawHandle handle) {
-    std::shared_ptr<VPIHandleNode> n =
-        std::make_shared<VPIHandleNode>();
-    n->sess = sess;
-    n->handle = handle;
-    return VPIHandle(n);
-  }
-
-  static constexpr const char* _type_key = "VPIHandle";
-  TVM_DECLARE_NODE_TYPE_INFO(VPIHandleNode, Node);
 };
 
 // Inline implementations
@@ -77,34 +47,99 @@ inline VPIHandleNode* VPIHandle::get() const {
   return static_cast<VPIHandleNode*>(node_.get());
 }
 
-VPISession VPISession::make(int h_pipe_read, int h_pipe_write) {
-  std::shared_ptr<VPISessionNode> n = std::make_shared<VPISessionNode>(
-      h_pipe_read, h_pipe_write);
-  n->ReadExpect(kPosEdgeTrigger);
-  n->in_control = true;
-  return VPISession(n);
+VPIHandle VPIHandleCreate(
+    const std::shared_ptr<VPISessionEntry>& sess,
+    VPIRawHandle handle) {
+  std::shared_ptr<VPIHandleNode> n = std::make_shared<VPIHandleNode>();
+  n->sess = sess;
+  n->handle = handle;
+  return VPIHandle(n);
 }
 
-VPIHandle VPISession::operator[](const std::string& name) const {
-  return GetByName(name, nullptr);
-}
-
-VPIHandle VPISession::GetByName(const std::string& name, VPIRawHandle handle) const {
-  VPISessionNode* n = get();
+VPIHandle GetHandleByName(
+    const std::shared_ptr<VPISessionEntry>& sess,
+    const std::string& name,
+    VPIRawHandle handle,
+    bool allow_undefined) {
+  VPISessionEntry* n = sess.get();
   CHECK(n->in_control);
   n->writer.Write(kGetHandleByName);
   n->writer.Write(name);
   n->writer.Write(handle);
   n->ReadExpect(kSuccess);
   CHECK(n->reader.Read(&handle));
-  CHECK(handle != nullptr)
-      << "Cannot find handle with name=" << name;
-  return VPIHandleNode::make(*this, handle);
+  if (handle != nullptr) {
+    return VPIHandleCreate(sess, handle);
+  } else {
+    CHECK(allow_undefined)
+        << "Cannot find handle with name=" << name;
+    return VPIHandle();
+  }
+}
+
+std::string VPIGetStrProp(VPIHandleNode* h, int code) {
+  VPISessionEntry* n = h->sess.get();
+  CHECK(n->in_control);
+  n->writer.Write(kGetStrProp);
+  n->writer.Write(code);
+  n->writer.Write(h->handle);
+  n->ReadExpect(kSuccess);
+  std::string str;
+  CHECK(n->reader.Read(&str));
+  return str;
+}
+
+int VPIGetIntProp(VPIHandleNode* h, int code) {
+  VPISessionEntry* n = h->sess.get();
+  CHECK(n->in_control);
+  n->writer.Write(kGetIntProp);
+  n->writer.Write(code);
+  n->writer.Write(h->handle);
+  n->ReadExpect(kSuccess);
+  int value;
+  CHECK(n->reader.Read(&value));
+  return value;
+}
+
+VPISession VPISession::make(int h_pipe_read, int h_pipe_write) {
+  std::shared_ptr<VPISessionNode> n = std::make_shared<VPISessionNode>();
+  n->sess = std::make_shared<VPISessionEntry>(h_pipe_read, h_pipe_write);
+  n->sess->in_control = true;
+  VPISession sess(n);
+  // The custom module handles
+  std::vector<VPIRawHandle> mod_handles;
+  n->sess->reader.Read(&mod_handles);
+  n->sess->ReadExpect(kPosEdgeTrigger);
+  // start Initialize the callbacks
+  for (VPIRawHandle raw_h : mod_handles) {
+    VPIHandle h = VPIHandleCreate(n->sess, raw_h);
+    CHECK_EQ(VPIGetIntProp(h.get(), kVPIType), kVPIModule)
+        << "Expect pass modules to $tvm_session after clk";
+    std::string def = VPIGetStrProp(h.get(), kVPIDefName);
+    std::string callback_name = "_vpi_module_" + def;
+    const PackedFunc* f = runtime::Registry::Get(callback_name);
+    CHECK(f != nullptr)
+        << "Cannot find definition for tvm vpi module " << def;
+    PackedFunc cb = (*f)(h);
+    n->posedge_end_callbacks.push_back(cb);
+  }
+  return sess;
+}
+
+VPIHandle VPISession::operator[](const std::string& name) const {
+  return GetHandleByName(get()->sess, name, nullptr, false);
+}
+VPIHandle VPISession::GetByName(const std::string& name,
+                                bool allow_undefined) const {
+  return GetHandleByName(get()->sess, name, nullptr, true);
 }
 
 void VPISession::yield() {
-  VPISessionNode* n = get();
+  VPISessionEntry* n = get()->sess.get();
   CHECK(n->in_control);
+  for (const PackedFunc& f : get()->posedge_end_callbacks) {
+    f();
+  }
   n->writer.Write(kYield);
   n->ReadExpect(kSuccess);
   n->in_control = false;
@@ -113,7 +148,7 @@ void VPISession::yield() {
 }
 
 void VPISession::shutdown() {
-  VPISessionNode* n = get();
+  VPISessionEntry* n = get()->sess.get();
   if (n->in_control) {
     n->writer.Write(kShutDown);
     n->ReadExpect(kSuccess);
@@ -122,20 +157,12 @@ void VPISession::shutdown() {
 }
 
 int VPIHandle::size() const {
-  VPIHandleNode* h = get();
-  VPISessionNode* n = h->sess.get();
-  CHECK(n->in_control);
-  n->writer.Write(kGetSize);
-  n->writer.Write(h->handle);
-  n->ReadExpect(kSuccess);
-  int value;
-  CHECK(n->reader.Read(&value));
-  return value;
+  return VPIGetIntProp(get(), kVPISize);
 }
 
 void VPIHandle::put_int(int value) {
   VPIHandleNode* h = get();
-  VPISessionNode* n = h->sess.get();
+  VPISessionEntry* n = h->sess.get();
   CHECK(n->in_control);
   n->writer.Write(kPutInt32);
   n->writer.Write(h->handle);
@@ -145,7 +172,7 @@ void VPIHandle::put_int(int value) {
 
 int VPIHandle::get_int() const {
   VPIHandleNode* h = get();
-  VPISessionNode* n = h->sess.get();
+  VPISessionEntry* n = h->sess.get();
   CHECK(n->in_control);
   n->writer.Write(kGetInt32);
   n->writer.Write(h->handle);
@@ -156,20 +183,12 @@ int VPIHandle::get_int() const {
 }
 
 std::string VPIHandle::name() const {
-  VPIHandleNode* h = get();
-  VPISessionNode* n = h->sess.get();
-  CHECK(n->in_control);
-  n->writer.Write(kGetName);
-  n->writer.Write(h->handle);
-  n->ReadExpect(kSuccess);
-  std::string str;
-  CHECK(n->reader.Read(&str));
-  return str;
+  return VPIGetStrProp(get(), kVPIFullName);
 }
 
 void VPIHandle::put_vec(const std::vector<VPIVecVal>& vec) const {
   VPIHandleNode* h = get();
-  VPISessionNode* n = h->sess.get();
+  VPISessionEntry* n = h->sess.get();
   CHECK(n->in_control);
   n->writer.Write(kPutVec);
   n->writer.Write(h->handle);
@@ -179,17 +198,17 @@ void VPIHandle::put_vec(const std::vector<VPIVecVal>& vec) const {
 
 void VPIHandle::get_vec(std::vector<VPIVecVal>* vec) const {
   VPIHandleNode* h = get();
-  VPISessionNode* n = h->sess.get();
+  VPISessionEntry* n = h->sess.get();
   CHECK(n->in_control);
-  n->writer.Write(kPutVec);
+  n->writer.Write(kGetVec);
   n->writer.Write(h->handle);
   n->ReadExpect(kSuccess);
-  CHECK(n->reader.Read(&vec));
+  CHECK(n->reader.Read(vec));
 }
 
 VPIHandle VPIHandle::operator[](const std::string& name) const {
   VPIHandleNode* h = get();
-  return h->sess.GetByName(name, h->handle);
+  return GetHandleByName(h->sess, name, h->handle, false);
 }
 
 // API registration
diff --git a/src/codegen/verilog/vpi_session.h b/src/codegen/verilog/vpi_session.h
index bc648adf7cba..88a7f2f1906e 100644
--- a/src/codegen/verilog/vpi_session.h
+++ b/src/codegen/verilog/vpi_session.h
@@ -14,10 +14,14 @@
 
 namespace tvm {
 namespace codegen {
+
 // node containers
 class VPISessionNode;
 class VPIHandleNode;
 class VPIHandle;
+class VPISessionEntry;
+
+using runtime::PackedFunc;
 
 /*! \brief Environment */
 class VPISession : public NodeRef {
@@ -29,6 +33,12 @@ class VPISession : public NodeRef {
    * \param name The name of the handle.
    */
   VPIHandle operator[](const std::string& name) const;
+  /*!
+   * \brief Get handle by name.
+   * \param name The name of the handle.
+   * \param allow_undefined whether allow undefined
+   */
+  VPIHandle GetByName(const std::string& name, bool allow_undefined) const;
   /*!
    * \brief Yield control back to the simulator
    *  Block until next cycle.
@@ -46,12 +56,7 @@ class VPISession : public NodeRef {
   static VPISession make(int h_pipe_read, int h_pipe_write);
   // Internal methods.
   using ContainerType = VPISessionNode;
-
- private:
-  friend class VPIHandle;
   inline VPISessionNode* get() const;
-  // Get handle by name
-  VPIHandle GetByName(const std::string& name, vpi::VPIRawHandle handle) const;
 };
 
 /*! \brief VPI Handle */
@@ -91,10 +96,39 @@ class VPIHandle : public NodeRef {
   void get_vec(std::vector<vpi::VPIVecVal>* vec) const;
   // Internal methods
   using ContainerType = VPIHandleNode;
-
- private:
   inline VPIHandleNode* get() const;
 };
+
+/*! \brief Container for session. */
+class VPISessionNode : public Node {
+ public:
+  // internal session.
+  std::shared_ptr<VPISessionEntry> sess;
+  // callbacks at pos edge end.
+  std::vector<PackedFunc> posedge_end_callbacks;
+
+  // visit all attributes
+  void VisitAttrs(AttrVisitor* v) final {
+  }
+  static constexpr const char* _type_key = "VPISession";
+  TVM_DECLARE_NODE_TYPE_INFO(VPISessionNode, Node);
+};
+
+/*! \brief Container for handle */
+class VPIHandleNode : public Node {
+ public:
+  // internal session.
+  std::shared_ptr<VPISessionEntry> sess;
+  // Internal handle
+  vpi::VPIRawHandle handle;
+
+  void VisitAttrs(AttrVisitor* v) final {
+  }
+
+  static constexpr const char* _type_key = "VPIHandle";
+  TVM_DECLARE_NODE_TYPE_INFO(VPIHandleNode, Node);
+};
+
 }  // namespace codegen
 }  // namespace tvm
 #endif  // TVM_CODEGEN_VERILOG_VPI_SESSION_H_
diff --git a/src/pass/make_api.cc b/src/pass/make_api.cc
index ead3d38d8aec..33c2a93615fa 100644
--- a/src/pass/make_api.cc
+++ b/src/pass/make_api.cc
@@ -106,7 +106,7 @@ LoweredFunc MakeAPI(Stmt body,
 
   for (int i = 0; i < static_cast<int>(api_args.size()); ++i) {
     Var v_arg = f_arg_decl(i);
-    if (i < static_cast<size_t>(num_packed_args)) {
+    if (i < num_packed_args) {
       seq_init.emplace_back(LetStmt::make(
           v_arg, f_arg_value(v_arg.type(), i), nop));
     } else {
diff --git a/src/runtime/c_runtime_api.cc b/src/runtime/c_runtime_api.cc
index 63a324027294..c0b876f772ff 100644
--- a/src/runtime/c_runtime_api.cc
+++ b/src/runtime/c_runtime_api.cc
@@ -23,7 +23,7 @@ namespace runtime {
 
 class DeviceAPIManager {
  public:
-  static const int kMaxDeviceAPI = 16;
+  static const int kMaxDeviceAPI = 32;
   // Get API
   static DeviceAPI* Get(TVMContext ctx) {
     return Global()->GetAPI(ctx.device_type);
diff --git a/src/runtime/cpu_device_api.cc b/src/runtime/cpu_device_api.cc
index 3204c4e27f5e..7cb29bd2e99f 100644
--- a/src/runtime/cpu_device_api.cc
+++ b/src/runtime/cpu_device_api.cc
@@ -1,11 +1,7 @@
 /*!
  *  Copyright (c) 2016 by Contributors
- * \file device_api_gpu.h
- * \brief GPU specific API
+ * \file cpu_device_api.cc
  */
-#ifndef TVM_RUNTIME_DEVICE_API_CPU_H_
-#define TVM_RUNTIME_DEVICE_API_CPU_H_
-
 #include <dmlc/logging.h>
 #include <tvm/runtime/registry.h>
 #include <cstdlib>
@@ -58,4 +54,3 @@ TVM_REGISTER_GLOBAL(_device_api_cpu)
   });
 }  // namespace runtime
 }  // namespace tvm
-#endif  // TVM_RUNTIME_DEVICE_API_CPU_H_
diff --git a/src/runtime/device_api.h b/src/runtime/device_api.h
index 9d626075d484..699655b0c4b3 100644
--- a/src/runtime/device_api.h
+++ b/src/runtime/device_api.h
@@ -64,6 +64,7 @@ inline std::string DeviceName(DLDeviceType type) {
     case kCPU: return "cpu";
     case kGPU: return "gpu";
     case kOpenCL: return "opencl";
+    case kVPI: return "vpi";
     default: LOG(FATAL) << "unknown type =" << type; return "Unknown";
   }
 }
diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc
index 585b32814a00..af4e8cbd0f99 100644
--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -134,7 +134,7 @@ bool InitOpenCL(TVMArgs args, TVMRetValue* rv) {
   // matching conditions
   std::string platform_name, device_type;
 
-  for (size_t i = 0; i < args.num_args; ++i) {
+  for (int i = 0; i < args.num_args; ++i) {
     std::string arg = args[i];
     size_t pos = arg.find_first_of('=');
     CHECK_EQ(pos, std::string::npos)
diff --git a/tests/python/unittest/test_runtime_ndarray.py b/tests/python/unittest/test_runtime_ndarray.py
index 1c3e5f70d178..0dc6e3f1f93d 100644
--- a/tests/python/unittest/test_runtime_ndarray.py
+++ b/tests/python/unittest/test_runtime_ndarray.py
@@ -5,7 +5,10 @@ def enabled_ctx_list():
     if tvm.module.enabled("opencl"):
         tvm.module.init_opencl()
 
-    ctx_list = [('cpu', tvm.cpu(0)), ('gpu', tvm.gpu(0)), ('cl', tvm.opencl(0))]
+    ctx_list = [('cpu', tvm.cpu(0)),
+                ('gpu', tvm.gpu(0)),
+                ('cl', tvm.opencl(0)),
+                ('cpu', tvm.vpi(0))]
     ctx_list = [x[1] for x in ctx_list if tvm.module.enabled(x[0])]
     return ctx_list
 
diff --git a/tests/verilog/test_counter.py b/tests/verilog/test_counter.py
index 795b2d691a24..340e27afcf26 100644
--- a/tests/verilog/test_counter.py
+++ b/tests/verilog/test_counter.py
@@ -1,5 +1,4 @@
 import tvm
-import os
 from tvm.addon import verilog
 
 def test_counter():
diff --git a/tests/verilog/test_loop.py b/tests/verilog/test_loop.py
index 2ba19f771cc4..65a7ffaa1651 100644
--- a/tests/verilog/test_loop.py
+++ b/tests/verilog/test_loop.py
@@ -1,5 +1,4 @@
 import tvm
-import os
 from tvm.addon import verilog
 
 def test_loop():
diff --git a/tests/verilog/test_vpi_mem_interface.py b/tests/verilog/test_vpi_mem_interface.py
new file mode 100644
index 000000000000..4a97061bd64c
--- /dev/null
+++ b/tests/verilog/test_vpi_mem_interface.py
@@ -0,0 +1,131 @@
+import tvm
+import numpy as np
+from tvm.addon import verilog
+
+class FIFOReader(object):
+    """Auxiliary class to read from FIFO """
+    def __init__(self, read_data, read_valid):
+        self.read_data = read_data
+        self.read_valid = read_valid
+        self.data = []
+
+    def __call__(self):
+        if self.read_valid.get_int():
+            self.data.append(self.read_data.get_int())
+
+class FIFOWriter(object):
+    """Auxiliary class to write to FIFO """
+    def __init__(self, write_data, write_enable, write_pend, data):
+        self.write_data = write_data
+        self.write_enable = write_enable
+        self.write_pend = write_pend
+        self.data = data
+
+    def __call__(self):
+        if self.data and self.write_pend.get_int():
+            self.write_enable.put_int(1)
+            self.write_data.put_int(int(self.data[0]))
+            del self.data[0]
+        else:
+            self.write_enable.put_int(0)
+
+
+def test_ram_read():
+    n = 10
+    # context for VPI RAM
+    ctx = tvm.vpi(0)
+    a_np = np.arange(n).astype('int8')
+    a = tvm.nd.array(a_np, ctx)
+
+    # head ptr of a
+    a_ptr = int(a.handle[0].data)
+    sess = verilog.session([
+        verilog.find_file("test_vpi_mem_interface.v"),
+        verilog.find_file("tvm_vpi_mem_interface.v")
+    ])
+    rst = sess.main.rst
+    read_data = sess.main.read_data
+    read_valid = sess.main.read_data_valid
+    read_en = sess.main.read_en
+    host_read_req = sess.main.read_req
+    host_read_addr = sess.main.read_addr
+    host_read_size = sess.main.read_size
+    rst.put_int(1)
+    sess.yield_until_posedge()
+    rst.put_int(0)
+    # hook up reader
+    reader = FIFOReader(read_data, read_valid)
+    sess.yield_callbacks.append(reader)
+    # request read
+    host_read_req.put_int(1)
+    host_read_addr.put_int(a_ptr)
+    host_read_size.put_int(a.shape[0])
+
+    sess.yield_until_posedge()
+    # second read request
+    host_read_addr.put_int(a_ptr + 2)
+    host_read_size.put_int(a.shape[0] - 2)
+
+    sess.yield_until_posedge()
+    host_read_req.put_int(0)
+    read_deq.put_int(1)
+
+    # yield until read is done
+    for i in range(a.shape[0] * 2):
+        sess.yield_until_posedge()
+    # check if result matches
+    r = np.concatenate((a_np, a_np[2:]))
+    np.testing.assert_equal(np.array(reader.data), r)
+
+
+def test_ram_write():
+    n = 10
+    # read from offset
+    offset = 2
+    # context for VPI RAM
+    ctx = tvm.vpi(0)
+    a_np = np.zeros(n).astype('int8')
+    a = tvm.nd.array(a_np, ctx)
+    w_data = range(2, n)
+    r_data = np.array(w_data, dtype='int8')
+
+    # head ptr of a
+    a_ptr = int(a.handle[0].data)
+
+    sess = verilog.session([
+        verilog.find_file("test_vpi_mem_interface.v"),
+        verilog.find_file("tvm_vpi_mem_interface.v")
+    ])
+    rst = sess.main.rst
+    write_data = sess.main.write_data
+    write_en = sess.main.write_en
+    write_ready = sess.main.write_data_ready
+    host_write_req = sess.main.write_req
+    host_write_addr = sess.main.write_addr
+    host_write_size = sess.main.write_size
+
+    rst.put_int(1)
+    sess.yield_until_posedge()
+    rst.put_int(0)
+    # hook up writeer
+    writer = FIFOWriter(write_data, write_en, write_ready, w_data)
+
+    sess.yield_callbacks.append(writer)
+    # request write
+    host_write_req.put_int(1)
+    host_write_addr.put_int(a_ptr + offset)
+    host_write_size.put_int(a.shape[0] - offset)
+
+    sess.yield_until_posedge()
+    host_write_req.put_int(0)
+
+    # yield until write is done
+    for i in range(a.shape[0]+2):
+        sess.yield_until_posedge()
+
+    # check if result matches
+    np.testing.assert_equal(a.asnumpy()[2:], r_data)
+
+
+if __name__ == "__main__":
+    test_ram_write()
diff --git a/tests/verilog/test_vpi_mem_interface.v b/tests/verilog/test_vpi_mem_interface.v
new file mode 100644
index 000000000000..def8905edc5a
--- /dev/null
+++ b/tests/verilog/test_vpi_mem_interface.v
@@ -0,0 +1,57 @@
+module main();
+   parameter PER = 10;
+   parameter WIDTH = 8;
+
+   reg clk;
+   reg rst;
+   // read channels
+   reg read_en;
+   wire [WIDTH-1:0] read_data;
+   wire             read_data_valid;
+   // write channels
+   reg              write_en;
+   reg [WIDTH-1:0]  write_data;
+   wire             write_data_ready;
+   // controls
+   reg              read_req;
+   reg [31:0]       read_addr;
+   reg [31:0]       read_size;
+   reg              write_req;
+   reg [31:0]       write_addr;
+   reg [31:0]       write_size;
+
+
+   always begin
+      #(PER/2) clk =~ clk;
+   end
+
+   tvm_vpi_mem_interface #
+     (
+      .READ_WIDTH(WIDTH),
+      .WRITE_WIDTH(WIDTH),
+      .ADDR_WIDTH(32),
+      .SIZE_WIDTH(32)
+    )
+   mem
+     (
+      .clk(clk),
+      .rst(rst),
+      .read_en(read_en),
+      .read_data_out(read_data),
+      .read_data_valid(read_data_valid),
+      .write_en(write_en),
+      .write_data_in(write_data),
+      .write_data_ready(write_data_ready),
+      .host_read_req(read_req),
+      .host_read_addr(read_addr),
+      .host_read_size(read_size),
+      .host_write_req(write_req),
+      .host_write_addr(write_addr),
+      .host_write_size(write_size)
+      );
+
+   initial begin
+      // pass myram to session to hook it up with simulation
+      $tvm_session(clk, mem);
+   end
+endmodule
diff --git a/tests/verilog/test_vpi_mmap.py b/tests/verilog/test_vpi_mmap.py
new file mode 100644
index 000000000000..67072003f241
--- /dev/null
+++ b/tests/verilog/test_vpi_mmap.py
@@ -0,0 +1,56 @@
+import tvm
+import numpy as np
+from tvm.addon import verilog
+
+def test_mmap():
+    n = 10
+    # context for VPI RAM
+    ctx = tvm.vpi(0)
+    a_np = np.arange(n).astype('int8')
+    a = tvm.nd.array(a_np, ctx)
+
+    # head ptr of a
+    a_ptr = int(a.handle[0].data)
+    sess = verilog.session([
+        verilog.find_file("test_vpi_mmap.v"),
+        verilog.find_file("tvm_vpi_mmap.v")
+    ])
+    rst = sess.main.rst
+    read_addr = sess.main.read_addr
+    read_data = sess.main.read_data
+    write_addr = sess.main.write_addr
+    write_data = sess.main.write_data
+    write_en = sess.main.write_en
+    mmap_addr = sess.main.mmap_addr
+
+    # setup memory map.
+    rst.put_int(1)
+    sess.yield_until_posedge()
+    rst.put_int(0)
+    write_en.put_int(0)
+    mmap_addr.put_int(a_ptr)
+    sess.yield_until_posedge()
+
+    # read test
+    for i in range(n):
+        read_addr.put_int(i)
+        sess.yield_until_posedge()
+        # read addr get set this cycle
+        sess.yield_until_posedge()
+        # get the data out
+        assert(read_data.get_int() == i)
+
+    # write test
+    for i in reversed(range(n)):
+        write_addr.put_int(i)
+        write_en.put_int(1)
+        write_data.put_int(i + 1)
+        sess.yield_until_posedge()
+        write_en.put_int(0)
+        sess.yield_until_posedge()
+
+    np.testing.assert_equal(a.asnumpy(), a_np + 1)
+
+
+if __name__ == "__main__":
+    test_mmap()
diff --git a/tests/verilog/test_vpi_mmap.v b/tests/verilog/test_vpi_mmap.v
new file mode 100644
index 000000000000..c922c43b4575
--- /dev/null
+++ b/tests/verilog/test_vpi_mmap.v
@@ -0,0 +1,53 @@
+module main();
+   parameter PER = 10;
+   parameter DATA_WIDTH = 8;
+   parameter ADDR_WIDTH = 8;
+   reg clk;
+   reg rst;
+   // read channels
+   reg [ADDR_WIDTH-1:0] read_addr;
+   wire [DATA_WIDTH-1:0] read_data;
+   // write channels
+   reg [ADDR_WIDTH-1:0]  write_addr;
+   reg [DATA_WIDTH-1:0]  write_data;
+   reg                   write_en;
+   // mmap base
+   reg [31:0]            mmap_addr;
+
+   always begin
+      #(PER/2) clk =~ clk;
+   end
+
+   tvm_vpi_read_mmap #
+     (
+      .DATA_WIDTH(DATA_WIDTH),
+      .ADDR_WIDTH(ADDR_WIDTH)
+      )
+   rmmap
+     (
+      .clk(clk),
+      .rst(rst),
+      .addr(read_addr),
+      .data_out(read_data),
+      .mmap_addr(mmap_addr)
+      );
+
+   tvm_vpi_write_mmap #
+     (
+      .DATA_WIDTH(DATA_WIDTH),
+      .ADDR_WIDTH(ADDR_WIDTH)
+    )
+   wmmap
+     (
+      .clk(clk),
+      .rst(rst),
+      .addr(write_addr),
+      .data_in(write_data),
+      .en(write_en),
+      .mmap_addr(mmap_addr)
+      );
+
+   initial begin
+      $tvm_session(clk, rmmap, wmmap);
+   end
+endmodule
diff --git a/verilog/tvm_vpi.cc b/verilog/tvm_vpi.cc
index d1a41e0bdd2d..3c654b095fb3 100644
--- a/verilog/tvm_vpi.cc
+++ b/verilog/tvm_vpi.cc
@@ -13,8 +13,17 @@
 
 namespace tvm {
 namespace vpi {
+// standard consistency checks
 static_assert(sizeof(vpiHandle) == sizeof(VPIRawHandle),
-              "VPI handle condition");
+              "VPI standard");
+// type codes
+static_assert(vpiModule == kVPIModule, "VPI standard");
+// Property code
+static_assert(vpiType == kVPIType, "VPI standard");
+static_assert(vpiFullName == kVPIFullName, "VPI standard");
+static_assert(vpiSize == kVPISize, "VPI standard");
+static_assert(vpiDefName == kVPIDefName, "VPI standard");
+
 // IPC client for VPI
 class IPCClient {
  public:
@@ -26,8 +35,11 @@ class IPCClient {
     vpiHandle argv = vpi_handle(vpiSysTfCall, 0);
     vpiHandle arg_iter = vpi_iterate(vpiArgument, argv);
     clock_ = vpi_scan(arg_iter);
-    CHECK(vpi_scan(arg_iter) == nullptr)
-        << "tvm_session can only take in one clock";
+    std::vector<VPIRawHandle> handles;
+    while (vpiHandle h = vpi_scan(arg_iter)) {
+      handles.push_back(h);
+    }
+    writer_.Write(handles);
     PutInt(clock_, 0);
   }
   int Callback() {
@@ -74,12 +86,21 @@ class IPCClient {
           writer_.Write(handle);
           break;
         }
-        case kGetName: {
+        case kGetStrProp: {
+          CHECK(reader_.Read(&value));
+          CHECK(reader_.Read(&handle));
+          std::string prop = vpi_get_str(
+              value, static_cast<vpiHandle>(handle));
+          writer_.Write(kSuccess);
+          writer_.Write(prop);
+          break;
+        }
+        case kGetIntProp: {
+          CHECK(reader_.Read(&value));
           CHECK(reader_.Read(&handle));
-          std::string name = vpi_get_str(
-              vpiFullName, static_cast<vpiHandle>(handle));
+          value = vpi_get(value, static_cast<vpiHandle>(handle));
           writer_.Write(kSuccess);
-          writer_.Write(name);
+          writer_.Write(value);
           break;
         }
         case kGetInt32: {
@@ -97,13 +118,6 @@ class IPCClient {
           writer_.Write(kSuccess);
           break;
         }
-        case kGetSize: {
-          CHECK(reader_.Read(&handle));
-          value = vpi_get(vpiSize, static_cast<vpiHandle>(handle));
-          writer_.Write(kSuccess);
-          writer_.Write(value);
-          break;
-        }
         case kGetVec: {
           CHECK(reader_.Read(&handle));
           vpiHandle h = static_cast<vpiHandle>(handle);
@@ -126,17 +140,19 @@ class IPCClient {
           CHECK(reader_.Read(&vec_buf_));
           CHECK(handle != clock_) << "Cannot write to clock";
           vpiHandle h = static_cast<vpiHandle>(handle);
-          size_t nwords = vec_buf_.size();
-          svec_buf_.resize(nwords);
-          reader_.Read(&vec_buf_[0], nwords * sizeof(s_vpi_vecval));
+          svec_buf_.resize(vec_buf_.size());
           for (size_t i = 0; i < vec_buf_.size(); ++i) {
             svec_buf_[i].aval = vec_buf_[i].aval;
             svec_buf_[i].bval = vec_buf_[i].bval;
           }
           s_vpi_value  value_s;
+          s_vpi_time time_s;
+          time_s.type = vpiSimTime;
+          time_s.high = 0;
+          time_s.low  = 0;
           value_s.format = vpiVectorVal;
           value_s.value.vector = &svec_buf_[0];
-          vpi_put_value(h, &value_s, 0, vpiNoDelay);
+          vpi_put_value(h, &value_s, &time_s, vpiInertialDelay);
           writer_.Write(kSuccess);
           break;
         }
@@ -183,9 +199,13 @@ class IPCClient {
   // Put integer into handle.
   static void PutInt(vpiHandle h, int value) {
     s_vpi_value  value_s;
+    s_vpi_time time_s;
+    time_s.type = vpiSimTime;
+    time_s.high = 0;
+    time_s.low  = 0;
     value_s.format = vpiIntVal;
     value_s.value.integer = value;
-    vpi_put_value(h, &value_s, 0, vpiNoDelay);
+    vpi_put_value(h, &value_s, &time_s, vpiInertialDelay);
   }
   // Handles
   vpiHandle clock_;
diff --git a/verilog/tvm_vpi.h b/verilog/tvm_vpi.h
index 5696438f371d..3925e8aedda4 100644
--- a/verilog/tvm_vpi.h
+++ b/verilog/tvm_vpi.h
@@ -12,10 +12,10 @@ namespace vpi {
 enum VPICallCode : int {
   kGetHandleByName,
   kGetHandleByIndex,
-  kGetName,
+  kGetStrProp,
+  kGetIntProp,
   kGetInt32,
   kPutInt32,
-  kGetSize,
   kGetVec,
   kPutVec,
   kYield,
@@ -28,6 +28,19 @@ enum VPIReturnCode : int {
   kFail = 2
 };
 
+// VPI type code as in IEEE standard.
+enum VPITypeCode {
+  kVPIModule = 32
+};
+
+// VPI property code as in IEEE standard.
+enum VPIPropCode {
+  kVPIType = 1,
+  kVPIFullName = 3,
+  kVPISize = 4,
+  kVPIDefName = 9
+};
+
 /*! \brief The vector value used in trasmission */
 struct VPIVecVal {
   int aval;
diff --git a/verilog/tvm_vpi_mem_interface.v b/verilog/tvm_vpi_mem_interface.v
new file mode 100644
index 000000000000..932af8d06280
--- /dev/null
+++ b/verilog/tvm_vpi_mem_interface.v
@@ -0,0 +1,43 @@
+// Memory controller to access TVM VPI simulated RAM.
+//
+// You only see the wires and registers but no logics here.
+// The real computation is implemented via TVM VPI
+//
+// Usage: create and pass instance to additional arguments of $tvm_session.
+// Then  it will be automatically hook up the RAM logic.
+//
+module tvm_vpi_mem_interface
+  #(
+    parameter READ_WIDTH = 8,
+    parameter WRITE_WIDTH = 8,
+    parameter ADDR_WIDTH = 32,
+    parameter SIZE_WIDTH = 32
+    )
+   (
+    input                   clk,
+    input                   rst,
+    // Read Ports
+    input                   read_en, // Read buffer enable
+    output [READ_WIDTH-1:0] read_data_out, // The data port for read
+    output                  read_data_valid, // Read is valid.
+    // Write ports
+    input                   write_en, // Write buffer enable
+    input [WRITE_WIDTH-1:0] write_data_in, // Input data to write.
+    output                  write_data_ready, // There are still pending write
+    // Status port
+    // Control signal ports to issue tasks
+    input                   host_read_req,   // Read request
+    input [ADDR_WIDTH-1:0]  host_read_addr,  // The address to issue a read task
+    input [SIZE_WIDTH-1:0]  host_read_size,  // The size of a read
+    input                   host_write_req,  // Write request.
+    input [ADDR_WIDTH-1:0]  host_write_addr, // The write address
+    input [SIZE_WIDTH-1:0]  host_write_size  // The write size
+    );
+   reg [READ_WIDTH-1:0]    reg_read_data;
+   reg                     reg_read_valid;
+   reg                     reg_write_ready;
+   // The wires up.
+   assign read_data_out = reg_read_data;
+   assign read_data_valid = reg_read_valid;
+   assign write_data_ready = reg_write_ready;
+endmodule
diff --git a/verilog/tvm_vpi_mmap.v b/verilog/tvm_vpi_mmap.v
new file mode 100644
index 000000000000..1f7793954cf5
--- /dev/null
+++ b/verilog/tvm_vpi_mmap.v
@@ -0,0 +1,43 @@
+// TVM mmap maps virtual DRAM into interface of SRAM.
+// This allows create testcases that directly access DRAM.
+
+// Read only memory map, one cycle read.
+// Usage: create and pass instance to additional arguments of $tvm_session.
+module tvm_vpi_read_mmap
+  #(
+    parameter DATA_WIDTH = 8,
+    parameter ADDR_WIDTH = 8,
+    parameter BASE_ADDR_WIDTH = 32
+    )
+   (
+    input                       clk,
+    input                       rst,
+    // Read Ports
+    input [ADDR_WIDTH-1:0]      addr, // Local offset in terms of number of units
+    output [DATA_WIDTH-1:0]     data_out, // The data port for read
+    // Configure port
+    input [BASE_ADDR_WIDTH-1:0] mmap_addr // The base address of memory map.
+    );
+   reg [DATA_WIDTH-1:0]         reg_data;
+   assign data_out = reg_data;
+endmodule
+
+// Write only memory map, one cycle write.
+// Usage: create and pass instance to additional arguments of $tvm_session.
+module tvm_vpi_write_mmap
+  #(
+    parameter DATA_WIDTH = 8,
+    parameter ADDR_WIDTH = 8,
+    parameter BASE_ADDR_WIDTH = 32
+    )
+   (
+    input                       clk,
+    input                       rst,
+    // Write Ports
+    input [ADDR_WIDTH-1:0]      addr, // Local offset in terms of number of units
+    input [DATA_WIDTH-1:0]      data_in, // The data port for write
+    input                       en, // The enable port for write
+    // Configure port
+    input [BASE_ADDR_WIDTH-1:0] mmap_addr // The base address of memap
+    );
+endmodule

From c53243145ce118f3ff66d47501d13436a6cce78b Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Fri, 17 Mar 2017 09:38:42 -0700
Subject: [PATCH 2/2] fix test issues

---
 .travis.yml                             | 2 ++
 src/codegen/verilog/vpi_device_api.cc   | 4 ++--
 tests/verilog/test_vpi_mem_interface.py | 7 ++++---
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index f5a0b4a3c170..37496edeca74 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -6,6 +6,8 @@ os:
   - linux
   - osx
 
+osx_image: xcode8
+
 env:
   # code analysis
   - TASK=all_test
diff --git a/src/codegen/verilog/vpi_device_api.cc b/src/codegen/verilog/vpi_device_api.cc
index 384156cae200..93ef4563fbe4 100644
--- a/src/codegen/verilog/vpi_device_api.cc
+++ b/src/codegen/verilog/vpi_device_api.cc
@@ -213,14 +213,14 @@ class VPIMemoryInterface {
       out_reg_write_ready_.put_int(0);
     }
     // Control tasks
-    if (ctrl_read_addr_.get_int() != 0) {
+    if (ctrl_read_req_.get_int()) {
       FIFOTask tsk;
       tsk.addr = reinterpret_cast<char*>(ctrl_read_addr_.get_int());
       tsk.size = static_cast<size_t>(ctrl_read_size_.get_int());
       read_tasks_.push(tsk);
     }
     // Control tasks
-    if (ctrl_write_addr_.get_int() != 0) {
+    if (ctrl_write_req_.get_int()) {
       FIFOTask tsk;
       tsk.addr = reinterpret_cast<char*>(ctrl_write_addr_.get_int());
       tsk.size = static_cast<size_t>(ctrl_write_size_.get_int());
diff --git a/tests/verilog/test_vpi_mem_interface.py b/tests/verilog/test_vpi_mem_interface.py
index 4a97061bd64c..32c01bab0e63 100644
--- a/tests/verilog/test_vpi_mem_interface.py
+++ b/tests/verilog/test_vpi_mem_interface.py
@@ -68,10 +68,10 @@ def test_ram_read():
 
     sess.yield_until_posedge()
     host_read_req.put_int(0)
-    read_deq.put_int(1)
+    read_en.put_int(1)
 
     # yield until read is done
-    for i in range(a.shape[0] * 2):
+    for i in range(a.shape[0] * 3):
         sess.yield_until_posedge()
     # check if result matches
     r = np.concatenate((a_np, a_np[2:]))
@@ -86,7 +86,7 @@ def test_ram_write():
     ctx = tvm.vpi(0)
     a_np = np.zeros(n).astype('int8')
     a = tvm.nd.array(a_np, ctx)
-    w_data = range(2, n)
+    w_data = list(range(2, n))
     r_data = np.array(w_data, dtype='int8')
 
     # head ptr of a
@@ -128,4 +128,5 @@ def test_ram_write():
 
 
 if __name__ == "__main__":
+    test_ram_read()
     test_ram_write()