apache · tmoreau89 · Mar 9, 2020 · Mar 2, 2020 · Mar 2, 2020 · Mar 2, 2020
diff --git a/cmake/modules/VTA.cmake b/cmake/modules/VTA.cmake
@@ -101,7 +101,9 @@ elseif(PYTHON)
        ${VTA_TARGET} STREQUAL "ultra96")
       target_link_libraries(vta ${__cma_lib})
     elseif(${VTA_TARGET} STREQUAL "de10nano")  # DE10-Nano rules
-      target_compile_definitions(vta PUBLIC VTA_MAX_XFER=2097152) # (1<<21)
+     #target_compile_definitions(vta PUBLIC VTA_MAX_XFER=2097152) # (1<<21)
+      target_include_directories(vta PUBLIC vta/src/de10nano)
+      target_include_directories(vta PUBLIC 3rdparty)
       target_include_directories(vta PUBLIC
         "/usr/local/intelFPGA_lite/18.1/embedded/ds-5/sw/gcc/arm-linux-gnueabihf/include")
     endif()

diff --git a/docs/vta/install.md b/docs/vta/install.md
@@ -146,8 +146,8 @@ Tips regarding the Pynq RPC Server:
 Before running the examples on your development machine, you'll need to configure your host environment as follows:
 ```bash
 # On the Host-side
-export VTA_PYNQ_RPC_HOST=192.168.2.99
-export VTA_PYNQ_RPC_PORT=9091
+export VTA_RPC_HOST=192.168.2.99
+export VTA_RPC_PORT=9091
 ```
 
 In addition, you'll need to edit the `vta_config.json` file on the host to indicate that we are targeting the Pynq platform, by setting the `TARGET` field to `"pynq"`.

diff --git a/vta/config/de10nano_sample.json b/vta/config/de10nano_sample.json
@@ -7,7 +7,7 @@
   "LOG_BATCH" : 0,
   "LOG_BLOCK" : 4,
   "LOG_UOP_BUFF_SIZE" : 15,
-  "LOG_INP_BUFF_SIZE" :15,
+  "LOG_INP_BUFF_SIZE" : 15,
   "LOG_WGT_BUFF_SIZE" : 18,
   "LOG_ACC_BUFF_SIZE" : 17
 }
diff --git a/vta/config/pynq_sample.json b/vta/config/pynq_sample.json
@@ -7,7 +7,7 @@
   "LOG_BATCH" : 0,
   "LOG_BLOCK" : 4,
   "LOG_UOP_BUFF_SIZE" : 15,
-  "LOG_INP_BUFF_SIZE" :15,
+  "LOG_INP_BUFF_SIZE" : 15,
   "LOG_WGT_BUFF_SIZE" : 18,
   "LOG_ACC_BUFF_SIZE" : 17
 }
diff --git a/vta/config/ultra96_sample.json b/vta/config/ultra96_sample.json
@@ -7,7 +7,7 @@
   "LOG_BATCH" : 0,
   "LOG_BLOCK" : 4,
   "LOG_UOP_BUFF_SIZE" : 15,
-  "LOG_INP_BUFF_SIZE" :15,
+  "LOG_INP_BUFF_SIZE" : 15,
   "LOG_WGT_BUFF_SIZE" : 18,
   "LOG_ACC_BUFF_SIZE" : 17
 }
diff --git a/vta/hardware/chisel/Makefile b/vta/hardware/chisel/Makefile
@@ -32,16 +32,36 @@ ifeq (, $(VERILATOR_INC_DIR))
   endif
 endif
 
-CONFIG = DefaultPynqConfig
+CONFIG = DefaultDe10Config
 TOP = VTA
 TOP_TEST = Test
 BUILD_NAME = build
+# Set USE_TRACE = 1 to generate a trace during simulation.
 USE_TRACE = 0
+# With USE_TRACE = 1, default trace format is VCD.
+# Set USE_TRACE_FST = 1 to use the FST format.
+# Note that although FST is around two orders of magnitude smaller than VCD
+# it is also currently much slower to produce (verilator limitation). But if
+# you are low on disk space it may be your only option.
+USE_TRACE_FST = 0
+# With USE_TRACE = 1, USE_TRACE_DETAILED = 1 will generate traces that also
+# include non-interface internal signal names starting with an underscore.
+# This will significantly increase the trace size and should only be used
+# on a per need basis for difficult debug problems.
+USE_TRACE_DETAILED = 0
 USE_THREADS = $(shell nproc)
 VTA_LIBNAME = libvta_hw
 UNITTEST_NAME = all
 CXX = g++
+# A debug build with DEBUG = 1 is useful to trace the simulation with a
+# debugger.
 DEBUG = 0
+# With DEBUG = 1, SANITIZE = 1 turns on address sanitizing to verify that
+# the verilator build is sane. To be used if you know what you are doing.
+SANITIZE = 0
+
+CXX_MAJOR := $(shell $(CXX) -dumpversion | sed 's/\..*//')
+CXX_HAS_ALIGN_NEW := $(shell [ $(CXX_MAJOR) -ge 7 ] && echo true)
 
 config_test = $(TOP_TEST)$(CONFIG)
 vta_dir = $(abspath ../../)
@@ -61,11 +81,15 @@ verilator_opt += -Mdir ${verilator_build_dir}
 verilator_opt += -I$(chisel_build_dir)
 
 ifeq ($(DEBUG), 0)
-  cxx_flags = -O2 -Wall
+  cxx_flags = -O2 -Wall -fvisibility=hidden
 else
   cxx_flags = -O0 -g -Wall
 endif
-cxx_flags += -fvisibility=hidden -std=c++11
+
+cxx_flags += -std=c++11 -Wno-maybe-uninitialized
+ifeq ($(CXX_HAS_ALIGN_NEW),true)
+  cxx_flags += -faligned-new
+endif
 cxx_flags += -DVL_TSIM_NAME=V$(TOP_TEST)
 cxx_flags += -DVL_PRINTF=printf
 cxx_flags += -DVL_USER_FINISH
@@ -82,13 +106,33 @@ cxx_flags += -I$(tvm_dir)/3rdparty/dlpack/include
 
 ld_flags = -fPIC -shared
 
+ifeq ($(SANITIZE), 1)
+  ifeq ($(DEBUG), 1)
+    cxx_flags += -fno-omit-frame-pointer -fsanitize=address -fsanitize-recover=address
+     ld_flags += -fno-omit-frame-pointer -fsanitize=address -fsanitize-recover=address
+  endif
+endif
+
 cxx_objs = $(verilator_build_dir)/verilated.o $(verilator_build_dir)/verilated_dpi.o $(verilator_build_dir)/tsim_device.o
 
 ifneq ($(USE_TRACE), 0)
-  verilator_opt += --trace
   cxx_flags += -DVM_TRACE=1
-  cxx_flags += -DTSIM_TRACE_FILE=$(verilator_build_dir)/$(TOP_TEST).vcd
-  cxx_objs += $(verilator_build_dir)/verilated_vcd_c.o
+  ifeq ($(USE_TRACE_FST), 1)
+    cxx_flags += -DVM_TRACE_FST
+    verilator_opt += --trace-fst
+  else
+    verilator_opt += --trace
+  endif
+  ifeq ($(USE_TRACE_DETAILED), 1)
+    verilator_opt += --trace-underscore --trace-structs
+  endif
+  ifeq ($(USE_TRACE_FST), 1)
+    cxx_flags += -DTSIM_TRACE_FILE=$(verilator_build_dir)/$(TOP_TEST).fst
+    cxx_objs += $(verilator_build_dir)/verilated_fst_c.o
+  else
+    cxx_flags += -DTSIM_TRACE_FILE=$(verilator_build_dir)/$(TOP_TEST).vcd
+    cxx_objs += $(verilator_build_dir)/verilated_vcd_c.o
+  endif
 else
   cxx_flags += -DVM_TRACE=0
 endif

diff --git a/vta/hardware/chisel/src/main/scala/core/Compute.scala b/vta/hardware/chisel/src/main/scala/core/Compute.scala
@@ -45,6 +45,7 @@ class Compute(debug: Boolean = false)(implicit p: Parameters) extends Module {
     val wgt = new TensorMaster(tensorType = "wgt")
     val out = new TensorMaster(tensorType = "out")
     val finish = Output(Bool())
+    val acc_wr_event = Output(Bool())
   })
   val sIdle :: sSync :: sExe :: Nil = Enum(3)
   val state = RegInit(sIdle)
@@ -125,6 +126,7 @@ class Compute(debug: Boolean = false)(implicit p: Parameters) extends Module {
   tensorAcc.io.tensor.rd.idx <> Mux(dec.io.isGemm, tensorGemm.io.acc.rd.idx, tensorAlu.io.acc.rd.idx)
   tensorAcc.io.tensor.wr <> Mux(dec.io.isGemm, tensorGemm.io.acc.wr, tensorAlu.io.acc.wr)
   io.vme_rd(1) <> tensorAcc.io.vme_rd
+  io.acc_wr_event := tensorAcc.io.tensor.wr.valid
 
   // gemm
   tensorGemm.io.start := state === sIdle & start & dec.io.isGemm

diff --git a/vta/hardware/chisel/src/main/scala/core/Core.scala b/vta/hardware/chisel/src/main/scala/core/Core.scala
@@ -111,6 +111,8 @@ class Core(implicit p: Parameters) extends Module {
   ecounters.io.launch := io.vcr.launch
   ecounters.io.finish := compute.io.finish
   io.vcr.ecnt <> ecounters.io.ecnt
+  io.vcr.ucnt <> ecounters.io.ucnt
+  ecounters.io.acc_wr_event := compute.io.acc_wr_event
 
   // Finish instruction is executed and asserts the VCR finish flag
   val finish = RegNext(compute.io.finish)

diff --git a/vta/hardware/chisel/src/main/scala/core/EventCounters.scala b/vta/hardware/chisel/src/main/scala/core/EventCounters.scala
@@ -44,6 +44,8 @@ class EventCounters(debug: Boolean = false)(implicit p: Parameters) extends Modu
     val launch = Input(Bool())
     val finish = Input(Bool())
     val ecnt = Vec(vp.nECnt, ValidIO(UInt(vp.regBits.W)))
+    val ucnt = Vec(vp.nUCnt, ValidIO(UInt(vp.regBits.W)))
+    val acc_wr_event = Input(Bool())
   })
   val cycle_cnt = RegInit(0.U(vp.regBits.W))
   when(io.launch && !io.finish) {
@@ -53,4 +55,13 @@ class EventCounters(debug: Boolean = false)(implicit p: Parameters) extends Modu
   }
   io.ecnt(0).valid := io.finish
   io.ecnt(0).bits := cycle_cnt
+
+  val acc_wr_count = Reg(UInt(vp.regBits.W))
+  when (!io.launch || io.finish) {
+    acc_wr_count := 0.U
+  }.elsewhen (io.acc_wr_event) {
+    acc_wr_count := acc_wr_count + 1.U
+  }
+  io.ucnt(0).valid := io.finish
+  io.ucnt(0).bits := acc_wr_count
 }
diff --git a/vta/hardware/chisel/src/main/scala/core/LoadUop.scala b/vta/hardware/chisel/src/main/scala/core/LoadUop.scala
@@ -112,14 +112,18 @@ class LoadUop(debug: Boolean = false)(implicit p: Parameters) extends Module {
         when(xcnt === xlen) {
           when(xrem === 0.U) {
             state := sIdle
-          }.elsewhen(xrem < xmax) {
-            state := sReadCmd
-            xlen := xrem
-            xrem := 0.U
           }.otherwise {
-            state := sReadCmd
-            xlen := xmax - 1.U
-            xrem := xrem - xmax
+            raddr := raddr + xmax_bytes
+            when(xrem < xmax) {
+              state := sReadCmd
+              xlen := xrem
+              xrem := 0.U
+            }
+            .otherwise {
+              state := sReadCmd
+              xlen := xmax - 1.U
+              xrem := xrem - xmax
+            }
           }
         }
       }
@@ -134,8 +138,6 @@ class LoadUop(debug: Boolean = false)(implicit p: Parameters) extends Module {
     }.otherwise {
       raddr := (io.baddr | (maskOffset & (dec.dram_offset << log2Ceil(uopBytes)))) - uopBytes.U
     }
-  }.elsewhen(state === sReadData && xcnt === xlen && xrem =/= 0.U) {
-    raddr := raddr + xmax_bytes
   }
 
   io.vme_rd.cmd.valid := state === sReadCmd

diff --git a/vta/hardware/chisel/src/main/scala/core/TensorAlu.scala b/vta/hardware/chisel/src/main/scala/core/TensorAlu.scala
@@ -72,7 +72,6 @@ class AluReg(implicit p: Parameters) extends Module {
 
 /** Vector of pipeline ALUs */
 class AluVector(implicit p: Parameters) extends Module {
-  val aluBits = p(CoreKey).accBits
   val io = IO(new Bundle {
     val opcode = Input(UInt(C_ALU_OP_BITS.W))
     val acc_a = new TensorMasterData(tensorType = "acc")

diff --git a/vta/hardware/chisel/src/main/scala/core/TensorLoad.scala b/vta/hardware/chisel/src/main/scala/core/TensorLoad.scala
@@ -103,8 +103,7 @@ class TensorLoad(tensorType: String = "none", debug: Boolean = false)(
             state := sXPad1
           }.elsewhen(dec.ypad_1 =/= 0.U) {
             state := sYPad1
-          }
-          .otherwise {
+          }.otherwise {
             state := sIdle
           }
         }.elsewhen(dataCtrl.io.stride) {
@@ -198,11 +197,9 @@ class TensorLoad(tensorType: String = "none", debug: Boolean = false)(
     tag := tag + 1.U
   }
 
-  when(
-    state === sIdle || dataCtrlDone || (set === (tp.tensorLength - 1).U && tag === (tp.numMemBlock - 1).U)) {
+  when(state === sIdle || dataCtrlDone || (set === (tp.tensorLength - 1).U && tag === (tp.numMemBlock - 1).U)) {
     set := 0.U
-  }.elsewhen(
-    (io.vme_rd.data.fire() || isZeroPad) && tag === (tp.numMemBlock - 1).U) {
+  }.elsewhen((io.vme_rd.data.fire() || isZeroPad) && tag === (tp.numMemBlock - 1).U) {
     set := set + 1.U
   }
 
@@ -211,10 +208,12 @@ class TensorLoad(tensorType: String = "none", debug: Boolean = false)(
   when(state === sIdle) {
     waddr_cur := dec.sram_offset
     waddr_nxt := dec.sram_offset
-  }.elsewhen((io.vme_rd.data
-    .fire() || isZeroPad) && set === (tp.tensorLength - 1).U && tag === (tp.numMemBlock - 1).U) {
+  }.elsewhen((io.vme_rd.data.fire() || isZeroPad)
+    && set === (tp.tensorLength - 1).U
+    && tag === (tp.numMemBlock - 1).U)
+  {
     waddr_cur := waddr_cur + 1.U
-  }.elsewhen(dataCtrl.io.stride) {
+  }.elsewhen(dataCtrl.io.stride && io.vme_rd.data.fire()) {
     waddr_cur := waddr_nxt + dec.xsize
     waddr_nxt := waddr_nxt + dec.xsize
   }
@@ -261,8 +260,7 @@ class TensorLoad(tensorType: String = "none", debug: Boolean = false)(
   }
 
   // done
-  val done_no_pad = io.vme_rd.data
-    .fire() & dataCtrl.io.done & dec.xpad_1 === 0.U & dec.ypad_1 === 0.U
+  val done_no_pad = io.vme_rd.data.fire() & dataCtrl.io.done & dec.xpad_1 === 0.U & dec.ypad_1 === 0.U
   val done_x_pad = state === sXPad1 & xPadCtrl1.io.done & dataCtrlDone & dec.ypad_1 === 0.U
   val done_y_pad = state === sYPad1 & dataCtrlDone & yPadCtrl1.io.done
   io.done := done_no_pad | done_x_pad | done_y_pad

diff --git a/vta/hardware/chisel/src/main/scala/core/TensorStore.scala b/vta/hardware/chisel/src/main/scala/core/TensorStore.scala
@@ -62,20 +62,38 @@ class TensorStore(tensorType: String = "none", debug: Boolean = false)(
   val tag = Reg(UInt(8.W))
   val set = Reg(UInt(8.W))
 
+  val xfer_bytes = Reg(chiselTypeOf(io.vme_wr.cmd.bits.addr))
+  val xstride_bytes = dec.xstride << log2Ceil(tensorLength * tensorWidth)
+  val maskOffset = VecInit(Seq.fill(M_DRAM_OFFSET_BITS)(true.B)).asUInt
+  val elemBytes = (p(CoreKey).batch * p(CoreKey).blockOut * p(CoreKey).outBits) / 8
+  val pulse_bytes_bits = log2Ceil(mp.dataBits >> 3)
+
+  val xfer_init_addr = io.baddr | (maskOffset & (dec.dram_offset << log2Ceil(elemBytes)))
+  val xfer_split_addr = waddr_cur + xfer_bytes
+  val xfer_stride_addr = waddr_nxt + xstride_bytes
+
+  val xfer_init_bytes   = xmax_bytes - xfer_init_addr % xmax_bytes
+  val xfer_init_pulses  = xfer_init_bytes >> pulse_bytes_bits
+  val xfer_split_bytes  = xmax_bytes - xfer_split_addr % xmax_bytes
+  val xfer_split_pulses = xfer_split_bytes >> pulse_bytes_bits
+  val xfer_stride_bytes = xmax_bytes - xfer_stride_addr % xmax_bytes
+  val xfer_stride_pulses= xfer_stride_bytes >> pulse_bytes_bits
+
   val sIdle :: sWriteCmd :: sWriteData :: sReadMem :: sWriteAck :: Nil = Enum(5)
   val state = RegInit(sIdle)
 
   // control
   switch(state) {
     is(sIdle) {
-      when(io.start) {
+      xfer_bytes := xfer_init_bytes
+      when (io.start) {
         state := sWriteCmd
-        when(xsize < xmax) {
+        when (xsize < xfer_init_pulses) {
           xlen := xsize
           xrem := 0.U
         }.otherwise {
-          xlen := xmax - 1.U
-          xrem := xsize - xmax
+          xlen := xfer_init_pulses - 1.U
+          xrem := xsize - xfer_init_pulses
         }
       }
     }
@@ -101,24 +119,29 @@ class TensorStore(tensorType: String = "none", debug: Boolean = false)(
         when(xrem === 0.U) {
           when(ycnt === ysize - 1.U) {
             state := sIdle
-          }.otherwise {
+          }.otherwise { // stride
             state := sWriteCmd
-            when(xsize < xmax) {
+            xfer_bytes := xfer_stride_bytes
+            when(xsize < xfer_stride_pulses) {
               xlen := xsize
               xrem := 0.U
             }.otherwise {
-              xlen := xmax - 1.U
-              xrem := xsize - xmax
+              xlen := xfer_stride_pulses - 1.U
+              xrem := xsize - xfer_stride_pulses
             }
           }
-        }.elsewhen(xrem < xmax) {
+        } // split
+        .elsewhen(xrem < xfer_split_pulses) {
           state := sWriteCmd
+          xfer_bytes := xfer_split_bytes
           xlen := xrem
           xrem := 0.U
-        }.otherwise {
+        }
+        .otherwise {
           state := sWriteCmd
-          xlen := xmax - 1.U
-          xrem := xrem - xmax
+          xfer_bytes := xfer_split_bytes
+          xlen := xfer_split_pulses - 1.U
+          xrem := xrem - xfer_split_pulses
         }
       }
     }
@@ -174,8 +197,7 @@ class TensorStore(tensorType: String = "none", debug: Boolean = false)(
   when(state === sIdle) {
     raddr_cur := dec.sram_offset
     raddr_nxt := dec.sram_offset
-  }.elsewhen(io.vme_wr.data
-    .fire() && set === (tensorLength - 1).U && tag === (numMemBlock - 1).U) {
+  }.elsewhen(io.vme_wr.data.fire() && set === (tensorLength - 1).U && tag === (numMemBlock - 1).U) {
     raddr_cur := raddr_cur + 1.U
   }.elsewhen(stride) {
     raddr_cur := raddr_nxt + dec.xsize
@@ -189,18 +211,14 @@ class TensorStore(tensorType: String = "none", debug: Boolean = false)(
   val mdata = MuxLookup(set, 0.U.asTypeOf(chiselTypeOf(wdata_t)), tread)
 
   // write-to-dram
-  val maskOffset = VecInit(Seq.fill(M_DRAM_OFFSET_BITS)(true.B)).asUInt
-  val elemBytes = (p(CoreKey).batch * p(CoreKey).blockOut * p(CoreKey).outBits) / 8
   when(state === sIdle) {
-    waddr_cur := io.baddr | (maskOffset & (dec.dram_offset << log2Ceil(
-      elemBytes)))
-    waddr_nxt := io.baddr | (maskOffset & (dec.dram_offset << log2Ceil(
-      elemBytes)))
+    waddr_cur := xfer_init_addr
+    waddr_nxt := xfer_init_addr
   }.elsewhen(state === sWriteAck && io.vme_wr.ack && xrem =/= 0.U) {
-    waddr_cur := waddr_cur + xmax_bytes
+    waddr_cur := xfer_split_addr
   }.elsewhen(stride) {
-    waddr_cur := waddr_nxt + (dec.xstride << log2Ceil(tensorLength * tensorWidth))
-    waddr_nxt := waddr_nxt + (dec.xstride << log2Ceil(tensorLength * tensorWidth))
+    waddr_cur := xfer_stride_addr
+    waddr_nxt := xfer_stride_addr
   }
 
   io.vme_wr.cmd.valid := state === sWriteCmd