Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion cmake/modules/VTA.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,9 @@ elseif(PYTHON)
${VTA_TARGET} STREQUAL "ultra96")
target_link_libraries(vta ${__cma_lib})
elseif(${VTA_TARGET} STREQUAL "de10nano") # DE10-Nano rules
target_compile_definitions(vta PUBLIC VTA_MAX_XFER=2097152) # (1<<21)
#target_compile_definitions(vta PUBLIC VTA_MAX_XFER=2097152) # (1<<21)
target_include_directories(vta PUBLIC vta/src/de10nano)
target_include_directories(vta PUBLIC 3rdparty)
target_include_directories(vta PUBLIC
"/usr/local/intelFPGA_lite/18.1/embedded/ds-5/sw/gcc/arm-linux-gnueabihf/include")
endif()
Expand Down
4 changes: 2 additions & 2 deletions docs/vta/install.md
Original file line number Diff line number Diff line change
Expand Up @@ -146,8 +146,8 @@ Tips regarding the Pynq RPC Server:
Before running the examples on your development machine, you'll need to configure your host environment as follows:
```bash
# On the Host-side
export VTA_PYNQ_RPC_HOST=192.168.2.99
export VTA_PYNQ_RPC_PORT=9091
export VTA_RPC_HOST=192.168.2.99
export VTA_RPC_PORT=9091
```

In addition, you'll need to edit the `vta_config.json` file on the host to indicate that we are targeting the Pynq platform, by setting the `TARGET` field to `"pynq"`.
Expand Down
2 changes: 1 addition & 1 deletion vta/config/de10nano_sample.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
"LOG_BATCH" : 0,
"LOG_BLOCK" : 4,
"LOG_UOP_BUFF_SIZE" : 15,
"LOG_INP_BUFF_SIZE" :15,
"LOG_INP_BUFF_SIZE" : 15,
"LOG_WGT_BUFF_SIZE" : 18,
"LOG_ACC_BUFF_SIZE" : 17
}
2 changes: 1 addition & 1 deletion vta/config/pynq_sample.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
"LOG_BATCH" : 0,
"LOG_BLOCK" : 4,
"LOG_UOP_BUFF_SIZE" : 15,
"LOG_INP_BUFF_SIZE" :15,
"LOG_INP_BUFF_SIZE" : 15,
"LOG_WGT_BUFF_SIZE" : 18,
"LOG_ACC_BUFF_SIZE" : 17
}
2 changes: 1 addition & 1 deletion vta/config/ultra96_sample.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
"LOG_BATCH" : 0,
"LOG_BLOCK" : 4,
"LOG_UOP_BUFF_SIZE" : 15,
"LOG_INP_BUFF_SIZE" :15,
"LOG_INP_BUFF_SIZE" : 15,
"LOG_WGT_BUFF_SIZE" : 18,
"LOG_ACC_BUFF_SIZE" : 17
}
56 changes: 50 additions & 6 deletions vta/hardware/chisel/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -32,16 +32,36 @@ ifeq (, $(VERILATOR_INC_DIR))
endif
endif

CONFIG = DefaultPynqConfig
CONFIG = DefaultDe10Config
TOP = VTA
TOP_TEST = Test
BUILD_NAME = build
# Set USE_TRACE = 1 to generate a trace during simulation.
USE_TRACE = 0
# With USE_TRACE = 1, default trace format is VCD.
# Set USE_TRACE_FST = 1 to use the FST format.
# Note that although FST is around two orders of magnitude smaller than VCD
# it is also currently much slower to produce (verilator limitation). But if
# you are low on disk space it may be your only option.
USE_TRACE_FST = 0
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we might need a comment here, to notify future users that USE_TRACE would default to use VCD as output, and USE_TRACE_FST would not take effect if USE_TRACE is not enabled.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense, although the logic is fairly simple and self-explanatory.
I did not see any comments in the Makefile for any of the configuration variables so I did not want to start adding ones.

# With USE_TRACE = 1, USE_TRACE_DETAILED = 1 will generate traces that also
# include non-interface internal signal names starting with an underscore.
# This will significantly increase the trace size and should only be used
# on a per need basis for difficult debug problems.
USE_TRACE_DETAILED = 0
USE_THREADS = $(shell nproc)
VTA_LIBNAME = libvta_hw
UNITTEST_NAME = all
CXX = g++
# A debug build with DEBUG = 1 is useful to trace the simulation with a
# debugger.
DEBUG = 0
# With DEBUG = 1, SANITIZE = 1 turns on address sanitizing to verify that
# the verilator build is sane. To be used if you know what you are doing.
SANITIZE = 0

CXX_MAJOR := $(shell $(CXX) -dumpversion | sed 's/\..*//')
CXX_HAS_ALIGN_NEW := $(shell [ $(CXX_MAJOR) -ge 7 ] && echo true)

config_test = $(TOP_TEST)$(CONFIG)
vta_dir = $(abspath ../../)
Expand All @@ -61,11 +81,15 @@ verilator_opt += -Mdir ${verilator_build_dir}
verilator_opt += -I$(chisel_build_dir)

ifeq ($(DEBUG), 0)
cxx_flags = -O2 -Wall
cxx_flags = -O2 -Wall -fvisibility=hidden
else
cxx_flags = -O0 -g -Wall
endif
cxx_flags += -fvisibility=hidden -std=c++11

cxx_flags += -std=c++11 -Wno-maybe-uninitialized
ifeq ($(CXX_HAS_ALIGN_NEW),true)
cxx_flags += -faligned-new
endif
cxx_flags += -DVL_TSIM_NAME=V$(TOP_TEST)
cxx_flags += -DVL_PRINTF=printf
cxx_flags += -DVL_USER_FINISH
Expand All @@ -82,13 +106,33 @@ cxx_flags += -I$(tvm_dir)/3rdparty/dlpack/include

ld_flags = -fPIC -shared

ifeq ($(SANITIZE), 1)
ifeq ($(DEBUG), 1)
cxx_flags += -fno-omit-frame-pointer -fsanitize=address -fsanitize-recover=address
ld_flags += -fno-omit-frame-pointer -fsanitize=address -fsanitize-recover=address
endif
endif

cxx_objs = $(verilator_build_dir)/verilated.o $(verilator_build_dir)/verilated_dpi.o $(verilator_build_dir)/tsim_device.o

ifneq ($(USE_TRACE), 0)
verilator_opt += --trace
cxx_flags += -DVM_TRACE=1
cxx_flags += -DTSIM_TRACE_FILE=$(verilator_build_dir)/$(TOP_TEST).vcd
cxx_objs += $(verilator_build_dir)/verilated_vcd_c.o
ifeq ($(USE_TRACE_FST), 1)
cxx_flags += -DVM_TRACE_FST
verilator_opt += --trace-fst
else
verilator_opt += --trace
endif
ifeq ($(USE_TRACE_DETAILED), 1)
verilator_opt += --trace-underscore --trace-structs
endif
ifeq ($(USE_TRACE_FST), 1)
cxx_flags += -DTSIM_TRACE_FILE=$(verilator_build_dir)/$(TOP_TEST).fst
cxx_objs += $(verilator_build_dir)/verilated_fst_c.o
else
cxx_flags += -DTSIM_TRACE_FILE=$(verilator_build_dir)/$(TOP_TEST).vcd
cxx_objs += $(verilator_build_dir)/verilated_vcd_c.o
endif
else
cxx_flags += -DVM_TRACE=0
endif
Expand Down
2 changes: 2 additions & 0 deletions vta/hardware/chisel/src/main/scala/core/Compute.scala
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ class Compute(debug: Boolean = false)(implicit p: Parameters) extends Module {
val wgt = new TensorMaster(tensorType = "wgt")
val out = new TensorMaster(tensorType = "out")
val finish = Output(Bool())
val acc_wr_event = Output(Bool())
})
val sIdle :: sSync :: sExe :: Nil = Enum(3)
val state = RegInit(sIdle)
Expand Down Expand Up @@ -125,6 +126,7 @@ class Compute(debug: Boolean = false)(implicit p: Parameters) extends Module {
tensorAcc.io.tensor.rd.idx <> Mux(dec.io.isGemm, tensorGemm.io.acc.rd.idx, tensorAlu.io.acc.rd.idx)
tensorAcc.io.tensor.wr <> Mux(dec.io.isGemm, tensorGemm.io.acc.wr, tensorAlu.io.acc.wr)
io.vme_rd(1) <> tensorAcc.io.vme_rd
io.acc_wr_event := tensorAcc.io.tensor.wr.valid

// gemm
tensorGemm.io.start := state === sIdle & start & dec.io.isGemm
Expand Down
2 changes: 2 additions & 0 deletions vta/hardware/chisel/src/main/scala/core/Core.scala
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,8 @@ class Core(implicit p: Parameters) extends Module {
ecounters.io.launch := io.vcr.launch
ecounters.io.finish := compute.io.finish
io.vcr.ecnt <> ecounters.io.ecnt
io.vcr.ucnt <> ecounters.io.ucnt
ecounters.io.acc_wr_event := compute.io.acc_wr_event

// Finish instruction is executed and asserts the VCR finish flag
val finish = RegNext(compute.io.finish)
Expand Down
11 changes: 11 additions & 0 deletions vta/hardware/chisel/src/main/scala/core/EventCounters.scala
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ class EventCounters(debug: Boolean = false)(implicit p: Parameters) extends Modu
val launch = Input(Bool())
val finish = Input(Bool())
val ecnt = Vec(vp.nECnt, ValidIO(UInt(vp.regBits.W)))
val ucnt = Vec(vp.nUCnt, ValidIO(UInt(vp.regBits.W)))
val acc_wr_event = Input(Bool())
})
val cycle_cnt = RegInit(0.U(vp.regBits.W))
when(io.launch && !io.finish) {
Expand All @@ -53,4 +55,13 @@ class EventCounters(debug: Boolean = false)(implicit p: Parameters) extends Modu
}
io.ecnt(0).valid := io.finish
io.ecnt(0).bits := cycle_cnt

val acc_wr_count = Reg(UInt(vp.regBits.W))
when (!io.launch || io.finish) {
acc_wr_count := 0.U
}.elsewhen (io.acc_wr_event) {
acc_wr_count := acc_wr_count + 1.U
}
io.ucnt(0).valid := io.finish
io.ucnt(0).bits := acc_wr_count
}
20 changes: 11 additions & 9 deletions vta/hardware/chisel/src/main/scala/core/LoadUop.scala
Original file line number Diff line number Diff line change
Expand Up @@ -112,14 +112,18 @@ class LoadUop(debug: Boolean = false)(implicit p: Parameters) extends Module {
when(xcnt === xlen) {
when(xrem === 0.U) {
state := sIdle
}.elsewhen(xrem < xmax) {
state := sReadCmd
xlen := xrem
xrem := 0.U
}.otherwise {
state := sReadCmd
xlen := xmax - 1.U
xrem := xrem - xmax
raddr := raddr + xmax_bytes
when(xrem < xmax) {
state := sReadCmd
xlen := xrem
xrem := 0.U
}
.otherwise {
state := sReadCmd
xlen := xmax - 1.U
xrem := xrem - xmax
}
}
}
}
Expand All @@ -134,8 +138,6 @@ class LoadUop(debug: Boolean = false)(implicit p: Parameters) extends Module {
}.otherwise {
raddr := (io.baddr | (maskOffset & (dec.dram_offset << log2Ceil(uopBytes)))) - uopBytes.U
}
}.elsewhen(state === sReadData && xcnt === xlen && xrem =/= 0.U) {
raddr := raddr + xmax_bytes
}

io.vme_rd.cmd.valid := state === sReadCmd
Expand Down
1 change: 0 additions & 1 deletion vta/hardware/chisel/src/main/scala/core/TensorAlu.scala
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ class AluReg(implicit p: Parameters) extends Module {

/** Vector of pipeline ALUs */
class AluVector(implicit p: Parameters) extends Module {
val aluBits = p(CoreKey).accBits
val io = IO(new Bundle {
val opcode = Input(UInt(C_ALU_OP_BITS.W))
val acc_a = new TensorMasterData(tensorType = "acc")
Expand Down
20 changes: 9 additions & 11 deletions vta/hardware/chisel/src/main/scala/core/TensorLoad.scala
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,7 @@ class TensorLoad(tensorType: String = "none", debug: Boolean = false)(
state := sXPad1
}.elsewhen(dec.ypad_1 =/= 0.U) {
state := sYPad1
}
.otherwise {
}.otherwise {
state := sIdle
}
}.elsewhen(dataCtrl.io.stride) {
Expand Down Expand Up @@ -198,11 +197,9 @@ class TensorLoad(tensorType: String = "none", debug: Boolean = false)(
tag := tag + 1.U
}

when(
state === sIdle || dataCtrlDone || (set === (tp.tensorLength - 1).U && tag === (tp.numMemBlock - 1).U)) {
when(state === sIdle || dataCtrlDone || (set === (tp.tensorLength - 1).U && tag === (tp.numMemBlock - 1).U)) {
set := 0.U
}.elsewhen(
(io.vme_rd.data.fire() || isZeroPad) && tag === (tp.numMemBlock - 1).U) {
}.elsewhen((io.vme_rd.data.fire() || isZeroPad) && tag === (tp.numMemBlock - 1).U) {
set := set + 1.U
}

Expand All @@ -211,10 +208,12 @@ class TensorLoad(tensorType: String = "none", debug: Boolean = false)(
when(state === sIdle) {
waddr_cur := dec.sram_offset
waddr_nxt := dec.sram_offset
}.elsewhen((io.vme_rd.data
.fire() || isZeroPad) && set === (tp.tensorLength - 1).U && tag === (tp.numMemBlock - 1).U) {
}.elsewhen((io.vme_rd.data.fire() || isZeroPad)
&& set === (tp.tensorLength - 1).U
&& tag === (tp.numMemBlock - 1).U)
{
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the linter might remind you to move the bracket to the previous line.

waddr_cur := waddr_cur + 1.U
}.elsewhen(dataCtrl.io.stride) {
}.elsewhen(dataCtrl.io.stride && io.vme_rd.data.fire()) {
waddr_cur := waddr_nxt + dec.xsize
waddr_nxt := waddr_nxt + dec.xsize
}
Expand Down Expand Up @@ -261,8 +260,7 @@ class TensorLoad(tensorType: String = "none", debug: Boolean = false)(
}

// done
val done_no_pad = io.vme_rd.data
.fire() & dataCtrl.io.done & dec.xpad_1 === 0.U & dec.ypad_1 === 0.U
val done_no_pad = io.vme_rd.data.fire() & dataCtrl.io.done & dec.xpad_1 === 0.U & dec.ypad_1 === 0.U
val done_x_pad = state === sXPad1 & xPadCtrl1.io.done & dataCtrlDone & dec.ypad_1 === 0.U
val done_y_pad = state === sYPad1 & dataCtrlDone & yPadCtrl1.io.done
io.done := done_no_pad | done_x_pad | done_y_pad
Expand Down
64 changes: 41 additions & 23 deletions vta/hardware/chisel/src/main/scala/core/TensorStore.scala
Original file line number Diff line number Diff line change
Expand Up @@ -62,20 +62,38 @@ class TensorStore(tensorType: String = "none", debug: Boolean = false)(
val tag = Reg(UInt(8.W))
val set = Reg(UInt(8.W))

val xfer_bytes = Reg(chiselTypeOf(io.vme_wr.cmd.bits.addr))
val xstride_bytes = dec.xstride << log2Ceil(tensorLength * tensorWidth)
val maskOffset = VecInit(Seq.fill(M_DRAM_OFFSET_BITS)(true.B)).asUInt
val elemBytes = (p(CoreKey).batch * p(CoreKey).blockOut * p(CoreKey).outBits) / 8
val pulse_bytes_bits = log2Ceil(mp.dataBits >> 3)

val xfer_init_addr = io.baddr | (maskOffset & (dec.dram_offset << log2Ceil(elemBytes)))
val xfer_split_addr = waddr_cur + xfer_bytes
val xfer_stride_addr = waddr_nxt + xstride_bytes

val xfer_init_bytes = xmax_bytes - xfer_init_addr % xmax_bytes
val xfer_init_pulses = xfer_init_bytes >> pulse_bytes_bits
val xfer_split_bytes = xmax_bytes - xfer_split_addr % xmax_bytes
val xfer_split_pulses = xfer_split_bytes >> pulse_bytes_bits
val xfer_stride_bytes = xmax_bytes - xfer_stride_addr % xmax_bytes
val xfer_stride_pulses= xfer_stride_bytes >> pulse_bytes_bits

val sIdle :: sWriteCmd :: sWriteData :: sReadMem :: sWriteAck :: Nil = Enum(5)
val state = RegInit(sIdle)

// control
switch(state) {
is(sIdle) {
when(io.start) {
xfer_bytes := xfer_init_bytes
when (io.start) {
state := sWriteCmd
when(xsize < xmax) {
when (xsize < xfer_init_pulses) {
xlen := xsize
xrem := 0.U
}.otherwise {
xlen := xmax - 1.U
xrem := xsize - xmax
xlen := xfer_init_pulses - 1.U
xrem := xsize - xfer_init_pulses
}
}
}
Expand All @@ -101,24 +119,29 @@ class TensorStore(tensorType: String = "none", debug: Boolean = false)(
when(xrem === 0.U) {
when(ycnt === ysize - 1.U) {
state := sIdle
}.otherwise {
}.otherwise { // stride
state := sWriteCmd
when(xsize < xmax) {
xfer_bytes := xfer_stride_bytes
when(xsize < xfer_stride_pulses) {
xlen := xsize
xrem := 0.U
}.otherwise {
xlen := xmax - 1.U
xrem := xsize - xmax
xlen := xfer_stride_pulses - 1.U
xrem := xsize - xfer_stride_pulses
}
}
}.elsewhen(xrem < xmax) {
} // split
.elsewhen(xrem < xfer_split_pulses) {
state := sWriteCmd
xfer_bytes := xfer_split_bytes
xlen := xrem
xrem := 0.U
}.otherwise {
}
.otherwise {
state := sWriteCmd
xlen := xmax - 1.U
xrem := xrem - xmax
xfer_bytes := xfer_split_bytes
xlen := xfer_split_pulses - 1.U
xrem := xrem - xfer_split_pulses
}
}
}
Expand Down Expand Up @@ -174,8 +197,7 @@ class TensorStore(tensorType: String = "none", debug: Boolean = false)(
when(state === sIdle) {
raddr_cur := dec.sram_offset
raddr_nxt := dec.sram_offset
}.elsewhen(io.vme_wr.data
.fire() && set === (tensorLength - 1).U && tag === (numMemBlock - 1).U) {
}.elsewhen(io.vme_wr.data.fire() && set === (tensorLength - 1).U && tag === (numMemBlock - 1).U) {
raddr_cur := raddr_cur + 1.U
}.elsewhen(stride) {
raddr_cur := raddr_nxt + dec.xsize
Expand All @@ -189,18 +211,14 @@ class TensorStore(tensorType: String = "none", debug: Boolean = false)(
val mdata = MuxLookup(set, 0.U.asTypeOf(chiselTypeOf(wdata_t)), tread)

// write-to-dram
val maskOffset = VecInit(Seq.fill(M_DRAM_OFFSET_BITS)(true.B)).asUInt
val elemBytes = (p(CoreKey).batch * p(CoreKey).blockOut * p(CoreKey).outBits) / 8
when(state === sIdle) {
waddr_cur := io.baddr | (maskOffset & (dec.dram_offset << log2Ceil(
elemBytes)))
waddr_nxt := io.baddr | (maskOffset & (dec.dram_offset << log2Ceil(
elemBytes)))
waddr_cur := xfer_init_addr
waddr_nxt := xfer_init_addr
}.elsewhen(state === sWriteAck && io.vme_wr.ack && xrem =/= 0.U) {
waddr_cur := waddr_cur + xmax_bytes
waddr_cur := xfer_split_addr
}.elsewhen(stride) {
waddr_cur := waddr_nxt + (dec.xstride << log2Ceil(tensorLength * tensorWidth))
waddr_nxt := waddr_nxt + (dec.xstride << log2Ceil(tensorLength * tensorWidth))
waddr_cur := xfer_stride_addr
waddr_nxt := xfer_stride_addr
}

io.vme_wr.cmd.valid := state === sWriteCmd
Expand Down
Loading