diff --git a/Bender.lock b/Bender.lock index 1d89adb..f25ca40 100644 --- a/Bender.lock +++ b/Bender.lock @@ -79,10 +79,10 @@ packages: - redundancy_cells - register_interface hwpe-ctrl: - revision: 0e95510c0f4d43452d21b7723d766ae92e45c101 + revision: null version: null source: - Git: https://github.com/pulp-platform/hwpe-ctrl.git + Path: working_dir/hwpe-ctrl dependencies: - tech_cells_generic hwpe-stream: diff --git a/Bender.yml b/Bender.yml index 53809e5..87407fe 100644 --- a/Bender.yml +++ b/Bender.yml @@ -16,8 +16,8 @@ dependencies: cv32e40x : { git: "https://github.com/pulp-platform/cv32e40x.git" , rev: "redmule-v1.0" } ibex : { git: "https://github.com/pulp-platform/ibex.git" , rev: pulpissimo-v6.1.2 } hwpe-stream : { git: "https://github.com/pulp-platform/hwpe-stream.git" , version: 1.9.2 } - hwpe-ctrl : { git: "https://github.com/pulp-platform/hwpe-ctrl.git" , rev: 0e95510c0f4d43452d21b7723d766ae92e45c101 } # branch: yt/task-interfaces - hci : { git: "https://github.com/pulp-platform/hci.git" , version: 2.2.0 } + hwpe-ctrl : { git: "https://github.com/pulp-platform/hwpe-ctrl.git" , version: 3.0.0 } + hci : { git: "https://github.com/pulp-platform/hci.git" , version: 2.3.0 } fpnew : { git: "https://github.com/pulp-platform/cvfpu.git" , rev: "pulp-v0.1.3" } common_cells : { git: "https://github.com/pulp-platform/common_cells.git" , version: 1.21.0 } tech_cells_generic: { git: "https://github.com/pulp-platform/tech_cells_generic.git", version: 0.2.11 } @@ -48,6 +48,9 @@ sources: - rtl/redmule_memory_scheduler.sv - rtl/redmule_mux.sv - rtl/redmule_inst_decoder.sv + - rtl/ctrl/regif/redmule_regif_pkg.sv + - rtl/ctrl/regif/redmule_regif.sv + - rtl/ctrl/redmule_target_decoder.sv - target: any(redmule_test_complex, redmule_test_hwpe) files: diff --git a/rtl/ctrl/gen_regif.sh b/rtl/ctrl/gen_regif.sh new file mode 100755 index 0000000..b5824a6 --- /dev/null +++ b/rtl/ctrl/gen_regif.sh @@ -0,0 +1,6 @@ +#!/bin/bash +peakrdl regblock redmule_regif.rdl -o regif/ --cpuif obi-flat --default-reset arst_n --hwif-report --addr-width 32 +peakrdl html redmule_regif.rdl -o regif/html/ +peakrdl c-header redmule_regif.rdl -o regif/hwpe_ctrl_target.h +# PeakRDL uses unpacked structs to avoid issues at compile time, which is commendable, but incompatible with FIFOing the output of the job! (use portable sed syntax that works on both Linux and macOS) +sed -E 's/typedef[[:space:]]+struct([[:space:]])/typedef struct packed\1/g' regif/redmule_regif_pkg.sv > regif/redmule_regif_pkg.sv.tmp && mv regif/redmule_regif_pkg.sv.tmp regif/redmule_regif_pkg.sv diff --git a/rtl/ctrl/redmule_regif.rdl b/rtl/ctrl/redmule_regif.rdl new file mode 100644 index 0000000..4c72604 --- /dev/null +++ b/rtl/ctrl/redmule_regif.rdl @@ -0,0 +1,268 @@ +/* + * redmule_regif.rdl + * Francesco Conti + * + * Copyright (C) 2025 ETH Zurich, University of Bologna + * Copyright and related rights are licensed under the Solderpad Hardware + * License, Version 0.51 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law + * or agreed to in writing, software, hardware and materials distributed under + * this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ + + /* + * This file contains the address map for RedMulE, expressed in SystemRDL. + */ + +// Enumeration for GEMM operation types (aligned with redmule_pkg::gemm_op_e) +enum gemm_op_e { + MATMUL = 3'h0 { name = "MATMUL"; desc = "Matrix multiplication"; }; + GEMM = 3'h1 { name = "GEMM"; desc = "General matrix multiply"; }; + ADDMAX = 3'h2 { name = "ADDMAX"; desc = "Addition with maximum"; }; + ADDMIN = 3'h3 { name = "ADDMIN"; desc = "Addition with minimum"; }; + MULMAX = 3'h4 { name = "MULMAX"; desc = "Multiplication with maximum"; }; + MULMIN = 3'h5 { name = "MULMIN"; desc = "Multiplication with minimum"; }; + MAXMIN = 3'h6 { name = "MAXMIN"; desc = "Maximum with minimum"; }; + MINMAX = 3'h7 { name = "MINMAX"; desc = "Minimum with maximum"; }; +}; + +// Enumeration for floating-point formats (aligned with redmule_pkg::gemm_fmt_e) +enum gemm_fmt_e { + Float8 = 2'h0 { name = "Float8"; desc = "FP8 format (E4M3)"; }; + Float16 = 2'h1 { name = "Float16"; desc = "FP16 format (IEEE)"; }; + Float8Alt = 2'h2 { name = "Float8Alt"; desc = "FP8 alternative format (E5M2)"; }; + Float16Alt = 2'h3 { name = "Float16Alt"; desc = "FP16 alternative format (BF16)"; }; +}; + +addrmap redmule_regif { + name = "RedMulE register interface"; + desc = "Control register map for RedMulE, including mandatory control/status registers and job-dependent configuration registers."; + // Mandatory COMMIT_TRIGGER register. Not to be updated inside HWPEs. + reg hwpe_commit_trigger { + field { + name = "reserved"; + desc = "Reserved."; + hw = r; + sw = r; + } r0[31:2] = 0; + field { + name = "commit_trigger"; + desc = "Write 0 to commit job & start execution, unlock controller; write `0x1` value to commit job & unlock controller without starting execution, which will be started when the next job is committed and triggered; write `0x2` value to trigger the current job queue without committing any new job."; + hw = r; + sw = w; + swacc = true; + } commit_trigger[1:0] = 0; + }; + // Mandatory ACQUIRE register. Not to be updated inside HWPEs. + reg hwpe_acquire { + field { + name = "acquire"; + desc = "On read starts a job offload, locks controller. Returns job ID."; + hw = w; + sw = r; + swacc = true; + } acquire[31:0] = 0; + }; + // Mandatory RESERVED register. Not to be updated inside HWPEs. + reg hwpe_reserved { + field { + name = "reserved"; + desc = "Reserved."; + hw = r; + sw = r; + } reserved[31:0] = 0; + }; + // Mandatory STATUS register. Not to be updated inside HWPEs. + reg hwpe_status { + field { + name = "status"; + desc = "Status of currently running job."; + hw = w; + sw = r; + } status0[31:0] = 0; + }; + // Mandatory RUNNING_JOB register. Not to be updated inside HWPEs. + reg hwpe_running_job { + field { + name = "reserved"; + desc = "Reserved."; + hw = r; + sw = r; + } r0[31:8] = 0; + field { + name = "running_job"; + desc = "Returns ID of currently running job if any job is running; otherwise, of the last job that has been run."; + hw = w; + sw = r; + } running_job[7:0] = 0; + }; + // Mandatory SOFT_CLEAR register. Not to be updated inside HWPEs. + reg hwpe_soft_clear { + field { + name = "reserved"; + desc = "Reserved."; + hw = r; + sw = r; + } r0[31:2] = 0; + field { + name = "soft_clear"; + desc = "Write `0x0` to clear the full status of the accelerator IP, including the register file; write `0x1` to clear the status of the accelerator IP, except for the register file; write `0x2` to clear only the register file."; + hw = r; + sw = w; + swacc = true; + } soft_clear[1:0] = 0; + }; + + // "mandatory" set of HWPE registers (CONTROL regs). Not to be updated inside HWPEs. + regfile hwpe_ctrl_mandatory { + hwpe_commit_trigger commit_trigger @ 0x00; + hwpe_acquire acquire @ 0x04; + hwpe_reserved reserved0 @ 0x08; + hwpe_status status @ 0x0c; + hwpe_running_job running_job @ 0x10; + hwpe_soft_clear soft_clear @ 0x14; + hwpe_reserved reserved1 @ 0x18; + hwpe_reserved reserved2 @ 0x1c; + }; + + // RedMulE job-dependent registers + reg mcnfig0 { + name = "MCNFIG0"; + field { + name = "k_size"; + desc = "K dimension (cols of X, rows of W)."; + hw = r; + sw = rw; + } k_size[31:16] = 0; + field { + name = "m_size"; + desc = "M dimension (rows of X/Z)."; + hw = r; + sw = rw; + } m_size[15:0] = 0; + }; + reg mcnfig1 { + name = "MCNFIG1"; + field { // TODO: this register is not aligned with the current XIF interface + name = "gemm_output_fmt"; + desc = "Output format."; + hw = r; + sw = rw; + encode = gemm_fmt_e; + } gemm_output_fmt[26:25] = 0; + field { // TODO: this register is not aligned with the current XIF interface + name = "gemm_input_fmt"; + desc = "Input format."; + hw = r; + sw = rw; + encode = gemm_fmt_e; + } gemm_input_fmt[24:23] = 0; + field { // TODO: this register is not aligned with the current XIF interface + name = "gemm_ops"; + desc = "Operation type."; + hw = r; + sw = rw; + encode = gemm_op_e; + } gemm_ops[22:20] = 0; + field { + name = "send_w"; + desc = "Broadcast W to external stream."; + hw = r; + sw = rw; + } send_w[19:19] = 0; + field { + name = "receive_w"; + desc = "Receive W to external stream."; + hw = r; + sw = rw; + } receive_w[18:18] = 0; + field { + name = "send_x"; + desc = "Broadcast X to external stream."; + hw = r; + sw = rw; + } send_x[17:17] = 0; + field { + name = "receive_x"; + desc = "Receive X from external stream."; + hw = r; + sw = rw; + } receive_x[16:16] = 0; + field { + name = "n_size"; + desc = "N dimension (cols of W/Z)."; + hw = r; + sw = rw; + } n_size[15:0] = 0; + }; + reg mcnfig2 { + name = "MCNFIG2"; + field { + name = "y_offs"; + desc = "Y buffer offset for bias addition."; + hw = r; + sw = rw; + } y_offs[31:0] = 0; + }; + reg marith0 { + name = "MARITH0"; + field { + name = "x_addr"; + desc = "X matrix base address."; + hw = r; + sw = rw; + } x_addr[31:0] = 0; + }; + reg marith1 { + name = "MARITH1"; + field { + name = "w_addr"; + desc = "W matrix base address."; + hw = r; + sw = rw; + } w_addr[31:0] = 0; + }; + reg marith2 { + name = "MARITH2"; + field { + name = "z_addr"; + desc = "Z matrix base address."; + hw = r; + sw = rw; + } z_addr[31:0] = 0; + }; + reg mopcnt { + name = "MOPCNT"; + field { + name = "op_id_cnt"; + desc = "Operations complete."; + hw = w; + sw = r; + } op_id_cnt[31:0] = 0; + }; + + // no "job-independent" registers in NEUREKA + regfile hwpe_ctrl_job_indep { + hwpe_reserved reserved; + }; + + // "job-dependent" set of HWPE registers. Update inside HWPEs. + regfile hwpe_ctrl_job_dep { + mcnfig0 mcnfig0 @ 0x00; + mcnfig1 mcnfig1 @ 0x04; + mcnfig2 mcnfig2 @ 0x08; + marith0 marith0 @ 0x0c; + marith1 marith1 @ 0x10; + marith2 marith2 @ 0x14; + mopcnt mopcnt @ 0x18; + }; + + // HWPE control address map. Update inside HWPEs + hwpe_ctrl_mandatory hwpe_ctrl @ 0x00; + hwpe_ctrl_job_dep hwpe_job_dep @ 0x20; + hwpe_ctrl_job_indep hwpe_job_indep; + +}; diff --git a/rtl/ctrl/redmule_target_decoder.sv b/rtl/ctrl/redmule_target_decoder.sv new file mode 100644 index 0000000..97c45e2 --- /dev/null +++ b/rtl/ctrl/redmule_target_decoder.sv @@ -0,0 +1,190 @@ +// Copyright 2025 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 +// +// Francesco Conti +// + +// +// This file contains a memory-mapped target interface for RedMulE. +// + +module redmule_target_decoder + import redmule_pkg::*; + import redmule_regif_pkg::*; +#( + parameter int unsigned OpIdWidth = 4 +)( + input logic clk_i, + input logic rst_ni, + input logic clear_i, + output logic target_clear_o, + input logic config_ready_i, + input logic op_done_i, + output logic config_valid_o, + output redmule_config_t config_o, + // target port + hwpe_ctrl_intf_periph.slave target +); + + // target signals + logic job_trigger; + logic job_done; + logic [31:0] job_status; + redmule_regif__hwpe_ctrl_job_indep__out_t job_indep_regs; + logic job_dep_regs_valid; + redmule_regif__hwpe_ctrl_job_dep__out_t job_dep_regs; + + // OBI plug target <-> regif + logic target_obi_req; + logic target_obi_gnt; + logic [31:0] target_obi_addr; + logic target_obi_we; + logic [3:0] target_obi_be; + logic [31:0] target_obi_wdata; + logic [OpIdWidth-1:0] target_obi_aid; + logic target_obi_rvalid; + logic target_obi_rready; + logic [31:0] target_obi_rdata; + logic target_obi_err; + logic [OpIdWidth-1:0] target_obi_rid; + + redmule_regif__in_t hwif_in; + redmule_regif__in_t hwif_in_target; + redmule_regif__out_t hwif_out; + + /* HWPE controller target port */ + hwpe_ctrl_target #( + .NB_CONTEXT ( 2 ), + .ID_WIDTH ( OpIdWidth ), + .ADDR_WIDTH ( 8 ), + .hwpe_ctrl_regif_in_t ( redmule_regif__in_t ), + .hwpe_ctrl_regif_out_t ( redmule_regif__out_t ), + .hwpe_ctrl_job_indep_t ( redmule_regif__hwpe_ctrl_job_indep__out_t ), + .hwpe_ctrl_job_dep_t ( redmule_regif__hwpe_ctrl_job_dep__out_t ) + ) i_target ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .clear_o ( target_clear_o ), + .target ( target ), + .job_trigger_o ( job_trigger ), + .job_done_i ( job_done ), + .job_status_i ( job_status ), + .job_indep_regs_o ( job_indep_regs ), + .job_dep_regs_valid_o ( job_dep_regs_valid ), + .job_dep_regs_o ( job_dep_regs ), + .target_obi_req_o ( target_obi_req ), + .target_obi_gnt_i ( target_obi_gnt ), + .target_obi_addr_o ( target_obi_addr ), + .target_obi_we_o ( target_obi_we ), + .target_obi_be_o ( target_obi_be ), + .target_obi_wdata_o ( target_obi_wdata ), + .target_obi_aid_o ( target_obi_aid ), + .target_obi_rvalid_i ( target_obi_rvalid ), + .target_obi_rready_o ( target_obi_rready ), + .target_obi_rdata_i ( target_obi_rdata ), + .target_obi_err_i ( target_obi_err ), + .target_obi_rid_i ( target_obi_rid ), + .hwif_in ( hwif_in_target ), + .hwif_out ( hwif_out ) + ); + + /* RedMulE SystemRDL-generated register interface */ + redmule_regif #( + .ID_WIDTH ( OpIdWidth ) + ) i_regif ( + .clk ( clk_i ), + .arst_n ( rst_ni ), + .s_obi_req ( target_obi_req ), + .s_obi_gnt ( target_obi_gnt ), + .s_obi_addr ( target_obi_addr ), + .s_obi_we ( target_obi_we ), + .s_obi_be ( target_obi_be ), + .s_obi_wdata ( target_obi_wdata ), + .s_obi_aid ( target_obi_aid ), + .s_obi_rvalid ( target_obi_rvalid ), + .s_obi_rready ( target_obi_rready ), + .s_obi_rdata ( target_obi_rdata ), + .s_obi_err ( target_obi_err ), + .s_obi_rid ( target_obi_rid ), + .hwif_in ( hwif_in ), + .hwif_out ( hwif_out ) + ); + + assign job_done = op_done_i; + assign config_valid_o = job_trigger; + + always_ff @(posedge clk_i or negedge rst_ni) begin : job_status_trigger + if(~rst_ni) begin + job_status <= '0; + end else begin + if(clear_i | target_clear_o) begin // TODO: connect target-generated clear as well! + job_status <= '0; + end + else if(job_trigger & config_ready_i) begin + job_status <= 32'h1; + end + else if(op_done_i) begin + job_status <= '0; + end + end + end + + // Decode instruction and extract configuration parameters from register file values + assign config_o.m_size = hwif_out.hwpe_job_dep.mcnfig0.m_size.value; + assign config_o.n_size = hwif_out.hwpe_job_dep.mcnfig1.n_size.value; + assign config_o.k_size = hwif_out.hwpe_job_dep.mcnfig0.k_size.value; + assign config_o.receive_x = hwif_out.hwpe_job_dep.mcnfig1.send_x.value; + assign config_o.send_x = hwif_out.hwpe_job_dep.mcnfig1.send_x.value; + assign config_o.receive_w = hwif_out.hwpe_job_dep.mcnfig1.receive_w.value; + assign config_o.send_w = hwif_out.hwpe_job_dep.mcnfig1.send_w.value; + assign config_o.y_offs = hwif_out.hwpe_job_dep.mcnfig2.y_offs.value; + assign config_o.x_addr = hwif_out.hwpe_job_dep.marith0.x_addr.value; + assign config_o.w_addr = hwif_out.hwpe_job_dep.marith1.w_addr.value; + assign config_o.z_addr = hwif_out.hwpe_job_dep.marith2.z_addr.value; + assign config_o.gemm_ops = gemm_op_e'(hwif_out.hwpe_job_dep.mcnfig1.gemm_ops.value); + assign config_o.gemm_input_fmt = gemm_fmt_e'(hwif_out.hwpe_job_dep.mcnfig1.gemm_input_fmt.value); + assign config_o.gemm_output_fmt = gemm_fmt_e'(hwif_out.hwpe_job_dep.mcnfig1.gemm_output_fmt.value); + + // Operation ID counter: + // op_id_counter_in_q: Increments when operations are issued (tags for tracking) + // op_id_counter_out_q: Increments when operations complete (for MOPCNT instruction) + logic [OpIdWidth-1:0] op_id_counter_in_q, op_id_counter_out_q; + + // Input counter: increments when MARITH instruction is issued + // Returns this value to CPU as operation ID for software tracking + always_ff @(posedge clk_i or negedge rst_ni) begin : op_id_counter_in + if (~rst_ni) begin + op_id_counter_in_q <= 0; + end else begin + if (clear_i | target_clear_o) begin + op_id_counter_in_q <= 0; + end else if (job_trigger) begin + op_id_counter_in_q <= op_id_counter_in_q + 1; + end + end + end + + // Output counter: increments when any operation completes + // Returns this value for MOPCNT instruction to check completion status + // Initialized to all 1's to detect first completion (wraps to 0) + always_ff @(posedge clk_i or negedge rst_ni) begin : op_id_counter_out + if (~rst_ni) begin + op_id_counter_out_q <= '1; + end else begin + if (clear_i | target_clear_o) begin + op_id_counter_out_q <= '1; + end else if (op_done_i) begin + op_id_counter_out_q <= op_id_counter_out_q + 1; + end + end + end + + // Combine hwif_in from hwpe_ctrl_target with RedMulE-specific fields + always_comb + begin + hwif_in = hwif_in_target; + hwif_in.hwpe_job_dep.mopcnt.op_id_cnt.next = op_id_counter_out_q; + end + +endmodule: redmule_target_decoder diff --git a/rtl/ctrl/regif/redmule_regif.sv b/rtl/ctrl/regif/redmule_regif.sv new file mode 100644 index 0000000..442f344 --- /dev/null +++ b/rtl/ctrl/regif/redmule_regif.sv @@ -0,0 +1,787 @@ +// Generated by PeakRDL-regblock - A free and open-source SystemVerilog generator +// https://github.com/SystemRDL/PeakRDL-regblock + +module redmule_regif #( + parameter ID_WIDTH = 1 + ) ( + input wire clk, + input wire arst_n, + + input wire s_obi_req, + output logic s_obi_gnt, + input wire [31:0] s_obi_addr, + input wire s_obi_we, + input wire [3:0] s_obi_be, + input wire [31:0] s_obi_wdata, + input wire [ID_WIDTH-1:0] s_obi_aid, + output logic s_obi_rvalid, + input wire s_obi_rready, + output logic [31:0] s_obi_rdata, + output logic s_obi_err, + output logic [ID_WIDTH-1:0] s_obi_rid, + + input redmule_regif_pkg::redmule_regif__in_t hwif_in, + output redmule_regif_pkg::redmule_regif__out_t hwif_out + ); + + //-------------------------------------------------------------------------- + // CPU Bus interface logic + //-------------------------------------------------------------------------- + logic cpuif_req; + logic cpuif_req_is_wr; + logic [31:0] cpuif_addr; + logic [31:0] cpuif_wr_data; + logic [31:0] cpuif_wr_biten; + logic cpuif_req_stall_wr; + logic cpuif_req_stall_rd; + + logic cpuif_rd_ack; + logic cpuif_rd_err; + logic [31:0] cpuif_rd_data; + + logic cpuif_wr_ack; + logic cpuif_wr_err; + + // State & holding regs + logic is_active; // A request is being served (not yet fully responded) + logic gnt_q; // one-cycle grant for A-channel + logic rsp_pending; // response ready but not yet accepted by manager + logic [31:0] rsp_rdata_q; + logic rsp_err_q; + logic [$bits(s_obi_rid)-1:0] rid_q; + + // Latch AID on accept to echo back the response + always_ff @(posedge clk or negedge arst_n) begin + if (~arst_n) begin + is_active <= 1'b0; + gnt_q <= 1'b0; + rsp_pending <= 1'b0; + rsp_rdata_q <= '0; + rsp_err_q <= 1'b0; + rid_q <= '0; + + cpuif_req <= '0; + cpuif_req_is_wr <= '0; + cpuif_addr <= '0; + cpuif_wr_data <= '0; + cpuif_wr_biten <= '0; + end else begin + // defaults + cpuif_req <= 1'b0; + gnt_q <= s_obi_req & ~is_active; + + // Accept new request when idle + if (~is_active) begin + if (s_obi_req) begin + is_active <= 1'b1; + cpuif_req <= 1'b1; + cpuif_req_is_wr <= s_obi_we; + cpuif_addr <= {s_obi_addr[31:2], 2'b0}; + cpuif_wr_data <= s_obi_wdata; + rid_q <= s_obi_aid; + for (int i = 0; i < 4; i++) begin + cpuif_wr_biten[i*8 +: 8] <= {8{ s_obi_be[i] }}; + end + end + end + + // Capture response + if (is_active && (cpuif_rd_ack || cpuif_wr_ack)) begin + rsp_pending <= 1'b1; + rsp_rdata_q <= cpuif_rd_data; + rsp_err_q <= cpuif_rd_err | cpuif_wr_err; + // NOTE: Keep 'is_active' asserted until the external R handshake completes + end + + // Complete external R-channel handshake only if manager ready + if (rsp_pending && s_obi_rvalid && s_obi_rready) begin + rsp_pending <= 1'b0; + is_active <= 1'b0; // free to accept the next request + end + end + end + + // R-channel outputs (held stable while rsp_pending=1) + assign s_obi_rvalid = rsp_pending; + assign s_obi_rdata = rsp_rdata_q; + assign s_obi_err = rsp_err_q; + assign s_obi_rid = rid_q; + + // A-channel grant (registered one-cycle pulse when we accept a request) + assign s_obi_gnt = gnt_q; + + logic cpuif_req_masked; + + // Read & write latencies are balanced. Stalls not required + assign cpuif_req_stall_rd = '0; + assign cpuif_req_stall_wr = '0; + assign cpuif_req_masked = cpuif_req + & !(!cpuif_req_is_wr & cpuif_req_stall_rd) + & !(cpuif_req_is_wr & cpuif_req_stall_wr); + + //-------------------------------------------------------------------------- + // Address Decode + //-------------------------------------------------------------------------- + typedef struct { + struct { + logic commit_trigger; + logic acquire; + logic reserved0; + logic status; + logic running_job; + logic soft_clear; + logic reserved1; + logic reserved2; + } hwpe_ctrl; + struct { + logic mcnfig0; + logic mcnfig1; + logic mcnfig2; + logic marith0; + logic marith1; + logic marith2; + logic mopcnt; + } hwpe_job_dep; + struct { + logic reserved; + } hwpe_job_indep; + } decoded_reg_strb_t; + decoded_reg_strb_t decoded_reg_strb; + logic decoded_err; + logic decoded_req; + logic decoded_req_is_wr; + logic [31:0] decoded_wr_data; + logic [31:0] decoded_wr_biten; + + always_comb begin + automatic logic is_valid_addr; + automatic logic is_invalid_rw; + is_valid_addr = '1; // No error checking on valid address access + is_invalid_rw = '0; + decoded_reg_strb.hwpe_ctrl.commit_trigger = cpuif_req_masked & (cpuif_addr == 32'h0); + decoded_reg_strb.hwpe_ctrl.acquire = cpuif_req_masked & (cpuif_addr == 32'h4) & !cpuif_req_is_wr; + decoded_reg_strb.hwpe_ctrl.reserved0 = cpuif_req_masked & (cpuif_addr == 32'h8) & !cpuif_req_is_wr; + decoded_reg_strb.hwpe_ctrl.status = cpuif_req_masked & (cpuif_addr == 32'hc) & !cpuif_req_is_wr; + decoded_reg_strb.hwpe_ctrl.running_job = cpuif_req_masked & (cpuif_addr == 32'h10) & !cpuif_req_is_wr; + decoded_reg_strb.hwpe_ctrl.soft_clear = cpuif_req_masked & (cpuif_addr == 32'h14); + decoded_reg_strb.hwpe_ctrl.reserved1 = cpuif_req_masked & (cpuif_addr == 32'h18) & !cpuif_req_is_wr; + decoded_reg_strb.hwpe_ctrl.reserved2 = cpuif_req_masked & (cpuif_addr == 32'h1c) & !cpuif_req_is_wr; + decoded_reg_strb.hwpe_job_dep.mcnfig0 = cpuif_req_masked & (cpuif_addr == 32'h20); + decoded_reg_strb.hwpe_job_dep.mcnfig1 = cpuif_req_masked & (cpuif_addr == 32'h24); + decoded_reg_strb.hwpe_job_dep.mcnfig2 = cpuif_req_masked & (cpuif_addr == 32'h28); + decoded_reg_strb.hwpe_job_dep.marith0 = cpuif_req_masked & (cpuif_addr == 32'h2c); + decoded_reg_strb.hwpe_job_dep.marith1 = cpuif_req_masked & (cpuif_addr == 32'h30); + decoded_reg_strb.hwpe_job_dep.marith2 = cpuif_req_masked & (cpuif_addr == 32'h34); + decoded_reg_strb.hwpe_job_dep.mopcnt = cpuif_req_masked & (cpuif_addr == 32'h38) & !cpuif_req_is_wr; + decoded_reg_strb.hwpe_job_indep.reserved = cpuif_req_masked & (cpuif_addr == 32'h3c) & !cpuif_req_is_wr; + decoded_err = (~is_valid_addr | is_invalid_rw) & decoded_req; + end + + // Pass down signals to next stage + assign decoded_req = cpuif_req_masked; + assign decoded_req_is_wr = cpuif_req_is_wr; + assign decoded_wr_data = cpuif_wr_data; + assign decoded_wr_biten = cpuif_wr_biten; + + //-------------------------------------------------------------------------- + // Field logic + //-------------------------------------------------------------------------- + typedef struct { + struct { + struct { + struct { + logic [1:0] next; + logic load_next; + } commit_trigger; + } commit_trigger; + struct { + struct { + logic [1:0] next; + logic load_next; + } soft_clear; + } soft_clear; + } hwpe_ctrl; + struct { + struct { + struct { + logic [15:0] next; + logic load_next; + } m_size; + struct { + logic [15:0] next; + logic load_next; + } k_size; + } mcnfig0; + struct { + struct { + logic [15:0] next; + logic load_next; + } n_size; + struct { + logic next; + logic load_next; + } receive_x; + struct { + logic next; + logic load_next; + } send_x; + struct { + logic next; + logic load_next; + } receive_w; + struct { + logic next; + logic load_next; + } send_w; + struct { + logic [2:0] next; + logic load_next; + } gemm_ops; + struct { + logic [1:0] next; + logic load_next; + } gemm_input_fmt; + struct { + logic [1:0] next; + logic load_next; + } gemm_output_fmt; + } mcnfig1; + struct { + struct { + logic [31:0] next; + logic load_next; + } y_offs; + } mcnfig2; + struct { + struct { + logic [31:0] next; + logic load_next; + } x_addr; + } marith0; + struct { + struct { + logic [31:0] next; + logic load_next; + } w_addr; + } marith1; + struct { + struct { + logic [31:0] next; + logic load_next; + } z_addr; + } marith2; + } hwpe_job_dep; + } field_combo_t; + field_combo_t field_combo; + + typedef struct { + struct { + struct { + struct { + logic [1:0] value; + } commit_trigger; + } commit_trigger; + struct { + struct { + logic [1:0] value; + } soft_clear; + } soft_clear; + } hwpe_ctrl; + struct { + struct { + struct { + logic [15:0] value; + } m_size; + struct { + logic [15:0] value; + } k_size; + } mcnfig0; + struct { + struct { + logic [15:0] value; + } n_size; + struct { + logic value; + } receive_x; + struct { + logic value; + } send_x; + struct { + logic value; + } receive_w; + struct { + logic value; + } send_w; + struct { + logic [2:0] value; + } gemm_ops; + struct { + logic [1:0] value; + } gemm_input_fmt; + struct { + logic [1:0] value; + } gemm_output_fmt; + } mcnfig1; + struct { + struct { + logic [31:0] value; + } y_offs; + } mcnfig2; + struct { + struct { + logic [31:0] value; + } x_addr; + } marith0; + struct { + struct { + logic [31:0] value; + } w_addr; + } marith1; + struct { + struct { + logic [31:0] value; + } z_addr; + } marith2; + } hwpe_job_dep; + } field_storage_t; + field_storage_t field_storage; + + // Field: redmule_regif.hwpe_ctrl.commit_trigger.commit_trigger + always_comb begin + automatic logic [1:0] next_c; + automatic logic load_next_c; + next_c = field_storage.hwpe_ctrl.commit_trigger.commit_trigger.value; + load_next_c = '0; + if(decoded_reg_strb.hwpe_ctrl.commit_trigger && decoded_req_is_wr) begin // SW write + next_c = (field_storage.hwpe_ctrl.commit_trigger.commit_trigger.value & ~decoded_wr_biten[1:0]) | (decoded_wr_data[1:0] & decoded_wr_biten[1:0]); + load_next_c = '1; + end + field_combo.hwpe_ctrl.commit_trigger.commit_trigger.next = next_c; + field_combo.hwpe_ctrl.commit_trigger.commit_trigger.load_next = load_next_c; + end + always_ff @(posedge clk or negedge arst_n) begin + if(~arst_n) begin + field_storage.hwpe_ctrl.commit_trigger.commit_trigger.value <= 2'h0; + end else begin + if(field_combo.hwpe_ctrl.commit_trigger.commit_trigger.load_next) begin + field_storage.hwpe_ctrl.commit_trigger.commit_trigger.value <= field_combo.hwpe_ctrl.commit_trigger.commit_trigger.next; + end + end + end + assign hwif_out.hwpe_ctrl.commit_trigger.commit_trigger.value = field_storage.hwpe_ctrl.commit_trigger.commit_trigger.value; + assign hwif_out.hwpe_ctrl.commit_trigger.commit_trigger.swacc = decoded_reg_strb.hwpe_ctrl.commit_trigger; + assign hwif_out.hwpe_ctrl.commit_trigger.r0.value = 30'h0; + assign hwif_out.hwpe_ctrl.acquire.acquire.swacc = decoded_reg_strb.hwpe_ctrl.acquire; + assign hwif_out.hwpe_ctrl.reserved0.reserved.value = 32'h0; + assign hwif_out.hwpe_ctrl.running_job.r0.value = 24'h0; + // Field: redmule_regif.hwpe_ctrl.soft_clear.soft_clear + always_comb begin + automatic logic [1:0] next_c; + automatic logic load_next_c; + next_c = field_storage.hwpe_ctrl.soft_clear.soft_clear.value; + load_next_c = '0; + if(decoded_reg_strb.hwpe_ctrl.soft_clear && decoded_req_is_wr) begin // SW write + next_c = (field_storage.hwpe_ctrl.soft_clear.soft_clear.value & ~decoded_wr_biten[1:0]) | (decoded_wr_data[1:0] & decoded_wr_biten[1:0]); + load_next_c = '1; + end + field_combo.hwpe_ctrl.soft_clear.soft_clear.next = next_c; + field_combo.hwpe_ctrl.soft_clear.soft_clear.load_next = load_next_c; + end + always_ff @(posedge clk or negedge arst_n) begin + if(~arst_n) begin + field_storage.hwpe_ctrl.soft_clear.soft_clear.value <= 2'h0; + end else begin + if(field_combo.hwpe_ctrl.soft_clear.soft_clear.load_next) begin + field_storage.hwpe_ctrl.soft_clear.soft_clear.value <= field_combo.hwpe_ctrl.soft_clear.soft_clear.next; + end + end + end + assign hwif_out.hwpe_ctrl.soft_clear.soft_clear.value = field_storage.hwpe_ctrl.soft_clear.soft_clear.value; + assign hwif_out.hwpe_ctrl.soft_clear.soft_clear.swacc = decoded_reg_strb.hwpe_ctrl.soft_clear; + assign hwif_out.hwpe_ctrl.soft_clear.r0.value = 30'h0; + assign hwif_out.hwpe_ctrl.reserved1.reserved.value = 32'h0; + assign hwif_out.hwpe_ctrl.reserved2.reserved.value = 32'h0; + // Field: redmule_regif.hwpe_job_dep.mcnfig0.m_size + always_comb begin + automatic logic [15:0] next_c; + automatic logic load_next_c; + next_c = field_storage.hwpe_job_dep.mcnfig0.m_size.value; + load_next_c = '0; + if(decoded_reg_strb.hwpe_job_dep.mcnfig0 && decoded_req_is_wr) begin // SW write + next_c = (field_storage.hwpe_job_dep.mcnfig0.m_size.value & ~decoded_wr_biten[15:0]) | (decoded_wr_data[15:0] & decoded_wr_biten[15:0]); + load_next_c = '1; + end + field_combo.hwpe_job_dep.mcnfig0.m_size.next = next_c; + field_combo.hwpe_job_dep.mcnfig0.m_size.load_next = load_next_c; + end + always_ff @(posedge clk or negedge arst_n) begin + if(~arst_n) begin + field_storage.hwpe_job_dep.mcnfig0.m_size.value <= 16'h0; + end else begin + if(field_combo.hwpe_job_dep.mcnfig0.m_size.load_next) begin + field_storage.hwpe_job_dep.mcnfig0.m_size.value <= field_combo.hwpe_job_dep.mcnfig0.m_size.next; + end + end + end + assign hwif_out.hwpe_job_dep.mcnfig0.m_size.value = field_storage.hwpe_job_dep.mcnfig0.m_size.value; + // Field: redmule_regif.hwpe_job_dep.mcnfig0.k_size + always_comb begin + automatic logic [15:0] next_c; + automatic logic load_next_c; + next_c = field_storage.hwpe_job_dep.mcnfig0.k_size.value; + load_next_c = '0; + if(decoded_reg_strb.hwpe_job_dep.mcnfig0 && decoded_req_is_wr) begin // SW write + next_c = (field_storage.hwpe_job_dep.mcnfig0.k_size.value & ~decoded_wr_biten[31:16]) | (decoded_wr_data[31:16] & decoded_wr_biten[31:16]); + load_next_c = '1; + end + field_combo.hwpe_job_dep.mcnfig0.k_size.next = next_c; + field_combo.hwpe_job_dep.mcnfig0.k_size.load_next = load_next_c; + end + always_ff @(posedge clk or negedge arst_n) begin + if(~arst_n) begin + field_storage.hwpe_job_dep.mcnfig0.k_size.value <= 16'h0; + end else begin + if(field_combo.hwpe_job_dep.mcnfig0.k_size.load_next) begin + field_storage.hwpe_job_dep.mcnfig0.k_size.value <= field_combo.hwpe_job_dep.mcnfig0.k_size.next; + end + end + end + assign hwif_out.hwpe_job_dep.mcnfig0.k_size.value = field_storage.hwpe_job_dep.mcnfig0.k_size.value; + // Field: redmule_regif.hwpe_job_dep.mcnfig1.n_size + always_comb begin + automatic logic [15:0] next_c; + automatic logic load_next_c; + next_c = field_storage.hwpe_job_dep.mcnfig1.n_size.value; + load_next_c = '0; + if(decoded_reg_strb.hwpe_job_dep.mcnfig1 && decoded_req_is_wr) begin // SW write + next_c = (field_storage.hwpe_job_dep.mcnfig1.n_size.value & ~decoded_wr_biten[15:0]) | (decoded_wr_data[15:0] & decoded_wr_biten[15:0]); + load_next_c = '1; + end + field_combo.hwpe_job_dep.mcnfig1.n_size.next = next_c; + field_combo.hwpe_job_dep.mcnfig1.n_size.load_next = load_next_c; + end + always_ff @(posedge clk or negedge arst_n) begin + if(~arst_n) begin + field_storage.hwpe_job_dep.mcnfig1.n_size.value <= 16'h0; + end else begin + if(field_combo.hwpe_job_dep.mcnfig1.n_size.load_next) begin + field_storage.hwpe_job_dep.mcnfig1.n_size.value <= field_combo.hwpe_job_dep.mcnfig1.n_size.next; + end + end + end + assign hwif_out.hwpe_job_dep.mcnfig1.n_size.value = field_storage.hwpe_job_dep.mcnfig1.n_size.value; + // Field: redmule_regif.hwpe_job_dep.mcnfig1.receive_x + always_comb begin + automatic logic [0:0] next_c; + automatic logic load_next_c; + next_c = field_storage.hwpe_job_dep.mcnfig1.receive_x.value; + load_next_c = '0; + if(decoded_reg_strb.hwpe_job_dep.mcnfig1 && decoded_req_is_wr) begin // SW write + next_c = (field_storage.hwpe_job_dep.mcnfig1.receive_x.value & ~decoded_wr_biten[16:16]) | (decoded_wr_data[16:16] & decoded_wr_biten[16:16]); + load_next_c = '1; + end + field_combo.hwpe_job_dep.mcnfig1.receive_x.next = next_c; + field_combo.hwpe_job_dep.mcnfig1.receive_x.load_next = load_next_c; + end + always_ff @(posedge clk or negedge arst_n) begin + if(~arst_n) begin + field_storage.hwpe_job_dep.mcnfig1.receive_x.value <= 1'h0; + end else begin + if(field_combo.hwpe_job_dep.mcnfig1.receive_x.load_next) begin + field_storage.hwpe_job_dep.mcnfig1.receive_x.value <= field_combo.hwpe_job_dep.mcnfig1.receive_x.next; + end + end + end + assign hwif_out.hwpe_job_dep.mcnfig1.receive_x.value = field_storage.hwpe_job_dep.mcnfig1.receive_x.value; + // Field: redmule_regif.hwpe_job_dep.mcnfig1.send_x + always_comb begin + automatic logic [0:0] next_c; + automatic logic load_next_c; + next_c = field_storage.hwpe_job_dep.mcnfig1.send_x.value; + load_next_c = '0; + if(decoded_reg_strb.hwpe_job_dep.mcnfig1 && decoded_req_is_wr) begin // SW write + next_c = (field_storage.hwpe_job_dep.mcnfig1.send_x.value & ~decoded_wr_biten[17:17]) | (decoded_wr_data[17:17] & decoded_wr_biten[17:17]); + load_next_c = '1; + end + field_combo.hwpe_job_dep.mcnfig1.send_x.next = next_c; + field_combo.hwpe_job_dep.mcnfig1.send_x.load_next = load_next_c; + end + always_ff @(posedge clk or negedge arst_n) begin + if(~arst_n) begin + field_storage.hwpe_job_dep.mcnfig1.send_x.value <= 1'h0; + end else begin + if(field_combo.hwpe_job_dep.mcnfig1.send_x.load_next) begin + field_storage.hwpe_job_dep.mcnfig1.send_x.value <= field_combo.hwpe_job_dep.mcnfig1.send_x.next; + end + end + end + assign hwif_out.hwpe_job_dep.mcnfig1.send_x.value = field_storage.hwpe_job_dep.mcnfig1.send_x.value; + // Field: redmule_regif.hwpe_job_dep.mcnfig1.receive_w + always_comb begin + automatic logic [0:0] next_c; + automatic logic load_next_c; + next_c = field_storage.hwpe_job_dep.mcnfig1.receive_w.value; + load_next_c = '0; + if(decoded_reg_strb.hwpe_job_dep.mcnfig1 && decoded_req_is_wr) begin // SW write + next_c = (field_storage.hwpe_job_dep.mcnfig1.receive_w.value & ~decoded_wr_biten[18:18]) | (decoded_wr_data[18:18] & decoded_wr_biten[18:18]); + load_next_c = '1; + end + field_combo.hwpe_job_dep.mcnfig1.receive_w.next = next_c; + field_combo.hwpe_job_dep.mcnfig1.receive_w.load_next = load_next_c; + end + always_ff @(posedge clk or negedge arst_n) begin + if(~arst_n) begin + field_storage.hwpe_job_dep.mcnfig1.receive_w.value <= 1'h0; + end else begin + if(field_combo.hwpe_job_dep.mcnfig1.receive_w.load_next) begin + field_storage.hwpe_job_dep.mcnfig1.receive_w.value <= field_combo.hwpe_job_dep.mcnfig1.receive_w.next; + end + end + end + assign hwif_out.hwpe_job_dep.mcnfig1.receive_w.value = field_storage.hwpe_job_dep.mcnfig1.receive_w.value; + // Field: redmule_regif.hwpe_job_dep.mcnfig1.send_w + always_comb begin + automatic logic [0:0] next_c; + automatic logic load_next_c; + next_c = field_storage.hwpe_job_dep.mcnfig1.send_w.value; + load_next_c = '0; + if(decoded_reg_strb.hwpe_job_dep.mcnfig1 && decoded_req_is_wr) begin // SW write + next_c = (field_storage.hwpe_job_dep.mcnfig1.send_w.value & ~decoded_wr_biten[19:19]) | (decoded_wr_data[19:19] & decoded_wr_biten[19:19]); + load_next_c = '1; + end + field_combo.hwpe_job_dep.mcnfig1.send_w.next = next_c; + field_combo.hwpe_job_dep.mcnfig1.send_w.load_next = load_next_c; + end + always_ff @(posedge clk or negedge arst_n) begin + if(~arst_n) begin + field_storage.hwpe_job_dep.mcnfig1.send_w.value <= 1'h0; + end else begin + if(field_combo.hwpe_job_dep.mcnfig1.send_w.load_next) begin + field_storage.hwpe_job_dep.mcnfig1.send_w.value <= field_combo.hwpe_job_dep.mcnfig1.send_w.next; + end + end + end + assign hwif_out.hwpe_job_dep.mcnfig1.send_w.value = field_storage.hwpe_job_dep.mcnfig1.send_w.value; + // Field: redmule_regif.hwpe_job_dep.mcnfig1.gemm_ops + always_comb begin + automatic logic [2:0] next_c; + automatic logic load_next_c; + next_c = field_storage.hwpe_job_dep.mcnfig1.gemm_ops.value; + load_next_c = '0; + if(decoded_reg_strb.hwpe_job_dep.mcnfig1 && decoded_req_is_wr) begin // SW write + next_c = (field_storage.hwpe_job_dep.mcnfig1.gemm_ops.value & ~decoded_wr_biten[22:20]) | (decoded_wr_data[22:20] & decoded_wr_biten[22:20]); + load_next_c = '1; + end + field_combo.hwpe_job_dep.mcnfig1.gemm_ops.next = next_c; + field_combo.hwpe_job_dep.mcnfig1.gemm_ops.load_next = load_next_c; + end + always_ff @(posedge clk or negedge arst_n) begin + if(~arst_n) begin + field_storage.hwpe_job_dep.mcnfig1.gemm_ops.value <= 3'h0; + end else begin + if(field_combo.hwpe_job_dep.mcnfig1.gemm_ops.load_next) begin + field_storage.hwpe_job_dep.mcnfig1.gemm_ops.value <= field_combo.hwpe_job_dep.mcnfig1.gemm_ops.next; + end + end + end + assign hwif_out.hwpe_job_dep.mcnfig1.gemm_ops.value = field_storage.hwpe_job_dep.mcnfig1.gemm_ops.value; + // Field: redmule_regif.hwpe_job_dep.mcnfig1.gemm_input_fmt + always_comb begin + automatic logic [1:0] next_c; + automatic logic load_next_c; + next_c = field_storage.hwpe_job_dep.mcnfig1.gemm_input_fmt.value; + load_next_c = '0; + if(decoded_reg_strb.hwpe_job_dep.mcnfig1 && decoded_req_is_wr) begin // SW write + next_c = (field_storage.hwpe_job_dep.mcnfig1.gemm_input_fmt.value & ~decoded_wr_biten[24:23]) | (decoded_wr_data[24:23] & decoded_wr_biten[24:23]); + load_next_c = '1; + end + field_combo.hwpe_job_dep.mcnfig1.gemm_input_fmt.next = next_c; + field_combo.hwpe_job_dep.mcnfig1.gemm_input_fmt.load_next = load_next_c; + end + always_ff @(posedge clk or negedge arst_n) begin + if(~arst_n) begin + field_storage.hwpe_job_dep.mcnfig1.gemm_input_fmt.value <= 2'h0; + end else begin + if(field_combo.hwpe_job_dep.mcnfig1.gemm_input_fmt.load_next) begin + field_storage.hwpe_job_dep.mcnfig1.gemm_input_fmt.value <= field_combo.hwpe_job_dep.mcnfig1.gemm_input_fmt.next; + end + end + end + assign hwif_out.hwpe_job_dep.mcnfig1.gemm_input_fmt.value = field_storage.hwpe_job_dep.mcnfig1.gemm_input_fmt.value; + // Field: redmule_regif.hwpe_job_dep.mcnfig1.gemm_output_fmt + always_comb begin + automatic logic [1:0] next_c; + automatic logic load_next_c; + next_c = field_storage.hwpe_job_dep.mcnfig1.gemm_output_fmt.value; + load_next_c = '0; + if(decoded_reg_strb.hwpe_job_dep.mcnfig1 && decoded_req_is_wr) begin // SW write + next_c = (field_storage.hwpe_job_dep.mcnfig1.gemm_output_fmt.value & ~decoded_wr_biten[26:25]) | (decoded_wr_data[26:25] & decoded_wr_biten[26:25]); + load_next_c = '1; + end + field_combo.hwpe_job_dep.mcnfig1.gemm_output_fmt.next = next_c; + field_combo.hwpe_job_dep.mcnfig1.gemm_output_fmt.load_next = load_next_c; + end + always_ff @(posedge clk or negedge arst_n) begin + if(~arst_n) begin + field_storage.hwpe_job_dep.mcnfig1.gemm_output_fmt.value <= 2'h0; + end else begin + if(field_combo.hwpe_job_dep.mcnfig1.gemm_output_fmt.load_next) begin + field_storage.hwpe_job_dep.mcnfig1.gemm_output_fmt.value <= field_combo.hwpe_job_dep.mcnfig1.gemm_output_fmt.next; + end + end + end + assign hwif_out.hwpe_job_dep.mcnfig1.gemm_output_fmt.value = field_storage.hwpe_job_dep.mcnfig1.gemm_output_fmt.value; + // Field: redmule_regif.hwpe_job_dep.mcnfig2.y_offs + always_comb begin + automatic logic [31:0] next_c; + automatic logic load_next_c; + next_c = field_storage.hwpe_job_dep.mcnfig2.y_offs.value; + load_next_c = '0; + if(decoded_reg_strb.hwpe_job_dep.mcnfig2 && decoded_req_is_wr) begin // SW write + next_c = (field_storage.hwpe_job_dep.mcnfig2.y_offs.value & ~decoded_wr_biten[31:0]) | (decoded_wr_data[31:0] & decoded_wr_biten[31:0]); + load_next_c = '1; + end + field_combo.hwpe_job_dep.mcnfig2.y_offs.next = next_c; + field_combo.hwpe_job_dep.mcnfig2.y_offs.load_next = load_next_c; + end + always_ff @(posedge clk or negedge arst_n) begin + if(~arst_n) begin + field_storage.hwpe_job_dep.mcnfig2.y_offs.value <= 32'h0; + end else begin + if(field_combo.hwpe_job_dep.mcnfig2.y_offs.load_next) begin + field_storage.hwpe_job_dep.mcnfig2.y_offs.value <= field_combo.hwpe_job_dep.mcnfig2.y_offs.next; + end + end + end + assign hwif_out.hwpe_job_dep.mcnfig2.y_offs.value = field_storage.hwpe_job_dep.mcnfig2.y_offs.value; + // Field: redmule_regif.hwpe_job_dep.marith0.x_addr + always_comb begin + automatic logic [31:0] next_c; + automatic logic load_next_c; + next_c = field_storage.hwpe_job_dep.marith0.x_addr.value; + load_next_c = '0; + if(decoded_reg_strb.hwpe_job_dep.marith0 && decoded_req_is_wr) begin // SW write + next_c = (field_storage.hwpe_job_dep.marith0.x_addr.value & ~decoded_wr_biten[31:0]) | (decoded_wr_data[31:0] & decoded_wr_biten[31:0]); + load_next_c = '1; + end + field_combo.hwpe_job_dep.marith0.x_addr.next = next_c; + field_combo.hwpe_job_dep.marith0.x_addr.load_next = load_next_c; + end + always_ff @(posedge clk or negedge arst_n) begin + if(~arst_n) begin + field_storage.hwpe_job_dep.marith0.x_addr.value <= 32'h0; + end else begin + if(field_combo.hwpe_job_dep.marith0.x_addr.load_next) begin + field_storage.hwpe_job_dep.marith0.x_addr.value <= field_combo.hwpe_job_dep.marith0.x_addr.next; + end + end + end + assign hwif_out.hwpe_job_dep.marith0.x_addr.value = field_storage.hwpe_job_dep.marith0.x_addr.value; + // Field: redmule_regif.hwpe_job_dep.marith1.w_addr + always_comb begin + automatic logic [31:0] next_c; + automatic logic load_next_c; + next_c = field_storage.hwpe_job_dep.marith1.w_addr.value; + load_next_c = '0; + if(decoded_reg_strb.hwpe_job_dep.marith1 && decoded_req_is_wr) begin // SW write + next_c = (field_storage.hwpe_job_dep.marith1.w_addr.value & ~decoded_wr_biten[31:0]) | (decoded_wr_data[31:0] & decoded_wr_biten[31:0]); + load_next_c = '1; + end + field_combo.hwpe_job_dep.marith1.w_addr.next = next_c; + field_combo.hwpe_job_dep.marith1.w_addr.load_next = load_next_c; + end + always_ff @(posedge clk or negedge arst_n) begin + if(~arst_n) begin + field_storage.hwpe_job_dep.marith1.w_addr.value <= 32'h0; + end else begin + if(field_combo.hwpe_job_dep.marith1.w_addr.load_next) begin + field_storage.hwpe_job_dep.marith1.w_addr.value <= field_combo.hwpe_job_dep.marith1.w_addr.next; + end + end + end + assign hwif_out.hwpe_job_dep.marith1.w_addr.value = field_storage.hwpe_job_dep.marith1.w_addr.value; + // Field: redmule_regif.hwpe_job_dep.marith2.z_addr + always_comb begin + automatic logic [31:0] next_c; + automatic logic load_next_c; + next_c = field_storage.hwpe_job_dep.marith2.z_addr.value; + load_next_c = '0; + if(decoded_reg_strb.hwpe_job_dep.marith2 && decoded_req_is_wr) begin // SW write + next_c = (field_storage.hwpe_job_dep.marith2.z_addr.value & ~decoded_wr_biten[31:0]) | (decoded_wr_data[31:0] & decoded_wr_biten[31:0]); + load_next_c = '1; + end + field_combo.hwpe_job_dep.marith2.z_addr.next = next_c; + field_combo.hwpe_job_dep.marith2.z_addr.load_next = load_next_c; + end + always_ff @(posedge clk or negedge arst_n) begin + if(~arst_n) begin + field_storage.hwpe_job_dep.marith2.z_addr.value <= 32'h0; + end else begin + if(field_combo.hwpe_job_dep.marith2.z_addr.load_next) begin + field_storage.hwpe_job_dep.marith2.z_addr.value <= field_combo.hwpe_job_dep.marith2.z_addr.next; + end + end + end + assign hwif_out.hwpe_job_dep.marith2.z_addr.value = field_storage.hwpe_job_dep.marith2.z_addr.value; + assign hwif_out.hwpe_job_indep.reserved.reserved.value = 32'h0; + + //-------------------------------------------------------------------------- + // Write response + //-------------------------------------------------------------------------- + assign cpuif_wr_ack = decoded_req & decoded_req_is_wr; + // Writes are always granted with no error response + assign cpuif_wr_err = '0; + + //-------------------------------------------------------------------------- + // Readback + //-------------------------------------------------------------------------- + + logic readback_err; + logic readback_done; + logic [31:0] readback_data; + + // Assign readback values to a flattened array + logic [31:0] readback_array[16]; + assign readback_array[0][1:0] = '0; + assign readback_array[0][31:2] = (decoded_reg_strb.hwpe_ctrl.commit_trigger && !decoded_req_is_wr) ? 30'h0 : '0; + assign readback_array[1][31:0] = (decoded_reg_strb.hwpe_ctrl.acquire && !decoded_req_is_wr) ? hwif_in.hwpe_ctrl.acquire.acquire.next : '0; + assign readback_array[2][31:0] = (decoded_reg_strb.hwpe_ctrl.reserved0 && !decoded_req_is_wr) ? 32'h0 : '0; + assign readback_array[3][31:0] = (decoded_reg_strb.hwpe_ctrl.status && !decoded_req_is_wr) ? hwif_in.hwpe_ctrl.status.status0.next : '0; + assign readback_array[4][7:0] = (decoded_reg_strb.hwpe_ctrl.running_job && !decoded_req_is_wr) ? hwif_in.hwpe_ctrl.running_job.running_job.next : '0; + assign readback_array[4][31:8] = (decoded_reg_strb.hwpe_ctrl.running_job && !decoded_req_is_wr) ? 24'h0 : '0; + assign readback_array[5][1:0] = '0; + assign readback_array[5][31:2] = (decoded_reg_strb.hwpe_ctrl.soft_clear && !decoded_req_is_wr) ? 30'h0 : '0; + assign readback_array[6][31:0] = (decoded_reg_strb.hwpe_ctrl.reserved1 && !decoded_req_is_wr) ? 32'h0 : '0; + assign readback_array[7][31:0] = (decoded_reg_strb.hwpe_ctrl.reserved2 && !decoded_req_is_wr) ? 32'h0 : '0; + assign readback_array[8][15:0] = (decoded_reg_strb.hwpe_job_dep.mcnfig0 && !decoded_req_is_wr) ? field_storage.hwpe_job_dep.mcnfig0.m_size.value : '0; + assign readback_array[8][31:16] = (decoded_reg_strb.hwpe_job_dep.mcnfig0 && !decoded_req_is_wr) ? field_storage.hwpe_job_dep.mcnfig0.k_size.value : '0; + assign readback_array[9][15:0] = (decoded_reg_strb.hwpe_job_dep.mcnfig1 && !decoded_req_is_wr) ? field_storage.hwpe_job_dep.mcnfig1.n_size.value : '0; + assign readback_array[9][16:16] = (decoded_reg_strb.hwpe_job_dep.mcnfig1 && !decoded_req_is_wr) ? field_storage.hwpe_job_dep.mcnfig1.receive_x.value : '0; + assign readback_array[9][17:17] = (decoded_reg_strb.hwpe_job_dep.mcnfig1 && !decoded_req_is_wr) ? field_storage.hwpe_job_dep.mcnfig1.send_x.value : '0; + assign readback_array[9][18:18] = (decoded_reg_strb.hwpe_job_dep.mcnfig1 && !decoded_req_is_wr) ? field_storage.hwpe_job_dep.mcnfig1.receive_w.value : '0; + assign readback_array[9][19:19] = (decoded_reg_strb.hwpe_job_dep.mcnfig1 && !decoded_req_is_wr) ? field_storage.hwpe_job_dep.mcnfig1.send_w.value : '0; + assign readback_array[9][22:20] = (decoded_reg_strb.hwpe_job_dep.mcnfig1 && !decoded_req_is_wr) ? field_storage.hwpe_job_dep.mcnfig1.gemm_ops.value : '0; + assign readback_array[9][24:23] = (decoded_reg_strb.hwpe_job_dep.mcnfig1 && !decoded_req_is_wr) ? field_storage.hwpe_job_dep.mcnfig1.gemm_input_fmt.value : '0; + assign readback_array[9][26:25] = (decoded_reg_strb.hwpe_job_dep.mcnfig1 && !decoded_req_is_wr) ? field_storage.hwpe_job_dep.mcnfig1.gemm_output_fmt.value : '0; + assign readback_array[9][31:27] = '0; + assign readback_array[10][31:0] = (decoded_reg_strb.hwpe_job_dep.mcnfig2 && !decoded_req_is_wr) ? field_storage.hwpe_job_dep.mcnfig2.y_offs.value : '0; + assign readback_array[11][31:0] = (decoded_reg_strb.hwpe_job_dep.marith0 && !decoded_req_is_wr) ? field_storage.hwpe_job_dep.marith0.x_addr.value : '0; + assign readback_array[12][31:0] = (decoded_reg_strb.hwpe_job_dep.marith1 && !decoded_req_is_wr) ? field_storage.hwpe_job_dep.marith1.w_addr.value : '0; + assign readback_array[13][31:0] = (decoded_reg_strb.hwpe_job_dep.marith2 && !decoded_req_is_wr) ? field_storage.hwpe_job_dep.marith2.z_addr.value : '0; + assign readback_array[14][31:0] = (decoded_reg_strb.hwpe_job_dep.mopcnt && !decoded_req_is_wr) ? hwif_in.hwpe_job_dep.mopcnt.op_id_cnt.next : '0; + assign readback_array[15][31:0] = (decoded_reg_strb.hwpe_job_indep.reserved && !decoded_req_is_wr) ? 32'h0 : '0; + + // Reduce the array + always_comb begin + automatic logic [31:0] readback_data_var; + readback_done = decoded_req & ~decoded_req_is_wr; + readback_err = '0; + readback_data_var = '0; + for(int i=0; i<16; i++) readback_data_var |= readback_array[i]; + readback_data = readback_data_var; + end + + assign cpuif_rd_ack = readback_done; + assign cpuif_rd_data = readback_data; + assign cpuif_rd_err = readback_err; +endmodule diff --git a/rtl/ctrl/regif/redmule_regif_pkg.sv b/rtl/ctrl/regif/redmule_regif_pkg.sv new file mode 100644 index 0000000..24e8058 --- /dev/null +++ b/rtl/ctrl/regif/redmule_regif_pkg.sv @@ -0,0 +1,243 @@ +// Generated by PeakRDL-regblock - A free and open-source SystemVerilog generator +// https://github.com/SystemRDL/PeakRDL-regblock + +package redmule_regif_pkg; + + localparam REDMULE_REGIF_DATA_WIDTH = 32; + localparam REDMULE_REGIF_MIN_ADDR_WIDTH = 32; + localparam REDMULE_REGIF_SIZE = 'h40; + + typedef struct packed { + logic [31:0] next; + } redmule_regif__hwpe_acquire__acquire__in_t; + + typedef struct packed { + redmule_regif__hwpe_acquire__acquire__in_t acquire; + } redmule_regif__hwpe_acquire__in_t; + + typedef struct packed { + logic [31:0] next; + } redmule_regif__hwpe_status__status0__in_t; + + typedef struct packed { + redmule_regif__hwpe_status__status0__in_t status0; + } redmule_regif__hwpe_status__in_t; + + typedef struct packed { + logic [7:0] next; + } redmule_regif__hwpe_running_job__running_job__in_t; + + typedef struct packed { + redmule_regif__hwpe_running_job__running_job__in_t running_job; + } redmule_regif__hwpe_running_job__in_t; + + typedef struct packed { + redmule_regif__hwpe_acquire__in_t acquire; + redmule_regif__hwpe_status__in_t status; + redmule_regif__hwpe_running_job__in_t running_job; + } redmule_regif__hwpe_ctrl_mandatory__in_t; + + typedef struct packed { + logic [31:0] next; + } redmule_regif__mopcnt__op_id_cnt__in_t; + + typedef struct packed { + redmule_regif__mopcnt__op_id_cnt__in_t op_id_cnt; + } redmule_regif__mopcnt__in_t; + + typedef struct packed { + redmule_regif__mopcnt__in_t mopcnt; + } redmule_regif__hwpe_ctrl_job_dep__in_t; + + typedef struct packed { + redmule_regif__hwpe_ctrl_mandatory__in_t hwpe_ctrl; + redmule_regif__hwpe_ctrl_job_dep__in_t hwpe_job_dep; + } redmule_regif__in_t; + + typedef struct packed { + logic [1:0] value; + logic swacc; + } redmule_regif__hwpe_commit_trigger__commit_trigger__out_t; + + typedef struct packed { + logic [29:0] value; + } redmule_regif__hwpe_commit_trigger__r0__out_t; + + typedef struct packed { + redmule_regif__hwpe_commit_trigger__commit_trigger__out_t commit_trigger; + redmule_regif__hwpe_commit_trigger__r0__out_t r0; + } redmule_regif__hwpe_commit_trigger__out_t; + + typedef struct packed { + logic swacc; + } redmule_regif__hwpe_acquire__acquire__out_t; + + typedef struct packed { + redmule_regif__hwpe_acquire__acquire__out_t acquire; + } redmule_regif__hwpe_acquire__out_t; + + typedef struct packed { + logic [31:0] value; + } redmule_regif__hwpe_reserved__reserved__out_t; + + typedef struct packed { + redmule_regif__hwpe_reserved__reserved__out_t reserved; + } redmule_regif__hwpe_reserved__out_t; + + typedef struct packed { + logic [23:0] value; + } redmule_regif__hwpe_running_job__r0__out_t; + + typedef struct packed { + redmule_regif__hwpe_running_job__r0__out_t r0; + } redmule_regif__hwpe_running_job__out_t; + + typedef struct packed { + logic [1:0] value; + logic swacc; + } redmule_regif__hwpe_soft_clear__soft_clear__out_t; + + typedef struct packed { + logic [29:0] value; + } redmule_regif__hwpe_soft_clear__r0__out_t; + + typedef struct packed { + redmule_regif__hwpe_soft_clear__soft_clear__out_t soft_clear; + redmule_regif__hwpe_soft_clear__r0__out_t r0; + } redmule_regif__hwpe_soft_clear__out_t; + + typedef struct packed { + redmule_regif__hwpe_commit_trigger__out_t commit_trigger; + redmule_regif__hwpe_acquire__out_t acquire; + redmule_regif__hwpe_reserved__out_t reserved0; + redmule_regif__hwpe_running_job__out_t running_job; + redmule_regif__hwpe_soft_clear__out_t soft_clear; + redmule_regif__hwpe_reserved__out_t reserved1; + redmule_regif__hwpe_reserved__out_t reserved2; + } redmule_regif__hwpe_ctrl_mandatory__out_t; + + typedef struct packed { + logic [15:0] value; + } redmule_regif__mcnfig0__m_size__out_t; + + typedef struct packed { + logic [15:0] value; + } redmule_regif__mcnfig0__k_size__out_t; + + typedef struct packed { + redmule_regif__mcnfig0__m_size__out_t m_size; + redmule_regif__mcnfig0__k_size__out_t k_size; + } redmule_regif__mcnfig0__out_t; + + typedef struct packed { + logic [15:0] value; + } redmule_regif__mcnfig1__n_size__out_t; + + typedef struct packed { + logic value; + } redmule_regif__mcnfig1__receive_x__out_t; + + typedef struct packed { + logic value; + } redmule_regif__mcnfig1__send_x__out_t; + + typedef struct packed { + logic value; + } redmule_regif__mcnfig1__receive_w__out_t; + + typedef struct packed { + logic value; + } redmule_regif__mcnfig1__send_w__out_t; + + typedef struct packed { + logic [2:0] value; + } redmule_regif__mcnfig1__gemm_ops__out_t; + + typedef struct packed { + logic [1:0] value; + } redmule_regif__mcnfig1__gemm_input_fmt__out_t; + + typedef struct packed { + logic [1:0] value; + } redmule_regif__mcnfig1__gemm_output_fmt__out_t; + + typedef struct packed { + redmule_regif__mcnfig1__n_size__out_t n_size; + redmule_regif__mcnfig1__receive_x__out_t receive_x; + redmule_regif__mcnfig1__send_x__out_t send_x; + redmule_regif__mcnfig1__receive_w__out_t receive_w; + redmule_regif__mcnfig1__send_w__out_t send_w; + redmule_regif__mcnfig1__gemm_ops__out_t gemm_ops; + redmule_regif__mcnfig1__gemm_input_fmt__out_t gemm_input_fmt; + redmule_regif__mcnfig1__gemm_output_fmt__out_t gemm_output_fmt; + } redmule_regif__mcnfig1__out_t; + + typedef struct packed { + logic [31:0] value; + } redmule_regif__mcnfig2__y_offs__out_t; + + typedef struct packed { + redmule_regif__mcnfig2__y_offs__out_t y_offs; + } redmule_regif__mcnfig2__out_t; + + typedef struct packed { + logic [31:0] value; + } redmule_regif__marith0__x_addr__out_t; + + typedef struct packed { + redmule_regif__marith0__x_addr__out_t x_addr; + } redmule_regif__marith0__out_t; + + typedef struct packed { + logic [31:0] value; + } redmule_regif__marith1__w_addr__out_t; + + typedef struct packed { + redmule_regif__marith1__w_addr__out_t w_addr; + } redmule_regif__marith1__out_t; + + typedef struct packed { + logic [31:0] value; + } redmule_regif__marith2__z_addr__out_t; + + typedef struct packed { + redmule_regif__marith2__z_addr__out_t z_addr; + } redmule_regif__marith2__out_t; + + typedef struct packed { + redmule_regif__mcnfig0__out_t mcnfig0; + redmule_regif__mcnfig1__out_t mcnfig1; + redmule_regif__mcnfig2__out_t mcnfig2; + redmule_regif__marith0__out_t marith0; + redmule_regif__marith1__out_t marith1; + redmule_regif__marith2__out_t marith2; + } redmule_regif__hwpe_ctrl_job_dep__out_t; + + typedef struct packed { + redmule_regif__hwpe_reserved__out_t reserved; + } redmule_regif__hwpe_ctrl_job_indep__out_t; + + typedef struct packed { + redmule_regif__hwpe_ctrl_mandatory__out_t hwpe_ctrl; + redmule_regif__hwpe_ctrl_job_dep__out_t hwpe_job_dep; + redmule_regif__hwpe_ctrl_job_indep__out_t hwpe_job_indep; + } redmule_regif__out_t; + + typedef enum logic [2:0] { + gemm_op_e__MATMUL = 'h0, + gemm_op_e__GEMM = 'h1, + gemm_op_e__ADDMAX = 'h2, + gemm_op_e__ADDMIN = 'h3, + gemm_op_e__MULMAX = 'h4, + gemm_op_e__MULMIN = 'h5, + gemm_op_e__MAXMIN = 'h6, + gemm_op_e__MINMAX = 'h7 + } gemm_op_e_e; + + typedef enum logic [1:0] { + gemm_fmt_e__Float8 = 'h0, + gemm_fmt_e__Float16 = 'h1, + gemm_fmt_e__Float8Alt = 'h2, + gemm_fmt_e__Float16Alt = 'h3 + } gemm_fmt_e_e; +endpackage diff --git a/rtl/redmule_ctrl.sv b/rtl/redmule_ctrl.sv index 8268939..81acc10 100644 --- a/rtl/redmule_ctrl.sv +++ b/rtl/redmule_ctrl.sv @@ -21,6 +21,7 @@ module redmule_ctrl input logic rst_ni , input logic test_mode_i , output logic busy_o , + input logic target_clear_i , output logic clear_o , output logic evt_o , input redmule_config_t config_i , @@ -156,6 +157,6 @@ module redmule_ctrl /* Other combinational assigmnets */ /*---------------------------------------------------------------------------------------------*/ assign evt_o = flgs_streamer_i.z_stream_sink_flags.done; - assign clear_o = latch_clear || current == REDMULE_FINISHED; + assign clear_o = target_clear_i || latch_clear || current == REDMULE_FINISHED; endmodule : redmule_ctrl diff --git a/rtl/redmule_inst_decoder.sv b/rtl/redmule_inst_decoder.sv index 213a62a..dcadc24 100644 --- a/rtl/redmule_inst_decoder.sv +++ b/rtl/redmule_inst_decoder.sv @@ -49,33 +49,52 @@ module redmule_inst_decoder input logic x_result_ready_i ); + // Calculate the width needed to represent hart IDs (minimum 1 bit) localparam int unsigned HartIdWidth = XifNumHarts > 1 ? $clog2(XifNumHarts) : 1; + // Compose full instruction encoding patterns for the three custom instructions: + // MCNFIG: Matrix configuration (sets dimensions, data flow control) + // MARITH: Matrix arithmetic operation (triggers computation with addresses) + // MOPCNT: Matrix operation count (returns number of completed operations) localparam logic [11:0] MCNFIG = {McnfigFunct2,McnfigFunct3,McnfigOpCode}; localparam logic [11:0] MARITH = {MarithFunct2,MarithFunct3,MarithOpCode}; localparam logic [11:0] MOPCNT = {MopcntFunct2,MopcntFunct3,MopcntOpCode}; + // Per-hart FIFO status flags for instruction and register packets logic [XifNumHarts-1:0] issue_fifo_full, register_fifo_full, issue_fifo_empty, register_fifo_empty; + + // Hart ID of the currently executing operation (tracked through pipeline) logic [HartIdWidth-1:0] current_hartid_d, current_hartid_q; + // Current instruction issue request and register data at head of each hart's FIFO x_issue_req_t [XifNumHarts-1:0] cur_issue; x_register_t [XifNumHarts-1:0] cur_register; + // TODO unused: x_result_t x_result_d, x_result_q; + // Per-hart operation ID counters: + // op_id_counter_in_q: Increments when operations are issued (tags for tracking) + // op_id_counter_out_q: Increments when operations complete (for MOPCNT instruction) logic [XifNumHarts-1:0] [OpIdWidth-1:0] op_id_counter_in_q, op_id_counter_out_q; + // Round-robin arbitration state for fair scheduling across harts logic [HartIdWidth-1:0] rr_counter_d, rr_counter_q; logic [XifNumHarts-1:0][HartIdWidth-1:0] rr_priority; logic [HartIdWidth-1:0] winner; + // Flag indicating whether the incoming instruction is a recognized RedMule custom instruction logic legal_inst; + // Per-hart configuration registers holding matrix operation parameters redmule_config_t [XifNumHarts-1:0] config_d, config_q; + // Control signal to enable popping from instruction FIFOs (delayed for MARITH until tiler ready) logic pop_enable; + // Decode incoming instruction to determine if it's a legal RedMule custom instruction + // Checks funct2[26:25], funct3[14:12], and opcode[6:0] fields always_comb begin : legal_inst_assignment legal_inst = 1'b0; @@ -85,21 +104,26 @@ module redmule_inst_decoder endcase end + // Generate XIF issue response indicating whether instruction is accepted and resource needs always_comb begin : x_issue_resp_assignment + // Accept instruction only if it's a legal RedMule custom instruction x_issue_resp_o.accept = legal_inst; unique case ({x_issue_req_i.instr[26:25],x_issue_req_i.instr[14:12],x_issue_req_i.instr[6:0]}) MCNFIG: begin + // MCNFIG: No writeback (configuration only), reads 3 source registers x_issue_resp_o.writeback = 'b0; - x_issue_resp_o.register_read = 'b111; + x_issue_resp_o.register_read = 'b111; // Read rs1, rs2, rs3 end MARITH: begin + // MARITH: Writeback if rd != x0 (returns operation ID), reads 3 source registers x_issue_resp_o.writeback = x_issue_req_i.instr[11:7] != 0; - x_issue_resp_o.register_read = 'b111; + x_issue_resp_o.register_read = 'b111; // Read rs1, rs2, rs3 (addresses) end MOPCNT: begin + // MOPCNT: Writeback if rd != x0 (returns completion count), no register reads x_issue_resp_o.writeback = x_issue_req_i.instr[11:7] != 0; - x_issue_resp_o.register_read = 'b0; + x_issue_resp_o.register_read = 'b0; // No source registers needed end default: begin x_issue_resp_o.writeback = 'b0; @@ -109,22 +133,27 @@ module redmule_inst_decoder end + // Construct result packet to write back to CPU register file always_comb begin : x_result_assignment + // Result valid when both instruction and register data available for winning hart x_result_valid_o = ~issue_fifo_empty[winner] && ~register_fifo_empty[winner]; x_result_o.hartid = cur_issue[winner].hartid; x_result_o.id = cur_issue[winner].id; - x_result_o.rd = cur_issue[winner].instr[11:7]; + x_result_o.rd = cur_issue[winner].instr[11:7]; // Destination register unique case ({cur_issue[winner].instr[26:25],cur_issue[winner].instr[14:12],cur_issue[winner].instr[6:0]}) MCNFIG: begin + // MCNFIG: No writeback, configuration stored internally x_result_o.we = 'b0; x_result_o.data = 'b0; end MARITH: begin + // MARITH: Write operation ID to rd (for tracking/synchronization) x_result_o.we = cur_issue[winner].instr[11:7] != 0; x_result_o.data = op_id_counter_in_q[winner]; end MOPCNT: begin + // MOPCNT: Write completion counter to rd (number of finished operations) x_result_o.we = cur_issue[winner].instr[11:7] != 0; x_result_o.data = op_id_counter_out_q[winner]; end @@ -135,12 +164,18 @@ module redmule_inst_decoder endcase end + // Output configuration from the winning hart to the RedMule tiler/controller assign config_o = config_d[winner]; + + // Configuration valid only for MARITH instructions when both FIFOs have data and CPU is ready + // (MCNFIG updates config but doesn't trigger execution) assign config_valid_o = ~issue_fifo_empty[winner] && ~register_fifo_empty[winner] && x_result_ready_i && {cur_issue[winner].instr[26:25],cur_issue[winner].instr[14:12],cur_issue[winner].instr[6:0]} == MARITH; + // Signal readiness to accept new instruction issue based on target hart's FIFO availability always_comb begin : x_issue_ready_assignment x_issue_ready_o = 1'b0; + // Find the hart matching the incoming request and check its issue FIFO status for (int unsigned i = 0; i < XifNumHarts; i++) begin if (x_issue_req_i.hartid == i) begin x_issue_ready_o = ~issue_fifo_full[i]; @@ -148,9 +183,11 @@ module redmule_inst_decoder end end + // Signal readiness to accept new register packet based on target hart's FIFO availability always_comb begin : x_register_ready_assignment x_register_ready_o = 1'b0; + // Find the hart matching the incoming register data and check its register FIFO status for (int unsigned i = 0; i < XifNumHarts; i++) begin if (x_register_i.hartid == i) begin x_register_ready_o = ~register_fifo_full[i]; @@ -158,6 +195,8 @@ module redmule_inst_decoder end end + // Round-robin counter for fair arbitration across multiple harts + // Advances each time a configuration is successfully accepted by downstream logic always_ff @(posedge clk_i, negedge rst_ni) begin : round_robin_counter if(~rst_ni) begin rr_counter_q <= '0; @@ -170,17 +209,23 @@ module redmule_inst_decoder end end + // Wrap counter to 0 after reaching the last hart assign rr_counter_d = rr_counter_q == XifNumHarts-1 ? 0 : rr_counter_q + 1; + // Calculate priority order for round-robin arbitration + // Creates a rotated sequence starting from current counter position always_comb begin : round_robin_priority for(int i = 0; i < XifNumHarts; i++) begin rr_priority[i] = (rr_counter_q + i < XifNumHarts) ? rr_counter_q + i : rr_counter_q + i - XifNumHarts; end end + // Select winning hart using round-robin priority among harts with ready instructions + // Scans in priority order and selects first hart with both issue and register data available always_comb begin : winner_assignment - winner = rr_counter_q; + winner = rr_counter_q; // Default to current counter position + // Override with first ready hart in priority order for(int i = 0; i < XifNumHarts; i++) begin if (~issue_fifo_empty[rr_priority[i]] && ~register_fifo_empty[rr_priority[i]]) begin winner = rr_priority[i]; @@ -188,6 +233,9 @@ module redmule_inst_decoder end end + // FIFO tracking which hart each in-flight operation belongs to + // Pushed when operation starts, popped when operation completes + // Used to correctly increment the completion counter for MOPCNT instruction fifo_v3 #( .FALL_THROUGH ( 0 ), .DEPTH ( InstFifoDepth * XifNumHarts ), @@ -200,13 +248,16 @@ module redmule_inst_decoder .full_o ( ), .empty_o ( ), .usage_o ( ), - .data_i ( winner ), - .push_i ( config_ready_i && config_valid_o ), - .data_o ( current_hartid_q ), - .pop_i ( op_done_i ) + .data_i ( winner ), // Push winning hart ID + .push_i ( config_ready_i && config_valid_o ), // On operation issue + .data_o ( current_hartid_q ), // Hart of completing op + .pop_i ( op_done_i ) // On operation completion ); + // Per-hart operation ID counters for tracking issued and completed operations for (genvar i = 0; i < XifNumHarts; i++) begin : gen_op_id_counters + // Input counter: increments when MARITH instruction is issued to this hart + // Returns this value to CPU as operation ID for software tracking always_ff @(posedge clk_i or negedge rst_ni) begin : op_id_counter_in if (~rst_ni) begin op_id_counter_in_q[i] <= 0; @@ -219,6 +270,9 @@ module redmule_inst_decoder end end + // Output counter: increments when any operation from this hart completes + // Returns this value for MOPCNT instruction to check completion status + // Initialized to all 1's to detect first completion (wraps to 0) always_ff @(posedge clk_i or negedge rst_ni) begin : op_id_counter_out if (~rst_ni) begin op_id_counter_out_q[i] <= '1; @@ -232,7 +286,9 @@ module redmule_inst_decoder end end - // Pop the fifos the first cycle the tiler is no longer busy if we detect a MARITH instruction + // Control when to pop instruction/register FIFOs: + // - MARITH: delay pop until config accepted by tiler (config_ready_i && config_valid_o) + // - Others: pop immediately since they don't require tiler resources assign pop_enable = ({cur_issue[winner].instr[26:25],cur_issue[winner].instr[14:12],cur_issue[winner].instr[6:0]} == MARITH ? config_ready_i && config_valid_o : 1'b1); for (genvar i = 0; i < XifNumHarts; i++) begin : gen_instruction_fifos @@ -250,6 +306,8 @@ module redmule_inst_decoder logic issue_push, register_push, issue_pop, register_pop; + // Register holding the most recent committed (non-killed) instruction ID for this hart + // Used to track successful instruction commits from the CPU always_ff @(posedge clk_i or negedge rst_ni) begin : commit_id_register if (~rst_ni) begin commit_id_q <= '0; @@ -262,8 +320,11 @@ module redmule_inst_decoder end end + // Capture commit ID when a valid, non-killed commit occurs for this hart assign commit_id_d = (x_commit_valid_i && ~x_commit_i.commit_kill && x_commit_i.hartid == i) ? x_commit_i.id : commit_id_q; + // Valid flag for commit_id, indicates whether we have a pending committed instruction + // Cleared when the matching instruction is popped from FIFO always_ff @(posedge clk_i or negedge rst_ni) begin : commid_id_valid_register if (~rst_ni) begin commit_id_valid_q <= 1'b0; @@ -276,9 +337,13 @@ module redmule_inst_decoder end end + // Set valid when commit arrives, hold until instruction processed assign commit_id_valid_d = (x_commit_valid_i && ~x_commit_i.commit_kill && x_commit_i.hartid == i) ? 1'b1 : commit_id_valid_q; + // Clear valid flag when the committed instruction is popped from FIFO assign commit_id_valid_flush = issue_pop && cur_issue[i].id == commit_id_d && ~issue_fifo_empty[i]; + // Register holding the most recent killed instruction ID for this hart + // CPU sends kill signal for speculative instructions that should be discarded always_ff @(posedge clk_i or negedge rst_ni) begin : kill_id_register if (~rst_ni) begin kill_id_q <= '0; @@ -291,8 +356,11 @@ module redmule_inst_decoder end end + // Capture kill ID when a commit with kill flag occurs for this hart assign kill_id_d = (x_commit_valid_i && x_commit_i.commit_kill && x_commit_i.hartid == i) ? x_commit_i.id : kill_id_q; + // Valid flag for kill_id, indicates whether we have a pending kill request + // Cleared after FIFO flush completes always_ff @(posedge clk_i or negedge rst_ni) begin : kill_id_valid_register if (~rst_ni) begin kill_id_valid_q <= 1'b0; @@ -305,13 +373,19 @@ module redmule_inst_decoder end end + // Set valid when kill arrives, hold until FIFO flushed assign kill_id_valid_d = (x_commit_valid_i && x_commit_i.commit_kill && x_commit_i.hartid == i) ? 1'b1 : kill_id_valid_q; + // Clear valid flag after FIFO has been flushed assign kill_id_valid_flush = fifo_flush; + // Trigger FIFO flush when head instruction matches a killed instruction ID assign fifo_flush = cur_issue[i].id == kill_id_d && kill_id_valid_d && ~issue_fifo_empty[i]; + // Push to issue FIFO when: legal instruction, FIFO not full, matches this hart assign issue_push = x_issue_valid_i && legal_inst && ~issue_fifo_full[i] && x_commit_i.hartid == i; + // Pop from issue FIFO when: this hart wins arbitration, pop enabled, CPU ready, both FIFOs have data assign issue_pop = winner == i && pop_enable && x_result_ready_i && ~issue_fifo_empty[i] && ~register_fifo_empty[i]; + // Register FIFO pops in sync with issue FIFO assign register_pop = issue_pop; fifo_v3 #( @@ -332,7 +406,9 @@ module redmule_inst_decoder .pop_i ( issue_pop ) ); - if (XifIssueRegisterSplit == 0) begin : gen_register_fifo // Register packets are guaranteed to arrive at the same time as the issue signal + // Non-split mode: register packets arrive synchronously with issue + if (XifIssueRegisterSplit == 0) begin : gen_register_fifo + // Push to register FIFO in sync with valid register packet for legal instruction assign register_push = x_register_valid_i & legal_inst & x_commit_i.hartid == i; fifo_v3 #( @@ -353,14 +429,15 @@ module redmule_inst_decoder .pop_i ( register_pop ) ); - end else begin : gen_register_buffer // If register split is enabled, we could receive register packets out of order + end else begin : gen_register_buffer + // Split mode: register packets may arrive out-of-order relative to issue // When an instruction is marked as valid, reserve a slot for the instruction in the buffer - // The buffer has a number of slots equal to InstFifoDepth - - // TODO: implement + // TODO: implement out-of-order register packet buffering end + // Configuration register for this hart, holds accumulated matrix operation parameters + // Updated when instructions are popped (MCNFIG sets params, MARITH uses them) always_ff @(posedge clk_i or negedge rst_ni) begin : config_register if (~rst_ni) begin config_q[i] <= '0; @@ -373,30 +450,33 @@ module redmule_inst_decoder end end + // Decode instruction and extract configuration parameters from register file values always_comb begin : config_assignment - config_d[i] = config_q[i]; + config_d[i] = config_q[i]; // Default: retain previous configuration unique case ({cur_issue[i].instr[26:25],cur_issue[i].instr[14:12],cur_issue[i].instr[6:0]}) MCNFIG: begin - config_d[i].m_size = cur_register[i].rs[0][15:0]; - config_d[i].n_size = cur_register[i].rs[1][15:0]; - config_d[i].k_size = cur_register[i].rs[0][31:16]; - config_d[i].receive_x = cur_register[i].rs[1][16]; - config_d[i].send_x = cur_register[i].rs[1][17]; - config_d[i].receive_w = cur_register[i].rs[1][18]; - config_d[i].send_w = cur_register[i].rs[1][19]; - config_d[i].y_offs = cur_register[i].rs[2][31:0]; + // Matrix configuration: extract dimensions and data flow control from rs1, rs2, rs3 + config_d[i].m_size = cur_register[i].rs[0][15:0]; // M dimension (rows of X/Z) + config_d[i].n_size = cur_register[i].rs[1][15:0]; // N dimension (cols of W/Z) + config_d[i].k_size = cur_register[i].rs[0][31:16]; // K dimension (cols of X, rows of W) + config_d[i].receive_x = cur_register[i].rs[1][16]; // Receive X from external stream + config_d[i].send_x = cur_register[i].rs[1][17]; // Broadcast X to external stream + config_d[i].receive_w = cur_register[i].rs[1][18]; // Receive W from external stream + config_d[i].send_w = cur_register[i].rs[1][19]; // Broadcast W to external stream + config_d[i].y_offs = cur_register[i].rs[2][31:0]; // Y buffer offset for bias addition end MARITH: begin - config_d[i].x_addr = cur_register[i].rs[0][31:0]; - config_d[i].w_addr = cur_register[i].rs[1][31:0]; - config_d[i].z_addr = cur_register[i].rs[2][31:0]; - // TODO: These are fixed for now - config_d[i].gemm_ops = GEMM; - config_d[i].gemm_input_fmt = redmule_pkg::Float16; - config_d[i].gemm_output_fmt = redmule_pkg::Float16; + // Matrix arithmetic: extract memory addresses from rs1, rs2, rs3 + config_d[i].x_addr = cur_register[i].rs[0][31:0]; // X matrix base address + config_d[i].w_addr = cur_register[i].rs[1][31:0]; // W matrix base address + config_d[i].z_addr = cur_register[i].rs[2][31:0]; // Z matrix base address (output) + // TODO: These operation parameters are fixed for now, could be made configurable + config_d[i].gemm_ops = GEMM; // Operation type: GEMM + config_d[i].gemm_input_fmt = redmule_pkg::Float16; // Input format: FP16 + config_d[i].gemm_output_fmt = redmule_pkg::Float16; // Output format: FP16 end - default: config_d[i] = config_q[i]; + default: config_d[i] = config_q[i]; // Other instructions don't modify config endcase end end diff --git a/rtl/redmule_pkg.sv b/rtl/redmule_pkg.sv index 10afbbe..b13d517 100644 --- a/rtl/redmule_pkg.sv +++ b/rtl/redmule_pkg.sv @@ -22,6 +22,8 @@ package redmule_pkg; parameter int unsigned WsourceStreamId = 1; parameter int unsigned YsourceStreamId = 2; + typedef enum logic { HWPE_TARGET, XIF } ctrl_intf_e; + typedef enum logic { LD_IN_FMP, LD_WEIGHT } source_sel_e; typedef enum logic { LOAD, STORE } ld_st_sel_e; diff --git a/rtl/redmule_top.sv b/rtl/redmule_top.sv index f211ac9..9ce0ee1 100644 --- a/rtl/redmule_top.sv +++ b/rtl/redmule_top.sv @@ -24,7 +24,9 @@ module redmule_top parameter bit LatchBuffers = 0, parameter fpnew_pkg::fmt_logic_t FpFmtConfig = 6'b001101, parameter fpnew_pkg::ifmt_logic_t IntFmtConfig = 4'b1000, - // Custom instrunctions + // Choose interface + parameter ctrl_intf_e CtrlIntfConfig = XIF, + // Custom instructions parameter logic [6:0] McnfigOpCode = 7'b0001011, parameter logic [6:0] MarithOpCode = 7'b0001011, parameter logic [6:0] MopcntOpCode = 7'b0001011, @@ -59,7 +61,7 @@ module redmule_top hwpe_stream_intf_stream.source w_stream_o , // Broadcasted X stream hwpe_stream_intf_stream.source x_stream_o , - // XIF ports + // XIF ports (unused if CtrlIntfConfig = HWPE_TARGET) input x_issue_req_t x_issue_req_i, output x_issue_resp_t x_issue_resp_o, input logic x_issue_valid_i, @@ -73,7 +75,9 @@ module redmule_top output logic x_result_valid_o, input logic x_result_ready_i, // TCDM master ports for the memory side - hci_core_intf.initiator tcdm + hci_core_intf.initiator tcdm, + // HWPE-ctrl target port (unused if CtrlIntfConfig = XIF) + hwpe_ctrl_intf_periph.slave target ); localparam int unsigned FpWidth = fp_width(FpFormat); @@ -83,6 +87,7 @@ logic clk_acc; logic fsm_z_clk_en, ctrl_z_clk_en; logic enable, clear; +logic target_clear; logic y_buffer_depth_count, y_buffer_load, z_buffer_fill, @@ -495,7 +500,7 @@ redmule_memory_scheduler #( ) i_memory_scheduler ( .clk_i ( clk_acc ), .rst_ni ( rst_ni ), - .clear_i ( '0 ), + .clear_i ( target_clear ), .z_priority_i ( z_priority ), .config_i ( redmule_config ), .config_valid_i ( cfg_complete ), @@ -509,52 +514,79 @@ redmule_memory_scheduler #( ); /*---------------------------------------------------------------*/ -/* | Instruction Decoder | */ +/* | Instruction Decoder (XIF) or Target Decoder (HWPE_TARGET) | */ /*---------------------------------------------------------------*/ logic tiler_busy; redmule_config_t dec_config_q; -redmule_inst_decoder #( - .InstFifoDepth ( 4 ), - .McnfigOpCode ( McnfigOpCode ), - .MarithOpCode ( MarithOpCode ), - .MopcntOpCode ( MopcntOpCode ), - .McnfigFunct3 ( McnfigFunct3 ), - .MarithFunct3 ( MarithFunct3 ), - .MopcntFunct3 ( MopcntFunct3 ), - .McnfigFunct2 ( McnfigFunct2 ), - .MarithFunct2 ( MarithFunct2 ), - .MopcntFunct2 ( MopcntFunct2 ), - .XifIdWidth ( XifIdWidth ), - .XifNumHarts ( XifNumHarts ), - .XifIssueRegisterSplit ( XifIssueRegisterSplit ), - .x_issue_req_t ( x_issue_req_t ), - .x_issue_resp_t ( x_issue_resp_t ), - .x_register_t ( x_register_t ), - .x_commit_t ( x_commit_t ), - .x_result_t ( x_result_t ) -) i_inst_decoder ( - .clk_i ( clk_i ), - .rst_ni ( rst_ni ), - .clear_i ( '0 ), - .config_ready_i ( ~config_fifo_full ), - .op_done_i ( flgs_streamer.z_stream_sink_flags.done ), - .config_valid_o ( dec_config_valid ), - .config_o ( dec_config ), - .x_issue_req_i ( x_issue_req_i ), - .x_issue_resp_o ( x_issue_resp_o ), - .x_issue_valid_i ( x_issue_valid_i ), - .x_issue_ready_o ( x_issue_ready_o ), - .x_register_i ( x_register_i ), - .x_register_valid_i ( x_register_valid_i ), - .x_register_ready_o ( x_register_ready_o ), - .x_commit_i ( x_commit_i ), - .x_commit_valid_i ( x_commit_valid_i ), - .x_result_o ( x_result_o ), - .x_result_valid_o ( x_result_valid_o ), - .x_result_ready_i ( x_result_ready_i ) -); +if(CtrlIntfConfig == XIF) begin : xif_ctrl_intf_gen + redmule_inst_decoder #( + .InstFifoDepth ( 4 ), + .McnfigOpCode ( McnfigOpCode ), + .MarithOpCode ( MarithOpCode ), + .MopcntOpCode ( MopcntOpCode ), + .McnfigFunct3 ( McnfigFunct3 ), + .MarithFunct3 ( MarithFunct3 ), + .MopcntFunct3 ( MopcntFunct3 ), + .McnfigFunct2 ( McnfigFunct2 ), + .MarithFunct2 ( MarithFunct2 ), + .MopcntFunct2 ( MopcntFunct2 ), + .XifIdWidth ( XifIdWidth ), + .XifNumHarts ( XifNumHarts ), + .XifIssueRegisterSplit ( XifIssueRegisterSplit ), + .x_issue_req_t ( x_issue_req_t ), + .x_issue_resp_t ( x_issue_resp_t ), + .x_register_t ( x_register_t ), + .x_commit_t ( x_commit_t ), + .x_result_t ( x_result_t ) + ) i_inst_decoder ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .clear_i ( '0 ), // TODO: fixme, not having a software-based clear mechanism is a bad idea. + .config_ready_i ( ~config_fifo_full ), + .op_done_i ( flgs_streamer.z_stream_sink_flags.done ), + .config_valid_o ( dec_config_valid ), + .config_o ( dec_config ), + .x_issue_req_i ( x_issue_req_i ), + .x_issue_resp_o ( x_issue_resp_o ), + .x_issue_valid_i ( x_issue_valid_i ), + .x_issue_ready_o ( x_issue_ready_o ), + .x_register_i ( x_register_i ), + .x_register_valid_i ( x_register_valid_i ), + .x_register_ready_o ( x_register_ready_o ), + .x_commit_i ( x_commit_i ), + .x_commit_valid_i ( x_commit_valid_i ), + .x_result_o ( x_result_o ), + .x_result_valid_o ( x_result_valid_o ), + .x_result_ready_i ( x_result_ready_i ) + ); + // bind unused HWPE_TARGET signals + assign target_clear = '0; // TODO: a software-accessible clear should be added also to the XIF interface + assign target.gnt = '1; + assign target.r_data = '0; + assign target.r_valid = '0; + assign target.r_id = '0; +end +else begin : mm_ctrl_intf_gen + redmule_target_decoder i_target_decoder ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .clear_i ( '0 ), // ORed internally with target_clear + .target_clear_o ( target_clear ), + .config_ready_i ( ~config_fifo_full ), + .op_done_i ( flgs_streamer.z_stream_sink_flags.done ), + .config_valid_o ( dec_config_valid ), + .config_o ( dec_config ), + .target ( target ) + ); + // bind unused XIF signals + assign x_issue_resp_o = '0; + assign x_issue_ready_o = '0; + assign x_register_ready_o = '0; + assign x_result_o = '0; + assign x_result_valid_o = '0; +end fifo_v3 #( .FALL_THROUGH ( 0 ), @@ -591,6 +623,7 @@ redmule_ctrl #( .flgs_streamer_i ( flgs_streamer ), .busy_o ( busy_o ), .tiler_busy_o ( tiler_busy ), + .target_clear_i ( target_clear ), .clear_o ( clear ), .evt_o ( evt_o ), .config_i ( dec_config_q ), @@ -620,7 +653,7 @@ redmule_scheduler #( .clk_i ( clk_acc ), .rst_ni ( rst_ni ), .test_mode_i ( test_mode_i ), - .clear_i ( '0 ), + .clear_i ( target_clear ), .x_valid_i ( x_buffer_fifo.valid ), .w_valid_i ( w_buffer_fifo.valid ), .y_valid_i ( y_buffer_fifo.valid ),