UoB-HPC · JosephMoore25 · Feb 9, 2024 · Jan 16, 2024 · Jan 18, 2024 · Jan 18, 2024
diff --git a/src/include/simeng/arch/Architecture.hh b/src/include/simeng/arch/Architecture.hh
@@ -65,7 +65,7 @@ class Architecture {
    * Writes into the supplied macro-op vector, and returns the number of bytes
    * consumed to produce it; a value of 0 indicates too few bytes were present
    * for a valid decoding. */
-  virtual uint8_t predecode(const void* ptr, uint8_t bytesAvailable,
+  virtual uint8_t predecode(const void* ptr, uint16_t bytesAvailable,
                             uint64_t instructionAddress,
                             MacroOp& output) const = 0;
 

diff --git a/src/include/simeng/arch/aarch64/Architecture.hh b/src/include/simeng/arch/aarch64/Architecture.hh
@@ -24,7 +24,7 @@ class Architecture : public arch::Architecture {
   /** Pre-decode instruction memory into a macro-op of `Instruction`
    * instances. Returns the number of bytes consumed to produce it (always 4),
    * and writes into the supplied macro-op vector. */
-  uint8_t predecode(const void* ptr, uint8_t bytesAvailable,
+  uint8_t predecode(const void* ptr, uint16_t bytesAvailable,
                     uint64_t instructionAddress,
                     MacroOp& output) const override;
 

diff --git a/src/include/simeng/arch/riscv/Architecture.hh b/src/include/simeng/arch/riscv/Architecture.hh
@@ -23,7 +23,7 @@ class Architecture : public arch::Architecture {
   /** Pre-decode instruction memory into a macro-op of `Instruction`
    * instances. Returns the number of bytes consumed to produce it (always 4),
    * and writes into the supplied macro-op vector. */
-  uint8_t predecode(const void* ptr, uint8_t bytesAvailable,
+  uint8_t predecode(const void* ptr, uint16_t bytesAvailable,
                     uint64_t instructionAddress,
                     MacroOp& output) const override;
 

diff --git a/src/include/simeng/pipeline/FetchUnit.hh b/src/include/simeng/pipeline/FetchUnit.hh
@@ -115,7 +115,7 @@ class FetchUnit {
   uint8_t* fetchBuffer_;
 
   /** The amount of data currently in the fetch buffer. */
-  uint8_t bufferedBytes_ = 0;
+  uint16_t bufferedBytes_ = 0;
 };
 
 }  // namespace pipeline

diff --git a/src/lib/arch/aarch64/Architecture.cc b/src/lib/arch/aarch64/Architecture.cc
@@ -144,7 +144,7 @@ Architecture::~Architecture() {
   SVCRval_ = 0;
 }
 
-uint8_t Architecture::predecode(const void* ptr, uint8_t bytesAvailable,
+uint8_t Architecture::predecode(const void* ptr, uint16_t bytesAvailable,
                                 uint64_t instructionAddress,
                                 MacroOp& output) const {
   // Check that instruction address is 4-byte aligned as required by Armv9.2-a

diff --git a/src/lib/arch/riscv/Architecture.cc b/src/lib/arch/riscv/Architecture.cc
@@ -144,7 +144,7 @@ Architecture::~Architecture() {
   groupExecutionInfo_.clear();
 }
 
-uint8_t Architecture::predecode(const void* ptr, uint8_t bytesAvailable,
+uint8_t Architecture::predecode(const void* ptr, uint16_t bytesAvailable,
                                 uint64_t instructionAddress,
                                 MacroOp& output) const {
   // Check that instruction address is 4-byte aligned as required by RISC-V

diff --git a/src/lib/config/ModelConfig.cc b/src/lib/config/ModelConfig.cc
@@ -318,7 +318,7 @@ void ModelConfig::setExpectations(bool isDefault) {
   expectations_["Fetch"].addChild(
       ExpectationNode::createExpectation<uint16_t>(32, "Fetch-Block-Size"));
   expectations_["Fetch"]["Fetch-Block-Size"].setValueSet(std::vector<uint16_t>{
-      4, 8, 16, 32, 64, 128, 256, 512, 1024, 4096, 8192, 16384, 32768});
+      4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768});
 
   expectations_["Fetch"].addChild(
       ExpectationNode::createExpectation<uint16_t>(32, "Loop-Buffer-Size"));
@@ -346,17 +346,19 @@ void ModelConfig::setExpectations(bool isDefault) {
   // Register-Set
   expectations_.addChild(ExpectationNode::createExpectation("Register-Set"));
   if (isa_ == ISA::AArch64) {
+    // TODO: Reduce to 32 once renaming issue has been sorted
     expectations_["Register-Set"].addChild(
-        ExpectationNode::createExpectation<uint16_t>(32,
+        ExpectationNode::createExpectation<uint16_t>(38,
                                                      "GeneralPurpose-Count"));
     expectations_["Register-Set"]["GeneralPurpose-Count"]
-        .setValueBounds<uint16_t>(32, UINT16_MAX);
+        .setValueBounds<uint16_t>(38, UINT16_MAX);
 
+    // TODO: Reduce to 32 once renaming issue has been sorted
     expectations_["Register-Set"].addChild(
         ExpectationNode::createExpectation<uint16_t>(
-            32, "FloatingPoint/SVE-Count"));
+            38, "FloatingPoint/SVE-Count"));
     expectations_["Register-Set"]["FloatingPoint/SVE-Count"]
-        .setValueBounds<uint16_t>(32, UINT16_MAX);
+        .setValueBounds<uint16_t>(38, UINT16_MAX);
 
     expectations_["Register-Set"].addChild(
         ExpectationNode::createExpectation<uint16_t>(17, "Predicate-Count",
@@ -377,14 +379,16 @@ void ModelConfig::setExpectations(bool isDefault) {
     expectations_["Register-Set"].addChild(
         ExpectationNode::createExpectation<uint16_t>(32,
                                                      "GeneralPurpose-Count"));
+    // TODO: Reduce to 32 once renaming issue has been sorted
     expectations_["Register-Set"]["GeneralPurpose-Count"]
-        .setValueBounds<uint16_t>(32, UINT16_MAX);
+        .setValueBounds<uint16_t>(38, UINT16_MAX);
 
     expectations_["Register-Set"].addChild(
         ExpectationNode::createExpectation<uint16_t>(32,
                                                      "FloatingPoint-Count"));
+    // TODO: Reduce to 32 once renaming issue has been sorted
     expectations_["Register-Set"]["FloatingPoint-Count"]
-        .setValueBounds<uint16_t>(32, UINT16_MAX);
+        .setValueBounds<uint16_t>(38, UINT16_MAX);
   }
 
   // Pipeline-Widths
@@ -485,13 +489,24 @@ void ModelConfig::setExpectations(bool isDefault) {
 
   expectations_["LSQ-L1-Interface"].addChild(
       ExpectationNode::createExpectation<uint16_t>(32, "Load-Bandwidth"));
-  expectations_["LSQ-L1-Interface"]["Load-Bandwidth"].setValueBounds<uint16_t>(
-      1, UINT16_MAX);
 
   expectations_["LSQ-L1-Interface"].addChild(
       ExpectationNode::createExpectation<uint16_t>(32, "Store-Bandwidth"));
-  expectations_["LSQ-L1-Interface"]["Store-Bandwidth"].setValueBounds<uint16_t>(
-      1, UINT16_MAX);
+
+  // AArch64 requires a vector length of at least 128, requiring a minimum of 16
+  // byte load/store bandwidths
+  // For RV64, the the minimum required load/store bandwidth is 8 bytes
+  if (isa_ == ISA::AArch64) {
+    expectations_["LSQ-L1-Interface"]["Load-Bandwidth"]
+        .setValueBounds<uint16_t>(16, UINT16_MAX);
+    expectations_["LSQ-L1-Interface"]["Store-Bandwidth"]
+        .setValueBounds<uint16_t>(16, UINT16_MAX);
+  } else if (isa_ == ISA::RV64) {
+    expectations_["LSQ-L1-Interface"]["Store-Bandwidth"]
+        .setValueBounds<uint16_t>(8, UINT16_MAX);
+    expectations_["LSQ-L1-Interface"]["Load-Bandwidth"]
+        .setValueBounds<uint16_t>(8, UINT16_MAX);
+  }
 
   expectations_["LSQ-L1-Interface"].addChild(
       ExpectationNode::createExpectation<uint16_t>(
@@ -926,6 +941,56 @@ void ModelConfig::postValidation() {
     invalid_ << "\t- Only a 'Flat' L1-Instruction-Memory Interface-Type is "
                 "supported. Interface-Type used is "
              << l1iType << "\n";
+
+  if (isa_ == ISA::AArch64) {
+    // Ensure LSQ-L1-Interface Load/Store Bandwidth is large enough to
+    // accomodate a full vector load of the specified Vector-Length parameter
+    if (configTree_["Core"]["Vector-Length"].as<uint16_t>() / 8 >
+        configTree_["LSQ-L1-Interface"]["Load-Bandwidth"].as<uint16_t>()) {
+      invalid_
+          << "\t- Load-Bandwidth (bytes) must be greater than Vector-Length "
+             "(bits). "
+             "The current Load-Bandwidth is set to "
+          << configTree_["LSQ-L1-Interface"]["Load-Bandwidth"].as<uint16_t>()
+          << " bytes, when it must be at least "
+          << configTree_["Core"]["Vector-Length"].as<uint16_t>() / 8 << "\n";
+    }
+    if (configTree_["Core"]["Vector-Length"].as<uint16_t>() / 8 >
+        configTree_["LSQ-L1-Interface"]["Store-Bandwidth"].as<uint16_t>()) {
+      invalid_
+          << "\t- Store-Bandwidth (bytes) must be greater than Vector-Length "
+             "(bits). "
+             "The current Store-Bandwidth is set to "
+          << configTree_["LSQ-L1-Interface"]["Store-Bandwidth"].as<uint16_t>()
+          << " bytes, when it must be at least "
+          << configTree_["Core"]["Vector-Length"].as<uint16_t>() / 8 << "\n";
+    }
+    // Ensure LSQ-L1-Interface Load/Store Bandwidth is also large enough to
+    // accomodate a full vector load of the specified Streaming-Vector-Length
+    // parameter when streaming mode is enabled
+    if (configTree_["Core"]["Streaming-Vector-Length"].as<uint16_t>() / 8 >
+        configTree_["LSQ-L1-Interface"]["Load-Bandwidth"].as<uint16_t>()) {
+      invalid_
+          << "\t- Load-Bandwidth (bytes) must be greater than "
+             "Streaming-Vector-Length (bits). "
+             "The current Load-Bandwidth is set to "
+          << configTree_["LSQ-L1-Interface"]["Load-Bandwidth"].as<uint16_t>()
+          << " bytes, when it must be at least "
+          << configTree_["Core"]["Streaming-Vector-Length"].as<uint16_t>() / 8
+          << "\n";
+    }
+    if (configTree_["Core"]["Streaming-Vector-Length"].as<uint16_t>() / 8 >
+        configTree_["LSQ-L1-Interface"]["Store-Bandwidth"].as<uint16_t>()) {
+      invalid_
+          << "\t- Store-Bandwidth (bytes) must be greater than "
+             "Streaming-Vector-Length (bits). "
+             "The current Store-Bandwidth is set to "
+          << configTree_["LSQ-L1-Interface"]["Store-Bandwidth"].as<uint16_t>()
+          << " bytes, when it must be at least "
+          << configTree_["Core"]["Streaming-Vector-Length"].as<uint16_t>() / 8
+          << "\n";
+    }
+  }
 }
 
 ryml::Tree ModelConfig::getConfig() { return configTree_; }

diff --git a/src/lib/models/emulation/Core.cc b/src/lib/models/emulation/Core.cc
@@ -8,7 +8,7 @@ namespace emulation {
 
 // TODO: Expose as config option
 /** The number of bytes fetched each cycle. */
-const uint8_t FETCH_SIZE = 4;
+const uint16_t FETCH_SIZE = 4;
 const unsigned int clockFrequency = 2.5 * 1e9;
 
 Core::Core(MemoryInterface& instructionMemory, MemoryInterface& dataMemory,

diff --git a/src/lib/pipeline/FetchUnit.cc b/src/lib/pipeline/FetchUnit.cc
@@ -41,7 +41,6 @@ void FetchUnit::tick() {
       auto bytesRead = isa_.predecode(&(loopBuffer_.front().encoding),
                                       loopBuffer_.front().instructionSize,
                                       loopBuffer_.front().address, macroOp);
-
       assert(bytesRead != 0 && "predecode failure for loop buffer entry");
 
       // Set prediction to recorded value during loop buffer filling
@@ -58,7 +57,7 @@ void FetchUnit::tick() {
 
   // Pointer to the instruction data to decode from
   const uint8_t* buffer;
-  uint8_t bufferOffset;
+  uint16_t bufferOffset;
 
   // Check if more instruction data is required
   if (bufferedBytes_ < isa_.getMaxInstructionSize()) {

diff --git a/test/regression/aarch64/Exception.cc b/test/regression/aarch64/Exception.cc
@@ -237,13 +237,16 @@ INSTANTIATE_TEST_SUITE_P(
     ::testing::Values(
         std::make_tuple(
             EMULATION,
-            "{Core: {Vector-Length: 512, Streaming-Vector-Length: 1024}}"),
+            "{Core: {Vector-Length: 512, Streaming-Vector-Length: 1024}, "
+            "LSQ-L1-Interface: {Load-Bandwidth: 256, Store-Bandwidth: 256}}"),
         std::make_tuple(
             INORDER,
-            "{Core: {Vector-Length: 512, Streaming-Vector-Length: 1024}}"),
+            "{Core: {Vector-Length: 512, Streaming-Vector-Length: 1024}, "
+            "LSQ-L1-Interface: {Load-Bandwidth: 256, Store-Bandwidth: 256}}"),
         std::make_tuple(
             OUTOFORDER,
-            "{Core: {Vector-Length: 512, Streaming-Vector-Length: 1024}}")),
+            "{Core: {Vector-Length: 512, Streaming-Vector-Length: 1024}, "
+            "LSQ-L1-Interface: {Load-Bandwidth: 256, Store-Bandwidth: 256}}")),
     paramToString);
 
 }  // namespace
diff --git a/test/unit/MockArchitecture.hh b/test/unit/MockArchitecture.hh
@@ -9,7 +9,7 @@ namespace simeng {
 class MockArchitecture : public arch::Architecture {
  public:
   MOCK_CONST_METHOD4(predecode,
-                     uint8_t(const void* ptr, uint8_t bytesAvailable,
+                     uint8_t(const void* ptr, uint16_t bytesAvailable,
                              uint64_t instructionAddress, MacroOp& output));
   MOCK_CONST_METHOD1(canRename, bool(Register reg));
   MOCK_CONST_METHOD1(getSystemRegisterTag, int32_t(uint16_t reg));

diff --git a/test/unit/aarch64/ArchitectureTest.cc b/test/unit/aarch64/ArchitectureTest.cc
@@ -32,6 +32,10 @@ class AArch64ArchitectureTest : public testing::Test {
       Vector-Length: 512,
       Streaming-Vector-Length: 128
     },
+    LSQ-L1-Interface: {
+      Load-Bandwidth: 64,
+      Store-Bandwidth: 64
+    },
     Ports: { 
       '0': {Portname: Port 0, Instruction-Group-Support: [FP, SVE]},
       '1': {Portname: Port 1, Instruction-Group-Support: [PREDICATE]},