diff --git a/configs/a64fx.yaml b/configs/a64fx.yaml index 1ace725d60..0c3ac7e0df 100644 --- a/configs/a64fx.yaml +++ b/configs/a64fx.yaml @@ -113,17 +113,17 @@ Reservation-Stations: - EXB 2: Size: 10 - Dispatch-Rate: 2 + Dispatch-Rate: 1 Ports: - EAGA 3: Size: 10 - Dispatch-Rate: 2 + Dispatch-Rate: 1 Ports: - EAGB 4: Size: 19 - Dispatch-Rate: 2 + Dispatch-Rate: 1 Ports: - BR Execution-Units: @@ -245,6 +245,125 @@ Latencies: - STORE_ADDRESS_SVE Execution-Latency: 6 Execution-Throughput: 1 +# Indexed FMLA instructions split into 2 dependent µops. Latency increased to 15 to mimic such behaviour +# NOTE: Any changes to the capstone opcode list could invalidate the mapping between ARM instructions and the values below + 11: + Instruction-Groups: + - ~1922 + - ~1924 + - ~1926 + - ~2359 + - ~2360 + - ~2361 + - ~2364 + - ~2365 + - ~2368 + - ~2369 + - ~2371 + - ~2390 + - ~2391 + - ~2392 + - ~2395 + - ~2396 + - ~2399 + - ~2400 + - ~2402 + - ~2445 + - ~2446 + - ~2447 + - ~2450 + - ~2451 + - ~2454 + - ~2455 + - ~2457 + - ~2470 + - ~2471 + - ~2472 + - ~2475 + - ~2476 + - ~2479 + - ~2480 + - ~2482 + - ~3627 + - ~3629 + - ~3631 + - ~3633 + - ~3644 + - ~3646 + - ~3648 + - ~3650 + - ~3709 + - ~3711 + - ~3713 + - ~3715 + - ~4306 + - ~4308 + - ~4310 + - ~4312 + - ~4326 + - ~4328 + - ~4330 + - ~4332 + - ~4372 + - ~4374 + - ~4376 + - ~4378 + - ~4468 + - ~4469 + - ~4470 + - ~4472 + - ~4474 + - ~4476 + - ~4493 + - ~4494 + - ~4495 + - ~4497 + - ~4499 + - ~4501 + - ~4511 + - ~4513 + - ~4515 + - ~4517 + - ~4519 + - ~4521 + - ~4534 + - ~4535 + - ~4536 + - ~4538 + - ~4540 + - ~4542 + - ~4594 + - ~4595 + - ~4599 + - ~4601 + - ~4603 + - ~4605 + - ~4613 + - ~4614 + - ~4618 + - ~4620 + - ~4622 + - ~4624 + - ~4633 + - ~4635 + - ~4637 + - ~4639 + - ~4641 + - ~4643 + - ~5760 + - ~5762 + - ~5764 + - ~5766 + - ~5780 + - ~5782 + - ~5784 + - ~5786 + - ~5824 + - ~5826 + - ~5828 + - ~5830 + Execution-Latency: 15 + Execution-Throughput: 1 # CPU-Info mainly used to generate a replica of the special (or system) file directory # structure CPU-Info: diff --git a/configs/a64fx_SME.yaml b/configs/a64fx_SME.yaml index f304234ee8..9a7c29f94c 100644 --- a/configs/a64fx_SME.yaml +++ b/configs/a64fx_SME.yaml @@ -81,29 +81,17 @@ Ports: - INT_DIV_OR_SQRT 5: Portname: EAGA - Instruction-Support: - - LOAD_INT - - LOAD_SCALAR - - LOAD_VECTOR - - LOAD_SVE - - STORE_INT - - STORE_SCALAR - - STORE_VECTOR - - STORE_SVE + Instruction-Support: + - LOAD + - STORE_ADDRESS - INT_SIMPLE_ARTH_NOSHIFT - INT_SIMPLE_LOGICAL_NOSHIFT - INT_SIMPLE_CMP 6: Portname: EAGB Instruction-Support: - - LOAD_INT - - LOAD_SCALAR - - LOAD_VECTOR - - LOAD_SVE - - STORE_INT - - STORE_SCALAR - - STORE_VECTOR - - STORE_SVE + - LOAD + - STORE_ADDRESS - INT_SIMPLE_ARTH_NOSHIFT - INT_SIMPLE_LOGICAL_NOSHIFT - INT_SIMPLE_CMP @@ -115,11 +103,6 @@ Ports: Portname: SME Instruction-Support: - SME - 9: - Portname: SME_LD_STR - Instruction-Support: - - LOAD_SME - - STORE_SME Reservation-Stations: 0: Size: 20 @@ -136,29 +119,24 @@ Reservation-Stations: - EXB 2: Size: 10 - Dispatch-Rate: 2 + Dispatch-Rate: 1 Ports: - EAGA 3: Size: 10 - Dispatch-Rate: 2 + Dispatch-Rate: 1 Ports: - EAGB 4: Size: 19 - Dispatch-Rate: 2 + Dispatch-Rate: 1 Ports: - BR 5: Size: 20 - Dispatch-Rate: 2 + Dispatch-Rate: 1 Ports: - SME - 6: - Size: 10 - Dispatch-Rate: 2 - Ports: - - SME_LD_STR Execution-Units: 0: Pipelined: True @@ -214,12 +192,6 @@ Execution-Units: - INT_DIV_OR_SQRT - FP_DIV_OR_SQRT - SVE_DIV_OR_SQRT - 9: - Pipelined: True - Blocking-Groups: - - INT_DIV_OR_SQRT - - FP_DIV_OR_SQRT - - SVE_DIV_OR_SQRT Latencies: 0: Instruction-Groups: @@ -248,8 +220,10 @@ Latencies: - SCALAR_SIMPLE - VECTOR_SIMPLE_LOGICAL - SVE_SIMPLE_LOGICAL + - SME_SIMPLE_LOGICAL - VECTOR_SIMPLE_CMP - SVE_SIMPLE_CMP + - SME_SIMPLE_CMP Execution-Latency: 4 Execution-Throughput: 1 5: @@ -258,18 +232,20 @@ Latencies: Execution-Latency: 29 Execution-Throughput: 29 6: - Instruction-Groups: + Instruction-Groups: + - SCALAR_SIMPLE_CVT - VECTOR_SIMPLE - SVE_SIMPLE - - SCALAR_SIMPLE_CVT + - SME_SIMPLE - FP_MUL - SVE_MUL - - SME + - SME_MUL Execution-Latency: 9 Execution-Throughput: 1 7: Instruction-Groups: - SVE_DIV_OR_SQRT + - SME_DIV_OR_SQRT Execution-Latency: 98 Execution-Throughput: 98 8: @@ -288,34 +264,129 @@ Latencies: 10: Instruction-Groups: - LOAD_SVE - - STORE_ADDRESS_SVE - LOAD_SME + - STORE_ADDRESS_SVE - STORE_ADDRESS_SME Execution-Latency: 6 Execution-Throughput: 1 +# Indexed FMLA instructions split into 2 dependent µops. Latency increased to 15 to mimic such behaviour +# NOTE: Any changes to the capstone opcode list could invalidate the mapping between ARM instructions and the values below 11: Instruction-Groups: - - SME_SIMPLE_LOGICAL - - SME_SIMPLE_CMP - # Same as SVE - Execution-Latency: 4 - Execution-Throughput: 1 - 12: - Instruction-Groups: - - SME_SIMPLE - - SME_DIV_OR_SQRT - - SME_MUL - # SME_MUL Used only by outer-product instructions - # Same as SVE. No SME DIV or SQRT so classification to this group should be impossible. - # Kept to catch edge cases. - Execution-Latency: 9 - Execution-Throughput: 1 - 13: - Instruction-Groups: - - LOAD_SME - - STORE_ADDRESS_SME - # Same as SVE LD/STR - Execution-Latency: 6 + - ~1922 + - ~1924 + - ~1926 + - ~2359 + - ~2360 + - ~2361 + - ~2364 + - ~2365 + - ~2368 + - ~2369 + - ~2371 + - ~2390 + - ~2391 + - ~2392 + - ~2395 + - ~2396 + - ~2399 + - ~2400 + - ~2402 + - ~2445 + - ~2446 + - ~2447 + - ~2450 + - ~2451 + - ~2454 + - ~2455 + - ~2457 + - ~2470 + - ~2471 + - ~2472 + - ~2475 + - ~2476 + - ~2479 + - ~2480 + - ~2482 + - ~3627 + - ~3629 + - ~3631 + - ~3633 + - ~3644 + - ~3646 + - ~3648 + - ~3650 + - ~3709 + - ~3711 + - ~3713 + - ~3715 + - ~4306 + - ~4308 + - ~4310 + - ~4312 + - ~4326 + - ~4328 + - ~4330 + - ~4332 + - ~4372 + - ~4374 + - ~4376 + - ~4378 + - ~4468 + - ~4469 + - ~4470 + - ~4472 + - ~4474 + - ~4476 + - ~4493 + - ~4494 + - ~4495 + - ~4497 + - ~4499 + - ~4501 + - ~4511 + - ~4513 + - ~4515 + - ~4517 + - ~4519 + - ~4521 + - ~4534 + - ~4535 + - ~4536 + - ~4538 + - ~4540 + - ~4542 + - ~4594 + - ~4595 + - ~4599 + - ~4601 + - ~4603 + - ~4605 + - ~4613 + - ~4614 + - ~4618 + - ~4620 + - ~4622 + - ~4624 + - ~4633 + - ~4635 + - ~4637 + - ~4639 + - ~4641 + - ~4643 + - ~5760 + - ~5762 + - ~5764 + - ~5766 + - ~5780 + - ~5782 + - ~5784 + - ~5786 + - ~5824 + - ~5826 + - ~5828 + - ~5830 + Execution-Latency: 15 Execution-Throughput: 1 # CPU-Info mainly used to generate a replica of the special (or system) file directory # structure diff --git a/src/lib/ModelConfig.cc b/src/lib/ModelConfig.cc index 84e71ced5b..19c07d0213 100644 --- a/src/lib/ModelConfig.cc +++ b/src/lib/ModelConfig.cc @@ -513,7 +513,10 @@ void ModelConfig::validate() { // AArch64_INSTRUCTION_LIST_END boundChecker(configFile_[root][i]["Instruction-Opcode"][opcodeIndex], (std::string(latNum) + std::string(grpNum)), - std::make_pair(0, 4516), ExpectedValue::UInteger); + std::make_pair( + 0, static_cast( + AARCH64Opcode::AArch64_INSTRUCTION_LIST_END)), + ExpectedValue::UInteger); opcodeIndex++; } else if (nodeChecker( grpNode[j], (std::string(latNum) + std::string(grpNum)), diff --git a/src/lib/pipeline/LoadStoreQueue.cc b/src/lib/pipeline/LoadStoreQueue.cc index 7844e465cd..c7a93ba633 100644 --- a/src/lib/pipeline/LoadStoreQueue.cc +++ b/src/lib/pipeline/LoadStoreQueue.cc @@ -433,55 +433,57 @@ void LoadStoreQueue::tick() { // Iterate over requests ready this cycle while (itInsn != itReq->second.end()) { - // Speculatively increment count of this request type - reqCounts[isStore]++; - - // Ensure the limit on the number of permitted operations is adhered - // to - if (reqCounts[isStore] + reqCounts[!isStore] > totalLimit_) { - // No more requests can be scheduled this cycle - exceededLimits = {true, true}; - break; - } else if (reqCounts[isStore] > reqLimits_[isStore]) { - // No more requests of this type can be scheduled this cycle - exceededLimits[isStore] = true; - // Remove speculative increment to ensure it doesn't count for - // comparisons aginast the totalLimit_ - reqCounts[isStore]--; - break; - } else { - // Schedule requests from the queue of addresses in - // request[Load|Store]Queue_ entry - auto& addressQueue = itInsn->reqAddresses; - while (addressQueue.size()) { - const simeng::MemoryAccessTarget req = addressQueue.front(); - - // Ensure the limit on the data transfered per cycle is adhered to - assert(req.size <= bandwidth && - "Individual memory request from LoadStoreQueue exceeds L1 " - "bandwidth set and thus will never be submitted"); - dataTransfered[isStore] += req.size; - if (dataTransfered[isStore] > bandwidth) { - // No more requests can be scheduled this cycle - exceededLimits[isStore] = true; - itInsn = itReq->second.end(); - break; - } - - // Request a read from the memory interface if the requestQueue_ - // entry represents a read - if (!isStore) { - memory_.requestRead(req, itInsn->insn->getSequenceId()); - } + // Schedule requests from the queue of addresses in + // request[Load|Store]Queue_ entry + auto& addressQueue = itInsn->reqAddresses; + while (addressQueue.size()) { + const simeng::MemoryAccessTarget req = + addressQueue.front(); // Speculatively increment count of this + // request type + reqCounts[isStore]++; + + // Ensure the limit on the number of permitted operations is adhered + // to + if (reqCounts[isStore] + reqCounts[!isStore] > totalLimit_) { + // No more requests can be scheduled this cycle + exceededLimits = {true, true}; + itInsn = itReq->second.end(); + break; + } else if (reqCounts[isStore] > reqLimits_[isStore]) { + // No more requests of this type can be scheduled this cycle + exceededLimits[isStore] = true; + // Remove speculative increment to ensure it doesn't count for + // comparisons against the totalLimit_ + reqCounts[isStore]--; + itInsn = itReq->second.end(); + break; + } - // Remove processed address from queue - addressQueue.pop(); + // Ensure the limit on the data transfered per cycle is adhered to + assert(req.size <= bandwidth && + "Individual memory request from LoadStoreQueue exceeds L1 " + "bandwidth set and thus will never be submitted"); + dataTransfered[isStore] += req.size; + if (dataTransfered[isStore] > bandwidth) { + // No more requests can be scheduled this cycle + exceededLimits[isStore] = true; + itInsn = itReq->second.end(); + break; } - // Remove entry from vector iff all of its requests have been - // scheduled - if (addressQueue.size() == 0) { - itInsn = itReq->second.erase(itInsn); + + // Request a read from the memory interface if the requestQueue_ + // entry represents a read + if (!isStore) { + memory_.requestRead(req, itInsn->insn->getSequenceId()); } + + // Remove processed address from queue + addressQueue.pop(); + } + // Remove entry from vector if all of its requests have been + // scheduled + if (addressQueue.size() == 0) { + itInsn = itReq->second.erase(itInsn); } }