diff --git a/config/amd_ras_gpio_config0.json b/config/amd_ras_gpio_config0.json index 4753449..11af1c8 100644 --- a/config/amd_ras_gpio_config0.json +++ b/config/amd_ras_gpio_config0.json @@ -1,3 +1,14 @@ { - "GPIO_ALERT_LINES": ["P0_I3C_APML_ALERT_L", "P1_I3C_APML_ALERT_L", "P2_I3C_APML_ALERT_L", "P3_I3C_APML_ALERT_L"] + "Alert_Config": [ + { + "AlertHandle": { + "Description": "Modes to handle RAS alert", + "Value": "UEVENT", + "ValidOptions": ["UEVENT", "GPIO"] + } + }, + { + "GPIO_ALERT_LINES": ["P0_I3C_APML_ALERT_L", "P1_I3C_APML_ALERT_L", "P2_I3C_APML_ALERT_L", "P3_I3C_APML_ALERT_L"] + } + ] } diff --git a/config/amd_ras_gpio_config1.json b/config/amd_ras_gpio_config1.json index 52a825a..f07a3e9 100644 --- a/config/amd_ras_gpio_config1.json +++ b/config/amd_ras_gpio_config1.json @@ -1,3 +1,15 @@ { - "GPIO_ALERT_LINES": ["P0_I3C_APML_ALERT_L"] + "Alert_Config": [ + { + "AlertHandle": { + "Description": "Modes to handle RAS alert", + "Value": "UEVENT", + "ValidOptions": ["UEVENT", "GPIO"] + } + }, + { + "GPIO_ALERT_LINES": ["P0_I3C_APML_ALERT_L"] + } + ] } + diff --git a/config/amd_ras_gpio_config2.json b/config/amd_ras_gpio_config2.json index 62a2a86..d0d71f7 100644 --- a/config/amd_ras_gpio_config2.json +++ b/config/amd_ras_gpio_config2.json @@ -1,3 +1,14 @@ { - "GPIO_ALERT_LINES": ["P1_I3C_APML_ALERT_L"] + "Alert_Config": [ + { + "AlertHandle": { + "Description": "Modes to handle RAS alert", + "Value": "UEVENT", + "ValidOptions": ["UEVENT", "GPIO"] + } + }, + { + "GPIO_ALERT_LINES": ["P1_I3C_APML_ALERT_L"] + } + ] } diff --git a/include/apml_manager.hpp b/include/apml_manager.hpp index ca3d93d..4209347 100644 --- a/include/apml_manager.hpp +++ b/include/apml_manager.hpp @@ -4,7 +4,9 @@ extern "C" { +#include "apml_alertl_uevent.h" #include "esmi_mailbox.h" +#include "linux/amd-apml.h" } #include @@ -54,16 +56,36 @@ class Manager : public amd::ras::Manager */ virtual void init(); - /** @brief Request GPIO events for APML alert handling. + /** @brief Register udev or request GPIO event for APML alert handling. * - * @details This function sets up GPIO event handling for APML alerts. It - * requests GPIO events for alert handling by binding the alert - * event handler to the specified GPIO line and event. The number of GPIO - * lines to be monitored is read from the amd_ras_gpio_config.json file + * @details This function registers for udev events from Alertl_L driver on + * RAS alerts or sets up GPIO event handling fot APML alerts based on + * apmlAlertFlag. If APML Alert_L is enabled, it uses the APML API to + * register for udev events. Otherwise, it requests GPIO events for alert + * handling by binding the alert event handler to the respective GPIO lines. + * The number of GPIO lines to be monitored and the flag apmlAlertFlag value + * is read from the amd_ras_gpio_config.json file. */ virtual void configure(); + /** @brief Unregister the APML Alert_L udev events. + * + * @details This function invokes APML API to unregister for udev events on + * RAS alerts. + */ + void releaseUdevReSrc(); + + /** @brief apmlAlertlFlag getter function + * + * @details This function returns the alertHandleMode. The value + * is read from the amd_ras_gpio_config.json file + */ + const std::string& getAlertHandleMode() const + { + return alertHandleMode; + } + private: sdbusplus::asio::object_server& objectServer; std::shared_ptr& systemBus; @@ -83,11 +105,25 @@ class Manager : public amd::ras::Manager boost::asio::deadline_timer* McaErrorPollingEvent; boost::asio::deadline_timer* DramCeccErrorPollingEvent; boost::asio::deadline_timer* PcieAerErrorPollingEvent; + boost::asio::deadline_timer* ApmlAlertEvent; std::mutex harvestMutex; std::mutex mcaErrorHarvestMtx; std::mutex dramErrorHarvestMtx; std::mutex pcieErrorHarvestMtx; std::vector gpioLines; + std::vector ud; + std::string alertHandleMode; + + /** + * @brief Handler for alert events. + * + * @details This function is invoked when an alert event occurs on P0 or P1. + * The function handles the event by processing the necessary response. + * + * @param[in] udev_mon - Udev monitor for monitoring event source from APML + * Alert_L driver. + */ + void alertSrcHandler(struct apml_udev_monitor* udev_mon, uint8_t socket); /** * @brief Requests GPIO events for hardware alert handling. @@ -232,7 +268,9 @@ class Manager : public amd::ras::Manager * actions. * * @param[in] socNum - Socket number of the processor. + * @param[in] src - Source of the APML Alert_L events. */ + bool decodeInterrupt(uint8_t, uint32_t); bool decodeInterrupt(uint8_t); /** @brief Check the validity of MCA banks. diff --git a/src/apml_manager.cpp b/src/apml_manager.cpp index dfcb3c0..ebb86fe 100644 --- a/src/apml_manager.cpp +++ b/src/apml_manager.cpp @@ -7,11 +7,13 @@ extern "C" { +#include "apml_alertl_uevent.h" #include "esmi_cpuid_msr.h" #ifdef APML_NDA #include "esmi_mailbox_nda.h" #endif #include "esmi_rmi.h" +#include "linux/amd-apml.h" } #include @@ -114,7 +116,8 @@ Manager::Manager(amd::ras::config::Manager& manager, io(io), apmlInitialized(false), platformInitialized(false), runtimeErrPollingSupported(false), McaErrorPollingEvent(nullptr), DramCeccErrorPollingEvent(nullptr), PcieAerErrorPollingEvent(nullptr), - mcaErrorHarvestMtx(), dramErrorHarvestMtx(), pcieErrorHarvestMtx() + ApmlAlertEvent(nullptr), mcaErrorHarvestMtx(), dramErrorHarvestMtx(), + pcieErrorHarvestMtx() {} void Manager::currentHostStateMonitor() @@ -519,6 +522,7 @@ void Manager::init() void Manager::configure() { + std::vector socketNames; amd::ras::util::cper::createRecord(objectServer, systemBus, node); std::string gpioConfigFile = @@ -534,31 +538,91 @@ void Manager::configure() jsonFile >> config; jsonFile.close(); - std::vector socketNames; - gpioLines.resize(cpuCount); - - if (config.contains("GPIO_ALERT_LINES")) + if (config.contains("Alert_Config") && config["Alert_Config"].is_array()) { - socketNames = - config["GPIO_ALERT_LINES"].get>(); + for (const auto& entry : config["Alert_Config"]) + { + if (entry.contains("AlertHandle")) + { + const auto& apmlAlertl = entry["AlertHandle"]; + + if (apmlAlertl.contains("Value")) + { + alertHandleMode = apmlAlertl["Value"].get(); + lg2::error("alertHandleMode {APMLALERTLFLAG}\n", + "APMLALERTLFLAG", alertHandleMode); + } + } + if (alertHandleMode == "GPIO") + { + if (entry.contains("GPIO_ALERT_LINES") && + (entry.contains("GPIO_ALERT_LINES"))) + { + lg2::error("Reading GPIO_ALERT_LINES\n"); + socketNames = entry["GPIO_ALERT_LINES"] + .get>(); + } + } + } } - else + if (socketNames.empty() && (alertHandleMode == "GPIO")) { throw std::runtime_error( "Failed to read GPIO_ALERT_LINES from gpio_config.json file"); } - gpioEventDescriptors.reserve(cpuCount); + if (alertHandleMode != "UEVENT" && alertHandleMode != "GPIO") + { + throw std::runtime_error("Invalid mode of Alert handling"); + } - for (size_t i = 0; i < cpuCount; ++i) + if (alertHandleMode == "UEVENT") + { + ud.resize(cpuCount); + for (size_t i = 0; i < cpuCount; ++i) + { + // Register for RAS alerts via APML API + apml_register_udev_monitor(&ud[i]); + if (!ud[i].udev) + { + lg2::error("Invalid udev device\n"); + return; + } + if (!ud[i].mon) + { + lg2::error("Invalid udev monitor\n"); + apml_unregister_udev_monitor(&ud[i]); + return; + } + lg2::debug("Register to udev event is successful {CPU}\n", "CPU", + i); + + alertSrcHandler(&ud[i], i); + } + } + else { - gpioEventDescriptors.emplace_back(io); + gpioLines.resize(cpuCount); + gpioEventDescriptors.reserve(cpuCount); - requestGPIOEvents(socketNames[i], - std::bind(&ras::apml::Manager::alertEventHandler, - this, std::ref(gpioEventDescriptors[i]), - std::ref(gpioLines[i]), socIndex[i]), - gpioLines[i], gpioEventDescriptors[i]); + for (size_t i = 0; i < cpuCount; ++i) + { + gpioEventDescriptors.emplace_back(io); + + requestGPIOEvents(socketNames[i], + std::bind(&ras::apml::Manager::alertEventHandler, + this, std::ref(gpioEventDescriptors[i]), + std::ref(gpioLines[i]), i), + gpioLines[i], gpioEventDescriptors[i]); + } + } +} + +void Manager::releaseUdevReSrc() +{ + for (size_t i = 0; i < cpuCount; ++i) + { + apml_unregister_udev_monitor(&ud[i]); } } @@ -635,6 +699,47 @@ void Manager::clearSbrmiAlertMask(uint8_t socNum) } } +void Manager::alertSrcHandler(struct apml_udev_monitor* udev_mon, + uint8_t socket) +{ + uint8_t soc_num = 0; + uint32_t src = 0; + bool block = false; + oob_status_t ret; + + ret = monitor_ras_alert(udev_mon->mon, block, &soc_num, &src); + if (ret == OOB_SUCCESS) + { + if (rcd == nullptr) + { + rcd = std::make_shared(); + } + if (socket == soc_num) + { + decodeInterrupt(soc_num, src); + } + } + else if (ret == OOB_FILE_ERROR || ret == OOB_INTERRUPTED) + { + lg2::error("Error monitoring the alertl udev events Err: {ERRNO}", + "ERRNO", ret); + return; + } + + ApmlAlertEvent = + new boost::asio::deadline_timer(io, boost::posix_time::seconds(1)); + ApmlAlertEvent->async_wait( + [this, udev_mon, socket](const boost::system::error_code ec) { + if (ec) + { + lg2::error("APML alert handler error: {ERROR}", "ERROR", + ec.message().c_str()); + return; + } + alertSrcHandler(udev_mon, socket); + }); +} + void Manager::requestGPIOEvents( const std::string& name, const std::function& handler, gpiod::line& gpioLine, @@ -1676,6 +1781,307 @@ bool Manager::checkIfCPUAlertsProcessed() return false; } +bool Manager::decodeInterrupt(uint8_t socNum, uint32_t src) +{ + std::unique_lock lock(harvestMutex); + struct ras_df_err_chk errorCheck; + uint8_t buf; + bool fchHangError = false; + bool controlFabricError = false; + bool resetReady = false; + bool runtimeError = false; + bool nonMcaShutdownError = false; + + // check RAS Status Register + if (src & 0xFF) + { + lg2::error("The alert signaled is due to a RAS fatal error"); + + if (src & sysMgmtCtrlErr) + { + /*if RasStatus[reset_ctrl_err] is set in any of the processors, + proceed to cold reset, regardless of the status of the other P + */ + + std::string rasErrMsg = + "Fatal error detected in the control fabric. " + "BMC may trigger a reset based on policy set. "; + + sd_journal_send("MESSAGE=%s", rasErrMsg.c_str(), "PRIORITY=%i", + LOG_ERR, "REDFISH_MESSAGE_ID=%s", + "OpenBMC.0.1.CPUError", "REDFISH_MESSAGE_ARGS=%s", + rasErrMsg.c_str(), NULL); + + controlFabricError = true; + } + else if (src & resetHangErr) + { + std::string rasErrMsg = + "System hang while resetting in syncflood." + "Suggested next step is to do an additional manual " + "immediate reset"; + sd_journal_send("MESSAGE=%s", rasErrMsg.c_str(), "PRIORITY=%i", + LOG_ERR, "REDFISH_MESSAGE_ID=%s", + "OpenBMC.0.1.CPUError", "REDFISH_MESSAGE_ARGS=%s", + rasErrMsg.c_str(), NULL); + + fchHangError = true; + } + else if (src & fatalError) + { + std::string rasErrMsg; + + if (src & shutdownError) + { + rasErrMsg = + "MCA CPU shutdown error detected." + "System may reset after harvesting MCA data based on policy set."; + + contextType = shutdown; + } + else + { + rasErrMsg = "RAS FATAL Error detected. " + "System may reset after harvesting " + "MCA data based on policy set. "; + contextType = crashdump; + } + + sd_journal_send("MESSAGE=%s", rasErrMsg.c_str(), "PRIORITY=%i", + LOG_ERR, "REDFISH_MESSAGE_ID=%s", + "OpenBMC.0.1.CPUError", "REDFISH_MESSAGE_ARGS=%s", + rasErrMsg.c_str(), NULL); + + if (false == harvestMcaValidityCheck(socNum, &errorCheck)) + { + lg2::info( + "No valid mca banks found. Harvesting additional debug log ID dumps"); + } + harvestMcaDataBanks(socNum, errorCheck); + } + else if (src & shutdownError) + { + std::string rasErrMsg = + "Non MCA Shutdown error detected in the system"; + + sd_journal_send("MESSAGE=%s", rasErrMsg.c_str(), "PRIORITY=%i", + LOG_ERR, "REDFISH_MESSAGE_ID=%s", + "OpenBMC.0.1.CPUError", "REDFISH_MESSAGE_ARGS=%s", + rasErrMsg.c_str(), NULL); + + nonMcaShutdownError = true; + } + else if (src & mcaErrOverflow) + { + runTimeErrorInfoCheck(mcaErr, interruptMode); + + std::string mcaErrOverflowMsg = + "MCA runtime error counter overflow occured"; + + sd_journal_send("MESSAGE=%s", mcaErrOverflowMsg.c_str(), + "PRIORITY=%i", LOG_ERR, "REDFISH_MESSAGE_ID=%s", + "OpenBMC.0.1.CPUError", "REDFISH_MESSAGE_ARGS=%s", + mcaErrOverflowMsg.c_str(), NULL); + + runtimeError = true; + } + else if (src & dramCeccErrOverflow) + { + runTimeErrorInfoCheck(dramCeccErr, interruptMode); + + std::string dramErrOverlowMsg = + "DRAM CECC runtime error counter overflow occured"; + + sd_journal_send("MESSAGE=%s", dramErrOverlowMsg.c_str(), + "PRIORITY=%i", LOG_ERR, "REDFISH_MESSAGE_ID=%s", + "OpenBMC.0.1.CPUError", "REDFISH_MESSAGE_ARGS=%s", + dramErrOverlowMsg.c_str(), NULL); + + runtimeError = true; + } + else if (src & pcieErrOverflow) + { + runTimeErrorInfoCheck(pcieErr, interruptMode); + + std::string pcieErrOverlowMsg = + "PCIE runtime error counter overflow occured"; + + sd_journal_send("MESSAGE=%s", pcieErrOverlowMsg.c_str(), + "PRIORITY=%i", LOG_ERR, "REDFISH_MESSAGE_ID=%s", + "OpenBMC.0.1.CPUError", "REDFISH_MESSAGE_ARGS=%s", + pcieErrOverlowMsg.c_str(), NULL); + + runtimeError = true; + } + + cpuAlertProcessed[socNum] = true; + + if (fchHangError == true || runtimeError == true || + nonMcaShutdownError == true) + { + return true; + } + + if (cpuCount == 2) + { + if (checkIfCPUAlertsProcessed() == true) + { + resetReady = true; + } + } + else + { + resetReady = true; + } + if (resetReady == true) + { + if (controlFabricError == false) + { + amd::ras::util::cper::createFile(rcd, fatalErr, 2, errCount, + node); + amd::ras::util::cper::exportToDBus( + errCount - 1, rcd->Header.TimeStamp, objectServer, + systemBus, node); + } + + bool recoveryAction = true; + + amd::ras::config::Manager::AttributeValue aifsArmed = + configMgr.getAttribute("AifsArmed"); + bool* aifsArmedFlag = std::get_if(&aifsArmed); + + amd::ras::config::Manager::AttributeValue configSigId = + configMgr.getAttribute("AifsSignatureIdList"); + std::map* configSigIdList = + std::get_if>(&configSigId); + + if ((*aifsArmedFlag == true) && + (amd::ras::util::cper::checkSignatureIdMatch(configSigIdList, + rcd) == true)) + { + lg2::info("AIFS armed for the system"); + + std::ifstream inputFile( + "/home/root/bmcweb_persistent_data.json"); + + /*Check if there is any active subscriptions for + the local AIFS flow*/ + if (inputFile.is_open()) + { + nlohmann::json jsonData; + inputFile >> jsonData; + + if (jsonData.find("subscriptions") != jsonData.end()) + { + lg2::info("Subscriptions found"); + const auto& subscriptionsArray = + jsonData["subscriptions"]; + if (subscriptionsArray.is_array()) + { + for (const auto& subscription : subscriptionsArray) + { + const auto& messageIds = + subscription["MessageIds"]; + if (messageIds.is_array()) + { + bool messageIdFound = std::any_of( + messageIds.begin(), messageIds.end(), + [](const std::string& messageId) { + return messageId == + "AmdAifsFailureMatch"; + }); + if (messageIdFound) + { + recoveryAction = false; + + struct ras_override_delay dataIn = { + 0, 0, 0}; + bool ackResp; + dataIn.stop_delay_counter = 1; + oob_status_t ret; + + amd::ras::config::Manager:: + AttributeValue disableResetCounter = + configMgr.getAttribute( + "DisableAifsResetOnSyncfloodCounter"); + bool* disableResetCntr = + std::get_if( + &disableResetCounter); + + if (*disableResetCntr == true) + { + lg2::info( + "Disable Aifs Delay Reset on Syncflood counter is true. Sending Delay Reset on Syncflood override APML command"); + ret = + override_delay_reset_on_sync_flood( + socNum, dataIn, &ackResp); + + if (ret) + { + lg2::error( + "Failed to override delay value reset on syncflood Err:{ERRNO}", + "ERRNO", ret); + } + else + { + lg2::info( + "Successfully sent Reset delay on Syncflood command"); + } + } + + sd_journal_send( + "PRIORITY=%i", LOG_INFO, + "REDFISH_MESSAGE_ID=%s", + "OpenBMC.0.1.AmdAifsFailureMatch", + NULL); + + break; + } + } + } + } + } + inputFile.close(); + } + } + if (recoveryAction == true) + { + amd::ras::config::Manager::AttributeValue ResetSignalVal = + configMgr.getAttribute("ResetSignalType"); + std::string* resetSignal = + std::get_if(&ResetSignalVal); + + amd::ras::config::Manager::AttributeValue SystemRecoveryVal = + configMgr.getAttribute("SystemRecoveryMode"); + std::string* systemRecovery = + std::get_if(&SystemRecoveryVal); + amd::ras::util::rasRecoveryAction(node, buf, systemRecovery, + resetSignal); + } + + if (rcd->SectionDescriptor != nullptr) + { + delete[] rcd->SectionDescriptor; + rcd->SectionDescriptor = nullptr; + } + if (rcd->ErrorRecord != nullptr) + { + delete[] rcd->ErrorRecord; + rcd->ErrorRecord = nullptr; + } + + rcd = nullptr; + + cpuAlertProcessed.assign(cpuCount, false); + } + } + else + { + lg2::debug("Nothing to Harvest. Not RAS Error"); + } + return true; +} + bool Manager::decodeInterrupt(uint8_t socNum) { std::unique_lock lock(harvestMutex); @@ -1897,7 +2303,6 @@ bool Manager::decodeInterrupt(uint8_t socNum) { return true; } - if (checkIfCPUAlertsProcessed() == true) { resetReady = true; diff --git a/src/main.cpp b/src/main.cpp index ac590e6..fa10163 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -55,6 +55,13 @@ int main(int argc, char* argv[]) #endif io.run(); +#ifdef APML + auto* apmlMgr = dynamic_cast(errorMgr); + if (apmlMgr && apmlMgr->getAlertHandleMode() == "UEVENT") + { + apmlMgr->releaseUdevReSrc(); + } +#endif return 0; }