diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 180b1cbfcc4e1..0ff5a053dd131 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -78,6 +78,7 @@ #define MCACOD_L3WB 0x017A /* L3 Explicit Writeback */ #define MCACOD_DATA 0x0134 /* Data Load */ #define MCACOD_INSTR 0x0150 /* Instruction Fetch */ +#define MCACOD_IOERR 0x0e0b /* Generic I/O error */ /* MCi_MISC register defines */ #define MCI_MISC_ADDR_LSB(m) ((m) & 0x3f) @@ -91,6 +92,11 @@ /* MCi_ADDR register defines */ #define MCI_ADDR_PHYSADDR GENMASK_ULL(boot_cpu_data.x86_phys_bits - 1, 0) +#define MCI_MISC_PCISEG_MASK GENMASK_ULL(39, 32) +#define MCI_MISC_PCISEG(m) (((m) & MCI_MISC_PCISEG_MASK) >> 32) +#define MCI_MISC_PCIRID_MASK GENMASK_ULL(31, 16) +#define MCI_MISC_PCIRID(m) (((m) & MCI_MISC_PCIRID_MASK) >> 16) + /* CTL2 register defines */ #define MCI_CTL2_CMCI_EN BIT_ULL(30) #define MCI_CTL2_CMCI_THRESHOLD_MASK 0x7fffULL diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig index 110e99b86a66a..1679ff3eab2e8 100644 --- a/drivers/edac/Kconfig +++ b/drivers/edac/Kconfig @@ -281,6 +281,16 @@ config EDAC_IGEN6 This In-Band ECC is first used on the Elkhart Lake SoC but may appear on others in the future. +config EDAC_IEH + tristate "Intel Integrated Error Handler" + depends on PCI && X86_64 + help + Support for error detection and correction on the Intel + CPU using I/O IEH (Integrated Error Handler). IEHs are PCIe + devices which aggregate and report error events of different + severities from various I/O devices, e.g., PCIe devices and + legacy PCI devices. + config EDAC_MPC85XX bool "Freescale MPC83xx / MPC85xx" depends on FSL_SOC && EDAC=y diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile index 61945d3113cc3..da6478131a32a 100644 --- a/drivers/edac/Makefile +++ b/drivers/edac/Makefile @@ -33,6 +33,7 @@ obj-$(CONFIG_EDAC_I7CORE) += i7core_edac.o obj-$(CONFIG_EDAC_SBRIDGE) += sb_edac.o obj-$(CONFIG_EDAC_PND2) += pnd2_edac.o obj-$(CONFIG_EDAC_IGEN6) += igen6_edac.o +obj-$(CONFIG_EDAC_IEH) += ieh_edac.o obj-$(CONFIG_EDAC_E7XXX) += e7xxx_edac.o obj-$(CONFIG_EDAC_E752X) += e752x_edac.o obj-$(CONFIG_EDAC_I82443BXGX) += i82443bxgx_edac.o diff --git a/drivers/edac/ieh_edac.c b/drivers/edac/ieh_edac.c new file mode 100644 index 0000000000000..6c92352091b15 --- /dev/null +++ b/drivers/edac/ieh_edac.c @@ -0,0 +1,793 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Driver for Intel Integrated Error Handler (IEH) + * + * Copyright (C) 2020 Intel Corporation + * + * IEH centralizes and standardizes how I/O device errors are reported. + * They are PCIe devices which aggregate and report error events of different + * severities (correctable, non-fatal uncorrectable, and fatal uncorrectable) + * from various I/O devices, e.g., PCIe devices, legacy PCI devices. + * + * There is a global IEH and optional north/south satellite IEH(s) logically + * connected to global IEH. The global IEH is the root to process all incoming + * error messages from satellite IEH(s) and local devices (if some devices + * are connected directly to the global IEH) and generate interrupts(SMI/NMI/MCE + * configured by BIOS/platform firmware). The first IEH-supported platform is + * Tiger Lake-U. This driver reads/prints the error severity and error source + * (bus/device/function) logged in the IEH(s) and reboots the system on fatal + * IEH errors. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "edac_mc.h" + +#define IEH_REVISION "v1.8" + +#define EDAC_MOD_STR "ieh_edac" +#define IEH_NMI_NAME "ieh" + +#define GET_BITFIELD(v, lo, hi) (((v) & GENMASK_ULL(hi, lo)) >> (lo)) + +/* Global correctable error status */ +#define GCOERRSTS_OFFSET 0x200 +/* Global non-fatal error status */ +#define GNFERRSTS_OFFSET 0x210 +/* Global fatal error status */ +#define GFAERRSTS_OFFSET 0x220 + +/* Global correctable error mask */ +#define GCOERRMSK_OFFSET 0x230 +#define GCOERRMSK 0xffffffff +/* Global nonfatal error mask */ +#define GNFERRMSK_OFFSET 0x234 +#define GNFERRMSK 0xffffffff +/* Global fatal error mask */ +#define GFAERRMSK_OFFSET 0x238 +#define GFAERRMSK 0xffffffff + +/* Global system event status */ +#define GSYSEVTSTS_OFFSET 0x260 + +/* Global system event mask */ +#define GSYSEVTMSK_OFFSET 0x264 +#define GSYSEVTMSK 0x7 +#define GSYSEVTMSK_CORR BIT(0) +#define GSYSEVTMSK_NONFATAL BIT(1) +#define GSYSEVTMSK_FATAL BIT(2) + +/* Global system event map */ +#define GSYSEVTMAP_OFFSET 0x268 +#define GSYSEVTMAP_CORR(m) GET_BITFIELD(m, 0, 1) +#define GSYSEVTMAP_NONFATAL(m) GET_BITFIELD(m, 2, 3) +#define GSYSEVTMAP_FATAL(m) GET_BITFIELD(m, 4, 5) +#define GSYSEVTMAP_MCE 0x3f + +/* IEH type and version */ +#define IEHTYPEVER_OFFSET 0x26c +#define IEHTYPEVER_TYPE(t) GET_BITFIELD(t, 0, 3) +#define IEHTYPEVER_VER(t) GET_BITFIELD(t, 4, 7) +#define IEHTYPEVER_BUS(t) GET_BITFIELD(t, 8, 15) + +/* Bitmap field of satellite IEH */ +#define BITMAP_OFFSET 0x27c +#define BITMAP(m) GET_BITFIELD(m, 0, 4) + +/* Local uncorrectable error mask */ +#define LERRUNCMSK_OFFSET 0x298 +#define LERRUNCMSK 0xffffffff +/* Local correctable error mask */ +#define LERRCORMSK_OFFSET 0x2c0 +#define LERRCORMSK 0xffffffff + +/* Device number and function number of the device reporting to IEH */ +#define DEVFUN_OFFSET 0x300 +#define DEVFUN_FUN(d) GET_BITFIELD(d, 0, 2) +#define DEVFUN_DEV(d) GET_BITFIELD(d, 3, 7) + +#define ieh_printk(level, fmt, arg...) \ + edac_printk(level, "ieh", fmt, ##arg) + +/*#define PCI_ADDR(sbdf) (sbdf)->seg, (sbdf)->bus, (sbdf)->dev, (sbdf)->fun*/ + +/* Error notification methods */ +enum evt_map { + IEH_IGN, + IEH_SMI, + IEH_NMI, + IEH_MCE, +}; + +enum severity_level { + IEH_CORR_ERR, + IEH_NONFATAL_ERR, + IEH_FATAL_ERR, +}; + +enum ieh_type { + /* Global IEH */ + IEH_GLOBAL, + /* North satellite IEH logically connected to global IEH */ + IEH_NORTH, + /* South satellite IEH logically connected to north IEH */ + IEH_SOUTH, + /* + * Superset south satellite IEH with physical ERR[2:0] signals output. + * It's used as a global IEH (when it present, system has only one IEH). + */ + IEH_SUPERSET, +}; + +enum action_on_fatal_err { + NOP, + RESTART, + POWER_OFF, +}; + +struct pci_sbdf { + u32 seg : 16; + u32 bus : 8; + u32 dev : 5; + u32 fun : 3; +}; + +struct ieh_dev { + struct list_head list; + struct pci_dev *pdev; + struct pci_sbdf sbdf; + enum ieh_type type; + u8 ver; + /* Global IEH fields */ + enum evt_map corr_map; + enum evt_map nonfatal_map; + enum evt_map fatal_map; +}; + +static struct ieh_config { + u16 did; + enum action_on_fatal_err action; +} *ieh_cfg; + +struct decoded_res { + enum severity_level sev; + struct pci_sbdf sbdf; +}; + +static LIST_HEAD(global_ieh_list); +static LIST_HEAD(north_ieh_list); +static LIST_HEAD(south_ieh_list); + +/* Tiger Lake-U SoC */ +#define IEH_DID_TGL_U 0xa0af + +static struct ieh_config tgl_u_cfg = { + .did = IEH_DID_TGL_U, + .action = RESTART, +}; + +/* Tiger Lake-H SoC */ +#define IEH_DID_TGL_H 0x43af + +static struct ieh_config tgl_h_cfg = { + .did = IEH_DID_TGL_H, + .action = RESTART, +}; + +static const char * const severities[] = { + [IEH_CORR_ERR] = "correctable", + [IEH_NONFATAL_ERR] = "non-fatal uncorrectable", + [IEH_FATAL_ERR] = "fatal uncorrectable", +}; + +static struct irq_work ieh_irq_work; + +static int dev_idx(u32 status, int start) +{ + int i; + + for (i = start; i < 32; i++) { + if (status & (1 << i)) + return i; + } + + return -1; +} + +static inline bool has_notification_by(enum evt_map map) +{ + struct ieh_dev *ieh; + + list_for_each_entry(ieh, &global_ieh_list, list) { + if (ieh->corr_map == map || ieh->nonfatal_map == map || + ieh->fatal_map == map) + return true; + } + + return false; +} + +static void ieh_output_error(struct decoded_res *res) +{ + struct pci_sbdf *p = &res->sbdf; + + ieh_printk(KERN_ERR, "Device %04x:%02x:%02x.%x - %s error\n", + p->seg, p->bus, p->dev, + p->fun, severities[res->sev]); + + if (res->sev != IEH_FATAL_ERR) + return; + + switch (ieh_cfg->action) { + case RESTART: + ieh_printk(KERN_EMERG, "Restart system on device fatal error!\n"); + kernel_restart(NULL); + break; + + case POWER_OFF: + ieh_printk(KERN_EMERG, "Power off system on device fatal error!\n"); + kernel_power_off(); + break; + default: + break; + } + + /* TODO: Further report error information from the error source */ +} + +static bool is_same_pdev(struct pci_sbdf *p, struct pci_sbdf *q) +{ + return (p->seg == q->seg && p->bus == q->bus && + p->dev == q->dev && p->fun == q->fun); +} + +static struct ieh_dev *__get_ieh(struct list_head *ieh_list, + struct pci_sbdf *sbdf) +{ + struct ieh_dev *ieh; + + list_for_each_entry(ieh, ieh_list, list) { + if (is_same_pdev(sbdf, &ieh->sbdf)) + return ieh; + } + + return NULL; +} + +static struct ieh_dev *get_global_ieh(struct pci_sbdf *sbdf) +{ + return __get_ieh(&global_ieh_list, sbdf); +} + +static inline struct ieh_dev *get_north_sat_ieh(struct pci_sbdf *sbdf) +{ + return __get_ieh(&north_ieh_list, sbdf); +} + +static inline struct ieh_dev *get_south_sat_ieh(struct pci_sbdf *sbdf) +{ + return __get_ieh(&south_ieh_list, sbdf); +} + +static int read_and_clear(struct pci_dev *pdev, int offset, u32 *val) +{ + if (pci_read_config_dword(pdev, offset, val)) { + ieh_printk(KERN_ERR, "Failed to read 0x%x\n", offset); + return -ENODEV; + } + + /* Write 1s to clear status */ + if (pci_write_config_dword(pdev, offset, *val)) { + ieh_printk(KERN_ERR, "Failed to write 0x%x\n", offset); + return -ENODEV; + } + + return 0; +} + +#define UNMASK_ERR_EVENT(ieh, name) \ + do { \ + u32 val; \ + if (pci_read_config_dword(ieh->pdev, name##MSK_OFFSET, &val)) \ + return -ENODEV; \ + val &= ~name##MSK; \ + if (pci_write_config_dword(ieh->pdev, name##MSK_OFFSET, val)) \ + return -ENODEV; \ + } while (0) + +static int unmask_all_err_events(void) +{ + struct ieh_dev *ieh; + + list_for_each_entry(ieh, &global_ieh_list, list) { + UNMASK_ERR_EVENT(ieh, GFAERR); + UNMASK_ERR_EVENT(ieh, GNFERR); + UNMASK_ERR_EVENT(ieh, GCOERR); + UNMASK_ERR_EVENT(ieh, LERRUNC); + UNMASK_ERR_EVENT(ieh, LERRCOR); + UNMASK_ERR_EVENT(ieh, GSYSEVT); + } + + return 0; +} + +#define MASK_ERR_EVENT(ieh, name) \ + do { \ + u32 val; \ + if (pci_read_config_dword(ieh->pdev, name##MSK_OFFSET, &val)) \ + return -ENODEV; \ + val |= name##MSK; \ + if (pci_write_config_dword(ieh->pdev, name##MSK_OFFSET, val)) \ + return -ENODEV; \ + } while (0) + +static int mask_all_err_events(void) +{ + struct ieh_dev *ieh; + + list_for_each_entry(ieh, &global_ieh_list, list) { + MASK_ERR_EVENT(ieh, GFAERR); + MASK_ERR_EVENT(ieh, GNFERR); + MASK_ERR_EVENT(ieh, GCOERR); + MASK_ERR_EVENT(ieh, LERRUNC); + MASK_ERR_EVENT(ieh, LERRCOR); + MASK_ERR_EVENT(ieh, GSYSEVT); + } + + return 0; +} + +static int ieh_handle_error(struct ieh_dev *d, enum severity_level sev) +{ + struct decoded_res res; + struct pci_sbdf *sbdf = &res.sbdf; + struct ieh_dev *ieh; + int i, start = 0; + u32 sts, reg; + + switch (sev) { + case IEH_CORR_ERR: + if (read_and_clear(d->pdev, GCOERRSTS_OFFSET, &sts)) + return -ENODEV; + ieh_printk(KERN_DEBUG, "Read %04x:%02x:%02x.%x GCOERRSTS: 0x%x\n", + (&d->sbdf)->seg, (&d->sbdf)->bus, (&d->sbdf)->dev, (&d->sbdf)->fun, sts); + break; + case IEH_NONFATAL_ERR: + if (read_and_clear(d->pdev, GNFERRSTS_OFFSET, &sts)) + return -ENODEV; + ieh_printk(KERN_DEBUG, "Read %04x:%02x:%02x.%x GNFERRSTS: 0x%x\n", + (&d->sbdf)->seg, (&d->sbdf)->bus, (&d->sbdf)->dev, (&d->sbdf)->fun, sts); + break; + case IEH_FATAL_ERR: + if (read_and_clear(d->pdev, GFAERRSTS_OFFSET, &sts)) + return -ENODEV; + ieh_printk(KERN_DEBUG, "Read %04x:%02x:%02x.%x GFAERRSTS: 0x%x\n", + (&d->sbdf)->seg, (&d->sbdf)->bus, (&d->sbdf)->dev, (&d->sbdf)->fun, sts); + break; + } + + while ((i = dev_idx(sts, start)) != -1) { + if (pci_read_config_dword(d->pdev, DEVFUN_OFFSET + i * 4, ®)) { + ieh_printk(KERN_ERR, "Failed to read DEVFUN %d\n", i); + return -ENODEV; + } + ieh_printk(KERN_DEBUG, "Read %04x:%02x:%02x.%x DEVFUN %d: 0x%x\n", + (&d->sbdf)->seg, (&d->sbdf)->bus, (&d->sbdf)->dev, (&d->sbdf)->fun, i, reg); + + memset(&res, 0, sizeof(res)); + res.sev = sev; + sbdf->seg = d->sbdf.seg; + sbdf->bus = d->sbdf.bus; + sbdf->dev = DEVFUN_DEV(reg); + sbdf->fun = DEVFUN_FUN(reg); + + switch (d->type) { + case IEH_GLOBAL: + ieh = get_north_sat_ieh(sbdf); + if (!ieh) + ieh_output_error(&res); + else if (ieh->type == IEH_NORTH) + ieh_handle_error(ieh, sev); + else + ieh_printk(KERN_ERR, "Invalid global IEH\n"); + break; + case IEH_NORTH: + ieh = get_south_sat_ieh(sbdf); + if (!ieh) + ieh_output_error(&res); + else if (ieh->type == IEH_SOUTH) + ieh_handle_error(ieh, sev); + else + ieh_printk(KERN_ERR, "Invalid north IEH\n"); + break; + case IEH_SOUTH: + case IEH_SUPERSET: + ieh_output_error(&res); + break; + } + + start = i + 1; + } + + return 0; +} + +static void __ieh_check_error(struct ieh_dev *ieh) +{ + struct pci_dev *pdev = ieh->pdev; + u32 sts; + + if (pci_read_config_dword(pdev, GSYSEVTSTS_OFFSET, &sts)) { + ieh_printk(KERN_ERR, "Failed to read GSYSEVTSTS\n"); + return; + } + + ieh_printk(KERN_DEBUG, "Read %04x:%02x:%02x.%x GSYSEVTSTS: 0x%x\n", + (&ieh->sbdf)->seg, (&ieh->sbdf)->bus, (&ieh->sbdf)->dev, (&ieh->sbdf)->fun, sts); + + if ((sts & (1 << IEH_FATAL_ERR)) && ieh->fatal_map == IEH_NMI) + ieh_handle_error(ieh, IEH_FATAL_ERR); + + if ((sts & (1 << IEH_NONFATAL_ERR)) && ieh->nonfatal_map == IEH_NMI) + ieh_handle_error(ieh, IEH_NONFATAL_ERR); + + if ((sts & (1 << IEH_CORR_ERR)) && ieh->corr_map == IEH_NMI) + ieh_handle_error(ieh, IEH_CORR_ERR); +} + +static void ieh_check_error(void) +{ + struct ieh_dev *ieh; + + list_for_each_entry(ieh, &global_ieh_list, list) { + __ieh_check_error(ieh); + } +} + +static void ieh_irq_work_cb(struct irq_work *irq_work) +{ + ieh_check_error(); +} + +static int ieh_nmi_handler(unsigned int cmd, struct pt_regs *regs) +{ + irq_work_queue(&ieh_irq_work); + return 0; +} + +static int mce_check_error(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct mce *mce = (struct mce *)data; + struct decoded_res res; + struct pci_sbdf *sbdf = &res.sbdf; + struct ieh_dev *ieh; + u64 rid; + + /* TODO: For debug only. Remove them later. */ + ieh_printk(KERN_DEBUG, "MCi_STATUS 0x%llx\n", mce->status); + ieh_printk(KERN_DEBUG, "MCi_MISC 0x%llx\n", mce->misc); + ieh_printk(KERN_DEBUG, "MCi_ADDR 0x%llx\n", mce->addr); + ieh_printk(KERN_DEBUG, "MCGSTATUS 0x%llx\n", mce->mcgstatus); + ieh_printk(KERN_DEBUG, "MCGSCAP 0x%llx\n", mce->mcgcap); + ieh_printk(KERN_DEBUG, "IP 0x%llx\n", mce->ip); + ieh_printk(KERN_DEBUG, "MC bank 0x%x\n", mce->bank); + + if ((mce->status & MCACOD) != MCACOD_IOERR) + return NOTIFY_DONE; + + if (!(mce->status & MCI_STATUS_MISCV)) + return NOTIFY_DONE; + + memset(&res, 0, sizeof(res)); + rid = MCI_MISC_PCIRID(mce->misc); + sbdf->seg = MCI_MISC_PCISEG(mce->misc); + sbdf->bus = GET_BITFIELD(rid, 8, 15); + sbdf->dev = GET_BITFIELD(rid, 3, 7); + sbdf->fun = GET_BITFIELD(rid, 0, 2); + + if (mce->status & MCI_STATUS_PCC) + res.sev = IEH_FATAL_ERR; + else if (mce->status & MCI_STATUS_UC) + res.sev = IEH_NONFATAL_ERR; + else + res.sev = IEH_CORR_ERR; + + ieh = get_global_ieh(sbdf); + if (ieh) + goto handle; + + ieh = get_north_sat_ieh(sbdf); + if (ieh) + goto handle; + + ieh = get_south_sat_ieh(sbdf); + if (ieh) + goto handle; + + goto output; + +handle: + ieh_handle_error(ieh, res.sev); + mce->kflags |= MCE_HANDLED_EDAC; + return NOTIFY_DONE; + +output: + ieh_output_error(&res); + return NOTIFY_DONE; +} + +static struct notifier_block ieh_mce_dec = { + .notifier_call = mce_check_error, + .priority = MCE_PRIO_EDAC, +}; + +static const struct x86_cpu_id ieh_cpuids[] = { + X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &tgl_u_cfg), + X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &tgl_h_cfg), + {} +}; +MODULE_DEVICE_TABLE(x86cpu, ieh_cpuids); + +static void __put_ieh(struct ieh_dev *ieh) +{ + if (!ieh) + return; + if (ieh->pdev) { + pci_disable_device(ieh->pdev); + pci_dev_put(ieh->pdev); + } + kfree(ieh); +} + +static void __put_iehs(struct list_head *ieh_list) +{ + struct ieh_dev *ieh, *tmp; + + edac_dbg(0, "\n"); + + list_for_each_entry_safe(ieh, tmp, ieh_list, list) { + list_del(&ieh->list); + __put_ieh(ieh); + } +} + +static void put_all_iehs(void) +{ + __put_iehs(&global_ieh_list); + __put_iehs(&north_ieh_list); + __put_iehs(&south_ieh_list); +} + +static int __get_all_iehs(u16 did) +{ + struct pci_dev *pdev, *prev = NULL; + int rc = -ENODEV, n = 0; + struct pci_sbdf *sbdf; + struct ieh_dev *ieh; + u32 reg; + + edac_dbg(0, "\n"); + + for (;;) { + pdev = pci_get_device(PCI_VENDOR_ID_INTEL, did, prev); + if (!pdev) + break; + + if (pci_enable_device(pdev)) { + ieh_printk(KERN_ERR, "Failed to enable %04x:%04x\n", + pdev->vendor, pdev->device); + goto fail; + } + + ieh = kzalloc(sizeof(*ieh), GFP_KERNEL); + if (!ieh) { + rc = -ENOMEM; + goto fail2; + } + + if (pci_read_config_dword(pdev, IEHTYPEVER_OFFSET, ®)) { + ieh_printk(KERN_ERR, "Failed to read IEHTYPEVER\n"); + return -ENODEV; + } + + ieh->pdev = pdev; + ieh->ver = IEHTYPEVER_VER(reg); + ieh->type = IEHTYPEVER_TYPE(reg); + sbdf = &ieh->sbdf; + sbdf->seg = pci_domain_nr(pdev->bus); + sbdf->bus = IEHTYPEVER_BUS(reg); + sbdf->dev = PCI_SLOT(pdev->devfn); + sbdf->fun = PCI_FUNC(pdev->devfn); + ieh_printk(KERN_DEBUG, "Read %04x:%02x:%02x.%x IEHTYPEVER: 0x%x\n", + (sbdf)->seg, (sbdf)->bus, (sbdf)->dev, (sbdf)->fun, reg); + + if (sbdf->bus != pdev->bus->number) { + ieh_printk(KERN_ERR, "Mismatched IEH bus\n"); + rc = -EINVAL; + goto fail3; + } + + switch (ieh->type) { + case IEH_SUPERSET: + case IEH_GLOBAL: + /* Set notification to MCE */ + if (pci_read_config_dword(pdev, GSYSEVTMAP_OFFSET, ®)) { + ieh_printk(KERN_ERR, "Failed to read old GSYSEVTMAP\n"); + return -ENODEV; + } + + reg |= GSYSEVTMAP_MCE; + if (pci_write_config_dword(pdev, GSYSEVTMAP_OFFSET, reg)) { + ieh_printk(KERN_ERR, "Failed to write GSYSEVTMAP\n"); + return -ENODEV; + } + + if (pci_read_config_dword(pdev, GSYSEVTMAP_OFFSET, ®)) { + ieh_printk(KERN_ERR, "Failed to read new GSYSEVTMAP\n"); + return -ENODEV; + } + ieh_printk(KERN_DEBUG, "Read %04x:%02x:%02x.%x GSYSEVTMAP: 0x%x\n", + (sbdf)->seg, (sbdf)->bus, (sbdf)->dev, (sbdf)->fun, reg); + + ieh->corr_map = GSYSEVTMAP_CORR(reg); + ieh->nonfatal_map = GSYSEVTMAP_NONFATAL(reg); + ieh->fatal_map = GSYSEVTMAP_FATAL(reg); + list_add_tail(&ieh->list, &global_ieh_list); + ieh_printk(KERN_DEBUG, "Global/Superset IEH %04x:%02x:%02x.%x\n", + (sbdf)->seg, (sbdf)->bus, (sbdf)->dev, (sbdf)->fun); + break; + case IEH_NORTH: + list_add_tail(&ieh->list, &north_ieh_list); + ieh_printk(KERN_DEBUG, "North IEH %04x:%02x:%02x.%x\n", + (sbdf)->seg, (sbdf)->bus, (sbdf)->dev, (sbdf)->fun); + break; + case IEH_SOUTH: + list_add_tail(&ieh->list, &south_ieh_list); + ieh_printk(KERN_DEBUG, "South IEH %04x:%02x:%02x.%x\n", + (sbdf)->seg, (sbdf)->bus, (sbdf)->dev, (sbdf)->fun); + break; + } + + pci_dev_get(pdev); + prev = pdev; + n++; + } + + return n; +fail3: + kfree(ieh); +fail2: + pci_disable_device(pdev); +fail: + pci_dev_put(pdev); + put_all_iehs(); + return rc; +} + +static int get_all_iehs(u16 did) +{ + int rc; + + rc = __get_all_iehs(did); + if (rc < 0) + return rc; + + if (rc == 0) { + ieh_printk(KERN_DEBUG, "No IEHs found\n"); + return -ENODEV; + } + + if (list_empty(&global_ieh_list)) { + ieh_printk(KERN_ERR, "No global IEH found\n"); + put_all_iehs(); + return -ENODEV; + } + + return 0; +} + +static int register_err_handler(void) +{ + bool os_visible = false; + int rc; + + if (has_notification_by(IEH_NMI)) { + init_irq_work(&ieh_irq_work, ieh_irq_work_cb); + rc = register_nmi_handler(NMI_SERR, ieh_nmi_handler, + 0, IEH_NMI_NAME); + if (rc) { + ieh_printk(KERN_ERR, "Can't register NMI handler\n"); + return rc; + } + + os_visible = true; + } + + if (has_notification_by(IEH_MCE)) { + mce_register_decode_chain(&ieh_mce_dec); + os_visible = true; + } + + if (!os_visible) { + ieh_printk(KERN_INFO, "No OS-visible IEH events\n"); + return -ENODEV; + } + + return 0; +} + +static void unregister_err_handler(void) +{ + if (has_notification_by(IEH_NMI)) { + unregister_nmi_handler(NMI_SERR, IEH_NMI_NAME); + irq_work_sync(&ieh_irq_work); + } + + if (has_notification_by(IEH_MCE)) + mce_unregister_decode_chain(&ieh_mce_dec); +} + +static int __init ieh_init(void) +{ + const struct x86_cpu_id *id; + struct ieh_dev *ieh; + int rc; + + edac_dbg(2, "\n"); + + id = x86_match_cpu(ieh_cpuids); + if (!id) + return -ENODEV; + ieh_cfg = (struct ieh_config *)id->driver_data; + + rc = get_all_iehs(ieh_cfg->did); + if (rc) + return rc; + + rc = register_err_handler(); + if (rc) + goto fail; + + rc = unmask_all_err_events(); + if (rc) + goto fail2; + + ieh = list_first_entry(&global_ieh_list, struct ieh_dev, list); + ieh_printk(KERN_INFO, "hw v%d, drv %s\n", ieh->ver, IEH_REVISION); + + return 0; +fail2: + unregister_err_handler(); +fail: + put_all_iehs(); + return rc; +} + +static void __exit ieh_exit(void) +{ + edac_dbg(2, "\n"); + mask_all_err_events(); + unregister_err_handler(); + put_all_iehs(); +} + +module_init(ieh_init); +module_exit(ieh_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Qiuxu Zhuo"); +MODULE_DESCRIPTION("IEH Driver for Intel CPU using I/O IEH"); diff --git a/drivers/edac/igen6_edac.c b/drivers/edac/igen6_edac.c index 1a18693294db4..a6825ed9ddf90 100644 --- a/drivers/edac/igen6_edac.c +++ b/drivers/edac/igen6_edac.c @@ -26,6 +26,7 @@ #include "edac_mc.h" #include "edac_module.h" +#include "igen6_edac.h" #define IGEN6_REVISION "v2.5.1" @@ -58,6 +59,7 @@ /* Capability register E */ #define CAPID_E_OFFSET 0xf0 #define CAPID_E_IBECC BIT(12) +#define CAPID_E_IBECC_BIT18 BIT(18) /* Error Status */ #define ERRSTS_OFFSET 0xc8 @@ -80,6 +82,7 @@ #define ECC_ERROR_LOG_UE BIT_ULL(63) #define ECC_ERROR_LOG_ADDR_SHIFT 5 #define ECC_ERROR_LOG_ADDR(v) GET_BITFIELD(v, 5, 38) +#define ECC_ERROR_LOG_ADDR45(v) GET_BITFIELD(v, 5, 45) #define ECC_ERROR_LOG_SYND(v) GET_BITFIELD(v, 46, 61) /* Host MMIO base address */ @@ -133,6 +136,8 @@ static struct res_config { u32 ibecc_base; u32 ibecc_error_log_offset; bool (*ibecc_available)(struct pci_dev *pdev); + /* Extract error address logged in IBECC */ + u64 (*err_addr)(u64 ecclog); /* Convert error address logged in IBECC to system physical address */ u64 (*err_addr_to_sys_addr)(u64 eaddr, int mc); /* Convert error address logged in IBECC to integrated memory controller address */ @@ -222,6 +227,67 @@ static struct work_struct ecclog_work; #define DID_ADL_SKU3 0x4621 #define DID_ADL_SKU4 0x4641 +/* Compute die IDs for Alder Lake-N with IBECC */ +#define DID_ADL_N_SKU1 0x4614 +#define DID_ADL_N_SKU2 0x4617 +#define DID_ADL_N_SKU3 0x461b +#define DID_ADL_N_SKU4 0x461c +#define DID_ADL_N_SKU5 0x4673 +#define DID_ADL_N_SKU6 0x4674 +#define DID_ADL_N_SKU7 0x4675 +#define DID_ADL_N_SKU8 0x4677 +#define DID_ADL_N_SKU9 0x4678 +#define DID_ADL_N_SKU10 0x4679 +#define DID_ADL_N_SKU11 0x467c + +/* Compute die IDs for Raptor Lake-P with IBECC */ +#define DID_RPL_P_SKU1 0xa706 +#define DID_RPL_P_SKU2 0xa707 +#define DID_RPL_P_SKU3 0xa708 +#define DID_RPL_P_SKU4 0xa716 +#define DID_RPL_P_SKU5 0xa718 + +/* Compute die IDs for Meteor Lake-PS with IBECC */ +#define DID_MTL_PS_SKU1 0x7d21 +#define DID_MTL_PS_SKU2 0x7d22 +#define DID_MTL_PS_SKU3 0x7d23 +#define DID_MTL_PS_SKU4 0x7d24 + +/* Compute die IDs for Meteor Lake-P with IBECC */ +#define DID_MTL_P_SKU1 0x7d01 +#define DID_MTL_P_SKU2 0x7d02 +#define DID_MTL_P_SKU3 0x7d14 + +static int get_mchbar(struct pci_dev *pdev, u64 *mchbar) +{ + union { + u64 v; + struct { + u32 v_lo; + u32 v_hi; + }; + } u; + + if (pci_read_config_dword(pdev, MCHBAR_OFFSET, &u.v_lo)) { + igen6_printk(KERN_ERR, "Failed to read lower MCHBAR\n"); + return -ENODEV; + } + + if (pci_read_config_dword(pdev, MCHBAR_OFFSET + 4, &u.v_hi)) { + igen6_printk(KERN_ERR, "Failed to read upper MCHBAR\n"); + return -ENODEV; + } + + if (!(u.v & MCHBAR_EN)) { + igen6_printk(KERN_ERR, "MCHBAR is disabled\n"); + return -ENODEV; + } + + *mchbar = MCHBAR_BASE(u.v); + + return 0; +} + static bool ehl_ibecc_available(struct pci_dev *pdev) { u32 v; @@ -272,6 +338,38 @@ static bool tgl_ibecc_available(struct pci_dev *pdev) return !(CAPID_E_IBECC & v); } +static bool mtl_p_ibecc_available(struct pci_dev *pdev) +{ + u32 v; + + if (pci_read_config_dword(pdev, CAPID_E_OFFSET, &v)) + return false; + + return !(CAPID_E_IBECC_BIT18 & v); +} + +static bool mtl_ps_ibecc_available(struct pci_dev *pdev) +{ + void __iomem *window; + u64 mchbar; + u32 val; + + if (get_mchbar(pdev, &mchbar)) + return false; + + window = ioremap(mchbar, MCHBAR_SIZE * 2); + if (!window) { + igen6_printk(KERN_ERR, "Failed to ioremap 0x%llx\n", mchbar); + return false; + } + + val = readl(window + 0x13c00); + iounmap(window); + + /* Bit6: 1 - IBECC is disabled, 0 - IBECC isn't disabled */ + return !GET_BITFIELD(val, 6, 6); +} + static u64 mem_addr_to_sys_addr(u64 maddr) { if (maddr < igen6_tolud) @@ -358,6 +456,11 @@ static u64 adl_err_addr_to_imc_addr(u64 eaddr, int mc) return imc_addr; } +static u64 rpl_p_err_addr(u64 ecclog) +{ + return ECC_ERROR_LOG_ADDR45(ecclog); +} + static struct res_config ehl_cfg = { .num_imc = 1, .imc_base = 0x5000, @@ -403,6 +506,51 @@ static struct res_config adl_cfg = { .err_addr_to_imc_addr = adl_err_addr_to_imc_addr, }; +static struct res_config adl_n_cfg = { + .machine_check = true, + .num_imc = 1, + .imc_base = 0xd800, + .ibecc_base = 0xd400, + .ibecc_error_log_offset = 0x68, + .ibecc_available = tgl_ibecc_available, + .err_addr_to_sys_addr = adl_err_addr_to_sys_addr, + .err_addr_to_imc_addr = adl_err_addr_to_imc_addr, +}; + +static struct res_config rpl_p_cfg = { + .machine_check = true, + .num_imc = 2, + .imc_base = 0xd800, + .ibecc_base = 0xd400, + .ibecc_error_log_offset = 0x68, + .ibecc_available = tgl_ibecc_available, + .err_addr = rpl_p_err_addr, + .err_addr_to_sys_addr = adl_err_addr_to_sys_addr, + .err_addr_to_imc_addr = adl_err_addr_to_imc_addr, +}; + +static struct res_config mtl_ps_cfg = { + .machine_check = true, + .num_imc = 2, + .imc_base = 0xd800, + .ibecc_base = 0xd400, + .ibecc_error_log_offset = 0x170, + .ibecc_available = mtl_ps_ibecc_available, + .err_addr_to_sys_addr = adl_err_addr_to_sys_addr, + .err_addr_to_imc_addr = adl_err_addr_to_imc_addr, +}; + +static struct res_config mtl_p_cfg = { + .machine_check = true, + .num_imc = 2, + .imc_base = 0xd800, + .ibecc_base = 0xd400, + .ibecc_error_log_offset = 0x170, + .ibecc_available = mtl_p_ibecc_available, + .err_addr_to_sys_addr = adl_err_addr_to_sys_addr, + .err_addr_to_imc_addr = adl_err_addr_to_imc_addr, +}; + static const struct pci_device_id igen6_pci_tbl[] = { { PCI_VDEVICE(INTEL, DID_EHL_SKU5), (kernel_ulong_t)&ehl_cfg }, { PCI_VDEVICE(INTEL, DID_EHL_SKU6), (kernel_ulong_t)&ehl_cfg }, @@ -424,10 +572,47 @@ static const struct pci_device_id igen6_pci_tbl[] = { { PCI_VDEVICE(INTEL, DID_ADL_SKU2), (kernel_ulong_t)&adl_cfg }, { PCI_VDEVICE(INTEL, DID_ADL_SKU3), (kernel_ulong_t)&adl_cfg }, { PCI_VDEVICE(INTEL, DID_ADL_SKU4), (kernel_ulong_t)&adl_cfg }, + { PCI_VDEVICE(INTEL, DID_ADL_N_SKU1), (kernel_ulong_t)&adl_n_cfg }, + { PCI_VDEVICE(INTEL, DID_ADL_N_SKU2), (kernel_ulong_t)&adl_n_cfg }, + { PCI_VDEVICE(INTEL, DID_ADL_N_SKU3), (kernel_ulong_t)&adl_n_cfg }, + { PCI_VDEVICE(INTEL, DID_ADL_N_SKU4), (kernel_ulong_t)&adl_n_cfg }, + { PCI_VDEVICE(INTEL, DID_ADL_N_SKU5), (kernel_ulong_t)&adl_n_cfg }, + { PCI_VDEVICE(INTEL, DID_ADL_N_SKU6), (kernel_ulong_t)&adl_n_cfg }, + { PCI_VDEVICE(INTEL, DID_ADL_N_SKU7), (kernel_ulong_t)&adl_n_cfg }, + { PCI_VDEVICE(INTEL, DID_ADL_N_SKU8), (kernel_ulong_t)&adl_n_cfg }, + { PCI_VDEVICE(INTEL, DID_ADL_N_SKU9), (kernel_ulong_t)&adl_n_cfg }, + { PCI_VDEVICE(INTEL, DID_ADL_N_SKU10), (kernel_ulong_t)&adl_n_cfg }, + { PCI_VDEVICE(INTEL, DID_ADL_N_SKU11), (kernel_ulong_t)&adl_n_cfg }, + { PCI_VDEVICE(INTEL, DID_RPL_P_SKU1), (kernel_ulong_t)&rpl_p_cfg }, + { PCI_VDEVICE(INTEL, DID_RPL_P_SKU2), (kernel_ulong_t)&rpl_p_cfg }, + { PCI_VDEVICE(INTEL, DID_RPL_P_SKU3), (kernel_ulong_t)&rpl_p_cfg }, + { PCI_VDEVICE(INTEL, DID_RPL_P_SKU4), (kernel_ulong_t)&rpl_p_cfg }, + { PCI_VDEVICE(INTEL, DID_RPL_P_SKU5), (kernel_ulong_t)&rpl_p_cfg }, + { PCI_VDEVICE(INTEL, DID_MTL_PS_SKU1), (kernel_ulong_t)&mtl_ps_cfg }, + { PCI_VDEVICE(INTEL, DID_MTL_PS_SKU2), (kernel_ulong_t)&mtl_ps_cfg }, + { PCI_VDEVICE(INTEL, DID_MTL_PS_SKU3), (kernel_ulong_t)&mtl_ps_cfg }, + { PCI_VDEVICE(INTEL, DID_MTL_PS_SKU4), (kernel_ulong_t)&mtl_ps_cfg }, + { PCI_VDEVICE(INTEL, DID_MTL_P_SKU1), (kernel_ulong_t)&mtl_p_cfg }, + { PCI_VDEVICE(INTEL, DID_MTL_P_SKU2), (kernel_ulong_t)&mtl_p_cfg }, + { PCI_VDEVICE(INTEL, DID_MTL_P_SKU3), (kernel_ulong_t)&mtl_p_cfg }, { }, }; MODULE_DEVICE_TABLE(pci, igen6_pci_tbl); +static BLOCKING_NOTIFIER_HEAD(ibecc_err_handler_chain); + +int ibecc_err_register_notifer(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&ibecc_err_handler_chain, nb); +} +EXPORT_SYMBOL_GPL(ibecc_err_register_notifer); + +int ibecc_err_unregister_notifer(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&ibecc_err_handler_chain, nb); +} +EXPORT_SYMBOL_GPL(ibecc_err_unregister_notifer); + static enum dev_type get_width(int dimm_l, u32 mad_dimm) { u32 w = dimm_l ? MAD_DIMM_CH_DLW(mad_dimm) : @@ -545,6 +730,7 @@ static void igen6_output_error(struct decoded_addr *res, enum hw_event_mc_err_type type = ecclog & ECC_ERROR_LOG_UE ? HW_EVENT_ERR_UNCORRECTED : HW_EVENT_ERR_CORRECTED; + struct ibecc_err_info e; edac_mc_handle_error(type, mci, 1, res->sys_addr >> PAGE_SHIFT, @@ -552,6 +738,13 @@ static void igen6_output_error(struct decoded_addr *res, ECC_ERROR_LOG_SYND(ecclog), res->channel_idx, res->sub_channel_idx, -1, "", ""); + + /* Notify other handlers for further IBECC error handling */ + memset(&e, 0, sizeof(e)); + e.type = type; + e.sys_addr = res->sys_addr; + e.ecc_log = ecclog; + blocking_notifier_call_chain(&ibecc_err_handler_chain, 0, &e); } static struct gen_pool *ecclog_gen_pool_create(void) @@ -679,8 +872,11 @@ static void ecclog_work_cb(struct work_struct *work) llist_for_each_entry_safe(node, tmp, head, llnode) { memset(&res, 0, sizeof(res)); - eaddr = ECC_ERROR_LOG_ADDR(node->ecclog) << - ECC_ERROR_LOG_ADDR_SHIFT; + if (res_cfg->err_addr) + eaddr = res_cfg->err_addr(node->ecclog); + else + eaddr = ECC_ERROR_LOG_ADDR(node->ecclog) << + ECC_ERROR_LOG_ADDR_SHIFT; res.mc = node->mc; res.sys_addr = res_cfg->err_addr_to_sys_addr(eaddr, res.mc); res.imc_addr = res_cfg->err_addr_to_imc_addr(eaddr, res.mc); @@ -969,22 +1165,8 @@ static int igen6_pci_setup(struct pci_dev *pdev, u64 *mchbar) igen6_tom = u.v & GENMASK_ULL(38, 20); - if (pci_read_config_dword(pdev, MCHBAR_OFFSET, &u.v_lo)) { - igen6_printk(KERN_ERR, "Failed to read lower MCHBAR\n"); - goto fail; - } - - if (pci_read_config_dword(pdev, MCHBAR_OFFSET + 4, &u.v_hi)) { - igen6_printk(KERN_ERR, "Failed to read upper MCHBAR\n"); - goto fail; - } - - if (!(u.v & MCHBAR_EN)) { - igen6_printk(KERN_ERR, "MCHBAR is disabled\n"); + if (get_mchbar(pdev, mchbar)) goto fail; - } - - *mchbar = MCHBAR_BASE(u.v); #ifdef CONFIG_EDAC_DEBUG if (pci_read_config_dword(pdev, TOUUD_OFFSET, &u.v_lo)) diff --git a/drivers/edac/igen6_edac.h b/drivers/edac/igen6_edac.h new file mode 100644 index 0000000000000..ca447593bdf8a --- /dev/null +++ b/drivers/edac/igen6_edac.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Registration for IBECC error notification + * Copyright (C) 2020 Intel Corporation + */ + +#ifndef _IGEN6_EDAC_H +#define _IGEN6_EDAC_H + +#include +#include + +struct ibecc_err_info { + enum hw_event_mc_err_type type; + u64 sys_addr; + u64 ecc_log; +}; + +int ibecc_err_register_notifer(struct notifier_block *nb); +int ibecc_err_unregister_notifer(struct notifier_block *nb); + +#endif /* _IGEN6_EDAC_H */