From 838cda3697073982acd276ac43387b2a0aed04b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FChristian=3D20K=3DC3=3DB6nig=3F=3D?= Date: Tue, 16 Jan 2018 10:43:17 +0100 Subject: [PATCH 01/62] x86/PCI: Enable AMD 64-bit window on resume MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reenable the 64-bit window during resume. Fixes: fa564ad96366 ("x86/PCI: Enable a 64bit BAR on AMD Family 15h (Models 00-1f, 30-3f, 60-7f)") Reported-by: Tom St Denis Signed-off-by: Christian König Signed-off-by: Bjorn Helgaas --- arch/x86/pci/fixup.c | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index f6a26e3cb47633..54ef19e90705aa 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -662,11 +662,11 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2033, quirk_no_aersid); */ static void pci_amd_enable_64bit_bar(struct pci_dev *dev) { + static const char *name = "PCI Bus 0000:00"; + struct resource *res, *conflict; u32 base, limit, high; struct pci_dev *other; - struct resource *res; unsigned i; - int r; if (!(pci_probe & PCI_BIG_ROOT_WINDOW)) return; @@ -707,21 +707,26 @@ static void pci_amd_enable_64bit_bar(struct pci_dev *dev) * Allocate a 256GB window directly below the 0xfd00000000 hardware * limit (see AMD Family 15h Models 30h-3Fh BKDG, sec 2.4.6). */ - res->name = "PCI Bus 0000:00"; + res->name = name; res->flags = IORESOURCE_PREFETCH | IORESOURCE_MEM | IORESOURCE_MEM_64 | IORESOURCE_WINDOW; res->start = 0xbd00000000ull; res->end = 0xfd00000000ull - 1; - r = request_resource(&iomem_resource, res); - if (r) { + conflict = request_resource_conflict(&iomem_resource, res); + if (conflict) { kfree(res); - return; - } + if (conflict->name != name) + return; - dev_info(&dev->dev, "adding root bus resource %pR (tainting kernel)\n", - res); - add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); + /* We are resuming from suspend; just reenable the window */ + res = conflict; + } else { + dev_info(&dev->dev, "adding root bus resource %pR (tainting kernel)\n", + res); + add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); + pci_bus_add_resource(dev->bus, res, 0); + } base = ((res->start >> 8) & AMD_141b_MMIO_BASE_MMIOBASE_MASK) | AMD_141b_MMIO_BASE_RE_MASK | AMD_141b_MMIO_BASE_WE_MASK; @@ -733,13 +738,16 @@ static void pci_amd_enable_64bit_bar(struct pci_dev *dev) pci_write_config_dword(dev, AMD_141b_MMIO_HIGH(i), high); pci_write_config_dword(dev, AMD_141b_MMIO_LIMIT(i), limit); pci_write_config_dword(dev, AMD_141b_MMIO_BASE(i), base); - - pci_bus_add_resource(dev->bus, res, 0); } DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x1401, pci_amd_enable_64bit_bar); DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x141b, pci_amd_enable_64bit_bar); DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x1571, pci_amd_enable_64bit_bar); DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x15b1, pci_amd_enable_64bit_bar); DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x1601, pci_amd_enable_64bit_bar); +DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_AMD, 0x1401, pci_amd_enable_64bit_bar); +DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_AMD, 0x141b, pci_amd_enable_64bit_bar); +DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_AMD, 0x1571, pci_amd_enable_64bit_bar); +DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_AMD, 0x15b1, pci_amd_enable_64bit_bar); +DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_AMD, 0x1601, pci_amd_enable_64bit_bar); #endif From cc01572e2fb080e279ca625f239aca61f435ebf3 Mon Sep 17 00:00:00 2001 From: Yossi Kuperman Date: Wed, 17 Jan 2018 15:52:41 +0200 Subject: [PATCH 02/62] xfrm: Add SA to hardware at the end of xfrm_state_construct() Current code configures the hardware with a new SA before the state has been fully initialized. During this time interval, an incoming ESP packet can cause a crash due to a NULL dereference. More specifically, xfrm_input() considers the packet as valid, and yet, anti-replay mechanism is not initialized. Move hardware configuration to the end of xfrm_state_construct(), and mark the state as valid once the SA is fully initialized. Fixes: d77e38e612a0 ("xfrm: Add an IPsec hardware offloading API") Signed-off-by: Aviad Yehezkel Signed-off-by: Aviv Heller Signed-off-by: Yossi Kuperman Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 10 +++++++--- net/xfrm/xfrm_user.c | 18 +++++++++++------- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 42995741263386..2d486492acdb26 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -2272,8 +2272,6 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload) goto error; } - x->km.state = XFRM_STATE_VALID; - error: return err; } @@ -2282,7 +2280,13 @@ EXPORT_SYMBOL(__xfrm_init_state); int xfrm_init_state(struct xfrm_state *x) { - return __xfrm_init_state(x, true, false); + int err; + + err = __xfrm_init_state(x, true, false); + if (!err) + x->km.state = XFRM_STATE_VALID; + + return err; } EXPORT_SYMBOL(xfrm_init_state); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index bdb48e5dba0480..7f52b8eb177db4 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -598,13 +598,6 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, goto error; } - if (attrs[XFRMA_OFFLOAD_DEV]) { - err = xfrm_dev_state_add(net, x, - nla_data(attrs[XFRMA_OFFLOAD_DEV])); - if (err) - goto error; - } - if ((err = xfrm_alloc_replay_state_esn(&x->replay_esn, &x->preplay_esn, attrs[XFRMA_REPLAY_ESN_VAL]))) goto error; @@ -620,6 +613,14 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, /* override default values from above */ xfrm_update_ae_params(x, attrs, 0); + /* configure the hardware if offload is requested */ + if (attrs[XFRMA_OFFLOAD_DEV]) { + err = xfrm_dev_state_add(net, x, + nla_data(attrs[XFRMA_OFFLOAD_DEV])); + if (err) + goto error; + } + return x; error: @@ -662,6 +663,9 @@ static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; } + if (x->km.state == XFRM_STATE_VOID) + x->km.state = XFRM_STATE_VALID; + c.seq = nlh->nlmsg_seq; c.portid = nlh->nlmsg_pid; c.event = nlh->nlmsg_type; From f61145f1a4bd7966aa0b15c5cd3950835b284f55 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Thu, 21 Dec 2017 14:17:22 -0800 Subject: [PATCH 03/62] drm/vc4: Flush the caches before the bin jobs, as well. If the frame samples from a render target that was just written, its cache flush during the binning step may have occurred before the previous frame's RCL was completed. Flush the texture caches again before starting each RCL job to make sure that the sampling of the previous RCL's output is correct. Fixes flickering in the top left of 3DMMES Taiji. Signed-off-by: Eric Anholt Fixes: ca26d28bbaa3 ("drm/vc4: improve throughput by pipelining binning and rendering jobs") Link: https://patchwork.freedesktop.org/patch/msgid/20171221221722.23809-1-eric@anholt.net Reviewed-by: Boris Brezillon --- drivers/gpu/drm/vc4/vc4_gem.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/drivers/gpu/drm/vc4/vc4_gem.c b/drivers/gpu/drm/vc4/vc4_gem.c index 638540943c61a5..e3e868cdee7943 100644 --- a/drivers/gpu/drm/vc4/vc4_gem.c +++ b/drivers/gpu/drm/vc4/vc4_gem.c @@ -436,6 +436,19 @@ vc4_flush_caches(struct drm_device *dev) VC4_SET_FIELD(0xf, V3D_SLCACTL_ICC)); } +static void +vc4_flush_texture_caches(struct drm_device *dev) +{ + struct vc4_dev *vc4 = to_vc4_dev(dev); + + V3D_WRITE(V3D_L2CACTL, + V3D_L2CACTL_L2CCLR); + + V3D_WRITE(V3D_SLCACTL, + VC4_SET_FIELD(0xf, V3D_SLCACTL_T1CC) | + VC4_SET_FIELD(0xf, V3D_SLCACTL_T0CC)); +} + /* Sets the registers for the next job to be actually be executed in * the hardware. * @@ -474,6 +487,14 @@ vc4_submit_next_render_job(struct drm_device *dev) if (!exec) return; + /* A previous RCL may have written to one of our textures, and + * our full cache flush at bin time may have occurred before + * that RCL completed. Flush the texture cache now, but not + * the instructions or uniforms (since we don't write those + * from an RCL). + */ + vc4_flush_texture_caches(dev); + submit_cl(dev, 1, exec->ct1ca, exec->ct1ea); } From 17b11b76b87afe9f8be199d7a5f442497133e2b0 Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Thu, 18 Jan 2018 15:58:21 +0100 Subject: [PATCH 04/62] drm/vc4: Fix NULL pointer dereference in vc4_save_hang_state() When saving BOs in the hang state we skip one entry of the kernel_state->bo[] array, thus leaving it to NULL. This leads to a NULL pointer dereference when, later in this function, we iterate over all BOs to check their ->madv state. Fixes: ca26d28bbaa3 ("drm/vc4: improve throughput by pipelining binning and rendering jobs") Cc: Signed-off-by: Boris Brezillon Signed-off-by: Eric Anholt Reviewed-by: Eric Anholt Link: https://patchwork.freedesktop.org/patch/msgid/20180118145821.22344-1-boris.brezillon@free-electrons.com --- drivers/gpu/drm/vc4/vc4_gem.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/vc4/vc4_gem.c b/drivers/gpu/drm/vc4/vc4_gem.c index e3e868cdee7943..c94cce96544c8b 100644 --- a/drivers/gpu/drm/vc4/vc4_gem.c +++ b/drivers/gpu/drm/vc4/vc4_gem.c @@ -146,7 +146,7 @@ vc4_save_hang_state(struct drm_device *dev) struct vc4_exec_info *exec[2]; struct vc4_bo *bo; unsigned long irqflags; - unsigned int i, j, unref_list_count, prev_idx; + unsigned int i, j, k, unref_list_count; kernel_state = kcalloc(1, sizeof(*kernel_state), GFP_KERNEL); if (!kernel_state) @@ -182,7 +182,7 @@ vc4_save_hang_state(struct drm_device *dev) return; } - prev_idx = 0; + k = 0; for (i = 0; i < 2; i++) { if (!exec[i]) continue; @@ -197,7 +197,7 @@ vc4_save_hang_state(struct drm_device *dev) WARN_ON(!refcount_read(&bo->usecnt)); refcount_inc(&bo->usecnt); drm_gem_object_get(&exec[i]->bo[j]->base); - kernel_state->bo[j + prev_idx] = &exec[i]->bo[j]->base; + kernel_state->bo[k++] = &exec[i]->bo[j]->base; } list_for_each_entry(bo, &exec[i]->unref_list, unref_head) { @@ -205,12 +205,12 @@ vc4_save_hang_state(struct drm_device *dev) * because they are naturally unpurgeable. */ drm_gem_object_get(&bo->base.base); - kernel_state->bo[j + prev_idx] = &bo->base.base; - j++; + kernel_state->bo[k++] = &bo->base.base; } - prev_idx = j + 1; } + WARN_ON_ONCE(k != state->bo_count); + if (exec[0]) state->start_bin = exec[0]->ct0ca; if (exec[1]) From aa5dd6fa6f5d4bdc82a67e952bba8ad2e98d77e2 Mon Sep 17 00:00:00 2001 From: Aviad Yehezkel Date: Thu, 18 Jan 2018 15:41:51 +0200 Subject: [PATCH 05/62] xfrm: fix error flow in case of add state fails If add state fails in case of device offload, netdev refcount will be negative since gc task is attempting to dev_free this state. This is fixed by putting NULL in state dev field. Signed-off-by: Aviad Yehezkel Signed-off-by: Boris Pismeny Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_device.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 30e5746085b8fc..ac9477189d1cc2 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -102,6 +102,7 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, err = dev->xfrmdev_ops->xdo_dev_state_add(x); if (err) { + xso->dev = NULL; dev_put(dev); return err; } From f30fefd8949290f505b3af2ba3c7c4c831f199b3 Mon Sep 17 00:00:00 2001 From: Andi Shyti Date: Mon, 22 Jan 2018 11:23:27 -0800 Subject: [PATCH 06/62] Input: stmfts,s6sy671 - add SPDX identifier Replace the original license statement with the SPDX identifier. Update also the copyright owner adding myself as co-owner of the copyright. Signed-off-by: Andi Shyti Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/s6sy761.c | 15 +++++---------- drivers/input/touchscreen/stmfts.c | 15 +++++---------- 2 files changed, 10 insertions(+), 20 deletions(-) diff --git a/drivers/input/touchscreen/s6sy761.c b/drivers/input/touchscreen/s6sy761.c index 26b1cb8a88ece3..675efa93d4448b 100644 --- a/drivers/input/touchscreen/s6sy761.c +++ b/drivers/input/touchscreen/s6sy761.c @@ -1,13 +1,8 @@ -/* - * Copyright (c) 2017 Samsung Electronics Co., Ltd. - * Author: Andi Shyti - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Samsung S6SY761 Touchscreen device driver - */ +// SPDX-License-Identifier: GPL-2.0 +// Samsung S6SY761 Touchscreen device driver +// +// Copyright (c) 2017 Samsung Electronics Co., Ltd. +// Copyright (c) 2017 Andi Shyti #include #include diff --git a/drivers/input/touchscreen/stmfts.c b/drivers/input/touchscreen/stmfts.c index c12d0189993965..2a123e20a42e39 100644 --- a/drivers/input/touchscreen/stmfts.c +++ b/drivers/input/touchscreen/stmfts.c @@ -1,13 +1,8 @@ -/* - * Copyright (c) 2017 Samsung Electronics Co., Ltd. - * Author: Andi Shyti - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * STMicroelectronics FTS Touchscreen device driver - */ +// SPDX-License-Identifier: GPL-2.0 +// STMicroelectronics FTS Touchscreen device driver +// +// Copyright (c) 2017 Samsung Electronics Co., Ltd. +// Copyright (c) 2017 Andi Shyti #include #include From e5c9c6a885fad00aa559b49d8fc23a60e290824e Mon Sep 17 00:00:00 2001 From: Mark Furneaux Date: Mon, 22 Jan 2018 11:24:17 -0800 Subject: [PATCH 07/62] Input: xpad - add support for PDP Xbox One controllers Adds support for the current lineup of Xbox One controllers from PDP (Performance Designed Products). These controllers are very picky with their initialization sequence and require an additional 2 packets before they send any input reports. Signed-off-by: Mark Furneaux Reviewed-by: Cameron Gutman Cc: stable@vger.kernel.org Signed-off-by: Dmitry Torokhov --- drivers/input/joystick/xpad.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/drivers/input/joystick/xpad.c b/drivers/input/joystick/xpad.c index d86e59515b9c9e..d88d3e0f59fb83 100644 --- a/drivers/input/joystick/xpad.c +++ b/drivers/input/joystick/xpad.c @@ -229,6 +229,7 @@ static const struct xpad_device { { 0x0e6f, 0x0213, "Afterglow Gamepad for Xbox 360", 0, XTYPE_XBOX360 }, { 0x0e6f, 0x021f, "Rock Candy Gamepad for Xbox 360", 0, XTYPE_XBOX360 }, { 0x0e6f, 0x0246, "Rock Candy Gamepad for Xbox One 2015", 0, XTYPE_XBOXONE }, + { 0x0e6f, 0x02ab, "PDP Controller for Xbox One", 0, XTYPE_XBOXONE }, { 0x0e6f, 0x0301, "Logic3 Controller", 0, XTYPE_XBOX360 }, { 0x0e6f, 0x0346, "Rock Candy Gamepad for Xbox One 2016", 0, XTYPE_XBOXONE }, { 0x0e6f, 0x0401, "Logic3 Controller", 0, XTYPE_XBOX360 }, @@ -475,6 +476,22 @@ static const u8 xboxone_hori_init[] = { 0x00, 0x00, 0x00, 0x80, 0x00 }; +/* + * This packet is required for some of the PDP pads to start + * sending input reports. One of those pads is (0x0e6f:0x02ab). + */ +static const u8 xboxone_pdp_init1[] = { + 0x0a, 0x20, 0x00, 0x03, 0x00, 0x01, 0x14 +}; + +/* + * This packet is required for some of the PDP pads to start + * sending input reports. One of those pads is (0x0e6f:0x02ab). + */ +static const u8 xboxone_pdp_init2[] = { + 0x06, 0x20, 0x00, 0x02, 0x01, 0x00 +}; + /* * A specific rumble packet is required for some PowerA pads to start * sending input reports. One of those pads is (0x24c6:0x543a). @@ -505,6 +522,8 @@ static const struct xboxone_init_packet xboxone_init_packets[] = { XBOXONE_INIT_PKT(0x0e6f, 0x0165, xboxone_hori_init), XBOXONE_INIT_PKT(0x0f0d, 0x0067, xboxone_hori_init), XBOXONE_INIT_PKT(0x0000, 0x0000, xboxone_fw2015_init), + XBOXONE_INIT_PKT(0x0e6f, 0x02ab, xboxone_pdp_init1), + XBOXONE_INIT_PKT(0x0e6f, 0x02ab, xboxone_pdp_init2), XBOXONE_INIT_PKT(0x24c6, 0x541a, xboxone_rumblebegin_init), XBOXONE_INIT_PKT(0x24c6, 0x542a, xboxone_rumblebegin_init), XBOXONE_INIT_PKT(0x24c6, 0x543a, xboxone_rumblebegin_init), From f5d07b9e98022d50720e38aa936fc11c67868ece Mon Sep 17 00:00:00 2001 From: Aaron Ma Date: Fri, 19 Jan 2018 09:43:39 -0800 Subject: [PATCH 08/62] Input: trackpoint - force 3 buttons if 0 button is reported Lenovo introduced trackpoint compatible sticks with minimum PS/2 commands. They supposed to reply with 0x02, 0x03, or 0x04 in response to the "Read Extended ID" command, so we would know not to try certain extended commands. Unfortunately even some trackpoints reporting the original IBM version (0x01 firmware 0x0e) now respond with incorrect data to the "Get Extended Buttons" command: thinkpad_acpi: ThinkPad BIOS R0DET87W (1.87 ), EC unknown thinkpad_acpi: Lenovo ThinkPad E470, model 20H1004SGE psmouse serio2: trackpoint: IBM TrackPoint firmware: 0x0e, buttons: 0/0 Since there are no trackpoints without buttons, let's assume the trackpoint has 3 buttons when we get 0 response to the extended buttons query. Signed-off-by: Aaron Ma Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=196253 Cc: stable@vger.kernel.org Signed-off-by: Dmitry Torokhov --- drivers/input/mouse/trackpoint.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/input/mouse/trackpoint.c b/drivers/input/mouse/trackpoint.c index 0871010f18d5f4..92a8898682a6d8 100644 --- a/drivers/input/mouse/trackpoint.c +++ b/drivers/input/mouse/trackpoint.c @@ -383,6 +383,9 @@ int trackpoint_detect(struct psmouse *psmouse, bool set_properties) if (trackpoint_read(ps2dev, TP_EXT_BTN, &button_info)) { psmouse_warn(psmouse, "failed to get extended button data, assuming 3 buttons\n"); button_info = 0x33; + } else if (!button_info) { + psmouse_warn(psmouse, "got 0 in extended button data, assuming 3 buttons\n"); + button_info = 0x33; } psmouse->private = kzalloc(sizeof(struct trackpoint_data), GFP_KERNEL); From 2a924d71794c530e55e73d0ce2cc77233307eaa9 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 5 Jan 2018 13:28:47 -0800 Subject: [PATCH 09/62] Input: trackpoint - only expose supported controls for Elan, ALPS and NXP The newer trackpoints from ALPS, Elan and NXP implement a very limited subset of extended commands and controls that the original trackpoints implemented, so we should not be exposing not working controls in sysfs. The newer trackpoints also do not implement "Power On Reset" or "Read Extended Button Status", so we should not be using these commands during initialization. While we are at it, let's change "unsigned char" to u8 for byte data or bool for booleans and use better suited error codes instead of -1. Cc: stable@vger.kernel.org Signed-off-by: Dmitry Torokhov --- drivers/input/mouse/trackpoint.c | 248 +++++++++++++++++++------------ drivers/input/mouse/trackpoint.h | 34 +++-- 2 files changed, 172 insertions(+), 110 deletions(-) diff --git a/drivers/input/mouse/trackpoint.c b/drivers/input/mouse/trackpoint.c index 92a8898682a6d8..bbd29220dbe998 100644 --- a/drivers/input/mouse/trackpoint.c +++ b/drivers/input/mouse/trackpoint.c @@ -19,6 +19,13 @@ #include "psmouse.h" #include "trackpoint.h" +static const char * const trackpoint_variants[] = { + [TP_VARIANT_IBM] = "IBM", + [TP_VARIANT_ALPS] = "ALPS", + [TP_VARIANT_ELAN] = "Elan", + [TP_VARIANT_NXP] = "NXP", +}; + /* * Power-on Reset: Resets all trackpoint parameters, including RAM values, * to defaults. @@ -26,7 +33,7 @@ */ static int trackpoint_power_on_reset(struct ps2dev *ps2dev) { - unsigned char results[2]; + u8 results[2]; int tries = 0; /* Issue POR command, and repeat up to once if 0xFC00 received */ @@ -38,7 +45,7 @@ static int trackpoint_power_on_reset(struct ps2dev *ps2dev) /* Check for success response -- 0xAA00 */ if (results[0] != 0xAA || results[1] != 0x00) - return -1; + return -ENODEV; return 0; } @@ -46,8 +53,7 @@ static int trackpoint_power_on_reset(struct ps2dev *ps2dev) /* * Device IO: read, write and toggle bit */ -static int trackpoint_read(struct ps2dev *ps2dev, - unsigned char loc, unsigned char *results) +static int trackpoint_read(struct ps2dev *ps2dev, u8 loc, u8 *results) { if (ps2_command(ps2dev, NULL, MAKE_PS2_CMD(0, 0, TP_COMMAND)) || ps2_command(ps2dev, results, MAKE_PS2_CMD(0, 1, loc))) { @@ -57,8 +63,7 @@ static int trackpoint_read(struct ps2dev *ps2dev, return 0; } -static int trackpoint_write(struct ps2dev *ps2dev, - unsigned char loc, unsigned char val) +static int trackpoint_write(struct ps2dev *ps2dev, u8 loc, u8 val) { if (ps2_command(ps2dev, NULL, MAKE_PS2_CMD(0, 0, TP_COMMAND)) || ps2_command(ps2dev, NULL, MAKE_PS2_CMD(0, 0, TP_WRITE_MEM)) || @@ -70,8 +75,7 @@ static int trackpoint_write(struct ps2dev *ps2dev, return 0; } -static int trackpoint_toggle_bit(struct ps2dev *ps2dev, - unsigned char loc, unsigned char mask) +static int trackpoint_toggle_bit(struct ps2dev *ps2dev, u8 loc, u8 mask) { /* Bad things will happen if the loc param isn't in this range */ if (loc < 0x20 || loc >= 0x2F) @@ -87,11 +91,11 @@ static int trackpoint_toggle_bit(struct ps2dev *ps2dev, return 0; } -static int trackpoint_update_bit(struct ps2dev *ps2dev, unsigned char loc, - unsigned char mask, unsigned char value) +static int trackpoint_update_bit(struct ps2dev *ps2dev, + u8 loc, u8 mask, u8 value) { int retval = 0; - unsigned char data; + u8 data; trackpoint_read(ps2dev, loc, &data); if (((data & mask) == mask) != !!value) @@ -105,17 +109,18 @@ static int trackpoint_update_bit(struct ps2dev *ps2dev, unsigned char loc, */ struct trackpoint_attr_data { size_t field_offset; - unsigned char command; - unsigned char mask; - unsigned char inverted; - unsigned char power_on_default; + u8 command; + u8 mask; + bool inverted; + u8 power_on_default; }; -static ssize_t trackpoint_show_int_attr(struct psmouse *psmouse, void *data, char *buf) +static ssize_t trackpoint_show_int_attr(struct psmouse *psmouse, + void *data, char *buf) { struct trackpoint_data *tp = psmouse->private; struct trackpoint_attr_data *attr = data; - unsigned char value = *(unsigned char *)((char *)tp + attr->field_offset); + u8 value = *(u8 *)((void *)tp + attr->field_offset); if (attr->inverted) value = !value; @@ -128,8 +133,8 @@ static ssize_t trackpoint_set_int_attr(struct psmouse *psmouse, void *data, { struct trackpoint_data *tp = psmouse->private; struct trackpoint_attr_data *attr = data; - unsigned char *field = (unsigned char *)((char *)tp + attr->field_offset); - unsigned char value; + u8 *field = (void *)tp + attr->field_offset; + u8 value; int err; err = kstrtou8(buf, 10, &value); @@ -157,17 +162,14 @@ static ssize_t trackpoint_set_bit_attr(struct psmouse *psmouse, void *data, { struct trackpoint_data *tp = psmouse->private; struct trackpoint_attr_data *attr = data; - unsigned char *field = (unsigned char *)((char *)tp + attr->field_offset); - unsigned int value; + bool *field = (void *)tp + attr->field_offset; + bool value; int err; - err = kstrtouint(buf, 10, &value); + err = kstrtobool(buf, &value); if (err) return err; - if (value > 1) - return -EINVAL; - if (attr->inverted) value = !value; @@ -193,30 +195,6 @@ PSMOUSE_DEFINE_ATTR(_name, S_IWUSR | S_IRUGO, \ &trackpoint_attr_##_name, \ trackpoint_show_int_attr, trackpoint_set_bit_attr) -#define TRACKPOINT_UPDATE_BIT(_psmouse, _tp, _name) \ -do { \ - struct trackpoint_attr_data *_attr = &trackpoint_attr_##_name; \ - \ - trackpoint_update_bit(&_psmouse->ps2dev, \ - _attr->command, _attr->mask, _tp->_name); \ -} while (0) - -#define TRACKPOINT_UPDATE(_power_on, _psmouse, _tp, _name) \ -do { \ - if (!_power_on || \ - _tp->_name != trackpoint_attr_##_name.power_on_default) { \ - if (!trackpoint_attr_##_name.mask) \ - trackpoint_write(&_psmouse->ps2dev, \ - trackpoint_attr_##_name.command, \ - _tp->_name); \ - else \ - TRACKPOINT_UPDATE_BIT(_psmouse, _tp, _name); \ - } \ -} while (0) - -#define TRACKPOINT_SET_POWER_ON_DEFAULT(_tp, _name) \ - (_tp->_name = trackpoint_attr_##_name.power_on_default) - TRACKPOINT_INT_ATTR(sensitivity, TP_SENS, TP_DEF_SENS); TRACKPOINT_INT_ATTR(speed, TP_SPEED, TP_DEF_SPEED); TRACKPOINT_INT_ATTR(inertia, TP_INERTIA, TP_DEF_INERTIA); @@ -229,13 +207,33 @@ TRACKPOINT_INT_ATTR(ztime, TP_Z_TIME, TP_DEF_Z_TIME); TRACKPOINT_INT_ATTR(jenks, TP_JENKS_CURV, TP_DEF_JENKS_CURV); TRACKPOINT_INT_ATTR(drift_time, TP_DRIFT_TIME, TP_DEF_DRIFT_TIME); -TRACKPOINT_BIT_ATTR(press_to_select, TP_TOGGLE_PTSON, TP_MASK_PTSON, 0, +TRACKPOINT_BIT_ATTR(press_to_select, TP_TOGGLE_PTSON, TP_MASK_PTSON, false, TP_DEF_PTSON); -TRACKPOINT_BIT_ATTR(skipback, TP_TOGGLE_SKIPBACK, TP_MASK_SKIPBACK, 0, +TRACKPOINT_BIT_ATTR(skipback, TP_TOGGLE_SKIPBACK, TP_MASK_SKIPBACK, false, TP_DEF_SKIPBACK); -TRACKPOINT_BIT_ATTR(ext_dev, TP_TOGGLE_EXT_DEV, TP_MASK_EXT_DEV, 1, +TRACKPOINT_BIT_ATTR(ext_dev, TP_TOGGLE_EXT_DEV, TP_MASK_EXT_DEV, true, TP_DEF_EXT_DEV); +static bool trackpoint_is_attr_available(struct psmouse *psmouse, + struct attribute *attr) +{ + struct trackpoint_data *tp = psmouse->private; + + return tp->variant_id == TP_VARIANT_IBM || + attr == &psmouse_attr_sensitivity.dattr.attr || + attr == &psmouse_attr_press_to_select.dattr.attr; +} + +static umode_t trackpoint_is_attr_visible(struct kobject *kobj, + struct attribute *attr, int n) +{ + struct device *dev = container_of(kobj, struct device, kobj); + struct serio *serio = to_serio_port(dev); + struct psmouse *psmouse = serio_get_drvdata(serio); + + return trackpoint_is_attr_available(psmouse, attr) ? attr->mode : 0; +} + static struct attribute *trackpoint_attrs[] = { &psmouse_attr_sensitivity.dattr.attr, &psmouse_attr_speed.dattr.attr, @@ -255,24 +253,56 @@ static struct attribute *trackpoint_attrs[] = { }; static struct attribute_group trackpoint_attr_group = { - .attrs = trackpoint_attrs, + .is_visible = trackpoint_is_attr_visible, + .attrs = trackpoint_attrs, }; -static int trackpoint_start_protocol(struct psmouse *psmouse, unsigned char *firmware_id) -{ - unsigned char param[2] = { 0 }; +#define TRACKPOINT_UPDATE(_power_on, _psmouse, _tp, _name) \ +do { \ + struct trackpoint_attr_data *_attr = &trackpoint_attr_##_name; \ + \ + if ((!_power_on || _tp->_name != _attr->power_on_default) && \ + trackpoint_is_attr_available(_psmouse, \ + &psmouse_attr_##_name.dattr.attr)) { \ + if (!_attr->mask) \ + trackpoint_write(&_psmouse->ps2dev, \ + _attr->command, _tp->_name); \ + else \ + trackpoint_update_bit(&_psmouse->ps2dev, \ + _attr->command, _attr->mask, \ + _tp->_name); \ + } \ +} while (0) - if (ps2_command(&psmouse->ps2dev, param, MAKE_PS2_CMD(0, 2, TP_READ_ID))) - return -1; +#define TRACKPOINT_SET_POWER_ON_DEFAULT(_tp, _name) \ +do { \ + _tp->_name = trackpoint_attr_##_name.power_on_default; \ +} while (0) - /* add new TP ID. */ - if (!(param[0] & TP_MAGIC_IDENT)) - return -1; +static int trackpoint_start_protocol(struct psmouse *psmouse, + u8 *variant_id, u8 *firmware_id) +{ + u8 param[2] = { 0 }; + int error; - if (firmware_id) - *firmware_id = param[1]; + error = ps2_command(&psmouse->ps2dev, + param, MAKE_PS2_CMD(0, 2, TP_READ_ID)); + if (error) + return error; + + switch (param[0]) { + case TP_VARIANT_IBM: + case TP_VARIANT_ALPS: + case TP_VARIANT_ELAN: + case TP_VARIANT_NXP: + if (variant_id) + *variant_id = param[0]; + if (firmware_id) + *firmware_id = param[1]; + return 0; + } - return 0; + return -ENODEV; } /* @@ -285,7 +315,7 @@ static int trackpoint_sync(struct psmouse *psmouse, bool in_power_on_state) { struct trackpoint_data *tp = psmouse->private; - if (!in_power_on_state) { + if (!in_power_on_state && tp->variant_id == TP_VARIANT_IBM) { /* * Disable features that may make device unusable * with this driver. @@ -347,7 +377,8 @@ static void trackpoint_defaults(struct trackpoint_data *tp) static void trackpoint_disconnect(struct psmouse *psmouse) { - sysfs_remove_group(&psmouse->ps2dev.serio->dev.kobj, &trackpoint_attr_group); + device_remove_group(&psmouse->ps2dev.serio->dev, + &trackpoint_attr_group); kfree(psmouse->private); psmouse->private = NULL; @@ -355,14 +386,20 @@ static void trackpoint_disconnect(struct psmouse *psmouse) static int trackpoint_reconnect(struct psmouse *psmouse) { - int reset_fail; + struct trackpoint_data *tp = psmouse->private; + int error; + bool was_reset; - if (trackpoint_start_protocol(psmouse, NULL)) - return -1; + error = trackpoint_start_protocol(psmouse, NULL, NULL); + if (error) + return error; - reset_fail = trackpoint_power_on_reset(&psmouse->ps2dev); - if (trackpoint_sync(psmouse, !reset_fail)) - return -1; + was_reset = tp->variant_id == TP_VARIANT_IBM && + trackpoint_power_on_reset(&psmouse->ps2dev) == 0; + + error = trackpoint_sync(psmouse, was_reset); + if (error) + return error; return 0; } @@ -370,49 +407,66 @@ static int trackpoint_reconnect(struct psmouse *psmouse) int trackpoint_detect(struct psmouse *psmouse, bool set_properties) { struct ps2dev *ps2dev = &psmouse->ps2dev; - unsigned char firmware_id; - unsigned char button_info; + struct trackpoint_data *tp; + u8 variant_id; + u8 firmware_id; + u8 button_info; int error; - if (trackpoint_start_protocol(psmouse, &firmware_id)) - return -1; + error = trackpoint_start_protocol(psmouse, &variant_id, &firmware_id); + if (error) + return error; if (!set_properties) return 0; - if (trackpoint_read(ps2dev, TP_EXT_BTN, &button_info)) { - psmouse_warn(psmouse, "failed to get extended button data, assuming 3 buttons\n"); - button_info = 0x33; - } else if (!button_info) { - psmouse_warn(psmouse, "got 0 in extended button data, assuming 3 buttons\n"); - button_info = 0x33; - } - - psmouse->private = kzalloc(sizeof(struct trackpoint_data), GFP_KERNEL); - if (!psmouse->private) + tp = kzalloc(sizeof(*tp), GFP_KERNEL); + if (!tp) return -ENOMEM; - psmouse->vendor = "IBM"; + trackpoint_defaults(tp); + tp->variant_id = variant_id; + tp->firmware_id = firmware_id; + + psmouse->private = tp; + + psmouse->vendor = trackpoint_variants[variant_id]; psmouse->name = "TrackPoint"; psmouse->reconnect = trackpoint_reconnect; psmouse->disconnect = trackpoint_disconnect; + if (variant_id != TP_VARIANT_IBM) { + /* Newer variants do not support extended button query. */ + button_info = 0x33; + } else { + error = trackpoint_read(ps2dev, TP_EXT_BTN, &button_info); + if (error) { + psmouse_warn(psmouse, + "failed to get extended button data, assuming 3 buttons\n"); + button_info = 0x33; + } else if (!button_info) { + psmouse_warn(psmouse, + "got 0 in extended button data, assuming 3 buttons\n"); + button_info = 0x33; + } + } + if ((button_info & 0x0f) >= 3) - __set_bit(BTN_MIDDLE, psmouse->dev->keybit); + input_set_capability(psmouse->dev, EV_KEY, BTN_MIDDLE); __set_bit(INPUT_PROP_POINTER, psmouse->dev->propbit); __set_bit(INPUT_PROP_POINTING_STICK, psmouse->dev->propbit); - trackpoint_defaults(psmouse->private); - - error = trackpoint_power_on_reset(ps2dev); - - /* Write defaults to TP only if reset fails. */ - if (error) + if (variant_id != TP_VARIANT_IBM || + trackpoint_power_on_reset(ps2dev) != 0) { + /* + * Write defaults to TP if we did not reset the trackpoint. + */ trackpoint_sync(psmouse, false); + } - error = sysfs_create_group(&ps2dev->serio->dev.kobj, &trackpoint_attr_group); + error = device_add_group(&ps2dev->serio->dev, &trackpoint_attr_group); if (error) { psmouse_err(psmouse, "failed to create sysfs attributes, error: %d\n", @@ -423,8 +477,8 @@ int trackpoint_detect(struct psmouse *psmouse, bool set_properties) } psmouse_info(psmouse, - "IBM TrackPoint firmware: 0x%02x, buttons: %d/%d\n", - firmware_id, + "%s TrackPoint firmware: 0x%02x, buttons: %d/%d\n", + psmouse->vendor, firmware_id, (button_info & 0xf0) >> 4, button_info & 0x0f); return 0; diff --git a/drivers/input/mouse/trackpoint.h b/drivers/input/mouse/trackpoint.h index 88055755f82e23..10a0391482343d 100644 --- a/drivers/input/mouse/trackpoint.h +++ b/drivers/input/mouse/trackpoint.h @@ -21,10 +21,16 @@ #define TP_COMMAND 0xE2 /* Commands start with this */ #define TP_READ_ID 0xE1 /* Sent for device identification */ -#define TP_MAGIC_IDENT 0x03 /* Sent after a TP_READ_ID followed */ - /* by the firmware ID */ - /* Firmware ID includes 0x1, 0x2, 0x3 */ +/* + * Valid first byte responses to the "Read Secondary ID" (0xE1) command. + * 0x01 was the original IBM trackpoint, others implement very limited + * subset of trackpoint features. + */ +#define TP_VARIANT_IBM 0x01 +#define TP_VARIANT_ALPS 0x02 +#define TP_VARIANT_ELAN 0x03 +#define TP_VARIANT_NXP 0x04 /* * Commands @@ -136,18 +142,20 @@ #define MAKE_PS2_CMD(params, results, cmd) ((params<<12) | (results<<8) | (cmd)) -struct trackpoint_data -{ - unsigned char sensitivity, speed, inertia, reach; - unsigned char draghys, mindrag; - unsigned char thresh, upthresh; - unsigned char ztime, jenks; - unsigned char drift_time; +struct trackpoint_data { + u8 variant_id; + u8 firmware_id; + + u8 sensitivity, speed, inertia, reach; + u8 draghys, mindrag; + u8 thresh, upthresh; + u8 ztime, jenks; + u8 drift_time; /* toggles */ - unsigned char press_to_select; - unsigned char skipback; - unsigned char ext_dev; + bool press_to_select; + bool skipback; + bool ext_dev; }; #ifdef CONFIG_MOUSE_PS2_TRACKPOINT From 5efec5c655dd31944af440ff2b0a93facc4a7762 Mon Sep 17 00:00:00 2001 From: Yossi Kuperman Date: Tue, 23 Jan 2018 00:16:21 +0200 Subject: [PATCH 10/62] xfrm: Fix eth_hdr(skb)->h_proto to reflect inner IP version IPSec tunnel mode supports encapsulation of IPv4 over IPv6 and vice-versa. The outer IP header is stripped and the inner IP inherits the original Ethernet header. Tcpdump fails to properly decode the inner packet in case that h_proto is different than the inner IP version. Fix h_proto to reflect the inner IP version. Signed-off-by: Yossi Kuperman Signed-off-by: Steffen Klassert --- net/ipv4/xfrm4_mode_tunnel.c | 1 + net/ipv6/xfrm6_mode_tunnel.c | 1 + 2 files changed, 2 insertions(+) diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index e6265e2c274e48..20ca486b3cadf3 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -92,6 +92,7 @@ static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) skb_reset_network_header(skb); skb_mac_header_rebuild(skb); + eth_hdr(skb)->h_proto = skb->protocol; err = 0; diff --git a/net/ipv6/xfrm6_mode_tunnel.c b/net/ipv6/xfrm6_mode_tunnel.c index 02556e356f87e9..dc93002ff9d1b8 100644 --- a/net/ipv6/xfrm6_mode_tunnel.c +++ b/net/ipv6/xfrm6_mode_tunnel.c @@ -92,6 +92,7 @@ static int xfrm6_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) skb_reset_network_header(skb); skb_mac_header_rebuild(skb); + eth_hdr(skb)->h_proto = skb->protocol; err = 0; From 545d8ae7affff7fb4f8bfd327c7c7790056535c4 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 22 Jan 2018 16:34:09 -0600 Subject: [PATCH 11/62] xfrm: fix boolean assignment in xfrm_get_type_offload Assign true or false to boolean variables instead of an integer value. This issue was detected with the help of Coccinelle. Fixes: ffdb5211da1c ("xfrm: Auto-load xfrm offload modules") Signed-off-by: Gustavo A. R. Silva Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 2d486492acdb26..a3785f538018dc 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -317,7 +317,7 @@ xfrm_get_type_offload(u8 proto, unsigned short family, bool try_load) if (!type && try_load) { request_module("xfrm-offload-%d-%d", family, proto); - try_load = 0; + try_load = false; goto retry; } From e2ac83d74a4d753cea88407e65136c84a0cb60b2 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 22 Jan 2018 22:07:46 -0600 Subject: [PATCH 12/62] x86/ftrace: Fix ORC unwinding from ftrace handlers Steven Rostedt discovered that the ftrace stack tracer is broken when it's used with the ORC unwinder. The problem is that objtool is instructed by the Makefile to ignore the ftrace_64.S code, so it doesn't generate any ORC data for it. Fix it by making the asm code objtool-friendly: - Objtool doesn't like the fact that save_mcount_regs pushes RBP at the beginning, but it's never restored (directly, at least). So just skip the original RBP push, which is only needed for frame pointers anyway. - Annotate some functions as normal callable functions with ENTRY/ENDPROC. - Add an empty unwind hint to return_to_handler(). The return address isn't on the stack, so there's nothing ORC can do there. It will just punt in the unlikely case it tries to unwind from that code. With all that fixed, remove the OBJECT_FILES_NON_STANDARD Makefile annotation so objtool can read the file. Link: http://lkml.kernel.org/r/20180123040746.ih4ep3tk4pbjvg7c@treble Reported-by: Steven Rostedt Signed-off-by: Josh Poimboeuf Signed-off-by: Steven Rostedt (VMware) --- arch/x86/kernel/Makefile | 5 ++++- arch/x86/kernel/ftrace_64.S | 24 +++++++++++++++--------- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 81bb565f449740..7e2baf7304ae49 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -29,10 +29,13 @@ KASAN_SANITIZE_stacktrace.o := n KASAN_SANITIZE_paravirt.o := n OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y -OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y OBJECT_FILES_NON_STANDARD_test_nx.o := y OBJECT_FILES_NON_STANDARD_paravirt_patch_$(BITS).o := y +ifdef CONFIG_FRAME_POINTER +OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y +endif + # If instrumentation of this dir is enabled, boot hangs during first second. # Probably could be more selective here, but note that files related to irqs, # boot, dumpstack/stacktrace, etc are either non-interesting or can lead to diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index 7cb8ba08beb997..ef61f540cf0af6 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -8,6 +8,7 @@ #include #include #include +#include .code64 .section .entry.text, "ax" @@ -20,7 +21,6 @@ EXPORT_SYMBOL(__fentry__) EXPORT_SYMBOL(mcount) #endif -/* All cases save the original rbp (8 bytes) */ #ifdef CONFIG_FRAME_POINTER # ifdef CC_USING_FENTRY /* Save parent and function stack frames (rip and rbp) */ @@ -31,7 +31,7 @@ EXPORT_SYMBOL(mcount) # endif #else /* No need to save a stack frame */ -# define MCOUNT_FRAME_SIZE 8 +# define MCOUNT_FRAME_SIZE 0 #endif /* CONFIG_FRAME_POINTER */ /* Size of stack used to save mcount regs in save_mcount_regs */ @@ -64,10 +64,10 @@ EXPORT_SYMBOL(mcount) */ .macro save_mcount_regs added=0 - /* Always save the original rbp */ +#ifdef CONFIG_FRAME_POINTER + /* Save the original rbp */ pushq %rbp -#ifdef CONFIG_FRAME_POINTER /* * Stack traces will stop at the ftrace trampoline if the frame pointer * is not set up properly. If fentry is used, we need to save a frame @@ -105,7 +105,11 @@ EXPORT_SYMBOL(mcount) * Save the original RBP. Even though the mcount ABI does not * require this, it helps out callers. */ +#ifdef CONFIG_FRAME_POINTER movq MCOUNT_REG_SIZE-8(%rsp), %rdx +#else + movq %rbp, %rdx +#endif movq %rdx, RBP(%rsp) /* Copy the parent address into %rsi (second parameter) */ @@ -148,7 +152,7 @@ EXPORT_SYMBOL(mcount) ENTRY(function_hook) retq -END(function_hook) +ENDPROC(function_hook) ENTRY(ftrace_caller) /* save_mcount_regs fills in first two parameters */ @@ -184,7 +188,7 @@ GLOBAL(ftrace_graph_call) /* This is weak to keep gas from relaxing the jumps */ WEAK(ftrace_stub) retq -END(ftrace_caller) +ENDPROC(ftrace_caller) ENTRY(ftrace_regs_caller) /* Save the current flags before any operations that can change them */ @@ -255,7 +259,7 @@ GLOBAL(ftrace_regs_caller_end) jmp ftrace_epilogue -END(ftrace_regs_caller) +ENDPROC(ftrace_regs_caller) #else /* ! CONFIG_DYNAMIC_FTRACE */ @@ -313,9 +317,10 @@ ENTRY(ftrace_graph_caller) restore_mcount_regs retq -END(ftrace_graph_caller) +ENDPROC(ftrace_graph_caller) -GLOBAL(return_to_handler) +ENTRY(return_to_handler) + UNWIND_HINT_EMPTY subq $24, %rsp /* Save the return values */ @@ -330,4 +335,5 @@ GLOBAL(return_to_handler) movq (%rsp), %rax addq $24, %rsp JMP_NOSPEC %rdi +END(return_to_handler) #endif From 6be7fa3c74d1e0cd50f2157b5c1524f152bf641e Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 22 Jan 2018 22:32:51 -0500 Subject: [PATCH 13/62] ftrace, orc, x86: Handle ftrace dynamically allocated trampolines The function tracer can create a dynamically allocated trampoline that is called by the function mcount or fentry hook that is used to call the function callback that is registered. The problem is that the orc undwinder will bail if it encounters one of these trampolines. This breaks the stack trace of function callbacks, which include the stack tracer and setting the stack trace for individual functions. Since these dynamic trampolines are basically copies of the static ftrace trampolines defined in ftrace_*.S, we do not need to create new orc entries for the dynamic trampolines. Finding the return address on the stack will be identical as the functions that were copied to create the dynamic trampolines. When encountering a ftrace dynamic trampoline, we can just use the orc entry of the ftrace static function that was copied for that trampoline. Signed-off-by: Steven Rostedt (VMware) --- arch/x86/kernel/unwind_orc.c | 48 +++++++++++++++++++++++++++++++++++- include/linux/ftrace.h | 2 ++ kernel/trace/ftrace.c | 29 +++++++++++++--------- 3 files changed, 66 insertions(+), 13 deletions(-) diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index be86a865087a6b..1f9188f5357cb3 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -74,8 +74,50 @@ static struct orc_entry *orc_module_find(unsigned long ip) } #endif +#ifdef CONFIG_DYNAMIC_FTRACE +static struct orc_entry *orc_find(unsigned long ip); + +/* + * Ftrace dynamic trampolines do not have orc entries of their own. + * But they are copies of the ftrace entries that are static and + * defined in ftrace_*.S, which do have orc entries. + * + * If the undwinder comes across a ftrace trampoline, then find the + * ftrace function that was used to create it, and use that ftrace + * function's orc entrie, as the placement of the return code in + * the stack will be identical. + */ +static struct orc_entry *orc_ftrace_find(unsigned long ip) +{ + struct ftrace_ops *ops; + unsigned long caller; + + ops = ftrace_ops_trampoline(ip); + if (!ops) + return NULL; + + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) + caller = (unsigned long)ftrace_regs_call; + else + caller = (unsigned long)ftrace_call; + + /* Prevent unlikely recursion */ + if (ip == caller) + return NULL; + + return orc_find(caller); +} +#else +static struct orc_entry *orc_ftrace_find(unsigned long ip) +{ + return NULL; +} +#endif + static struct orc_entry *orc_find(unsigned long ip) { + static struct orc_entry *orc; + if (!orc_init) return NULL; @@ -111,7 +153,11 @@ static struct orc_entry *orc_find(unsigned long ip) __stop_orc_unwind_ip - __start_orc_unwind_ip, ip); /* Module lookup: */ - return orc_module_find(ip); + orc = orc_module_find(ip); + if (orc) + return orc; + + return orc_ftrace_find(ip); } static void orc_sort_swap(void *_a, void *_b, int size) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 2bab81951ced73..3319df9727aa23 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -332,6 +332,8 @@ extern int ftrace_text_reserved(const void *start, const void *end); extern int ftrace_nr_registered_ops(void); +struct ftrace_ops *ftrace_ops_trampoline(unsigned long addr); + bool is_ftrace_trampoline(unsigned long addr); /* diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ccdf3664e4a9a7..554b517c61a04d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1119,15 +1119,11 @@ static struct ftrace_ops global_ops = { }; /* - * This is used by __kernel_text_address() to return true if the - * address is on a dynamically allocated trampoline that would - * not return true for either core_kernel_text() or - * is_module_text_address(). + * Used by the stack undwinder to know about dynamic ftrace trampolines. */ -bool is_ftrace_trampoline(unsigned long addr) +struct ftrace_ops *ftrace_ops_trampoline(unsigned long addr) { - struct ftrace_ops *op; - bool ret = false; + struct ftrace_ops *op = NULL; /* * Some of the ops may be dynamically allocated, @@ -1144,15 +1140,24 @@ bool is_ftrace_trampoline(unsigned long addr) if (op->trampoline && op->trampoline_size) if (addr >= op->trampoline && addr < op->trampoline + op->trampoline_size) { - ret = true; - goto out; + preempt_enable_notrace(); + return op; } } while_for_each_ftrace_op(op); - - out: preempt_enable_notrace(); - return ret; + return NULL; +} + +/* + * This is used by __kernel_text_address() to return true if the + * address is on a dynamically allocated trampoline that would + * not return true for either core_kernel_text() or + * is_module_text_address(). + */ +bool is_ftrace_trampoline(unsigned long addr) +{ + return ftrace_ops_trampoline(addr) != NULL; } struct ftrace_page { From 2ee5b92a2598d9e403337185fdf88f661dee8616 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 23 Jan 2018 13:25:04 -0500 Subject: [PATCH 14/62] tracing: Update stack trace skipping for ORC unwinder With the addition of ORC unwinder and FRAME POINTER unwinder, the stack trace skipping requirements have changed. I went through the tracing stack trace dumps with ORC and with frame pointers and recalculated the proper values. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 34 +++++++++++--------- kernel/trace/trace_events_trigger.c | 13 ++++++-- kernel/trace/trace_functions.c | 49 +++++++++++++++++++++-------- 3 files changed, 67 insertions(+), 29 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2a8d8a294345a2..8e3f20a18a06da 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2374,6 +2374,15 @@ void trace_event_buffer_commit(struct trace_event_buffer *fbuffer) } EXPORT_SYMBOL_GPL(trace_event_buffer_commit); +/* + * Skip 3: + * + * trace_buffer_unlock_commit_regs() + * trace_event_buffer_commit() + * trace_event_raw_event_xxx() +*/ +# define STACK_SKIP 3 + void trace_buffer_unlock_commit_regs(struct trace_array *tr, struct ring_buffer *buffer, struct ring_buffer_event *event, @@ -2383,16 +2392,12 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr, __buffer_unlock_commit(buffer, event); /* - * If regs is not set, then skip the following callers: - * trace_buffer_unlock_commit_regs - * event_trigger_unlock_commit - * trace_event_buffer_commit - * trace_event_raw_event_sched_switch + * If regs is not set, then skip the necessary functions. * Note, we can still get here via blktrace, wakeup tracer * and mmiotrace, but that's ok if they lose a function or - * two. They are that meaningful. + * two. They are not that meaningful. */ - ftrace_trace_stack(tr, buffer, flags, regs ? 0 : 4, pc, regs); + ftrace_trace_stack(tr, buffer, flags, regs ? 0 : STACK_SKIP, pc, regs); ftrace_trace_userstack(buffer, flags, pc); } @@ -2579,11 +2584,13 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, trace.skip = skip; /* - * Add two, for this function and the call to save_stack_trace() + * Add one, for this function and the call to save_stack_trace() * If regs is set, then these functions will not be in the way. */ +#ifndef CONFIG_UNWINDER_ORC if (!regs) - trace.skip += 2; + trace.skip++; +#endif /* * Since events can happen in NMIs there's no safe way to @@ -2711,11 +2718,10 @@ void trace_dump_stack(int skip) local_save_flags(flags); - /* - * Skip 3 more, seems to get us at the caller of - * this function. - */ - skip += 3; +#ifndef CONFIG_UNWINDER_ORC + /* Skip 1 to skip this function. */ + skip++; +#endif __ftrace_trace_stack(global_trace.trace_buffer.buffer, flags, skip, preempt_count(), NULL); } diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index f2ac9d44f6c4b1..87411482a46f27 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -1123,13 +1123,22 @@ static __init int register_trigger_snapshot_cmd(void) { return 0; } #endif /* CONFIG_TRACER_SNAPSHOT */ #ifdef CONFIG_STACKTRACE +#ifdef CONFIG_UNWINDER_ORC +/* Skip 2: + * event_triggers_post_call() + * trace_event_raw_event_xxx() + */ +# define STACK_SKIP 2 +#else /* - * Skip 3: + * Skip 4: * stacktrace_trigger() * event_triggers_post_call() + * trace_event_buffer_commit() * trace_event_raw_event_xxx() */ -#define STACK_SKIP 3 +#define STACK_SKIP 4 +#endif static void stacktrace_trigger(struct event_trigger_data *data, void *rec) diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 27f7ad12c4b1b1..b611cd36e22db8 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -154,6 +154,24 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, preempt_enable_notrace(); } +#ifdef CONFIG_UNWINDER_ORC +/* + * Skip 2: + * + * function_stack_trace_call() + * ftrace_call() + */ +#define STACK_SKIP 2 +#else +/* + * Skip 3: + * __trace_stack() + * function_stack_trace_call() + * ftrace_call() + */ +#define STACK_SKIP 3 +#endif + static void function_stack_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *pt_regs) @@ -180,15 +198,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, if (likely(disabled == 1)) { pc = preempt_count(); trace_function(tr, ip, parent_ip, flags, pc); - /* - * skip over 5 funcs: - * __ftrace_trace_stack, - * __trace_stack, - * function_stack_trace_call - * ftrace_list_func - * ftrace_call - */ - __trace_stack(tr, flags, 5, pc); + __trace_stack(tr, flags, STACK_SKIP, pc); } atomic_dec(&data->disabled); @@ -367,14 +377,27 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip, tracer_tracing_off(tr); } +#ifdef CONFIG_UNWINDER_ORC /* - * Skip 4: + * Skip 3: + * + * function_trace_probe_call() + * ftrace_ops_assist_func() + * ftrace_call() + */ +#define FTRACE_STACK_SKIP 3 +#else +/* + * Skip 5: + * + * __trace_stack() * ftrace_stacktrace() * function_trace_probe_call() - * ftrace_ops_list_func() + * ftrace_ops_assist_func() * ftrace_call() */ -#define STACK_SKIP 4 +#define FTRACE_STACK_SKIP 5 +#endif static __always_inline void trace_stack(struct trace_array *tr) { @@ -384,7 +407,7 @@ static __always_inline void trace_stack(struct trace_array *tr) local_save_flags(flags); pc = preempt_count(); - __trace_stack(tr, flags, STACK_SKIP, pc); + __trace_stack(tr, flags, FTRACE_STACK_SKIP, pc); } static void From 02612bb05e51df8489db5e94d0cf8d1c81f87b0c Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Mon, 22 Jan 2018 18:06:37 +0100 Subject: [PATCH 15/62] pppoe: take ->needed_headroom of lower device into account on xmit In pppoe_sendmsg(), reserving dev->hard_header_len bytes of headroom was probably fine before the introduction of ->needed_headroom in commit f5184d267c1a ("net: Allow netdevices to specify needed head/tailroom"). But now, virtual devices typically advertise the size of their overhead in dev->needed_headroom, so we must also take it into account in skb_reserve(). Allocation size of skb is also updated to take dev->needed_tailroom into account and replace the arbitrary 32 bytes with the real size of a PPPoE header. This issue was discovered by syzbot, who connected a pppoe socket to a gre device which had dev->header_ops->create == ipgre_header and dev->hard_header_len == 0. Therefore, PPPoE didn't reserve any headroom, and dev_hard_header() crashed when ipgre_header() tried to prepend its header to skb->data. skbuff: skb_under_panic: text:000000001d390b3a len:31 put:24 head:00000000d8ed776f data:000000008150e823 tail:0x7 end:0xc0 dev:gre0 ------------[ cut here ]------------ kernel BUG at net/core/skbuff.c:104! invalid opcode: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 1 PID: 3670 Comm: syzkaller801466 Not tainted 4.15.0-rc7-next-20180115+ #97 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:skb_panic+0x162/0x1f0 net/core/skbuff.c:100 RSP: 0018:ffff8801d9bd7840 EFLAGS: 00010282 RAX: 0000000000000083 RBX: ffff8801d4f083c0 RCX: 0000000000000000 RDX: 0000000000000083 RSI: 1ffff1003b37ae92 RDI: ffffed003b37aefc RBP: ffff8801d9bd78a8 R08: 1ffff1003b37ae8a R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000000 R12: ffffffff86200de0 R13: ffffffff84a981ad R14: 0000000000000018 R15: ffff8801d2d34180 FS: 00000000019c4880(0000) GS:ffff8801db300000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000208bc000 CR3: 00000001d9111001 CR4: 00000000001606e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: skb_under_panic net/core/skbuff.c:114 [inline] skb_push+0xce/0xf0 net/core/skbuff.c:1714 ipgre_header+0x6d/0x4e0 net/ipv4/ip_gre.c:879 dev_hard_header include/linux/netdevice.h:2723 [inline] pppoe_sendmsg+0x58e/0x8b0 drivers/net/ppp/pppoe.c:890 sock_sendmsg_nosec net/socket.c:630 [inline] sock_sendmsg+0xca/0x110 net/socket.c:640 sock_write_iter+0x31a/0x5d0 net/socket.c:909 call_write_iter include/linux/fs.h:1775 [inline] do_iter_readv_writev+0x525/0x7f0 fs/read_write.c:653 do_iter_write+0x154/0x540 fs/read_write.c:932 vfs_writev+0x18a/0x340 fs/read_write.c:977 do_writev+0xfc/0x2a0 fs/read_write.c:1012 SYSC_writev fs/read_write.c:1085 [inline] SyS_writev+0x27/0x30 fs/read_write.c:1082 entry_SYSCALL_64_fastpath+0x29/0xa0 Admittedly PPPoE shouldn't be allowed to run on non Ethernet-like interfaces, but reserving space for ->needed_headroom is a more fundamental issue that needs to be addressed first. Same problem exists for __pppoe_xmit(), which also needs to take dev->needed_headroom into account in skb_cow_head(). Fixes: f5184d267c1a ("net: Allow netdevices to specify needed head/tailroom") Reported-by: syzbot+ed0838d0fa4c4f2b528e20286e6dc63effc7c14d@syzkaller.appspotmail.com Signed-off-by: Guillaume Nault Reviewed-by: Xin Long Signed-off-by: David S. Miller --- drivers/net/ppp/pppoe.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c index 4e1da1645b154d..5aa59f41bf8c39 100644 --- a/drivers/net/ppp/pppoe.c +++ b/drivers/net/ppp/pppoe.c @@ -842,6 +842,7 @@ static int pppoe_sendmsg(struct socket *sock, struct msghdr *m, struct pppoe_hdr *ph; struct net_device *dev; char *start; + int hlen; lock_sock(sk); if (sock_flag(sk, SOCK_DEAD) || !(sk->sk_state & PPPOX_CONNECTED)) { @@ -860,16 +861,16 @@ static int pppoe_sendmsg(struct socket *sock, struct msghdr *m, if (total_len > (dev->mtu + dev->hard_header_len)) goto end; - - skb = sock_wmalloc(sk, total_len + dev->hard_header_len + 32, - 0, GFP_KERNEL); + hlen = LL_RESERVED_SPACE(dev); + skb = sock_wmalloc(sk, hlen + sizeof(*ph) + total_len + + dev->needed_tailroom, 0, GFP_KERNEL); if (!skb) { error = -ENOMEM; goto end; } /* Reserve space for headers. */ - skb_reserve(skb, dev->hard_header_len); + skb_reserve(skb, hlen); skb_reset_network_header(skb); skb->dev = dev; @@ -930,7 +931,7 @@ static int __pppoe_xmit(struct sock *sk, struct sk_buff *skb) /* Copy the data if there is no space for the header or if it's * read-only. */ - if (skb_cow_head(skb, sizeof(*ph) + dev->hard_header_len)) + if (skb_cow_head(skb, LL_RESERVED_SPACE(dev) + sizeof(*ph))) goto abort; __skb_push(skb, sizeof(*ph)); From e9191ffb65d8e159680ce0ad2224e1acbde6985c Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 22 Jan 2018 20:06:42 +0000 Subject: [PATCH 16/62] ipv6: Fix getsockopt() for sockets with default IPV6_AUTOFLOWLABEL Commit 513674b5a2c9 ("net: reevalulate autoflowlabel setting after sysctl setting") removed the initialisation of ipv6_pinfo::autoflowlabel and added a second flag to indicate whether this field or the net namespace default should be used. The getsockopt() handling for this case was not updated, so it currently returns 0 for all sockets for which IPV6_AUTOFLOWLABEL is not explicitly enabled. Fix it to return the effective value, whether that has been set at the socket or net namespace level. Fixes: 513674b5a2c9 ("net: reevalulate autoflowlabel setting after sysctl ...") Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- include/net/ipv6.h | 1 + net/ipv6/ip6_output.c | 2 +- net/ipv6/ipv6_sockglue.c | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/include/net/ipv6.h b/include/net/ipv6.h index f73797e2fa60c5..221238254eb783 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -331,6 +331,7 @@ int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq, int flags); int ip6_flowlabel_init(void); void ip6_flowlabel_cleanup(void); +bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np); static inline void fl6_sock_release(struct ip6_flowlabel *fl) { diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 4f7d8de5661147..3763dc01e37477 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -166,7 +166,7 @@ int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) !(IP6CB(skb)->flags & IP6SKB_REROUTED)); } -static bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np) +bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np) { if (!np->autoflowlabel_set) return ip6_default_np_autolabel(net); diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 2d4680e0376f41..e8ffb5b5d84e62 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -1336,7 +1336,7 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, break; case IPV6_AUTOFLOWLABEL: - val = np->autoflowlabel; + val = ip6_autoflowlabel(sock_net(sk), np); break; case IPV6_RECVFRAGSIZE: From 848b159835ddef99cc4193083f7e786c3992f580 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Mon, 22 Jan 2018 16:06:37 -0500 Subject: [PATCH 17/62] vmxnet3: repair memory leak with the introduction of commit b0eb57cb97e7837ebb746404c2c58c6f536f23fa, it appears that rq->buf_info is improperly handled. While it is heap allocated when an rx queue is setup, and freed when torn down, an old line of code in vmxnet3_rq_destroy was not properly removed, leading to rq->buf_info[0] being set to NULL prior to its being freed, causing a memory leak, which eventually exhausts the system on repeated create/destroy operations (for example, when the mtu of a vmxnet3 interface is changed frequently. Fix is pretty straight forward, just move the NULL set to after the free. Tested by myself with successful results Applies to net, and should likely be queued for stable, please Signed-off-by: Neil Horman Reported-By: boyang@redhat.com CC: boyang@redhat.com CC: Shrikrishna Khare CC: "VMware, Inc." CC: David S. Miller Acked-by: Shrikrishna Khare Signed-off-by: David S. Miller --- drivers/net/vmxnet3/vmxnet3_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c index d1c7029ded7cef..cf95290b160c56 100644 --- a/drivers/net/vmxnet3/vmxnet3_drv.c +++ b/drivers/net/vmxnet3/vmxnet3_drv.c @@ -1616,7 +1616,6 @@ static void vmxnet3_rq_destroy(struct vmxnet3_rx_queue *rq, rq->rx_ring[i].basePA); rq->rx_ring[i].base = NULL; } - rq->buf_info[i] = NULL; } if (rq->data_ring.base) { @@ -1638,6 +1637,7 @@ static void vmxnet3_rq_destroy(struct vmxnet3_rx_queue *rq, (rq->rx_ring[0].size + rq->rx_ring[1].size); dma_free_coherent(&adapter->pdev->dev, sz, rq->buf_info[0], rq->buf_info_pa); + rq->buf_info[0] = rq->buf_info[1] = NULL; } } From a97cb0e7b3f4c6297fd857055ae8e895f402f501 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 22 Jan 2018 11:39:47 +0100 Subject: [PATCH 18/62] futex: Fix OWNER_DEAD fixup Both Geert and DaveJ reported that the recent futex commit: c1e2f0eaf015 ("futex: Avoid violating the 10th rule of futex") introduced a problem with setting OWNER_DEAD. We set the bit on an uninitialized variable and then entirely optimize it away as a dead-store. Move the setting of the bit to where it is more useful. Reported-by: Geert Uytterhoeven Reported-by: Dave Jones Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: c1e2f0eaf015 ("futex: Avoid violating the 10th rule of futex") Link: http://lkml.kernel.org/r/20180122103947.GD2228@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/futex.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 8c5424dd59244f..7f719d11090810 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2311,9 +2311,6 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); oldowner = pi_state->owner; - /* Owner died? */ - if (!pi_state->owner) - newtid |= FUTEX_OWNER_DIED; /* * We are here because either: @@ -2374,6 +2371,9 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, } newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; + /* Owner died? */ + if (!pi_state->owner) + newtid |= FUTEX_OWNER_DIED; if (get_futex_value_locked(&uval, uaddr)) goto handle_fault; From 88f1c87de11a86d839f4ce5313e552d96709b990 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 22 Jan 2018 14:00:55 -0800 Subject: [PATCH 19/62] locking/lockdep: Avoid triggering hardlockup from debug_show_all_locks() debug_show_all_locks() iterates all tasks and print held locks whole holding tasklist_lock. This can take a while on a slow console device and may end up triggering NMI hardlockup detector if someone else ends up waiting for tasklist_lock. Touch the NMI watchdog while printing the held locks to avoid spuriously triggering the hardlockup detector. Signed-off-by: Tejun Heo Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: kernel-team@fb.com Link: http://lkml.kernel.org/r/20180122220055.GB1771050@devbig577.frc2.facebook.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 5fa1324a4f29a5..52165904471963 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -49,6 +49,7 @@ #include #include #include +#include #include @@ -4490,6 +4491,7 @@ void debug_show_all_locks(void) if (!unlock) if (read_trylock(&tasklist_lock)) unlock = 1; + touch_nmi_watchdog(); } while_each_thread(g, p); pr_warn("\n"); From ce48c146495a1a50e48cdbfbfaba3e708be7c07c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 22 Jan 2018 22:53:28 +0100 Subject: [PATCH 20/62] sched/core: Fix cpu.max vs. cpuhotplug deadlock Tejun reported the following cpu-hotplug lock (percpu-rwsem) read recursion: tg_set_cfs_bandwidth() get_online_cpus() cpus_read_lock() cfs_bandwidth_usage_inc() static_key_slow_inc() cpus_read_lock() Reported-by: Tejun Heo Tested-by: Tejun Heo Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20180122215328.GP3397@worktop Signed-off-by: Ingo Molnar --- include/linux/jump_label.h | 7 +++++++ kernel/jump_label.c | 12 +++++++++--- kernel/sched/fair.c | 4 ++-- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index c7b368c734af34..e0340ca08d9897 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -160,6 +160,8 @@ extern void arch_jump_label_transform_static(struct jump_entry *entry, extern int jump_label_text_reserved(void *start, void *end); extern void static_key_slow_inc(struct static_key *key); extern void static_key_slow_dec(struct static_key *key); +extern void static_key_slow_inc_cpuslocked(struct static_key *key); +extern void static_key_slow_dec_cpuslocked(struct static_key *key); extern void jump_label_apply_nops(struct module *mod); extern int static_key_count(struct static_key *key); extern void static_key_enable(struct static_key *key); @@ -222,6 +224,9 @@ static inline void static_key_slow_dec(struct static_key *key) atomic_dec(&key->enabled); } +#define static_key_slow_inc_cpuslocked(key) static_key_slow_inc(key) +#define static_key_slow_dec_cpuslocked(key) static_key_slow_dec(key) + static inline int jump_label_text_reserved(void *start, void *end) { return 0; @@ -416,6 +421,8 @@ extern bool ____wrong_branch_error(void); #define static_branch_inc(x) static_key_slow_inc(&(x)->key) #define static_branch_dec(x) static_key_slow_dec(&(x)->key) +#define static_branch_inc_cpuslocked(x) static_key_slow_inc_cpuslocked(&(x)->key) +#define static_branch_dec_cpuslocked(x) static_key_slow_dec_cpuslocked(&(x)->key) /* * Normal usage; boolean enable/disable. diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 8594d24e4adc22..b4517095db6af2 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -79,7 +79,7 @@ int static_key_count(struct static_key *key) } EXPORT_SYMBOL_GPL(static_key_count); -static void static_key_slow_inc_cpuslocked(struct static_key *key) +void static_key_slow_inc_cpuslocked(struct static_key *key) { int v, v1; @@ -180,7 +180,7 @@ void static_key_disable(struct static_key *key) } EXPORT_SYMBOL_GPL(static_key_disable); -static void static_key_slow_dec_cpuslocked(struct static_key *key, +static void __static_key_slow_dec_cpuslocked(struct static_key *key, unsigned long rate_limit, struct delayed_work *work) { @@ -211,7 +211,7 @@ static void __static_key_slow_dec(struct static_key *key, struct delayed_work *work) { cpus_read_lock(); - static_key_slow_dec_cpuslocked(key, rate_limit, work); + __static_key_slow_dec_cpuslocked(key, rate_limit, work); cpus_read_unlock(); } @@ -229,6 +229,12 @@ void static_key_slow_dec(struct static_key *key) } EXPORT_SYMBOL_GPL(static_key_slow_dec); +void static_key_slow_dec_cpuslocked(struct static_key *key) +{ + STATIC_KEY_CHECK_USE(key); + __static_key_slow_dec_cpuslocked(key, 0, NULL); +} + void static_key_slow_dec_deferred(struct static_key_deferred *key) { STATIC_KEY_CHECK_USE(key); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2fe3aa853e4dba..26a71ebcd3c2ed 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4365,12 +4365,12 @@ static inline bool cfs_bandwidth_used(void) void cfs_bandwidth_usage_inc(void) { - static_key_slow_inc(&__cfs_bandwidth_used); + static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used); } void cfs_bandwidth_usage_dec(void) { - static_key_slow_dec(&__cfs_bandwidth_used); + static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used); } #else /* HAVE_JUMP_LABEL */ static bool cfs_bandwidth_used(void) From 1df37383a8aeabb9b418698f0bcdffea01f4b1b2 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 22 Jan 2018 17:09:34 -0500 Subject: [PATCH 21/62] x86/retpoline: Remove the esp/rsp thunk It doesn't make sense to have an indirect call thunk with esp/rsp as retpoline code won't work correctly with the stack pointer register. Removing it will help compiler writers to catch error in case such a thunk call is emitted incorrectly. Fixes: 76b043848fd2 ("x86/retpoline: Add initial retpoline support") Suggested-by: Jeff Law Signed-off-by: Waiman Long Signed-off-by: Thomas Gleixner Acked-by: David Woodhouse Cc: Tom Lendacky Cc: Kees Cook Cc: Andi Kleen Cc: Tim Chen Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Arjan van de Ven Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1516658974-27852-1-git-send-email-longman@redhat.com --- arch/x86/include/asm/asm-prototypes.h | 1 - arch/x86/lib/retpoline.S | 1 - 2 files changed, 2 deletions(-) diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h index 0927cdc4f94601..1908214b91257f 100644 --- a/arch/x86/include/asm/asm-prototypes.h +++ b/arch/x86/include/asm/asm-prototypes.h @@ -38,5 +38,4 @@ INDIRECT_THUNK(dx) INDIRECT_THUNK(si) INDIRECT_THUNK(di) INDIRECT_THUNK(bp) -INDIRECT_THUNK(sp) #endif /* CONFIG_RETPOLINE */ diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index dfb2ba91b670dc..c909961e678a59 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -36,7 +36,6 @@ GENERATE_THUNK(_ASM_DX) GENERATE_THUNK(_ASM_SI) GENERATE_THUNK(_ASM_DI) GENERATE_THUNK(_ASM_BP) -GENERATE_THUNK(_ASM_SP) #ifdef CONFIG_64BIT GENERATE_THUNK(r8) GENERATE_THUNK(r9) From 40d4071ce2d20840d224b4a77b5dc6f752c9ab15 Mon Sep 17 00:00:00 2001 From: Xiao Liang Date: Mon, 22 Jan 2018 14:12:52 +0800 Subject: [PATCH 22/62] perf/x86/amd/power: Do not load AMD power module on !AMD platforms The AMD power module can be loaded on non AMD platforms, but unload fails with the following Oops: BUG: unable to handle kernel NULL pointer dereference at (null) IP: __list_del_entry_valid+0x29/0x90 Call Trace: perf_pmu_unregister+0x25/0xf0 amd_power_pmu_exit+0x1c/0xd23 [power] SyS_delete_module+0x1a8/0x2b0 ? exit_to_usermode_loop+0x8f/0xb0 entry_SYSCALL_64_fastpath+0x20/0x83 Return -ENODEV instead of 0 from the module init function if the CPU does not match. Fixes: c7ab62bfbe0e ("perf/x86/amd/power: Add AMD accumulated power reporting mechanism") Signed-off-by: Xiao Liang Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20180122061252.6394-1-xiliang@redhat.com --- arch/x86/events/amd/power.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/amd/power.c b/arch/x86/events/amd/power.c index a6eee5ac4f581d..2aefacf5c5b2ab 100644 --- a/arch/x86/events/amd/power.c +++ b/arch/x86/events/amd/power.c @@ -277,7 +277,7 @@ static int __init amd_power_pmu_init(void) int ret; if (!x86_match_cpu(cpu_match)) - return 0; + return -ENODEV; if (!boot_cpu_has(X86_FEATURE_ACC_POWER)) return -ENODEV; From 7e702d17ed138cf4ae7c00e8c00681ed464587c7 Mon Sep 17 00:00:00 2001 From: Jia Zhang Date: Tue, 23 Jan 2018 11:41:32 +0100 Subject: [PATCH 23/62] x86/microcode/intel: Extend BDW late-loading further with LLC size check Commit b94b73733171 ("x86/microcode/intel: Extend BDW late-loading with a revision check") reduced the impact of erratum BDF90 for Broadwell model 79. The impact can be reduced further by checking the size of the last level cache portion per core. Tony: "The erratum says the problem only occurs on the large-cache SKUs. So we only need to avoid the update if we are on a big cache SKU that is also running old microcode." For more details, see erratum BDF90 in document #334165 (Intel Xeon Processor E7-8800/4800 v4 Product Family Specification Update) from September 2017. Fixes: b94b73733171 ("x86/microcode/intel: Extend BDW late-loading with a revision check") Signed-off-by: Jia Zhang Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Acked-by: Tony Luck Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/1516321542-31161-1-git-send-email-zhang.jia@linux.alibaba.com --- arch/x86/kernel/cpu/microcode/intel.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index d9e460fc7a3b30..f7c55b0e753ad0 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -45,6 +45,9 @@ static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin"; /* Current microcode patch used in early patching on the APs. */ static struct microcode_intel *intel_ucode_patch; +/* last level cache size per core */ +static int llc_size_per_core; + static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1, unsigned int s2, unsigned int p2) { @@ -912,12 +915,14 @@ static bool is_blacklisted(unsigned int cpu) /* * Late loading on model 79 with microcode revision less than 0x0b000021 - * may result in a system hang. This behavior is documented in item - * BDF90, #334165 (Intel Xeon Processor E7-8800/4800 v4 Product Family). + * and LLC size per core bigger than 2.5MB may result in a system hang. + * This behavior is documented in item BDF90, #334165 (Intel Xeon + * Processor E7-8800/4800 v4 Product Family). */ if (c->x86 == 6 && c->x86_model == INTEL_FAM6_BROADWELL_X && c->x86_mask == 0x01 && + llc_size_per_core > 2621440 && c->microcode < 0x0b000021) { pr_err_once("Erratum BDF90: late loading with revision < 0x0b000021 (0x%x) disabled.\n", c->microcode); pr_err_once("Please consider either early loading through initrd/built-in or a potential BIOS update.\n"); @@ -975,6 +980,15 @@ static struct microcode_ops microcode_intel_ops = { .apply_microcode = apply_microcode_intel, }; +static int __init calc_llc_size_per_core(struct cpuinfo_x86 *c) +{ + u64 llc_size = c->x86_cache_size * 1024; + + do_div(llc_size, c->x86_max_cores); + + return (int)llc_size; +} + struct microcode_ops * __init init_intel_microcode(void) { struct cpuinfo_x86 *c = &boot_cpu_data; @@ -985,5 +999,7 @@ struct microcode_ops * __init init_intel_microcode(void) return NULL; } + llc_size_per_core = calc_llc_size_per_core(c); + return µcode_intel_ops; } From 1d080f096fe33f031d26e19b3ef0146f66b8b0f1 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 23 Jan 2018 11:41:33 +0100 Subject: [PATCH 24/62] x86/microcode: Fix again accessing initrd after having been freed Commit 24c2503255d3 ("x86/microcode: Do not access the initrd after it has been freed") fixed attempts to access initrd from the microcode loader after it has been freed. However, a similar KASAN warning was reported (stack trace edited): smpboot: Booting Node 0 Processor 1 APIC 0x11 ================================================================== BUG: KASAN: use-after-free in find_cpio_data+0x9b5/0xa50 Read of size 1 at addr ffff880035ffd000 by task swapper/1/0 CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.14.8-slack #7 Hardware name: System manufacturer System Product Name/A88X-PLUS, BIOS 3003 03/10/2016 Call Trace: dump_stack print_address_description kasan_report ? find_cpio_data __asan_report_load1_noabort find_cpio_data find_microcode_in_initrd __load_ucode_amd load_ucode_amd_ap load_ucode_ap After some investigation, it turned out that a merge was done using the wrong side to resolve, leading to picking up the previous state, before the 24c2503255d3 fix. Therefore the Fixes tag below contains a merge commit. Revert the mismerge by catching the save_microcode_in_initrd_amd() retval and thus letting the function exit with the last return statement so that initrd_gone can be set to true. Fixes: f26483eaedec ("Merge branch 'x86/urgent' into x86/microcode, to resolve conflicts") Reported-by: Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://bugzilla.kernel.org/show_bug.cgi?id=198295 Link: https://lkml.kernel.org/r/20180123104133.918-2-bp@alien8.de --- arch/x86/kernel/cpu/microcode/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index c4fa4a85d4cb6f..e4fc595cd6eaa6 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -239,7 +239,7 @@ static int __init save_microcode_in_initrd(void) break; case X86_VENDOR_AMD: if (c->x86 >= 0x10) - return save_microcode_in_initrd_amd(cpuid_eax(1)); + ret = save_microcode_in_initrd_amd(cpuid_eax(1)); break; default: break; From 1de1ea7efeb9e8543212210e34518b4049ccd285 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Fri, 22 Dec 2017 10:54:20 +0100 Subject: [PATCH 25/62] KVM: s390: add proper locking for CMMA migration bitmap Some parts of the cmma migration bitmap is already protected with the kvm->lock (e.g. the migration start). On the other hand the read of the cmma bits is not protected against a concurrent free, neither is the emulation of the ESSA instruction. Let's extend the locking to all related ioctls by using the slots lock for - kvm_s390_vm_start_migration - kvm_s390_vm_stop_migration - kvm_s390_set_cmma_bits - kvm_s390_get_cmma_bits In addition to that, we use synchronize_srcu before freeing the migration structure as all users hold kvm->srcu for read. (e.g. the ESSA handler). Reported-by: David Hildenbrand Signed-off-by: Christian Borntraeger Cc: stable@vger.kernel.org # 4.13+ Fixes: 190df4a212a7 (KVM: s390: CMMA tracking, ESSA emulation, migration mode) Reviewed-by: Claudio Imbrenda Reviewed-by: David Hildenbrand Reviewed-by: Cornelia Huck --- arch/s390/kvm/kvm-s390.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index abcd24fdde3fc9..52880e980a336a 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -766,7 +766,7 @@ static void kvm_s390_sync_request_broadcast(struct kvm *kvm, int req) /* * Must be called with kvm->srcu held to avoid races on memslots, and with - * kvm->lock to avoid races with ourselves and kvm_s390_vm_stop_migration. + * kvm->slots_lock to avoid races with ourselves and kvm_s390_vm_stop_migration. */ static int kvm_s390_vm_start_migration(struct kvm *kvm) { @@ -822,7 +822,7 @@ static int kvm_s390_vm_start_migration(struct kvm *kvm) } /* - * Must be called with kvm->lock to avoid races with ourselves and + * Must be called with kvm->slots_lock to avoid races with ourselves and * kvm_s390_vm_start_migration. */ static int kvm_s390_vm_stop_migration(struct kvm *kvm) @@ -837,6 +837,8 @@ static int kvm_s390_vm_stop_migration(struct kvm *kvm) if (kvm->arch.use_cmma) { kvm_s390_sync_request_broadcast(kvm, KVM_REQ_STOP_MIGRATION); + /* We have to wait for the essa emulation to finish */ + synchronize_srcu(&kvm->srcu); vfree(mgs->pgste_bitmap); } kfree(mgs); @@ -846,14 +848,12 @@ static int kvm_s390_vm_stop_migration(struct kvm *kvm) static int kvm_s390_vm_set_migration(struct kvm *kvm, struct kvm_device_attr *attr) { - int idx, res = -ENXIO; + int res = -ENXIO; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->slots_lock); switch (attr->attr) { case KVM_S390_VM_MIGRATION_START: - idx = srcu_read_lock(&kvm->srcu); res = kvm_s390_vm_start_migration(kvm); - srcu_read_unlock(&kvm->srcu, idx); break; case KVM_S390_VM_MIGRATION_STOP: res = kvm_s390_vm_stop_migration(kvm); @@ -861,7 +861,7 @@ static int kvm_s390_vm_set_migration(struct kvm *kvm, default: break; } - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->slots_lock); return res; } @@ -1751,7 +1751,9 @@ long kvm_arch_vm_ioctl(struct file *filp, r = -EFAULT; if (copy_from_user(&args, argp, sizeof(args))) break; + mutex_lock(&kvm->slots_lock); r = kvm_s390_get_cmma_bits(kvm, &args); + mutex_unlock(&kvm->slots_lock); if (!r) { r = copy_to_user(argp, &args, sizeof(args)); if (r) @@ -1765,7 +1767,9 @@ long kvm_arch_vm_ioctl(struct file *filp, r = -EFAULT; if (copy_from_user(&args, argp, sizeof(args))) break; + mutex_lock(&kvm->slots_lock); r = kvm_s390_set_cmma_bits(kvm, &args); + mutex_unlock(&kvm->slots_lock); break; } default: From 1ecdaea02ca6bfacf2ecda500dc1af51e9780c42 Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Wed, 24 Jan 2018 10:02:09 +0100 Subject: [PATCH 26/62] mlxsw: spectrum_router: Don't log an error on missing neighbor Driver periodically samples all neighbors configured in device in order to update the kernel regarding their state. When finding an entry configured in HW that doesn't show in neigh_lookup() driver logs an error message. This introduces a race when removing multiple neighbors - it's possible that a given entry would still be configured in HW as its removal is still being processed but is already removed from the kernel's neighbor tables. Simply remove the error message and gracefully accept such events. Fixes: c723c735fa6b ("mlxsw: spectrum_router: Periodically update the kernel's neigh table") Fixes: 60f040ca11b9 ("mlxsw: spectrum_router: Periodically dump active IPv6 neighbours") Signed-off-by: Yuval Mintz Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 6c0391c13fe002..7042c855a5d6b2 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -1942,11 +1942,8 @@ static void mlxsw_sp_router_neigh_ent_ipv4_process(struct mlxsw_sp *mlxsw_sp, dipn = htonl(dip); dev = mlxsw_sp->router->rifs[rif]->dev; n = neigh_lookup(&arp_tbl, &dipn, dev); - if (!n) { - netdev_err(dev, "Failed to find matching neighbour for IP=%pI4h\n", - &dip); + if (!n) return; - } netdev_dbg(dev, "Updating neighbour with IP=%pI4h\n", &dip); neigh_event_send(n, NULL); @@ -1973,11 +1970,8 @@ static void mlxsw_sp_router_neigh_ent_ipv6_process(struct mlxsw_sp *mlxsw_sp, dev = mlxsw_sp->router->rifs[rif]->dev; n = neigh_lookup(&nd_tbl, &dip, dev); - if (!n) { - netdev_err(dev, "Failed to find matching neighbour for IP=%pI6c\n", - &dip); + if (!n) return; - } netdev_dbg(dev, "Updating neighbour with IP=%pI6c\n", &dip); neigh_event_send(n, NULL); From 5132ede0fe8092b043dae09a7cc32b8ae7272baa Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 24 Jan 2018 15:28:17 +0100 Subject: [PATCH 27/62] Revert "module: Add retpoline tag to VERMAGIC" This reverts commit 6cfb521ac0d5b97470883ff9b7facae264b7ab12. Turns out distros do not want to make retpoline as part of their "ABI", so this patch should not have been merged. Sorry Andi, this was my fault, I suggested it when your original patch was the "correct" way of doing this instead. Reported-by: Jiri Kosina Fixes: 6cfb521ac0d5 ("module: Add retpoline tag to VERMAGIC") Acked-by: Andi Kleen Cc: Thomas Gleixner Cc: David Woodhouse Cc: rusty@rustcorp.com.au Cc: arjan.van.de.ven@intel.com Cc: jeyu@kernel.org Cc: stable Signed-off-by: Greg Kroah-Hartman Signed-off-by: Linus Torvalds --- include/linux/vermagic.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/include/linux/vermagic.h b/include/linux/vermagic.h index 853291714ae0b9..bae807eb2933f9 100644 --- a/include/linux/vermagic.h +++ b/include/linux/vermagic.h @@ -31,17 +31,11 @@ #else #define MODULE_RANDSTRUCT_PLUGIN #endif -#ifdef RETPOLINE -#define MODULE_VERMAGIC_RETPOLINE "retpoline " -#else -#define MODULE_VERMAGIC_RETPOLINE "" -#endif #define VERMAGIC_STRING \ UTS_RELEASE " " \ MODULE_VERMAGIC_SMP MODULE_VERMAGIC_PREEMPT \ MODULE_VERMAGIC_MODULE_UNLOAD MODULE_VERMAGIC_MODVERSIONS \ MODULE_ARCH_VERMAGIC \ - MODULE_RANDSTRUCT_PLUGIN \ - MODULE_VERMAGIC_RETPOLINE + MODULE_RANDSTRUCT_PLUGIN From ce30f264b33d9e3d27e34638976c52b578648b92 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Thu, 4 Jan 2018 14:31:25 +0100 Subject: [PATCH 28/62] MAINTAINERS: clarify that only verified bugs should be submitted to security@ We're seeing a raise of automated reports from testing tools and reports about address leaks that are not really exploitable as-is, many of which do not represent an immediate risk justifying to work in closed places. Signed-off-by: Willy Tarreau Acked-by: Greg Kroah-Hartman Signed-off-by: Linus Torvalds --- MAINTAINERS | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index e3581413420c61..fec88c5ccedf30 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -62,7 +62,15 @@ trivial patch so apply some common sense. 7. When sending security related changes or reports to a maintainer please Cc: security@kernel.org, especially if the maintainer - does not respond. + does not respond. Please keep in mind that the security team is + a small set of people who can be efficient only when working on + verified bugs. Please only Cc: this list when you have identified + that the bug would present a short-term risk to other users if it + were publicly disclosed. For example, reports of address leaks do + not represent an immediate threat and are better handled publicly, + and ideally, should come with a patch proposal. Please do not send + automated reports to this list either. Such bugs will be handled + better and faster in the usual public places. 8. Happy hacking. From e4fd493c0541d36953f7b9d3bfced67a1321792f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 23 Jan 2018 15:17:05 -0500 Subject: [PATCH 29/62] Btrfs: fix stale entries in readdir In fixing the readdir+pagefault deadlock I accidentally introduced a stale entry regression in readdir. If we get close to full for the temporary buffer, and then skip a few delayed deletions, and then try to add another entry that won't fit, we will emit the entries we found and retry. Unfortunately we delete entries from our del_list as we find them, assuming we won't need them. However our pos will be with whatever our last entry was, which could be before the delayed deletions we skipped, so the next search will add the deleted entries back into our readdir buffer. So instead don't delete entries we find in our del_list so we can make sure we always find our delayed deletions. This is a slight perf hit for readdir with lots of pending deletions, but hopefully this isn't a common occurrence. If it is we can revist this and optimize it. cc: stable@vger.kernel.org Fixes: 23b5ec74943f ("btrfs: fix readdir deadlock with pagefault") Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.c | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 056276101c63a7..a6226cd6063c78 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1633,28 +1633,18 @@ void btrfs_readdir_put_delayed_items(struct inode *inode, int btrfs_should_delete_dir_index(struct list_head *del_list, u64 index) { - struct btrfs_delayed_item *curr, *next; - int ret; - - if (list_empty(del_list)) - return 0; + struct btrfs_delayed_item *curr; + int ret = 0; - list_for_each_entry_safe(curr, next, del_list, readdir_list) { + list_for_each_entry(curr, del_list, readdir_list) { if (curr->key.offset > index) break; - - list_del(&curr->readdir_list); - ret = (curr->key.offset == index); - - if (refcount_dec_and_test(&curr->refs)) - kfree(curr); - - if (ret) - return 1; - else - continue; + if (curr->key.offset == index) { + ret = 1; + break; + } } - return 0; + return ret; } /* From 560a66075d694e6ec24c60967b4d93d97cbb33d1 Mon Sep 17 00:00:00 2001 From: Wolfgang Bumiller Date: Thu, 18 Jan 2018 11:32:35 +0100 Subject: [PATCH 30/62] net: sched: em_nbyte: don't add the data offset twice 'ptr' is shifted by the offset and then validated, the memcmp should not add it a second time. Signed-off-by: Wolfgang Bumiller Signed-off-by: David S. Miller --- net/sched/em_nbyte.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/em_nbyte.c b/net/sched/em_nbyte.c index df3110d695857e..07c10bac06a079 100644 --- a/net/sched/em_nbyte.c +++ b/net/sched/em_nbyte.c @@ -51,7 +51,7 @@ static int em_nbyte_match(struct sk_buff *skb, struct tcf_ematch *em, if (!tcf_valid_offset(skb, ptr, nbyte->hdr.len)) return 0; - return !memcmp(ptr + nbyte->hdr.off, nbyte->pattern, nbyte->hdr.len); + return !memcmp(ptr, nbyte->pattern, nbyte->hdr.len); } static struct tcf_ematch_ops em_nbyte_ops = { From d3303a65a00c94372ddab831570647488e6c06e2 Mon Sep 17 00:00:00 2001 From: Wolfgang Bumiller Date: Thu, 18 Jan 2018 11:32:36 +0100 Subject: [PATCH 31/62] net: sched: fix TCF_LAYER_LINK case in tcf_get_base_ptr TCF_LAYER_LINK and TCF_LAYER_NETWORK returned the same pointer as skb->data points to the network header. Use skb_mac_header instead. Signed-off-by: Wolfgang Bumiller Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 8e08b6da72f325..753ac9361154be 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -522,7 +522,7 @@ static inline unsigned char * tcf_get_base_ptr(struct sk_buff *skb, int layer) { switch (layer) { case TCF_LAYER_LINK: - return skb->data; + return skb_mac_header(skb); case TCF_LAYER_NETWORK: return skb_network_header(skb); case TCF_LAYER_TRANSPORT: From 3eab2ad9162e7467c988b91f50395eac51a1e650 Mon Sep 17 00:00:00 2001 From: James Morris Date: Thu, 25 Jan 2018 07:53:57 +1100 Subject: [PATCH 32/62] MAINTAINERS: update email address for James Morris Update my email address. Signed-off-by: James Morris --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index fec88c5ccedf30..810d5d990f4af7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12241,7 +12241,7 @@ M: Security Officers S: Supported SECURITY SUBSYSTEM -M: James Morris +M: James Morris M: "Serge E. Hallyn" L: linux-security-module@vger.kernel.org (suggested Cc:) T: git git://git.kernel.org/pub/scm/linux/kernel/git/jmorris/linux-security.git From 581e7226a5d43f629eb6399a121f85f6a15f81be Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 24 Jan 2018 12:35:40 -0800 Subject: [PATCH 33/62] kcm: Only allow TCP sockets to be attached to a KCM mux TCP sockets for IPv4 and IPv6 that are not listeners or in closed stated are allowed to be attached to a KCM mux. Fixes: ab7ac4eb9832 ("kcm: Kernel Connection Multiplexor module") Reported-by: syzbot+8865eaff7f9acd593945@syzkaller.appspotmail.com Signed-off-by: Tom Herbert Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/kcm/kcmsock.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index d4e98f20fc2ac1..7632797fb68ea3 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -1387,8 +1387,13 @@ static int kcm_attach(struct socket *sock, struct socket *csock, if (!csk) return -EINVAL; - /* We must prevent loops or risk deadlock ! */ - if (csk->sk_family == PF_KCM) + /* Only allow TCP sockets to be attached for now */ + if ((csk->sk_family != AF_INET && csk->sk_family != AF_INET6) || + csk->sk_protocol != IPPROTO_TCP) + return -EOPNOTSUPP; + + /* Don't allow listeners or closed sockets */ + if (csk->sk_state == TCP_LISTEN || csk->sk_state == TCP_CLOSE) return -EOPNOTSUPP; psock = kmem_cache_zalloc(kcm_psockp, GFP_KERNEL); From e5571240236c5652f3e079b1d5866716a7ad819c Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 24 Jan 2018 12:35:41 -0800 Subject: [PATCH 34/62] kcm: Check if sk_user_data already set in kcm_attach This is needed to prevent sk_user_data being overwritten. The check is done under the callback lock. This should prevent a socket from being attached twice to a KCM mux. It also prevents a socket from being attached for other use cases of sk_user_data as long as the other cases set sk_user_data under the lock. Followup work is needed to unify all the use cases of sk_user_data to use the same locking. Reported-by: syzbot+114b15f2be420a8886c3@syzkaller.appspotmail.com Fixes: ab7ac4eb9832 ("kcm: Kernel Connection Multiplexor module") Signed-off-by: Tom Herbert Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/kcm/kcmsock.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index 7632797fb68ea3..4a8d407f890221 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -1410,9 +1410,18 @@ static int kcm_attach(struct socket *sock, struct socket *csock, return err; } - sock_hold(csk); - write_lock_bh(&csk->sk_callback_lock); + + /* Check if sk_user_data is aready by KCM or someone else. + * Must be done under lock to prevent race conditions. + */ + if (csk->sk_user_data) { + write_unlock_bh(&csk->sk_callback_lock); + strp_done(&psock->strp); + kmem_cache_free(kcm_psockp, psock); + return -EALREADY; + } + psock->save_data_ready = csk->sk_data_ready; psock->save_write_space = csk->sk_write_space; psock->save_state_change = csk->sk_state_change; @@ -1420,8 +1429,11 @@ static int kcm_attach(struct socket *sock, struct socket *csock, csk->sk_data_ready = psock_data_ready; csk->sk_write_space = psock_write_space; csk->sk_state_change = psock_state_change; + write_unlock_bh(&csk->sk_callback_lock); + sock_hold(csk); + /* Finished initialization, now add the psock to the MUX. */ spin_lock_bh(&mux->lock); head = &mux->psocks; From 4de49474b18936d62797d1dd451c6c4db1a7b119 Mon Sep 17 00:00:00 2001 From: Michal Kalderon Date: Tue, 23 Jan 2018 11:33:46 +0200 Subject: [PATCH 35/62] qed: Remove reserveration of dpi for kernel Double reservation for kernel dedicated dpi was performed. Once in the core module and once in qedr. Remove the reservation from core. Signed-off-by: Michal Kalderon Signed-off-by: Ariel Elior Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_rdma.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_rdma.c b/drivers/net/ethernet/qlogic/qed/qed_rdma.c index c8c4b3940564db..9d6e2d43d4dee0 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_rdma.c +++ b/drivers/net/ethernet/qlogic/qed/qed_rdma.c @@ -615,9 +615,6 @@ static int qed_rdma_reserve_lkey(struct qed_hwfn *p_hwfn) { struct qed_rdma_device *dev = p_hwfn->p_rdma_info->dev; - /* The first DPI is reserved for the Kernel */ - __set_bit(0, p_hwfn->p_rdma_info->dpi_map.bitmap); - /* Tid 0 will be used as the key for "reserved MR". * The driver should allocate memory for it so it can be loaded but no * ramrod should be passed on it. From 1fe280a056dff50774bd59c3e61187cf8c0ccf10 Mon Sep 17 00:00:00 2001 From: Michal Kalderon Date: Tue, 23 Jan 2018 11:33:47 +0200 Subject: [PATCH 36/62] qed: Free reserved MR tid A tid was allocated for reserved MR during initialization but not freed. This lead to an annoying output message during rdma unload flow. Signed-off-by: Michal Kalderon Signed-off-by: Ariel Elior Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_rdma.c | 28 +++++++++++++--------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_rdma.c b/drivers/net/ethernet/qlogic/qed/qed_rdma.c index 9d6e2d43d4dee0..b7abb8205d3a99 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_rdma.c +++ b/drivers/net/ethernet/qlogic/qed/qed_rdma.c @@ -358,10 +358,27 @@ static void qed_rdma_resc_free(struct qed_hwfn *p_hwfn) kfree(p_rdma_info); } +static void qed_rdma_free_tid(void *rdma_cxt, u32 itid) +{ + struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "itid = %08x\n", itid); + + spin_lock_bh(&p_hwfn->p_rdma_info->lock); + qed_bmap_release_id(p_hwfn, &p_hwfn->p_rdma_info->tid_map, itid); + spin_unlock_bh(&p_hwfn->p_rdma_info->lock); +} + +static void qed_rdma_free_reserved_lkey(struct qed_hwfn *p_hwfn) +{ + qed_rdma_free_tid(p_hwfn, p_hwfn->p_rdma_info->dev->reserved_lkey); +} + static void qed_rdma_free(struct qed_hwfn *p_hwfn) { DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Freeing RDMA\n"); + qed_rdma_free_reserved_lkey(p_hwfn); qed_rdma_resc_free(p_hwfn); } @@ -794,17 +811,6 @@ static struct qed_rdma_device *qed_rdma_query_device(void *rdma_cxt) return p_hwfn->p_rdma_info->dev; } -static void qed_rdma_free_tid(void *rdma_cxt, u32 itid) -{ - struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt; - - DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "itid = %08x\n", itid); - - spin_lock_bh(&p_hwfn->p_rdma_info->lock); - qed_bmap_release_id(p_hwfn, &p_hwfn->p_rdma_info->tid_map, itid); - spin_unlock_bh(&p_hwfn->p_rdma_info->lock); -} - static void qed_rdma_cnq_prod_update(void *rdma_cxt, u8 qz_offset, u16 prod) { struct qed_hwfn *p_hwfn; From aebb48f5e465772576359c1705c4a84abea92027 Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Tue, 23 Jan 2018 14:33:14 +0000 Subject: [PATCH 37/62] sparc64: fix typo in CONFIG_CRYPTO_DES_SPARC64 => CONFIG_CRYPTO_CAMELLIA_SPARC64 This patch fixes the typo CONFIG_CRYPTO_DES_SPARC64 => CONFIG_CRYPTO_CAMELLIA_SPARC64 Fixes: 81658ad0d923 ("sparc64: Add CAMELLIA driver making use of the new camellia opcodes.") Signed-off-by: Corentin Labbe Signed-off-by: David S. Miller --- arch/sparc/crypto/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/sparc/crypto/Makefile b/arch/sparc/crypto/Makefile index 818d3aa5172e68..d257186c27d12d 100644 --- a/arch/sparc/crypto/Makefile +++ b/arch/sparc/crypto/Makefile @@ -10,7 +10,7 @@ obj-$(CONFIG_CRYPTO_MD5_SPARC64) += md5-sparc64.o obj-$(CONFIG_CRYPTO_AES_SPARC64) += aes-sparc64.o obj-$(CONFIG_CRYPTO_DES_SPARC64) += des-sparc64.o -obj-$(CONFIG_CRYPTO_DES_SPARC64) += camellia-sparc64.o +obj-$(CONFIG_CRYPTO_CAMELLIA_SPARC64) += camellia-sparc64.o obj-$(CONFIG_CRYPTO_CRC32C_SPARC64) += crc32c-sparc64.o From b7051cb8dadd69f85da5989017af2bb35b418950 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 23 Jan 2018 00:08:40 -0800 Subject: [PATCH 38/62] i40e: flower: check if TC offload is enabled on a netdev Since TC block changes drivers are required to check if the TC hw offload flag is set on the interface themselves. Fixes: 2f4b411a3d67 ("i40e: Enable cloud filters via tc-flower") Fixes: 44ae12a768b7 ("net: sched: move the can_offload check from binding phase to rule insertion phase") Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Acked-by: Jiri Pirko Acked-by: Amritha Nambiar Acked-by: Jeff Kirsher Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/i40e/i40e_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 42dcaefc4c1942..af792112a2d3da 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -7505,6 +7505,8 @@ static int i40e_setup_tc_cls_flower(struct i40e_netdev_priv *np, { struct i40e_vsi *vsi = np->vsi; + if (!tc_can_offload(vsi->netdev)) + return -EOPNOTSUPP; if (cls_flower->common.chain_index) return -EOPNOTSUPP; From e9cb4239134c860e5f92c75bf5321bd377bb505b Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 23 Jan 2018 17:27:25 +0800 Subject: [PATCH 39/62] vhost: use mutex_lock_nested() in vhost_dev_lock_vqs() We used to call mutex_lock() in vhost_dev_lock_vqs() which tries to hold mutexes of all virtqueues. This may confuse lockdep to report a possible deadlock because of trying to hold locks belong to same class. Switch to use mutex_lock_nested() to avoid false positive. Fixes: 6b1e6cc7855b0 ("vhost: new device IOTLB API") Reported-by: syzbot+dbb7c1161485e61b0241@syzkaller.appspotmail.com Signed-off-by: Jason Wang Acked-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- drivers/vhost/vhost.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 33ac2b186b85eb..549771a0cd8b90 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -904,7 +904,7 @@ static void vhost_dev_lock_vqs(struct vhost_dev *d) { int i = 0; for (i = 0; i < d->nvqs; ++i) - mutex_lock(&d->vqs[i]->mutex); + mutex_lock_nested(&d->vqs[i]->mutex, i); } static void vhost_dev_unlock_vqs(struct vhost_dev *d) From 6f3180afbb22106d96a1320e175562f36a4d3506 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 23 Jan 2018 17:27:26 +0800 Subject: [PATCH 40/62] vhost: do not try to access device IOTLB when not initialized The code will try to access dev->iotlb when processing VHOST_IOTLB_INVALIDATE even if it was not initialized which may lead to NULL pointer dereference. Fixes this by check dev->iotlb before. Fixes: 6b1e6cc7855b0 ("vhost: new device IOTLB API") Signed-off-by: Jason Wang Acked-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- drivers/vhost/vhost.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 549771a0cd8b90..5727b186b3ca56 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1015,6 +1015,10 @@ static int vhost_process_iotlb_msg(struct vhost_dev *dev, vhost_iotlb_notify_vq(dev, msg); break; case VHOST_IOTLB_INVALIDATE: + if (!dev->iotlb) { + ret = -EFAULT; + break; + } vhost_vq_meta_reset(dev); vhost_del_umem_range(dev->iotlb, msg->iova, msg->iova + msg->size - 1); From 060403f34008af90e310d7e0e7531ebb3acf9557 Mon Sep 17 00:00:00 2001 From: Nick Dyer Date: Wed, 24 Jan 2018 13:46:04 -0800 Subject: [PATCH 41/62] Revert "Input: synaptics_rmi4 - use devm_device_add_group() for attributes in F01" Since the sysfs attribute hangs off the RMI bus, which doesn't go away during firmware flash, it needs to be explicitly removed, otherwise we would try and register the same attribute twice. This reverts commit 36a44af5c176d619552d99697433261141dd1296. Signed-off-by: Nick Dyer Signed-off-by: Dmitry Torokhov --- drivers/input/rmi4/rmi_f01.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/input/rmi4/rmi_f01.c b/drivers/input/rmi4/rmi_f01.c index ae966e333a2f47..8a07ae147df690 100644 --- a/drivers/input/rmi4/rmi_f01.c +++ b/drivers/input/rmi4/rmi_f01.c @@ -570,14 +570,19 @@ static int rmi_f01_probe(struct rmi_function *fn) dev_set_drvdata(&fn->dev, f01); - error = devm_device_add_group(&fn->rmi_dev->dev, &rmi_f01_attr_group); + error = sysfs_create_group(&fn->rmi_dev->dev.kobj, &rmi_f01_attr_group); if (error) - dev_warn(&fn->dev, - "Failed to create attribute group: %d\n", error); + dev_warn(&fn->dev, "Failed to create sysfs group: %d\n", error); return 0; } +static void rmi_f01_remove(struct rmi_function *fn) +{ + /* Note that the bus device is used, not the F01 device */ + sysfs_remove_group(&fn->rmi_dev->dev.kobj, &rmi_f01_attr_group); +} + static int rmi_f01_config(struct rmi_function *fn) { struct f01_data *f01 = dev_get_drvdata(&fn->dev); @@ -717,6 +722,7 @@ struct rmi_function_handler rmi_f01_handler = { }, .func = 0x01, .probe = rmi_f01_probe, + .remove = rmi_f01_remove, .config = rmi_f01_config, .attention = rmi_f01_attention, .suspend = rmi_f01_suspend, From 45d6e545505fd32edb812f085be7de45b6a5c0af Mon Sep 17 00:00:00 2001 From: Ivan Mikhaylov Date: Wed, 24 Jan 2018 15:53:24 +0300 Subject: [PATCH 42/62] net/ibm/emac: add 8192 rx/tx fifo size emac4syn chips has availability to use 8192 rx/tx fifo buffer sizes, in current state if we set it up in dts 8192 as example, we will get only 2048 which may impact on network speed. Signed-off-by: Ivan Mikhaylov Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/emac/core.c | 6 ++++++ drivers/net/ethernet/ibm/emac/emac.h | 2 ++ 2 files changed, 8 insertions(+) diff --git a/drivers/net/ethernet/ibm/emac/core.c b/drivers/net/ethernet/ibm/emac/core.c index 7feff2450ed687..241db3199b88cd 100644 --- a/drivers/net/ethernet/ibm/emac/core.c +++ b/drivers/net/ethernet/ibm/emac/core.c @@ -494,6 +494,9 @@ static u32 __emac_calc_base_mr1(struct emac_instance *dev, int tx_size, int rx_s case 16384: ret |= EMAC_MR1_RFS_16K; break; + case 8192: + ret |= EMAC4_MR1_RFS_8K; + break; case 4096: ret |= EMAC_MR1_RFS_4K; break; @@ -516,6 +519,9 @@ static u32 __emac4_calc_base_mr1(struct emac_instance *dev, int tx_size, int rx_ case 16384: ret |= EMAC4_MR1_TFS_16K; break; + case 8192: + ret |= EMAC4_MR1_TFS_8K; + break; case 4096: ret |= EMAC4_MR1_TFS_4K; break; diff --git a/drivers/net/ethernet/ibm/emac/emac.h b/drivers/net/ethernet/ibm/emac/emac.h index 5afcc27ceebb68..d0a0e3b3f28376 100644 --- a/drivers/net/ethernet/ibm/emac/emac.h +++ b/drivers/net/ethernet/ibm/emac/emac.h @@ -151,9 +151,11 @@ struct emac_regs { #define EMAC4_MR1_RFS_2K 0x00100000 #define EMAC4_MR1_RFS_4K 0x00180000 +#define EMAC4_MR1_RFS_8K 0x00200000 #define EMAC4_MR1_RFS_16K 0x00280000 #define EMAC4_MR1_TFS_2K 0x00020000 #define EMAC4_MR1_TFS_4K 0x00030000 +#define EMAC4_MR1_TFS_8K 0x00040000 #define EMAC4_MR1_TFS_16K 0x00050000 #define EMAC4_MR1_TR 0x00008000 #define EMAC4_MR1_MWSW_001 0x00001000 From 624ca9c33c8a853a4a589836e310d776620f4ab9 Mon Sep 17 00:00:00 2001 From: Ivan Mikhaylov Date: Wed, 24 Jan 2018 15:53:25 +0300 Subject: [PATCH 43/62] net/ibm/emac: wrong bit is used for STA control register write STA control register has areas of mode and opcodes for opeations. 18 bit is using for mode selection, where 0 is old MIO/MDIO access method and 1 is indirect access mode. 19-20 bits are using for setting up read/write operation(STA opcodes). In current state 'read' is set into old MIO/MDIO mode with 19 bit and write operation is set into 18 bit which is mode selection, not a write operation. To correlate write with read we set it into 20 bit. All those bit operations are MSB 0 based. Signed-off-by: Ivan Mikhaylov Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/emac/emac.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ibm/emac/emac.h b/drivers/net/ethernet/ibm/emac/emac.h index d0a0e3b3f28376..c26d2631ca30e5 100644 --- a/drivers/net/ethernet/ibm/emac/emac.h +++ b/drivers/net/ethernet/ibm/emac/emac.h @@ -244,7 +244,7 @@ struct emac_regs { #define EMAC_STACR_PHYE 0x00004000 #define EMAC_STACR_STAC_MASK 0x00003000 #define EMAC_STACR_STAC_READ 0x00001000 -#define EMAC_STACR_STAC_WRITE 0x00002000 +#define EMAC_STACR_STAC_WRITE 0x00000800 #define EMAC_STACR_OPBC_MASK 0x00000C00 #define EMAC_STACR_OPBC_50 0x00000000 #define EMAC_STACR_OPBC_66 0x00000400 From 82d94856fa221b5173eefd56bcd1057c037e9b07 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 9 Jan 2018 13:10:30 +0100 Subject: [PATCH 44/62] perf/core: Fix lock inversion between perf,trace,cpuhp Lockdep gifted us with noticing the following 4-way lockup scenario: perf_trace_init() #0 mutex_lock(&event_mutex) perf_trace_event_init() perf_trace_event_reg() tp_event->class->reg() := tracepoint_probe_register #1 mutex_lock(&tracepoints_mutex) trace_point_add_func() #2 static_key_enable() #2 do_cpu_up() perf_event_init_cpu() #3 mutex_lock(&pmus_lock) #4 mutex_lock(&ctx->mutex) perf_event_task_disable() mutex_lock(¤t->perf_event_mutex) #4 ctx = perf_event_ctx_lock() #5 perf_event_for_each_child() do_exit() task_work_run() __fput() perf_release() perf_event_release_kernel() #4 mutex_lock(&ctx->mutex) #5 mutex_lock(&event->child_mutex) free_event() _free_event() event->destroy() := perf_trace_destroy #0 mutex_lock(&event_mutex); Fix that by moving the free_event() out from under the locks. Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Steven Rostedt (VMware) Cc: Thomas Gleixner Cc: Vince Weaver Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 4df5b695bf0db1..2d80824298a719 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1231,6 +1231,10 @@ static void put_ctx(struct perf_event_context *ctx) * perf_event_context::lock * perf_event::mmap_mutex * mmap_sem + * + * cpu_hotplug_lock + * pmus_lock + * cpuctx->mutex / perf_event_context::mutex */ static struct perf_event_context * perf_event_ctx_lock_nested(struct perf_event *event, int nesting) @@ -4196,6 +4200,7 @@ int perf_event_release_kernel(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; struct perf_event *child, *tmp; + LIST_HEAD(free_list); /* * If we got here through err_file: fput(event_file); we will not have @@ -4268,8 +4273,7 @@ int perf_event_release_kernel(struct perf_event *event) struct perf_event, child_list); if (tmp == child) { perf_remove_from_context(child, DETACH_GROUP); - list_del(&child->child_list); - free_event(child); + list_move(&child->child_list, &free_list); /* * This matches the refcount bump in inherit_event(); * this can't be the last reference. @@ -4284,6 +4288,11 @@ int perf_event_release_kernel(struct perf_event *event) } mutex_unlock(&event->child_mutex); + list_for_each_entry_safe(child, tmp, &free_list, child_list) { + list_del(&child->child_list); + free_event(child); + } + no_ctx: put_event(event); /* Must be the 'last' reference */ return 0; From 43fa87f7deed52e8c8420182e0c133bc4cf395f6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 9 Jan 2018 17:07:59 +0100 Subject: [PATCH 45/62] perf/core: Fix another perf,trace,cpuhp lock inversion Lockdep noticed the following 3-way lockup race: perf_trace_init() #0 mutex_lock(&event_mutex) perf_trace_event_init() perf_trace_event_reg() tp_event->class->reg() := tracepoint_probe_register #1 mutex_lock(&tracepoints_mutex) trace_point_add_func() #2 static_key_enable() #2 do_cpu_up() perf_event_init_cpu() #3 mutex_lock(&pmus_lock) #4 mutex_lock(&ctx->mutex) perf_ioctl() #4 ctx = perf_event_ctx_lock() _perf_iotcl() ftrace_profile_set_filter() #0 mutex_lock(&event_mutex) Fudge it for now by noting that the tracepoint state does not depend on the event <-> context relation. Ugly though :/ Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/events/core.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 2d80824298a719..816f83d70fc65f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8525,6 +8525,29 @@ perf_event_set_addr_filter(struct perf_event *event, char *filter_str) return ret; } +static int +perf_tracepoint_set_filter(struct perf_event *event, char *filter_str) +{ + struct perf_event_context *ctx = event->ctx; + int ret; + + /* + * Beware, here be dragons!! + * + * the tracepoint muck will deadlock against ctx->mutex, but the tracepoint + * stuff does not actually need it. So temporarily drop ctx->mutex. As per + * perf_event_ctx_lock() we already have a reference on ctx. + * + * This can result in event getting moved to a different ctx, but that + * does not affect the tracepoint state. + */ + mutex_unlock(&ctx->mutex); + ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); + mutex_lock(&ctx->mutex); + + return ret; +} + static int perf_event_set_filter(struct perf_event *event, void __user *arg) { char *filter_str; @@ -8541,8 +8564,7 @@ static int perf_event_set_filter(struct perf_event *event, void __user *arg) if (IS_ENABLED(CONFIG_EVENT_TRACING) && event->attr.type == PERF_TYPE_TRACEPOINT) - ret = ftrace_profile_set_filter(event, event->attr.config, - filter_str); + ret = perf_tracepoint_set_filter(event, filter_str); else if (has_addr_filter(event)) ret = perf_event_set_addr_filter(event, filter_str); From 0c7296cad651a3a40286d70ff37e73bd6fa4e4da Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 9 Jan 2018 21:23:02 +0100 Subject: [PATCH 46/62] perf/core: Fix ctx::mutex deadlock Lockdep noticed the following 3-way lockup scenario: sys_perf_event_open() perf_event_alloc() perf_try_init_event() #0 ctx = perf_event_ctx_lock_nested(1) perf_swevent_init() swevent_hlist_get() #1 mutex_lock(&pmus_lock) perf_event_init_cpu() #1 mutex_lock(&pmus_lock) #2 mutex_lock(&ctx->mutex) sys_perf_event_open() mutex_lock_double() #2 mutex_lock() #0 mutex_lock_nested() And while we need that perf_event_ctx_lock_nested() for HW PMUs such that they can iterate the sibling list, trying to match it to the available counters, the software PMUs need do no such thing. Exclude them. In particular the swevent triggers the above invertion, while the tpevent PMU triggers a more elaborate one through their event_mutex. Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 816f83d70fc65f..5d8f4031f8d589 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9199,7 +9199,13 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) if (!try_module_get(pmu->module)) return -ENODEV; - if (event->group_leader != event) { + /* + * A number of pmu->event_init() methods iterate the sibling_list to, + * for example, validate if the group fits on the PMU. Therefore, + * if this is a sibling event, acquire the ctx->mutex to protect + * the sibling_list. + */ + if (event->group_leader != event && pmu->task_ctx_nr != perf_sw_context) { /* * This ctx->mutex can nest when we're called through * inheritance. See the perf_event_ctx_lock_nested() comment. From efe951d3de9141626a494bcb1efb0650eaef6491 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 10 Jan 2018 19:23:08 +0100 Subject: [PATCH 47/62] perf/x86: Fix perf,x86,cpuhp deadlock More lockdep gifts, a 5-way lockup race: perf_event_create_kernel_counter() perf_event_alloc() perf_try_init_event() x86_pmu_event_init() __x86_pmu_event_init() x86_reserve_hardware() #0 mutex_lock(&pmc_reserve_mutex); reserve_ds_buffer() #1 get_online_cpus() perf_event_release_kernel() _free_event() hw_perf_event_destroy() x86_release_hardware() #0 mutex_lock(&pmc_reserve_mutex) release_ds_buffer() #1 get_online_cpus() #1 do_cpu_up() perf_event_init_cpu() #2 mutex_lock(&pmus_lock) #3 mutex_lock(&ctx->mutex) sys_perf_event_open() mutex_lock_double() #3 mutex_lock(ctx->mutex) #4 mutex_lock_nested(ctx->mutex, 1); perf_try_init_event() #4 mutex_lock_nested(ctx->mutex, 1) x86_pmu_event_init() intel_pmu_hw_config() x86_add_exclusive() #0 mutex_lock(&pmc_reserve_mutex) Fix it by using ordering constructs instead of locking. Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/events/intel/ds.c | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 8156e47da7ba4c..18c25ab2855744 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -372,10 +372,9 @@ static int alloc_pebs_buffer(int cpu) static void release_pebs_buffer(int cpu) { struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); - struct debug_store *ds = hwev->ds; void *cea; - if (!ds || !x86_pmu.pebs) + if (!x86_pmu.pebs) return; kfree(per_cpu(insn_buffer, cpu)); @@ -384,7 +383,6 @@ static void release_pebs_buffer(int cpu) /* Clear the fixmap */ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer; ds_clear_cea(cea, x86_pmu.pebs_buffer_size); - ds->pebs_buffer_base = 0; dsfree_pages(hwev->ds_pebs_vaddr, x86_pmu.pebs_buffer_size); hwev->ds_pebs_vaddr = NULL; } @@ -419,16 +417,14 @@ static int alloc_bts_buffer(int cpu) static void release_bts_buffer(int cpu) { struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); - struct debug_store *ds = hwev->ds; void *cea; - if (!ds || !x86_pmu.bts) + if (!x86_pmu.bts) return; /* Clear the fixmap */ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer; ds_clear_cea(cea, BTS_BUFFER_SIZE); - ds->bts_buffer_base = 0; dsfree_pages(hwev->ds_bts_vaddr, BTS_BUFFER_SIZE); hwev->ds_bts_vaddr = NULL; } @@ -454,16 +450,22 @@ void release_ds_buffers(void) if (!x86_pmu.bts && !x86_pmu.pebs) return; - get_online_cpus(); - for_each_online_cpu(cpu) + for_each_possible_cpu(cpu) + release_ds_buffer(cpu); + + for_each_possible_cpu(cpu) { + /* + * Again, ignore errors from offline CPUs, they will no longer + * observe cpu_hw_events.ds and not program the DS_AREA when + * they come up. + */ fini_debug_store_on_cpu(cpu); + } for_each_possible_cpu(cpu) { release_pebs_buffer(cpu); release_bts_buffer(cpu); - release_ds_buffer(cpu); } - put_online_cpus(); } void reserve_ds_buffers(void) @@ -483,8 +485,6 @@ void reserve_ds_buffers(void) if (!x86_pmu.pebs) pebs_err = 1; - get_online_cpus(); - for_each_possible_cpu(cpu) { if (alloc_ds_buffer(cpu)) { bts_err = 1; @@ -521,11 +521,14 @@ void reserve_ds_buffers(void) if (x86_pmu.pebs && !pebs_err) x86_pmu.pebs_active = 1; - for_each_online_cpu(cpu) + for_each_possible_cpu(cpu) { + /* + * Ignores wrmsr_on_cpu() errors for offline CPUs they + * will get this call through intel_pmu_cpu_starting(). + */ init_debug_store_on_cpu(cpu); + } } - - put_online_cpus(); } /* From 4ee806d51176ba7b8ff1efd81f271d7252e03a1d Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Thu, 18 Jan 2018 16:14:26 -0500 Subject: [PATCH 48/62] net: tcp: close sock if net namespace is exiting When a tcp socket is closed, if it detects that its net namespace is exiting, close immediately and do not wait for FIN sequence. For normal sockets, a reference is taken to their net namespace, so it will never exit while the socket is open. However, kernel sockets do not take a reference to their net namespace, so it may begin exiting while the kernel socket is still open. In this case if the kernel socket is a tcp socket, it will stay open trying to complete its close sequence. The sock's dst(s) hold a reference to their interface, which are all transferred to the namespace's loopback interface when the real interfaces are taken down. When the namespace tries to take down its loopback interface, it hangs waiting for all references to the loopback interface to release, which results in messages like: unregister_netdevice: waiting for lo to become free. Usage count = 1 These messages continue until the socket finally times out and closes. Since the net namespace cleanup holds the net_mutex while calling its registered pernet callbacks, any new net namespace initialization is blocked until the current net namespace finishes exiting. After this change, the tcp socket notices the exiting net namespace, and closes immediately, releasing its dst(s) and their reference to the loopback interface, which lets the net namespace continue exiting. Link: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1711407 Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=97811 Signed-off-by: Dan Streetman Signed-off-by: David S. Miller --- include/net/net_namespace.h | 10 ++++++++++ net/ipv4/tcp.c | 3 +++ net/ipv4/tcp_timer.c | 15 +++++++++++++++ 3 files changed, 28 insertions(+) diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 10f99dafd5acb1..049008493faf67 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -223,6 +223,11 @@ int net_eq(const struct net *net1, const struct net *net2) return net1 == net2; } +static inline int check_net(const struct net *net) +{ + return atomic_read(&net->count) != 0; +} + void net_drop_ns(void *); #else @@ -247,6 +252,11 @@ int net_eq(const struct net *net1, const struct net *net2) return 1; } +static inline int check_net(const struct net *net) +{ + return 1; +} + #define net_drop_ns NULL #endif diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f08eebe60446e2..8e053ad7cae260 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2298,6 +2298,9 @@ void tcp_close(struct sock *sk, long timeout) tcp_send_active_reset(sk, GFP_ATOMIC); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); + } else if (!check_net(sock_net(sk))) { + /* Not possible to send reset; just close */ + tcp_set_state(sk, TCP_CLOSE); } } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 968fda1983762e..388158c9d9f6b3 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -48,11 +48,19 @@ static void tcp_write_err(struct sock *sk) * to prevent DoS attacks. It is called when a retransmission timeout * or zero probe timeout occurs on orphaned socket. * + * Also close if our net namespace is exiting; in that case there is no + * hope of ever communicating again since all netns interfaces are already + * down (or about to be down), and we need to release our dst references, + * which have been moved to the netns loopback interface, so the namespace + * can finish exiting. This condition is only possible if we are a kernel + * socket, as those do not hold references to the namespace. + * * Criteria is still not confirmed experimentally and may change. * We kill the socket, if: * 1. If number of orphaned sockets exceeds an administratively configured * limit. * 2. If we have strong memory pressure. + * 3. If our net namespace is exiting. */ static int tcp_out_of_resources(struct sock *sk, bool do_reset) { @@ -81,6 +89,13 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset) __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); return 1; } + + if (!check_net(sock_net(sk))) { + /* Not possible to send reset; just close */ + tcp_done(sk); + return 1; + } + return 0; } From f15ca723c1ebe6c1a06bc95fda6b62cd87b44559 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Thu, 25 Jan 2018 19:03:03 +0100 Subject: [PATCH 49/62] net: don't call update_pmtu unconditionally Some dst_ops (e.g. md_dst_ops)) doesn't set this handler. It may result to: "BUG: unable to handle kernel NULL pointer dereference at (null)" Let's add a helper to check if update_pmtu is available before calling it. Fixes: 52a589d51f10 ("geneve: update skb dst pmtu on tx path") Fixes: a93bf0ff4490 ("vxlan: update skb dst pmtu on tx path") CC: Roman Kapl CC: Xin Long Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- drivers/infiniband/ulp/ipoib/ipoib_cm.c | 3 +-- drivers/net/geneve.c | 4 ++-- drivers/net/vxlan.c | 6 ++---- include/net/dst.h | 8 ++++++++ net/ipv4/ip_tunnel.c | 3 +-- net/ipv4/ip_vti.c | 2 +- net/ipv6/ip6_tunnel.c | 6 ++---- net/ipv6/ip6_vti.c | 2 +- net/ipv6/sit.c | 4 ++-- 9 files changed, 20 insertions(+), 18 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 2c13123bfd6949..71ea9e26666cd2 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -1456,8 +1456,7 @@ void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb, struct ipoib_dev_priv *priv = ipoib_priv(dev); int e = skb_queue_empty(&priv->cm.skb_queue); - if (skb_dst(skb)) - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); + skb_dst_update_pmtu(skb, mtu); skb_queue_tail(&priv->cm.skb_queue, skb); if (e) diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 0a48b3073d3d36..64fda2e1040eb6 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -829,7 +829,7 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, int mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr) - GENEVE_BASE_HLEN - info->options_len - 14; - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); + skb_dst_update_pmtu(skb, mtu); } sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true); @@ -875,7 +875,7 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, int mtu = dst_mtu(dst) - sizeof(struct ipv6hdr) - GENEVE_BASE_HLEN - info->options_len - 14; - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); + skb_dst_update_pmtu(skb, mtu); } sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true); diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 31f4b7911ef84c..c3e34e3c82a7a9 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -2158,8 +2158,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, if (skb_dst(skb)) { int mtu = dst_mtu(ndst) - VXLAN_HEADROOM; - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, - skb, mtu); + skb_dst_update_pmtu(skb, mtu); } tos = ip_tunnel_ecn_encap(tos, old_iph, skb); @@ -2200,8 +2199,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, if (skb_dst(skb)) { int mtu = dst_mtu(ndst) - VXLAN6_HEADROOM; - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, - skb, mtu); + skb_dst_update_pmtu(skb, mtu); } tos = ip_tunnel_ecn_encap(tos, old_iph, skb); diff --git a/include/net/dst.h b/include/net/dst.h index b091fd53609891..d49d607dd2b3c5 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -521,4 +521,12 @@ static inline struct xfrm_state *dst_xfrm(const struct dst_entry *dst) } #endif +static inline void skb_dst_update_pmtu(struct sk_buff *skb, u32 mtu) +{ + struct dst_entry *dst = skb_dst(skb); + + if (dst && dst->ops->update_pmtu) + dst->ops->update_pmtu(dst, NULL, skb, mtu); +} + #endif /* _NET_DST_H */ diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 5ddb1cb52bd405..6d21068f9b5531 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -520,8 +520,7 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, else mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; - if (skb_dst(skb)) - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); + skb_dst_update_pmtu(skb, mtu); if (skb->protocol == htons(ETH_P_IP)) { if (!skb_is_gso(skb) && diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 949f432a5f04b5..51b1669334fe6b 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -200,7 +200,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, mtu = dst_mtu(dst); if (skb->len > mtu) { - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); + skb_dst_update_pmtu(skb, mtu); if (skb->protocol == htons(ETH_P_IP)) { icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 9a7cf355bc8c8f..1ee5584c3555b4 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -642,8 +642,7 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (rel_info > dst_mtu(skb_dst(skb2))) goto out; - skb_dst(skb2)->ops->update_pmtu(skb_dst(skb2), NULL, skb2, - rel_info); + skb_dst_update_pmtu(skb2, rel_info); } icmp_send(skb2, rel_type, rel_code, htonl(rel_info)); @@ -1131,8 +1130,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, mtu = 576; } - if (skb_dst(skb) && !t->parms.collect_md) - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); + skb_dst_update_pmtu(skb, mtu); if (skb->len - t->tun_hlen - eth_hlen > mtu && !skb_is_gso(skb)) { *pmtu = mtu; err = -EMSGSIZE; diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index dbb74f3c57a77f..8c184f84f35334 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -483,7 +483,7 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) mtu = dst_mtu(dst); if (!skb->ignore_df && skb->len > mtu) { - skb_dst(skb)->ops->update_pmtu(dst, NULL, skb, mtu); + skb_dst_update_pmtu(skb, mtu); if (skb->protocol == htons(ETH_P_IPV6)) { if (mtu < IPV6_MIN_MTU) diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index d7dc23c1b2ca32..3873d387713575 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -934,8 +934,8 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, df = 0; } - if (tunnel->parms.iph.daddr && skb_dst(skb)) - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); + if (tunnel->parms.iph.daddr) + skb_dst_update_pmtu(skb, mtu); if (skb->len > mtu && !skb_is_gso(skb)) { icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); From 0fd189a95fdbc631737df5f27a0fc0a3dd31b75e Mon Sep 17 00:00:00 2001 From: Lyude Paul Date: Thu, 25 Jan 2018 18:29:53 -0500 Subject: [PATCH 50/62] drm/nouveau: Move irq setup/teardown to pci ctor/dtor For a while we've been having issues with seemingly random interrupts coming from nvidia cards when resuming them. Originally the fix for this was thought to be just re-arming the MSI interrupt registers right after re-allocating our IRQs, however it seems a lot of what we do is both wrong and not even nessecary. This was made apparent by what appeared to be a regression in the mainline kernel that started introducing suspend/resume issues for nouveau: a0c9259dc4e1 (irq/matrix: Spread interrupts on allocation) After this commit was introduced, we started getting interrupts from the GPU before we actually re-allocated our own IRQ (see references below) and assigned the IRQ handler. Investigating this turned out that the problem was not with the commit, but the fact that nouveau even free/allocates it's irqs before and after suspend/resume. For starters: drivers in the linux kernel haven't had to handle freeing/re-allocating their IRQs during suspend/resume cycles for quite a while now. Nouveau seems to be one of the few drivers left that still does this, despite the fact there's no reason we actually need to since disabling interrupts from the device side should be enough, as the kernel is already smart enough to know to disable host-side interrupts for us before going into suspend. Since we were tearing down our IRQs by hand however, that means there was a short period during resume where interrupts could be received before we re-allocated our IRQ which would lead to us getting an unhandled IRQ. Since we never handle said IRQ and re-arm the interrupt registers, this would cause us to miss all of the interrupts from the GPU and cause our init process to start timing out on anything requiring interrupts. So, since this whole setup/teardown every suspend/resume cycle is useless anyway, move irq setup/teardown into the pci subdev's ctor/dtor functions instead so they're only called at driver load and driver unload. This should fix most of the issues with pending interrupts on resume, along with getting suspend/resume for nouveau to work again. As well, this probably means we can also just remove the msi rearm call inside nvkm_pci_init(). But since our main focus here is to fix suspend/resume before 4.15, we'll save that for a later patch. Signed-off-by: Lyude Paul Cc: Karol Herbst Cc: Thomas Gleixner Cc: Mike Galbraith Cc: stable@vger.kernel.org Signed-off-by: Ben Skeggs --- .../gpu/drm/nouveau/nvkm/subdev/pci/base.c | 46 +++++++++++++------ 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/pci/base.c b/drivers/gpu/drm/nouveau/nvkm/subdev/pci/base.c index deb96de54b0030..ee2431a7804ec8 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/pci/base.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/pci/base.c @@ -71,6 +71,10 @@ nvkm_pci_intr(int irq, void *arg) struct nvkm_pci *pci = arg; struct nvkm_device *device = pci->subdev.device; bool handled = false; + + if (pci->irq < 0) + return IRQ_HANDLED; + nvkm_mc_intr_unarm(device); if (pci->msi) pci->func->msi_rearm(pci); @@ -84,11 +88,6 @@ nvkm_pci_fini(struct nvkm_subdev *subdev, bool suspend) { struct nvkm_pci *pci = nvkm_pci(subdev); - if (pci->irq >= 0) { - free_irq(pci->irq, pci); - pci->irq = -1; - } - if (pci->agp.bridge) nvkm_agp_fini(pci); @@ -108,8 +107,20 @@ static int nvkm_pci_oneinit(struct nvkm_subdev *subdev) { struct nvkm_pci *pci = nvkm_pci(subdev); - if (pci_is_pcie(pci->pdev)) - return nvkm_pcie_oneinit(pci); + struct pci_dev *pdev = pci->pdev; + int ret; + + if (pci_is_pcie(pci->pdev)) { + ret = nvkm_pcie_oneinit(pci); + if (ret) + return ret; + } + + ret = request_irq(pdev->irq, nvkm_pci_intr, IRQF_SHARED, "nvkm", pci); + if (ret) + return ret; + + pci->irq = pdev->irq; return 0; } @@ -117,7 +128,6 @@ static int nvkm_pci_init(struct nvkm_subdev *subdev) { struct nvkm_pci *pci = nvkm_pci(subdev); - struct pci_dev *pdev = pci->pdev; int ret; if (pci->agp.bridge) { @@ -131,28 +141,34 @@ nvkm_pci_init(struct nvkm_subdev *subdev) if (pci->func->init) pci->func->init(pci); - ret = request_irq(pdev->irq, nvkm_pci_intr, IRQF_SHARED, "nvkm", pci); - if (ret) - return ret; - - pci->irq = pdev->irq; - /* Ensure MSI interrupts are armed, for the case where there are * already interrupts pending (for whatever reason) at load time. */ if (pci->msi) pci->func->msi_rearm(pci); - return ret; + return 0; } static void * nvkm_pci_dtor(struct nvkm_subdev *subdev) { struct nvkm_pci *pci = nvkm_pci(subdev); + nvkm_agp_dtor(pci); + + if (pci->irq >= 0) { + /* freq_irq() will call the handler, we use pci->irq == -1 + * to signal that it's been torn down and should be a noop. + */ + int irq = pci->irq; + pci->irq = -1; + free_irq(irq, pci); + } + if (pci->msi) pci_disable_msi(pci->pdev); + return nvkm_pci(subdev); } From 6793f1c450b1533a5e9c2493490de771d38b24f9 Mon Sep 17 00:00:00 2001 From: Martin Brandenburg Date: Thu, 25 Jan 2018 19:39:44 -0500 Subject: [PATCH 51/62] orangefs: fix deadlock; do not write i_size in read_iter After do_readv_writev, the inode cache is invalidated anyway, so i_size will never be read. It will be fetched from the server which will also know about updates from other machines. Fixes deadlock on 32-bit SMP. See https://marc.info/?l=linux-fsdevel&m=151268557427760&w=2 Signed-off-by: Martin Brandenburg Cc: Al Viro Cc: Mike Marshall Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- fs/orangefs/file.c | 7 ++----- fs/orangefs/orangefs-kernel.h | 11 ----------- 2 files changed, 2 insertions(+), 16 deletions(-) diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index 1668fd645c4536..0d228cd087e627 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -452,7 +452,7 @@ ssize_t orangefs_inode_read(struct inode *inode, static ssize_t orangefs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; - loff_t pos = *(&iocb->ki_pos); + loff_t pos = iocb->ki_pos; ssize_t rc = 0; BUG_ON(iocb->private); @@ -492,9 +492,6 @@ static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *ite } } - if (file->f_pos > i_size_read(file->f_mapping->host)) - orangefs_i_size_write(file->f_mapping->host, file->f_pos); - rc = generic_write_checks(iocb, iter); if (rc <= 0) { @@ -508,7 +505,7 @@ static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *ite * pos to the end of the file, so we will wait till now to set * pos... */ - pos = *(&iocb->ki_pos); + pos = iocb->ki_pos; rc = do_readv_writev(ORANGEFS_IO_WRITE, file, diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index 97adf7d100b5fe..2595453fe7370a 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -533,17 +533,6 @@ do { \ sys_attr.mask = ORANGEFS_ATTR_SYS_ALL_SETABLE; \ } while (0) -static inline void orangefs_i_size_write(struct inode *inode, loff_t i_size) -{ -#if BITS_PER_LONG == 32 && defined(CONFIG_SMP) - inode_lock(inode); -#endif - i_size_write(inode, i_size); -#if BITS_PER_LONG == 32 && defined(CONFIG_SMP) - inode_unlock(inode); -#endif -} - static inline void orangefs_set_timeout(struct dentry *dentry) { unsigned long time = jiffies + orangefs_dcache_timeout_msecs*HZ/1000; From a78e93661c5fd30b9e1dee464b2f62f966883ef7 Mon Sep 17 00:00:00 2001 From: Francois Romieu Date: Fri, 26 Jan 2018 01:53:26 +0100 Subject: [PATCH 52/62] r8169: fix memory corruption on retrieval of hardware statistics. Hardware statistics retrieval hurts in tight invocation loops. Avoid extraneous write and enforce strict ordering of writes targeted to the tally counters dump area address registers. Signed-off-by: Francois Romieu Tested-by: Oliver Freyermuth Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c index fc0d5fa65ad4c1..734286ebe5ef55 100644 --- a/drivers/net/ethernet/realtek/r8169.c +++ b/drivers/net/ethernet/realtek/r8169.c @@ -2244,19 +2244,14 @@ static bool rtl8169_do_counters(struct net_device *dev, u32 counter_cmd) void __iomem *ioaddr = tp->mmio_addr; dma_addr_t paddr = tp->counters_phys_addr; u32 cmd; - bool ret; RTL_W32(CounterAddrHigh, (u64)paddr >> 32); + RTL_R32(CounterAddrHigh); cmd = (u64)paddr & DMA_BIT_MASK(32); RTL_W32(CounterAddrLow, cmd); RTL_W32(CounterAddrLow, cmd | counter_cmd); - ret = rtl_udelay_loop_wait_low(tp, &rtl_counters_cond, 10, 1000); - - RTL_W32(CounterAddrLow, 0); - RTL_W32(CounterAddrHigh, 0); - - return ret; + return rtl_udelay_loop_wait_low(tp, &rtl_counters_cond, 10, 1000); } static bool rtl8169_reset_counters(struct net_device *dev) From 1e19c4d689dc1e95bafd23ef68fbc0c6b9e05180 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 24 Jan 2018 19:37:37 -0800 Subject: [PATCH 53/62] net: vrf: Add support for sends to local broadcast address Sukumar reported that sends to the local broadcast address (255.255.255.255) are broken. Check for the address in vrf driver and do not redirect to the VRF device - similar to multicast packets. With this change sockets can use SO_BINDTODEVICE to specify an egress interface and receive responses. Note: the egress interface can not be a VRF device but needs to be the enslaved device. https://bugzilla.kernel.org/show_bug.cgi?id=198521 Reported-by: Sukumar Gopalakrishnan Signed-off-by: David Ahern Signed-off-by: David S. Miller --- drivers/net/vrf.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index feb1b2e15c2e10..139c61c8244ad9 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -673,8 +673,9 @@ static struct sk_buff *vrf_ip_out(struct net_device *vrf_dev, struct sock *sk, struct sk_buff *skb) { - /* don't divert multicast */ - if (ipv4_is_multicast(ip_hdr(skb)->daddr)) + /* don't divert multicast or local broadcast */ + if (ipv4_is_multicast(ip_hdr(skb)->daddr) || + ipv4_is_lbcast(ip_hdr(skb)->daddr)) return skb; if (qdisc_tx_is_default(vrf_dev)) From 5beda7d54eafece4c974cfa9fbb9f60fb18fd20a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 25 Jan 2018 13:12:14 -0800 Subject: [PATCH 54/62] x86/mm/64: Fix vmapped stack syncing on very-large-memory 4-level systems Neil Berrington reported a double-fault on a VM with 768GB of RAM that uses large amounts of vmalloc space with PTI enabled. The cause is that load_new_mm_cr3() was never fixed to take the 5-level pgd folding code into account, so, on a 4-level kernel, the pgd synchronization logic compiles away to exactly nothing. Interestingly, the problem doesn't trigger with nopti. I assume this is because the kernel is mapped with global pages if we boot with nopti. The sequence of operations when we create a new task is that we first load its mm while still running on the old stack (which crashes if the old stack is unmapped in the new mm unless the TLB saves us), then we call prepare_switch_to(), and then we switch to the new stack. prepare_switch_to() pokes the new stack directly, which will populate the mapping through vmalloc_fault(). I assume that we're getting lucky on non-PTI systems -- the old stack's TLB entry stays alive long enough to make it all the way through prepare_switch_to() and switch_to() so that we make it to a valid stack. Fixes: b50858ce3e2a ("x86/mm/vmalloc: Add 5-level paging support") Reported-and-tested-by: Neil Berrington Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Cc: Konstantin Khlebnikov Cc: stable@vger.kernel.org Cc: Dave Hansen Cc: Borislav Petkov Link: https://lkml.kernel.org/r/346541c56caed61abbe693d7d2742b4a380c5001.1516914529.git.luto@kernel.org --- arch/x86/mm/tlb.c | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index a1561957dccbb8..5bfe61a5e8e3c6 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -151,6 +151,34 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next, local_irq_restore(flags); } +static void sync_current_stack_to_mm(struct mm_struct *mm) +{ + unsigned long sp = current_stack_pointer; + pgd_t *pgd = pgd_offset(mm, sp); + + if (CONFIG_PGTABLE_LEVELS > 4) { + if (unlikely(pgd_none(*pgd))) { + pgd_t *pgd_ref = pgd_offset_k(sp); + + set_pgd(pgd, *pgd_ref); + } + } else { + /* + * "pgd" is faked. The top level entries are "p4d"s, so sync + * the p4d. This compiles to approximately the same code as + * the 5-level case. + */ + p4d_t *p4d = p4d_offset(pgd, sp); + + if (unlikely(p4d_none(*p4d))) { + pgd_t *pgd_ref = pgd_offset_k(sp); + p4d_t *p4d_ref = p4d_offset(pgd_ref, sp); + + set_p4d(p4d, *p4d_ref); + } + } +} + void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { @@ -226,11 +254,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * mapped in the new pgd, we'll double-fault. Forcibly * map it. */ - unsigned int index = pgd_index(current_stack_pointer); - pgd_t *pgd = next->pgd + index; - - if (unlikely(pgd_none(*pgd))) - set_pgd(pgd, init_mm.pgd[index]); + sync_current_stack_to_mm(next); } /* Stop remote flushes for the previous mm */ From 36b3a7726886f24c4209852a58e64435bde3af98 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 25 Jan 2018 13:12:15 -0800 Subject: [PATCH 55/62] x86/mm/64: Tighten up vmalloc_fault() sanity checks on 5-level kernels On a 5-level kernel, if a non-init mm has a top-level entry, it needs to match init_mm's, but the vmalloc_fault() code skipped over the BUG_ON() that would have checked it. While we're at it, get rid of the rather confusing 4-level folded "pgd" logic. Cleans-up: b50858ce3e2a ("x86/mm/vmalloc: Add 5-level paging support") Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Cc: Konstantin Khlebnikov Cc: Dave Hansen Cc: Borislav Petkov Cc: Neil Berrington Link: https://lkml.kernel.org/r/2ae598f8c279b0a29baf75df207e6f2fdddc0a1b.1516914529.git.luto@kernel.org --- arch/x86/mm/fault.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index b3e40773dce096..800de815519cd1 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -439,18 +439,13 @@ static noinline int vmalloc_fault(unsigned long address) if (pgd_none(*pgd_ref)) return -1; - if (pgd_none(*pgd)) { - set_pgd(pgd, *pgd_ref); - arch_flush_lazy_mmu_mode(); - } else if (CONFIG_PGTABLE_LEVELS > 4) { - /* - * With folded p4d, pgd_none() is always false, so the pgd may - * point to an empty page table entry and pgd_page_vaddr() - * will return garbage. - * - * We will do the correct sanity check on the p4d level. - */ - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + if (CONFIG_PGTABLE_LEVELS > 4) { + if (pgd_none(*pgd)) { + set_pgd(pgd, *pgd_ref); + arch_flush_lazy_mmu_mode(); + } else { + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + } } /* With 4-level paging, copying happens on the p4d level. */ @@ -459,7 +454,7 @@ static noinline int vmalloc_fault(unsigned long address) if (p4d_none(*p4d_ref)) return -1; - if (p4d_none(*p4d)) { + if (p4d_none(*p4d) && CONFIG_PGTABLE_LEVELS == 4) { set_p4d(p4d, *p4d_ref); arch_flush_lazy_mmu_mode(); } else { @@ -470,6 +465,7 @@ static noinline int vmalloc_fault(unsigned long address) * Below here mismatches are bugs because these lower tables * are shared: */ + BUILD_BUG_ON(CONFIG_PGTABLE_LEVELS < 4); pud = pud_offset(p4d, address); pud_ref = pud_offset(p4d_ref, address); From 6572cc2bf2e7b10378eaa5a94a0c717dca1289c9 Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Wed, 24 Jan 2018 13:26:11 -0800 Subject: [PATCH 56/62] Update the RISC-V MAINTAINERS file Now that we're upstream in Linux we've been able to make some infrastructure changes so our port works a bit more like other ports. Specifically: * We now have a mailing list specific to the RISC-V Linux port, hosted at lists.infreadead.org. * We now have a kernel.org git tree where work on our port is coordinated. This patch changes the RISC-V maintainers entry to reflect these new bits of infrastructure. Reviewed-by: Christoph Hellwig Signed-off-by: Palmer Dabbelt --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index e3581413420c61..a3c25b18bd2d98 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11651,8 +11651,8 @@ F: drivers/mtd/nand/r852.h RISC-V ARCHITECTURE M: Palmer Dabbelt M: Albert Ou -L: patches@groups.riscv.org -T: git https://github.com/riscv/riscv-linux +L: linux-riscv@lists.infradead.org +T: git git://git.kernel.org/pub/scm/linux/kernel/git/palmer/riscv-linux.git S: Supported F: arch/riscv/ K: riscv From dd5684ecae3bd8e44b644f50e2c12c7e57fdfef5 Mon Sep 17 00:00:00 2001 From: Alexey Kodanev Date: Fri, 26 Jan 2018 15:14:16 +0300 Subject: [PATCH 57/62] dccp: don't restart ccid2_hc_tx_rto_expire() if sk in closed state ccid2_hc_tx_rto_expire() timer callback always restarts the timer again and can run indefinitely (unless it is stopped outside), and after commit 120e9dabaf55 ("dccp: defer ccid_hc_tx_delete() at dismantle time"), which moved ccid_hc_tx_delete() (also includes sk_stop_timer()) from dccp_destroy_sock() to sk_destruct(), this started to happen quite often. The timer prevents releasing the socket, as a result, sk_destruct() won't be called. Found with LTP/dccp_ipsec tests running on the bonding device, which later couldn't be unloaded after the tests were completed: unregister_netdevice: waiting for bond0 to become free. Usage count = 148 Fixes: 2a91aa396739 ("[DCCP] CCID2: Initial CCID2 (TCP-Like) implementation") Signed-off-by: Alexey Kodanev Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/dccp/ccids/ccid2.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index 1c75cd1255f69e..92d016e87816e3 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -140,6 +140,9 @@ static void ccid2_hc_tx_rto_expire(struct timer_list *t) ccid2_pr_debug("RTO_EXPIRE\n"); + if (sk->sk_state == DCCP_CLOSED) + goto out; + /* back-off timer */ hc->tx_rto <<= 1; if (hc->tx_rto > DCCP_RTO_MAX) From ba3169fc7548759be986b168d662e0ba64c2fd88 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Fri, 26 Jan 2018 11:48:25 +0000 Subject: [PATCH 58/62] VSOCK: set POLLOUT | POLLWRNORM for TCP_CLOSING select(2) with wfds but no rfds must return when the socket is shut down by the peer. This way userspace notices socket activity and gets -EPIPE from the next write(2). Currently select(2) does not return for virtio-vsock when a SEND+RCV shutdown packet is received. This is because vsock_poll() only sets POLLOUT | POLLWRNORM for TCP_CLOSE, not the TCP_CLOSING state that the socket is in when the shutdown is received. Signed-off-by: Stefan Hajnoczi Signed-off-by: David S. Miller --- net/vmw_vsock/af_vsock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 5d28abf87fbfe7..c9473d698525a3 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -951,7 +951,7 @@ static unsigned int vsock_poll(struct file *file, struct socket *sock, * POLLOUT|POLLWRNORM when peer is closed and nothing to read, * but local send is not shutdown. */ - if (sk->sk_state == TCP_CLOSE) { + if (sk->sk_state == TCP_CLOSE || sk->sk_state == TCP_CLOSING) { if (!(sk->sk_shutdown & SEND_SHUTDOWN)) mask |= POLLOUT | POLLWRNORM; From 8a95b74d50825067fb6c8af7f9db03e711b1cb9d Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 25 Jan 2018 11:59:34 -0800 Subject: [PATCH 59/62] x86: Mark hpa as a "Designated Reviewer" for the time being Due to some unfortunate events, I have not been directly involved in the x86 kernel patch flow for a while now. I have also not been able to ramp back up by now like I had hoped to, and after reviewing what I will need to work on both internally at Intel and elsewhere in the near term, it is clear that I am not going to be able to ramp back up until late 2018 at the very earliest. It is not acceptable to not recognize that this load is currently taken by Ingo and Thomas without my direct participation, so I mark myself as R: (designated reviewer) rather than M: (maintainer) until further notice. This is in fact recognizing the de facto situation for the past few years. I have obviously no intention of going away, and I will do everything within my power to improve Linux on x86 and x86 for Linux. This, however, puts credit where it is due and reflects a change of focus. This patch also removes stale entries for portions of the x86 architecture which have not been maintained separately from arch/x86 for a long time. If there is a reason to re-introduce them then that can happen later. Signed-off-by: H. Peter Anvin Signed-off-by: Thomas Gleixner Cc: Bruce Schlobohm Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20180125195934.5253-1-hpa@zytor.com Signed-off-by: Ingo Molnar --- MAINTAINERS | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index e3581413420c61..94976349ff6141 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6609,16 +6609,6 @@ L: linux-i2c@vger.kernel.org S: Maintained F: drivers/i2c/i2c-stub.c -i386 BOOT CODE -M: "H. Peter Anvin" -S: Maintained -F: arch/x86/boot/ - -i386 SETUP CODE / CPU ERRATA WORKAROUNDS -M: "H. Peter Anvin" -T: git git://git.kernel.org/pub/scm/linux/kernel/git/hpa/linux-2.6-x86setup.git -S: Maintained - IA64 (Itanium) PLATFORM M: Tony Luck M: Fenghua Yu @@ -14858,7 +14848,7 @@ F: net/x25/ X86 ARCHITECTURE (32-BIT AND 64-BIT) M: Thomas Gleixner M: Ingo Molnar -M: "H. Peter Anvin" +R: "H. Peter Anvin" M: x86@kernel.org L: linux-kernel@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/core From d5421ea43d30701e03cadc56a38854c36a8b4433 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Jan 2018 14:54:32 +0100 Subject: [PATCH 60/62] hrtimer: Reset hrtimer cpu base proper on CPU hotplug The hrtimer interrupt code contains a hang detection and mitigation mechanism, which prevents that a long delayed hrtimer interrupt causes a continous retriggering of interrupts which prevent the system from making progress. If a hang is detected then the timer hardware is programmed with a certain delay into the future and a flag is set in the hrtimer cpu base which prevents newly enqueued timers from reprogramming the timer hardware prior to the chosen delay. The subsequent hrtimer interrupt after the delay clears the flag and resumes normal operation. If such a hang happens in the last hrtimer interrupt before a CPU is unplugged then the hang_detected flag is set and stays that way when the CPU is plugged in again. At that point the timer hardware is not armed and it cannot be armed because the hang_detected flag is still active, so nothing clears that flag. As a consequence the CPU does not receive hrtimer interrupts and no timers expire on that CPU which results in RCU stalls and other malfunctions. Clear the flag along with some other less critical members of the hrtimer cpu base to ensure starting from a clean state when a CPU is plugged in. Thanks to Paul, Sebastian and Anna-Maria for their help to get down to the root cause of that hard to reproduce heisenbug. Once understood it's trivial and certainly justifies a brown paperbag. Fixes: 41d2e4949377 ("hrtimer: Tune hrtimer_interrupt hang logic") Reported-by: Paul E. McKenney Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Sebastian Sewior Cc: Anna-Maria Gleixner Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801261447590.2067@nanos --- kernel/time/hrtimer.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index d32520840fde9b..aa9d2a2b12109b 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -655,7 +655,9 @@ static void hrtimer_reprogram(struct hrtimer *timer, static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { base->expires_next = KTIME_MAX; + base->hang_detected = 0; base->hres_active = 0; + base->next_timer = NULL; } /* @@ -1589,6 +1591,7 @@ int hrtimers_prepare_cpu(unsigned int cpu) timerqueue_init_head(&cpu_base->clock_base[i].active); } + cpu_base->active_bases = 0; cpu_base->cpu = cpu; hrtimer_init_hres(cpu_base); return 0; From dd085168a74c99c3ebe7f813069e412eb8444243 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Sat, 27 Jan 2018 20:21:50 -0600 Subject: [PATCH 61/62] x86/ftrace: Add one more ENDPROC annotation When ORC support was added for the ftrace_64.S code, an ENDPROC for function_hook() was missed. This results in the following warning: arch/x86/kernel/ftrace_64.o: warning: objtool: .entry.text+0x0: unreachable instruction Fixes: e2ac83d74a4d ("x86/ftrace: Fix ORC unwinding from ftrace handlers") Reported-by: Steven Rostedt Reported-by: Borislav Petkov Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Cc: Linus Torvalds Link: https://lkml.kernel.org/r/20180128022150.dqierscqmt3uwwsr@treble --- arch/x86/kernel/ftrace_64.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index 7cb8ba08beb997..8774fd2ed390f7 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -291,7 +291,7 @@ trace: restore_mcount_regs jmp fgraph_trace -END(function_hook) +ENDPROC(function_hook) #endif /* CONFIG_DYNAMIC_FTRACE */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER From d8a5b80568a9cb66810e75b182018e9edb68e8ff Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 28 Jan 2018 13:20:33 -0800 Subject: [PATCH 62/62] Linux 4.15 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 339397b838d3cb..c8b8e902d5a4fd 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 4 PATCHLEVEL = 15 SUBLEVEL = 0 -EXTRAVERSION = -rc9 +EXTRAVERSION = NAME = Fearless Coyote # *DOCUMENTATION*