From 24633d901ea44fe99bc9a2d01a3881fa097b78b3 Mon Sep 17 00:00:00 2001
From: Vaibhav Shankar <vaibhav.shankar@intel.com>
Date: Thu, 13 Aug 2020 19:22:34 -0700
Subject: [PATCH 01/74] perf/x86/intel/uncore: Add BW counters for GT, IA and
 IO breakdown

Linux only has support to read total DDR reads and writes. Here we
add support to enable bandwidth breakdown-GT, IA and IO. Breakdown
of BW is important to debug and optimize memory access. This can also
be used for telemetry and improving the system software.The offsets for
GT, IA and IO are added and these free running counters can be accessed
via MMIO space.

The BW breakdown can be measured using the following cmd:

  perf stat -e uncore_imc/gt_requests/,uncore_imc/ia_requests/,uncore_imc/io_requests/

             30.57 MiB  uncore_imc/gt_requests/
           1346.13 MiB  uncore_imc/ia_requests/
            190.97 MiB  uncore_imc/io_requests/

       5.984572733 seconds time elapsed

     BW/s = <gt,ia,io>_requests/time elapsed

Signed-off-by: Vaibhav Shankar <vaibhav.shankar@intel.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20200814022234.23605-1-vaibhav.shankar@intel.com
---
 arch/x86/events/intel/uncore_snb.c | 52 ++++++++++++++++++++++++++++--
 1 file changed, 49 insertions(+), 3 deletions(-)

diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
index cb94ba86efd28f..6a4ca27b2c9e12 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -390,6 +390,18 @@ static struct uncore_event_desc snb_uncore_imc_events[] = {
 	INTEL_UNCORE_EVENT_DESC(data_writes.scale, "6.103515625e-5"),
 	INTEL_UNCORE_EVENT_DESC(data_writes.unit, "MiB"),
 
+	INTEL_UNCORE_EVENT_DESC(gt_requests, "event=0x03"),
+	INTEL_UNCORE_EVENT_DESC(gt_requests.scale, "6.103515625e-5"),
+	INTEL_UNCORE_EVENT_DESC(gt_requests.unit, "MiB"),
+
+	INTEL_UNCORE_EVENT_DESC(ia_requests, "event=0x04"),
+	INTEL_UNCORE_EVENT_DESC(ia_requests.scale, "6.103515625e-5"),
+	INTEL_UNCORE_EVENT_DESC(ia_requests.unit, "MiB"),
+
+	INTEL_UNCORE_EVENT_DESC(io_requests, "event=0x05"),
+	INTEL_UNCORE_EVENT_DESC(io_requests.scale, "6.103515625e-5"),
+	INTEL_UNCORE_EVENT_DESC(io_requests.unit, "MiB"),
+
 	{ /* end: all zeroes */ },
 };
 
@@ -405,13 +417,35 @@ static struct uncore_event_desc snb_uncore_imc_events[] = {
 #define SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE	0x5054
 #define SNB_UNCORE_PCI_IMC_CTR_BASE		SNB_UNCORE_PCI_IMC_DATA_READS_BASE
 
+/* BW break down- legacy counters */
+#define SNB_UNCORE_PCI_IMC_GT_REQUESTS		0x3
+#define SNB_UNCORE_PCI_IMC_GT_REQUESTS_BASE	0x5040
+#define SNB_UNCORE_PCI_IMC_IA_REQUESTS		0x4
+#define SNB_UNCORE_PCI_IMC_IA_REQUESTS_BASE	0x5044
+#define SNB_UNCORE_PCI_IMC_IO_REQUESTS		0x5
+#define SNB_UNCORE_PCI_IMC_IO_REQUESTS_BASE	0x5048
+
 enum perf_snb_uncore_imc_freerunning_types {
-	SNB_PCI_UNCORE_IMC_DATA		= 0,
+	SNB_PCI_UNCORE_IMC_DATA_READS		= 0,
+	SNB_PCI_UNCORE_IMC_DATA_WRITES,
+	SNB_PCI_UNCORE_IMC_GT_REQUESTS,
+	SNB_PCI_UNCORE_IMC_IA_REQUESTS,
+	SNB_PCI_UNCORE_IMC_IO_REQUESTS,
+
 	SNB_PCI_UNCORE_IMC_FREERUNNING_TYPE_MAX,
 };
 
 static struct freerunning_counters snb_uncore_imc_freerunning[] = {
-	[SNB_PCI_UNCORE_IMC_DATA]     = { SNB_UNCORE_PCI_IMC_DATA_READS_BASE, 0x4, 0x0, 2, 32 },
+	[SNB_PCI_UNCORE_IMC_DATA_READS]		= { SNB_UNCORE_PCI_IMC_DATA_READS_BASE,
+							0x0, 0x0, 1, 32 },
+	[SNB_PCI_UNCORE_IMC_DATA_READS]		= { SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE,
+							0x0, 0x0, 1, 32 },
+	[SNB_PCI_UNCORE_IMC_GT_REQUESTS]	= { SNB_UNCORE_PCI_IMC_GT_REQUESTS_BASE,
+							0x0, 0x0, 1, 32 },
+	[SNB_PCI_UNCORE_IMC_IA_REQUESTS]	= { SNB_UNCORE_PCI_IMC_IA_REQUESTS_BASE,
+							0x0, 0x0, 1, 32 },
+	[SNB_PCI_UNCORE_IMC_IO_REQUESTS]	= { SNB_UNCORE_PCI_IMC_IO_REQUESTS_BASE,
+							0x0, 0x0, 1, 32 },
 };
 
 static struct attribute *snb_uncore_imc_formats_attr[] = {
@@ -525,6 +559,18 @@ static int snb_uncore_imc_event_init(struct perf_event *event)
 		base = SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE;
 		idx = UNCORE_PMC_IDX_FREERUNNING;
 		break;
+	case SNB_UNCORE_PCI_IMC_GT_REQUESTS:
+		base = SNB_UNCORE_PCI_IMC_GT_REQUESTS_BASE;
+		idx = UNCORE_PMC_IDX_FREERUNNING;
+		break;
+	case SNB_UNCORE_PCI_IMC_IA_REQUESTS:
+		base = SNB_UNCORE_PCI_IMC_IA_REQUESTS_BASE;
+		idx = UNCORE_PMC_IDX_FREERUNNING;
+		break;
+	case SNB_UNCORE_PCI_IMC_IO_REQUESTS:
+		base = SNB_UNCORE_PCI_IMC_IO_REQUESTS_BASE;
+		idx = UNCORE_PMC_IDX_FREERUNNING;
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -598,7 +644,7 @@ static struct intel_uncore_ops snb_uncore_imc_ops = {
 
 static struct intel_uncore_type snb_uncore_imc = {
 	.name		= "imc",
-	.num_counters   = 2,
+	.num_counters   = 5,
 	.num_boxes	= 1,
 	.num_freerunning_types	= SNB_PCI_UNCORE_IMC_FREERUNNING_TYPE_MAX,
 	.mmio_map_size	= SNB_UNCORE_PCI_IMC_MAP_SIZE,

From 030a2c689fb46e1690f7ded8b194bab7678efb28 Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Mon, 3 Aug 2020 13:56:00 +1000
Subject: [PATCH 02/74] powerpc: Fix P10 PVR revision in /proc/cpuinfo for SMT4
 cores

On POWER10 bit 12 in the PVR indicates if the core is SMT4 or SMT8.
Bit 12 is set for SMT4.

Without this patch, /proc/cpuinfo on a SMT4 DD1 POWER10 looks like
this:
  cpu             : POWER10, altivec supported
  revision        : 17.0 (pvr 0080 1100)

Fixes: a3ea40d5c736 ("powerpc: Add POWER10 architected mode")
Cc: stable@vger.kernel.org # v5.8
Signed-off-by: Michael Neuling <mikey@neuling.org>
Reviewed-by: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200803035600.1820371-1-mikey@neuling.org
---
 arch/powerpc/kernel/setup-common.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index b198b0ff25bcf2..808ec9fab6052f 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -311,6 +311,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 				min = pvr & 0xFF;
 				break;
 			case 0x004e: /* POWER9 bits 12-15 give chip type */
+			case 0x0080: /* POWER10 bit 12 gives SMT8/4 */
 				maj = (pvr >> 8) & 0x0F;
 				min = pvr & 0xFF;
 				break;

From 781fa4811d95314c1965c0c3337c9ac36ef26093 Mon Sep 17 00:00:00 2001
From: Anju T Sudhakar <anju@linux.vnet.ibm.com>
Date: Fri, 7 Aug 2020 06:05:00 -0400
Subject: [PATCH 03/74] powerpc/perf: Add support for outputting extended regs
 in perf intr_regs

Add support for perf extended register capability in powerpc. The
capability flag PERF_PMU_CAP_EXTENDED_REGS, is used to indicate the
PMU which support extended registers. The generic code define the mask
of extended registers as 0 for non supported architectures.

Patch adds extended regs support for power9 platform by exposing
MMCR0, MMCR1 and MMCR2 registers.

REG_RESERVED mask needs update to include extended regs.
PERF_REG_EXTENDED_MASK, contains mask value of the supported
registers, is defined at runtime in the kernel based on platform since
the supported registers may differ from one processor version to
another and hence the MASK value.

With the patch:

  available registers: r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11
  r12 r13 r14 r15 r16 r17 r18 r19 r20 r21 r22 r23 r24 r25 r26
  r27 r28 r29 r30 r31 nip msr orig_r3 ctr link xer ccr softe
  trap dar dsisr sier mmcra mmcr0 mmcr1 mmcr2

  PERF_RECORD_SAMPLE(IP, 0x1): 4784/4784: 0 period: 1 addr: 0
  ... intr regs: mask 0xffffffffffff ABI 64-bit
  .... r0    0xc00000000012b77c
  .... r1    0xc000003fe5e03930
  .... r2    0xc000000001b0e000
  .... r3    0xc000003fdcddf800
  .... r4    0xc000003fc7880000
  .... r5    0x9c422724be
  .... r6    0xc000003fe5e03908
  .... r7    0xffffff63bddc8706
  .... r8    0x9e4
  .... r9    0x0
  .... r10   0x1
  .... r11   0x0
  .... r12   0xc0000000001299c0
  .... r13   0xc000003ffffc4800
  .... r14   0x0
  .... r15   0x7fffdd8b8b00
  .... r16   0x0
  .... r17   0x7fffdd8be6b8
  .... r18   0x7e7076607730
  .... r19   0x2f
  .... r20   0xc00000001fc26c68
  .... r21   0xc0002041e4227e00
  .... r22   0xc00000002018fb60
  .... r23   0x1
  .... r24   0xc000003ffec4d900
  .... r25   0x80000000
  .... r26   0x0
  .... r27   0x1
  .... r28   0x1
  .... r29   0xc000000001be1260
  .... r30   0x6008010
  .... r31   0xc000003ffebb7218
  .... nip   0xc00000000012b910
  .... msr   0x9000000000009033
  .... orig_r3 0xc00000000012b86c
  .... ctr   0xc0000000001299c0
  .... link  0xc00000000012b77c
  .... xer   0x0
  .... ccr   0x28002222
  .... softe 0x1
  .... trap  0xf00
  .... dar   0x0
  .... dsisr 0x80000000000
  .... sier  0x0
  .... mmcra 0x80000000000
  .... mmcr0 0x82008090
  .... mmcr1 0x1e000000
  .... mmcr2 0x0
   ... thread: perf:4784

Signed-off-by: Anju T Sudhakar <anju@linux.vnet.ibm.com>
Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Tested-by: Nageswara R Sastry <nasastry@in.ibm.com>
Reviewed-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Reviewed-by: Kajol Jain <kjain@linux.ibm.com>
Reviewed-and-tested-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/1596794701-23530-2-git-send-email-atrajeev@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/perf_event.h        |  3 ++
 arch/powerpc/include/asm/perf_event_server.h |  5 +++
 arch/powerpc/include/uapi/asm/perf_regs.h    | 14 +++++++-
 arch/powerpc/perf/core-book3s.c              |  1 +
 arch/powerpc/perf/perf_regs.c                | 34 ++++++++++++++++++--
 arch/powerpc/perf/power9-pmu.c               |  6 ++++
 6 files changed, 59 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/perf_event.h b/arch/powerpc/include/asm/perf_event.h
index 1e8b2e1ec1db6a..daec64d41b4478 100644
--- a/arch/powerpc/include/asm/perf_event.h
+++ b/arch/powerpc/include/asm/perf_event.h
@@ -40,4 +40,7 @@ static inline bool is_sier_available(void) { return false; }
 
 /* To support perf_regs sier update */
 extern bool is_sier_available(void);
+/* To define perf extended regs mask value */
+extern u64 PERF_REG_EXTENDED_MASK;
+#define PERF_REG_EXTENDED_MASK	PERF_REG_EXTENDED_MASK
 #endif
diff --git a/arch/powerpc/include/asm/perf_event_server.h b/arch/powerpc/include/asm/perf_event_server.h
index 86c9eb064b22b4..f6acabb6c9be80 100644
--- a/arch/powerpc/include/asm/perf_event_server.h
+++ b/arch/powerpc/include/asm/perf_event_server.h
@@ -62,6 +62,11 @@ struct power_pmu {
 	int 		*blacklist_ev;
 	/* BHRB entries in the PMU */
 	int		bhrb_nr;
+	/*
+	 * set this flag with `PERF_PMU_CAP_EXTENDED_REGS` if
+	 * the pmu supports extended perf regs capability
+	 */
+	int		capabilities;
 };
 
 /*
diff --git a/arch/powerpc/include/uapi/asm/perf_regs.h b/arch/powerpc/include/uapi/asm/perf_regs.h
index f599064dd8dc84..225c64c5681358 100644
--- a/arch/powerpc/include/uapi/asm/perf_regs.h
+++ b/arch/powerpc/include/uapi/asm/perf_regs.h
@@ -48,6 +48,18 @@ enum perf_event_powerpc_regs {
 	PERF_REG_POWERPC_DSISR,
 	PERF_REG_POWERPC_SIER,
 	PERF_REG_POWERPC_MMCRA,
-	PERF_REG_POWERPC_MAX,
+	/* Extended registers */
+	PERF_REG_POWERPC_MMCR0,
+	PERF_REG_POWERPC_MMCR1,
+	PERF_REG_POWERPC_MMCR2,
+	/* Max regs without the extended regs */
+	PERF_REG_POWERPC_MAX = PERF_REG_POWERPC_MMCRA + 1,
 };
+
+#define PERF_REG_PMU_MASK	((1ULL << PERF_REG_POWERPC_MAX) - 1)
+
+/* PERF_REG_EXTENDED_MASK value for CPU_FTR_ARCH_300 */
+#define PERF_REG_PMU_MASK_300   (((1ULL << (PERF_REG_POWERPC_MMCR2 + 1)) - 1) - PERF_REG_PMU_MASK)
+
+#define PERF_REG_MAX_ISA_300   (PERF_REG_POWERPC_MMCR2 + 1)
 #endif /* _UAPI_ASM_POWERPC_PERF_REGS_H */
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 78fe3498659427..00038650a007fd 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -2323,6 +2323,7 @@ int register_power_pmu(struct power_pmu *pmu)
 		pmu->name);
 
 	power_pmu.attr_groups = ppmu->attr_groups;
+	power_pmu.capabilities |= (ppmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS);
 
 #ifdef MSR_HV
 	/*
diff --git a/arch/powerpc/perf/perf_regs.c b/arch/powerpc/perf/perf_regs.c
index a213a0aa5d25cf..9301e6852d5325 100644
--- a/arch/powerpc/perf/perf_regs.c
+++ b/arch/powerpc/perf/perf_regs.c
@@ -13,9 +13,11 @@
 #include <asm/ptrace.h>
 #include <asm/perf_regs.h>
 
+u64 PERF_REG_EXTENDED_MASK;
+
 #define PT_REGS_OFFSET(id, r) [id] = offsetof(struct pt_regs, r)
 
-#define REG_RESERVED (~((1ULL << PERF_REG_POWERPC_MAX) - 1))
+#define REG_RESERVED (~(PERF_REG_EXTENDED_MASK | PERF_REG_PMU_MASK))
 
 static unsigned int pt_regs_offset[PERF_REG_POWERPC_MAX] = {
 	PT_REGS_OFFSET(PERF_REG_POWERPC_R0,  gpr[0]),
@@ -69,10 +71,26 @@ static unsigned int pt_regs_offset[PERF_REG_POWERPC_MAX] = {
 	PT_REGS_OFFSET(PERF_REG_POWERPC_MMCRA, dsisr),
 };
 
+/* Function to return the extended register values */
+static u64 get_ext_regs_value(int idx)
+{
+	switch (idx) {
+	case PERF_REG_POWERPC_MMCR0:
+		return mfspr(SPRN_MMCR0);
+	case PERF_REG_POWERPC_MMCR1:
+		return mfspr(SPRN_MMCR1);
+	case PERF_REG_POWERPC_MMCR2:
+		return mfspr(SPRN_MMCR2);
+	default: return 0;
+	}
+}
+
 u64 perf_reg_value(struct pt_regs *regs, int idx)
 {
-	if (WARN_ON_ONCE(idx >= PERF_REG_POWERPC_MAX))
-		return 0;
+	u64 perf_reg_extended_max = PERF_REG_POWERPC_MAX;
+
+	if (cpu_has_feature(CPU_FTR_ARCH_300))
+		perf_reg_extended_max = PERF_REG_MAX_ISA_300;
 
 	if (idx == PERF_REG_POWERPC_SIER &&
 	   (IS_ENABLED(CONFIG_FSL_EMB_PERF_EVENT) ||
@@ -85,6 +103,16 @@ u64 perf_reg_value(struct pt_regs *regs, int idx)
 	    IS_ENABLED(CONFIG_PPC32)))
 		return 0;
 
+	if (idx >= PERF_REG_POWERPC_MAX && idx < perf_reg_extended_max)
+		return get_ext_regs_value(idx);
+
+	/*
+	 * If the idx is referring to value beyond the
+	 * supported registers, return 0 with a warning
+	 */
+	if (WARN_ON_ONCE(idx >= perf_reg_extended_max))
+		return 0;
+
 	return regs_get_register(regs, pt_regs_offset[idx]);
 }
 
diff --git a/arch/powerpc/perf/power9-pmu.c b/arch/powerpc/perf/power9-pmu.c
index 05dae38b969a5f..2a57e93a79dcf7 100644
--- a/arch/powerpc/perf/power9-pmu.c
+++ b/arch/powerpc/perf/power9-pmu.c
@@ -90,6 +90,8 @@ enum {
 #define POWER9_MMCRA_IFM3		0x00000000C0000000UL
 #define POWER9_MMCRA_BHRB_MASK		0x00000000C0000000UL
 
+extern u64 PERF_REG_EXTENDED_MASK;
+
 /* Nasty Power9 specific hack */
 #define PVR_POWER9_CUMULUS		0x00002000
 
@@ -434,6 +436,7 @@ static struct power_pmu power9_pmu = {
 	.cache_events		= &power9_cache_events,
 	.attr_groups		= power9_pmu_attr_groups,
 	.bhrb_nr		= 32,
+	.capabilities           = PERF_PMU_CAP_EXTENDED_REGS,
 };
 
 int init_power9_pmu(void)
@@ -457,6 +460,9 @@ int init_power9_pmu(void)
 		}
 	}
 
+	/* Set the PERF_REG_EXTENDED_MASK here */
+	PERF_REG_EXTENDED_MASK = PERF_REG_PMU_MASK_300;
+
 	rc = register_power_pmu(&power9_pmu);
 	if (rc)
 		return rc;

From d735599a069f6936c1392e07075c34a19bda949a Mon Sep 17 00:00:00 2001
From: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Date: Fri, 7 Aug 2020 06:05:01 -0400
Subject: [PATCH 04/74] powerpc/perf: Add extended regs support for power10
 platform

Include capability flag PERF_PMU_CAP_EXTENDED_REGS for power10 and
expose MMCR3, SIER2, SIER3 registers as part of extended regs. Also
introduce PERF_REG_PMU_MASK_31 to define extended mask value at
runtime for power10.

Suggested-by: Ryan Grimm <grimm@linux.ibm.com>
Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Tested-by: Nageswara R Sastry <nasastry@in.ibm.com>
Reviewed-by: Kajol Jain <kjain@linux.ibm.com>
Reviewed-and-tested-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/1596794701-23530-3-git-send-email-atrajeev@linux.vnet.ibm.com
---
 arch/powerpc/include/uapi/asm/perf_regs.h |  6 ++++++
 arch/powerpc/perf/perf_regs.c             | 12 +++++++++++-
 arch/powerpc/perf/power10-pmu.c           |  6 ++++++
 3 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/uapi/asm/perf_regs.h b/arch/powerpc/include/uapi/asm/perf_regs.h
index 225c64c5681358..bdf5f10f8b9f50 100644
--- a/arch/powerpc/include/uapi/asm/perf_regs.h
+++ b/arch/powerpc/include/uapi/asm/perf_regs.h
@@ -52,6 +52,9 @@ enum perf_event_powerpc_regs {
 	PERF_REG_POWERPC_MMCR0,
 	PERF_REG_POWERPC_MMCR1,
 	PERF_REG_POWERPC_MMCR2,
+	PERF_REG_POWERPC_MMCR3,
+	PERF_REG_POWERPC_SIER2,
+	PERF_REG_POWERPC_SIER3,
 	/* Max regs without the extended regs */
 	PERF_REG_POWERPC_MAX = PERF_REG_POWERPC_MMCRA + 1,
 };
@@ -60,6 +63,9 @@ enum perf_event_powerpc_regs {
 
 /* PERF_REG_EXTENDED_MASK value for CPU_FTR_ARCH_300 */
 #define PERF_REG_PMU_MASK_300   (((1ULL << (PERF_REG_POWERPC_MMCR2 + 1)) - 1) - PERF_REG_PMU_MASK)
+/* PERF_REG_EXTENDED_MASK value for CPU_FTR_ARCH_31 */
+#define PERF_REG_PMU_MASK_31   (((1ULL << (PERF_REG_POWERPC_SIER3 + 1)) - 1) - PERF_REG_PMU_MASK)
 
 #define PERF_REG_MAX_ISA_300   (PERF_REG_POWERPC_MMCR2 + 1)
+#define PERF_REG_MAX_ISA_31    (PERF_REG_POWERPC_SIER3 + 1)
 #endif /* _UAPI_ASM_POWERPC_PERF_REGS_H */
diff --git a/arch/powerpc/perf/perf_regs.c b/arch/powerpc/perf/perf_regs.c
index 9301e6852d5325..8e53f2fc3fe07f 100644
--- a/arch/powerpc/perf/perf_regs.c
+++ b/arch/powerpc/perf/perf_regs.c
@@ -81,6 +81,14 @@ static u64 get_ext_regs_value(int idx)
 		return mfspr(SPRN_MMCR1);
 	case PERF_REG_POWERPC_MMCR2:
 		return mfspr(SPRN_MMCR2);
+#ifdef CONFIG_PPC64
+	case PERF_REG_POWERPC_MMCR3:
+		return mfspr(SPRN_MMCR3);
+	case PERF_REG_POWERPC_SIER2:
+		return mfspr(SPRN_SIER2);
+	case PERF_REG_POWERPC_SIER3:
+		return mfspr(SPRN_SIER3);
+#endif
 	default: return 0;
 	}
 }
@@ -89,7 +97,9 @@ u64 perf_reg_value(struct pt_regs *regs, int idx)
 {
 	u64 perf_reg_extended_max = PERF_REG_POWERPC_MAX;
 
-	if (cpu_has_feature(CPU_FTR_ARCH_300))
+	if (cpu_has_feature(CPU_FTR_ARCH_31))
+		perf_reg_extended_max = PERF_REG_MAX_ISA_31;
+	else if (cpu_has_feature(CPU_FTR_ARCH_300))
 		perf_reg_extended_max = PERF_REG_MAX_ISA_300;
 
 	if (idx == PERF_REG_POWERPC_SIER &&
diff --git a/arch/powerpc/perf/power10-pmu.c b/arch/powerpc/perf/power10-pmu.c
index f7cff7f36a1cd4..83148656b5241e 100644
--- a/arch/powerpc/perf/power10-pmu.c
+++ b/arch/powerpc/perf/power10-pmu.c
@@ -87,6 +87,8 @@
 #define POWER10_MMCRA_IFM3		0x00000000C0000000UL
 #define POWER10_MMCRA_BHRB_MASK		0x00000000C0000000UL
 
+extern u64 PERF_REG_EXTENDED_MASK;
+
 /* Table of alternatives, sorted by column 0 */
 static const unsigned int power10_event_alternatives[][MAX_ALT] = {
 	{ PM_RUN_CYC_ALT,		PM_RUN_CYC },
@@ -397,6 +399,7 @@ static struct power_pmu power10_pmu = {
 	.cache_events		= &power10_cache_events,
 	.attr_groups		= power10_pmu_attr_groups,
 	.bhrb_nr		= 32,
+	.capabilities           = PERF_PMU_CAP_EXTENDED_REGS,
 };
 
 int init_power10_pmu(void)
@@ -408,6 +411,9 @@ int init_power10_pmu(void)
 	    strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power10"))
 		return -ENODEV;
 
+	/* Set the PERF_REG_EXTENDED_MASK here */
+	PERF_REG_EXTENDED_MASK = PERF_REG_PMU_MASK_31;
+
 	rc = register_power_pmu(&power10_pmu);
 	if (rc)
 		return rc;

From 327da008e65a25b8206b36b7fc0c9e4edbb36a58 Mon Sep 17 00:00:00 2001
From: Madhavan Srinivasan <maddy@linux.ibm.com>
Date: Mon, 17 Aug 2020 06:26:18 +0530
Subject: [PATCH 05/74] powerpc: Add POWER10 raw mode cputable entry

Add a raw mode cputable entry for POWER10. Copies most of the fields
from commit a3ea40d5c736 ("powerpc: Add POWER10 architected mode")
except for oprofile_cpu_type, machine_check_early, pvr_mask and
pvr_mask fields. On bare metal systems we use DT CPU features, which
doesn't need a cputable entry. But in VMs we still rely on the raw
cputable entry to set the correct values for the PMU related fields.

Fixes: a3ea40d5c736 ("powerpc: Add POWER10 architected mode")
Signed-off-by: Madhavan Srinivasan <maddy@linux.ibm.com>
[mpe: Reorder vs cleanup patch and add Fixes tag]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200817005618.3305028-2-maddy@linux.ibm.com
---
 arch/powerpc/kernel/cputable.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c
index 3d406a9626e86e..93b98666066474 100644
--- a/arch/powerpc/kernel/cputable.c
+++ b/arch/powerpc/kernel/cputable.c
@@ -75,6 +75,7 @@ extern void __restore_cpu_power10(void);
 extern long __machine_check_early_realmode_p7(struct pt_regs *regs);
 extern long __machine_check_early_realmode_p8(struct pt_regs *regs);
 extern long __machine_check_early_realmode_p9(struct pt_regs *regs);
+extern long __machine_check_early_realmode_p10(struct pt_regs *regs);
 #endif /* CONFIG_PPC64 */
 #if defined(CONFIG_E500)
 extern void __setup_cpu_e5500(unsigned long offset, struct cpu_spec* spec);
@@ -542,6 +543,25 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.machine_check_early	= __machine_check_early_realmode_p9,
 		.platform		= "power9",
 	},
+	{	/* Power10 */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x00800000,
+		.cpu_name		= "POWER10 (raw)",
+		.cpu_features		= CPU_FTRS_POWER10,
+		.cpu_user_features	= COMMON_USER_POWER10,
+		.cpu_user_features2	= COMMON_USER2_POWER10,
+		.mmu_features		= MMU_FTRS_POWER10,
+		.icache_bsize		= 128,
+		.dcache_bsize		= 128,
+		.num_pmcs		= 6,
+		.pmc_type		= PPC_PMC_IBM,
+		.oprofile_cpu_type	= "ppc64/power10",
+		.oprofile_type		= PPC_OPROFILE_INVALID,
+		.cpu_setup		= __setup_cpu_power10,
+		.cpu_restore		= __restore_cpu_power10,
+		.machine_check_early	= __machine_check_early_realmode_p10,
+		.platform		= "power10",
+	},
 	{	/* Cell Broadband Engine */
 		.pvr_mask		= 0xffff0000,
 		.pvr_value		= 0x00700000,

From 388692e943a58f28aac0fe83e75f5994da9ff8a1 Mon Sep 17 00:00:00 2001
From: Madhavan Srinivasan <maddy@linux.ibm.com>
Date: Mon, 17 Aug 2020 06:26:17 +0530
Subject: [PATCH 06/74] powerpc/kernel: Cleanup machine check function
 declarations

__machine_check_early_realmode_p*() are currently declared as extern
in cputable.c and because of this when compiled with "C=1" (which
enables semantic checker) produces these warnings.

    CHECK   arch/powerpc/kernel/mce_power.c
  arch/powerpc/kernel/mce_power.c:709:6: warning: symbol '__machine_check_early_realmode_p7' was not declared. Should it be static?
  arch/powerpc/kernel/mce_power.c:717:6: warning: symbol '__machine_check_early_realmode_p8' was not declared. Should it be static?
  arch/powerpc/kernel/mce_power.c:722:6: warning: symbol '__machine_check_early_realmode_p9' was not declared. Should it be static?
  arch/powerpc/kernel/mce_power.c:740:6: warning: symbol '__machine_check_early_realmode_p10' was not declared. Should it be static?

Patch here moves the declaration to asm/mce.h and includes the same in
cputable.c

Fixes: ae744f3432d3 ("powerpc/book3s: Flush SLB/TLBs if we get SLB/TLB machine check errors on power8")
Fixes: 7b9f71f974a1 ("powerpc/64s: POWER9 machine check handler")
Signed-off-by: Madhavan Srinivasan <maddy@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200817005618.3305028-1-maddy@linux.ibm.com
---
 arch/powerpc/include/asm/cputable.h | 5 +++++
 arch/powerpc/include/asm/mce.h      | 7 +++++++
 arch/powerpc/kernel/cputable.c      | 4 ----
 arch/powerpc/kernel/dt_cpu_ftrs.c   | 4 ----
 4 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h
index fdddb822d564b0..e005b458102327 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -9,6 +9,11 @@
 
 #ifndef __ASSEMBLY__
 
+/*
+ * Added to include __machine_check_early_realmode_* functions
+ */
+#include <asm/mce.h>
+
 /* This structure can grow, it's real size is used by head.S code
  * via the mkdefs mechanism.
  */
diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index adf2cda67f9a40..89aa8248a57ddd 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -210,6 +210,9 @@ struct mce_error_info {
 #define MCE_EVENT_RELEASE	true
 #define MCE_EVENT_DONTRELEASE	false
 
+struct pt_regs;
+struct notifier_block;
+
 extern void save_mce_event(struct pt_regs *regs, long handled,
 			   struct mce_error_info *mce_err, uint64_t nip,
 			   uint64_t addr, uint64_t phys_addr);
@@ -225,5 +228,9 @@ int mce_register_notifier(struct notifier_block *nb);
 int mce_unregister_notifier(struct notifier_block *nb);
 #ifdef CONFIG_PPC_BOOK3S_64
 void flush_and_reload_slb(void);
+long __machine_check_early_realmode_p7(struct pt_regs *regs);
+long __machine_check_early_realmode_p8(struct pt_regs *regs);
+long __machine_check_early_realmode_p9(struct pt_regs *regs);
+long __machine_check_early_realmode_p10(struct pt_regs *regs);
 #endif /* CONFIG_PPC_BOOK3S_64 */
 #endif /* __ASM_PPC64_MCE_H__ */
diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c
index 93b98666066474..2aa89c6b289679 100644
--- a/arch/powerpc/kernel/cputable.c
+++ b/arch/powerpc/kernel/cputable.c
@@ -72,10 +72,6 @@ extern void __setup_cpu_power9(unsigned long offset, struct cpu_spec* spec);
 extern void __restore_cpu_power9(void);
 extern void __setup_cpu_power10(unsigned long offset, struct cpu_spec* spec);
 extern void __restore_cpu_power10(void);
-extern long __machine_check_early_realmode_p7(struct pt_regs *regs);
-extern long __machine_check_early_realmode_p8(struct pt_regs *regs);
-extern long __machine_check_early_realmode_p9(struct pt_regs *regs);
-extern long __machine_check_early_realmode_p10(struct pt_regs *regs);
 #endif /* CONFIG_PPC64 */
 #if defined(CONFIG_E500)
 extern void __setup_cpu_e5500(unsigned long offset, struct cpu_spec* spec);
diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c
index 6f8c0c6b937a17..8dc46f38680b25 100644
--- a/arch/powerpc/kernel/dt_cpu_ftrs.c
+++ b/arch/powerpc/kernel/dt_cpu_ftrs.c
@@ -64,10 +64,6 @@ struct dt_cpu_feature {
  * Set up the base CPU
  */
 
-extern long __machine_check_early_realmode_p8(struct pt_regs *regs);
-extern long __machine_check_early_realmode_p9(struct pt_regs *regs);
-extern long __machine_check_early_realmode_p10(struct pt_regs *regs);
-
 static int hv_mode;
 
 static struct {

From 1e4e4bcaf70ec89f8b499c93a83d078c1e5c0ea6 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Date: Mon, 17 Aug 2020 16:03:01 +0530
Subject: [PATCH 07/74] powerpc/pkeys: Fix build error with PPC_MEM_KEYS
 disabled
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

IS_ENABLED() instead of #ifdef still requires variable declaration.
In this specific case, default_uamor is declared in asm/pkeys.h which
is only included if PPC_MEM_KEYS is enabled.

arch/powerpc/mm/book3s64/hash_utils.c: In function ‘hash__early_init_mmu_secondary’:
arch/powerpc/mm/book3s64/hash_utils.c:1119:21: error: ‘default_uamor’ undeclared (first use in this function)
 1119 |   mtspr(SPRN_UAMOR, default_uamor);
      |                     ^~~~~~~~~~~~~

Fixes: 6553fb799f60 ("powerpc/pkeys: Fix boot failures with Nemo board (A-EON AmigaOne X1000)")
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200817103301.158836-1-aneesh.kumar@linux.ibm.com
---
 arch/powerpc/mm/book3s64/hash_utils.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
index 1da9dbba92171b..890a71c5293eef 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -1115,8 +1115,10 @@ void hash__early_init_mmu_secondary(void)
 			&& cpu_has_feature(CPU_FTR_HVMODE))
 		tlbiel_all();
 
-	if (IS_ENABLED(CONFIG_PPC_MEM_KEYS) && mmu_has_feature(MMU_FTR_PKEY))
+#ifdef CONFIG_PPC_MEM_KEYS
+	if (mmu_has_feature(MMU_FTR_PKEY))
 		mtspr(SPRN_UAMOR, default_uamor);
+#endif
 }
 #endif /* CONFIG_SMP */
 

From fdc6edbb31fba76fd25d7bd016b675a92908d81e Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Mon, 17 Aug 2020 06:03:26 +0000
Subject: [PATCH 08/74] powerpc/fixmap: Fix the size of the early debug area

Commit ("03fd42d458fb powerpc/fixmap: Fix FIX_EARLY_DEBUG_BASE when
page size is 256k") reworked the setup of the early debug area and
mistakenly replaced 128 * 1024 by SZ_128.

Change to SZ_128K to restore the original 128 kbytes size of the area.

Fixes: 03fd42d458fb ("powerpc/fixmap: Fix FIX_EARLY_DEBUG_BASE when page size is 256k")
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/996184974d674ff984643778cf1cdd7fe58cc065.1597644194.git.christophe.leroy@csgroup.eu
---
 arch/powerpc/include/asm/fixmap.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/fixmap.h b/arch/powerpc/include/asm/fixmap.h
index 925cf89cbf4bac..6bfc87915d5dba 100644
--- a/arch/powerpc/include/asm/fixmap.h
+++ b/arch/powerpc/include/asm/fixmap.h
@@ -52,7 +52,7 @@ enum fixed_addresses {
 	FIX_HOLE,
 	/* reserve the top 128K for early debugging purposes */
 	FIX_EARLY_DEBUG_TOP = FIX_HOLE,
-	FIX_EARLY_DEBUG_BASE = FIX_EARLY_DEBUG_TOP+(ALIGN(SZ_128, PAGE_SIZE)/PAGE_SIZE)-1,
+	FIX_EARLY_DEBUG_BASE = FIX_EARLY_DEBUG_TOP+(ALIGN(SZ_128K, PAGE_SIZE)/PAGE_SIZE)-1,
 #ifdef CONFIG_HIGHMEM
 	FIX_KMAP_BEGIN,	/* reserved pte's for temporary kernel mappings */
 	FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,

From cf28f3bbfca097d956f9021cb710dfad56adcc62 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Mon, 17 Aug 2020 10:42:14 -0700
Subject: [PATCH 09/74] bpf: Use get_file_rcu() instead of get_file() for
 task_file iterator

With latest `bpftool prog` command, we observed the following kernel
panic.
    BUG: kernel NULL pointer dereference, address: 0000000000000000
    #PF: supervisor instruction fetch in kernel mode
    #PF: error_code(0x0010) - not-present page
    PGD dfe894067 P4D dfe894067 PUD deb663067 PMD 0
    Oops: 0010 [#1] SMP
    CPU: 9 PID: 6023 ...
    RIP: 0010:0x0
    Code: Bad RIP value.
    RSP: 0000:ffffc900002b8f18 EFLAGS: 00010286
    RAX: ffff8883a405f400 RBX: ffff888e46a6bf00 RCX: 000000008020000c
    RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffff8883a405f400
    RBP: ffff888e46a6bf50 R08: 0000000000000000 R09: ffffffff81129600
    R10: ffff8883a405f300 R11: 0000160000000000 R12: 0000000000002710
    R13: 000000e9494b690c R14: 0000000000000202 R15: 0000000000000009
    FS:  00007fd9187fe700(0000) GS:ffff888e46a40000(0000) knlGS:0000000000000000
    CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
    CR2: ffffffffffffffd6 CR3: 0000000de5d33002 CR4: 0000000000360ee0
    DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
    DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
    Call Trace:
     <IRQ>
     rcu_core+0x1a4/0x440
     __do_softirq+0xd3/0x2c8
     irq_exit+0x9d/0xa0
     smp_apic_timer_interrupt+0x68/0x120
     apic_timer_interrupt+0xf/0x20
     </IRQ>
    RIP: 0033:0x47ce80
    Code: Bad RIP value.
    RSP: 002b:00007fd9187fba40 EFLAGS: 00000206 ORIG_RAX: ffffffffffffff13
    RAX: 0000000000000002 RBX: 00007fd931789160 RCX: 000000000000010c
    RDX: 00007fd9308cdfb4 RSI: 00007fd9308cdfb4 RDI: 00007ffedd1ea0a8
    RBP: 00007fd9187fbab0 R08: 000000000000000e R09: 000000000000002a
    R10: 0000000000480210 R11: 00007fd9187fc570 R12: 00007fd9316cc400
    R13: 0000000000000118 R14: 00007fd9308cdfb4 R15: 00007fd9317a9380

After further analysis, the bug is triggered by
Commit eaaacd23910f ("bpf: Add task and task/file iterator targets")
which introduced task_file bpf iterator, which traverses all open file
descriptors for all tasks in the current namespace.
The latest `bpftool prog` calls a task_file bpf program to traverse
all files in the system in order to associate processes with progs/maps, etc.
When traversing files for a given task, rcu read_lock is taken to
access all files in a file_struct. But it used get_file() to grab
a file, which is not right. It is possible file->f_count is 0 and
get_file() will unconditionally increase it.
Later put_file() may cause all kind of issues with the above
as one of sympotoms.

The failure can be reproduced with the following steps in a few seconds:
    $ cat t.c
    #include <stdio.h>
    #include <sys/types.h>
    #include <sys/stat.h>
    #include <fcntl.h>
    #include <unistd.h>

    #define N 10000
    int fd[N];
    int main() {
      int i;

      for (i = 0; i < N; i++) {
        fd[i] = open("./note.txt", 'r');
        if (fd[i] < 0) {
           fprintf(stderr, "failed\n");
           return -1;
        }
      }
      for (i = 0; i < N; i++)
        close(fd[i]);

      return 0;
    }
    $ gcc -O2 t.c
    $ cat run.sh
    #/bin/bash
    for i in {1..100}
    do
      while true; do ./a.out; done &
    done
    $ ./run.sh
    $ while true; do bpftool prog >& /dev/null; done

This patch used get_file_rcu() which only grabs a file if the
file->f_count is not zero. This is to ensure the file pointer
is always valid. The above reproducer did not fail for more
than 30 minutes.

Fixes: eaaacd23910f ("bpf: Add task and task/file iterator targets")
Suggested-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Link: https://lore.kernel.org/bpf/20200817174214.252601-1-yhs@fb.com
---
 kernel/bpf/task_iter.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 232df29793e962..f21b5e1e4540e1 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -178,10 +178,11 @@ task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info,
 		f = fcheck_files(curr_files, curr_fd);
 		if (!f)
 			continue;
+		if (!get_file_rcu(f))
+			continue;
 
 		/* set info->fd */
 		info->fd = curr_fd;
-		get_file(f);
 		rcu_read_unlock();
 		return f;
 	}

From 48d2f0407be7a36e8f20be37ec9121e021ef3964 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Wed, 5 Aug 2020 15:27:29 +0000
Subject: [PATCH 10/74] powerpc/kasan: Fix KASAN_SHADOW_START on BOOK3S_32

On BOOK3S_32, when we have modules and strict kernel RWX, modules
are not in vmalloc space but in a dedicated segment that is
below PAGE_OFFSET.

So KASAN_SHADOW_START must take it into account.

MODULES_VADDR can't be used because it is not defined yet
in kasan.h

Fixes: 6ca055322da8 ("powerpc/32s: Use dedicated segment for modules with STRICT_KERNEL_RWX")
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/6eddca2d5611fd57312a88eae31278c87a8fc99d.1596641224.git.christophe.leroy@csgroup.eu
---
 arch/powerpc/include/asm/kasan.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/kasan.h b/arch/powerpc/include/asm/kasan.h
index d635b96c7ea69b..7355ed05e65eda 100644
--- a/arch/powerpc/include/asm/kasan.h
+++ b/arch/powerpc/include/asm/kasan.h
@@ -15,11 +15,18 @@
 #ifndef __ASSEMBLY__
 
 #include <asm/page.h>
+#include <linux/sizes.h>
 
 #define KASAN_SHADOW_SCALE_SHIFT	3
 
+#if defined(CONFIG_PPC_BOOK3S_32) && defined(CONFIG_MODULES) && defined(CONFIG_STRICT_KERNEL_RWX)
+#define KASAN_KERN_START	ALIGN_DOWN(PAGE_OFFSET - SZ_256M, SZ_256M)
+#else
+#define KASAN_KERN_START	PAGE_OFFSET
+#endif
+
 #define KASAN_SHADOW_START	(KASAN_SHADOW_OFFSET + \
-				 (PAGE_OFFSET >> KASAN_SHADOW_SCALE_SHIFT))
+				 (KASAN_KERN_START >> KASAN_SHADOW_SCALE_SHIFT))
 
 #define KASAN_SHADOW_OFFSET	ASM_CONST(CONFIG_KASAN_SHADOW_OFFSET)
 

From 7bee31ad8e2f6c276f36993346ac70f4d4c80e45 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Wed, 5 Aug 2020 15:27:28 +0000
Subject: [PATCH 11/74] powerpc/32s: Fix is_module_segment() when MODULES_VADDR
 is defined

When MODULES_VADDR is defined, is_module_segment() shall check the
address against it instead of checking agains VMALLOC_START.

Fixes: 6ca055322da8 ("powerpc/32s: Use dedicated segment for modules with STRICT_KERNEL_RWX")
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/07884ed033c31e074747b7eb8eaa329d15db07ec.1596641219.git.christophe.leroy@csgroup.eu
---
 arch/powerpc/mm/book3s32/mmu.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c
index c0162911f6cbdc..82ae9e06a77324 100644
--- a/arch/powerpc/mm/book3s32/mmu.c
+++ b/arch/powerpc/mm/book3s32/mmu.c
@@ -191,10 +191,17 @@ static bool is_module_segment(unsigned long addr)
 {
 	if (!IS_ENABLED(CONFIG_MODULES))
 		return false;
+#ifdef MODULES_VADDR
+	if (addr < ALIGN_DOWN(MODULES_VADDR, SZ_256M))
+		return false;
+	if (addr >= ALIGN(MODULES_END, SZ_256M))
+		return false;
+#else
 	if (addr < ALIGN_DOWN(VMALLOC_START, SZ_256M))
 		return false;
 	if (addr >= ALIGN(VMALLOC_END, SZ_256M))
 		return false;
+#endif
 	return true;
 }
 

From 801980f6497946048709b9b09771a1729551d705 Mon Sep 17 00:00:00 2001
From: Michael Roth <mdroth@linux.vnet.ibm.com>
Date: Tue, 11 Aug 2020 11:15:44 -0500
Subject: [PATCH 12/74] powerpc/pseries/hotplug-cpu: wait indefinitely for vCPU
 death

For a power9 KVM guest with XIVE enabled, running a test loop
where we hotplug 384 vcpus and then unplug them, the following traces
can be seen (generally within a few loops) either from the unplugged
vcpu:

  cpu 65 (hwid 65) Ready to die...
  Querying DEAD? cpu 66 (66) shows 2
  list_del corruption. next->prev should be c00a000002470208, but was c00a000002470048
  ------------[ cut here ]------------
  kernel BUG at lib/list_debug.c:56!
  Oops: Exception in kernel mode, sig: 5 [#1]
  LE SMP NR_CPUS=2048 NUMA pSeries
  Modules linked in: fuse nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 ...
  CPU: 66 PID: 0 Comm: swapper/66 Kdump: loaded Not tainted 4.18.0-221.el8.ppc64le #1
  NIP:  c0000000007ab50c LR: c0000000007ab508 CTR: 00000000000003ac
  REGS: c0000009e5a17840 TRAP: 0700   Not tainted  (4.18.0-221.el8.ppc64le)
  MSR:  800000000282b033 <SF,VEC,VSX,EE,FP,ME,IR,DR,RI,LE>  CR: 28000842  XER: 20040000
  ...
  NIP __list_del_entry_valid+0xac/0x100
  LR  __list_del_entry_valid+0xa8/0x100
  Call Trace:
    __list_del_entry_valid+0xa8/0x100 (unreliable)
    free_pcppages_bulk+0x1f8/0x940
    free_unref_page+0xd0/0x100
    xive_spapr_cleanup_queue+0x148/0x1b0
    xive_teardown_cpu+0x1bc/0x240
    pseries_mach_cpu_die+0x78/0x2f0
    cpu_die+0x48/0x70
    arch_cpu_idle_dead+0x20/0x40
    do_idle+0x2f4/0x4c0
    cpu_startup_entry+0x38/0x40
    start_secondary+0x7bc/0x8f0
    start_secondary_prolog+0x10/0x14

or on the worker thread handling the unplug:

  pseries-hotplug-cpu: Attempting to remove CPU <NULL>, drc index: 1000013a
  Querying DEAD? cpu 314 (314) shows 2
  BUG: Bad page state in process kworker/u768:3  pfn:95de1
  cpu 314 (hwid 314) Ready to die...
  page:c00a000002577840 refcount:0 mapcount:-128 mapping:0000000000000000 index:0x0
  flags: 0x5ffffc00000000()
  raw: 005ffffc00000000 5deadbeef0000100 5deadbeef0000200 0000000000000000
  raw: 0000000000000000 0000000000000000 00000000ffffff7f 0000000000000000
  page dumped because: nonzero mapcount
  Modules linked in: kvm xt_CHECKSUM ipt_MASQUERADE xt_conntrack ...
  CPU: 0 PID: 548 Comm: kworker/u768:3 Kdump: loaded Not tainted 4.18.0-224.el8.bz1856588.ppc64le #1
  Workqueue: pseries hotplug workque pseries_hp_work_fn
  Call Trace:
    dump_stack+0xb0/0xf4 (unreliable)
    bad_page+0x12c/0x1b0
    free_pcppages_bulk+0x5bc/0x940
    page_alloc_cpu_dead+0x118/0x120
    cpuhp_invoke_callback.constprop.5+0xb8/0x760
    _cpu_down+0x188/0x340
    cpu_down+0x5c/0xa0
    cpu_subsys_offline+0x24/0x40
    device_offline+0xf0/0x130
    dlpar_offline_cpu+0x1c4/0x2a0
    dlpar_cpu_remove+0xb8/0x190
    dlpar_cpu_remove_by_index+0x12c/0x150
    dlpar_cpu+0x94/0x800
    pseries_hp_work_fn+0x128/0x1e0
    process_one_work+0x304/0x5d0
    worker_thread+0xcc/0x7a0
    kthread+0x1ac/0x1c0
    ret_from_kernel_thread+0x5c/0x80

The latter trace is due to the following sequence:

  page_alloc_cpu_dead
    drain_pages
      drain_pages_zone
        free_pcppages_bulk

where drain_pages() in this case is called under the assumption that
the unplugged cpu is no longer executing. To ensure that is the case,
and early call is made to __cpu_die()->pseries_cpu_die(), which runs a
loop that waits for the cpu to reach a halted state by polling its
status via query-cpu-stopped-state RTAS calls. It only polls for 25
iterations before giving up, however, and in the trace above this
results in the following being printed only .1 seconds after the
hotplug worker thread begins processing the unplug request:

  pseries-hotplug-cpu: Attempting to remove CPU <NULL>, drc index: 1000013a
  Querying DEAD? cpu 314 (314) shows 2

At that point the worker thread assumes the unplugged CPU is in some
unknown/dead state and procedes with the cleanup, causing the race
with the XIVE cleanup code executed by the unplugged CPU.

Fix this by waiting indefinitely, but also making an effort to avoid
spurious lockup messages by allowing for rescheduling after polling
the CPU status and printing a warning if we wait for longer than 120s.

Fixes: eac1e731b59ee ("powerpc/xive: guest exploitation of the XIVE interrupt controller")
Suggested-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Michael Roth <mdroth@linux.vnet.ibm.com>
Tested-by: Greg Kurz <groug@kaod.org>
Reviewed-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Reviewed-by: Greg Kurz <groug@kaod.org>
[mpe: Trim oopses in change log slightly for readability]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200811161544.10513-1-mdroth@linux.vnet.ibm.com
---
 arch/powerpc/platforms/pseries/hotplug-cpu.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c
index c6e0d8abf75ea9..7a974ed6b2400e 100644
--- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
+++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -107,22 +107,28 @@ static int pseries_cpu_disable(void)
  */
 static void pseries_cpu_die(unsigned int cpu)
 {
-	int tries;
 	int cpu_status = 1;
 	unsigned int pcpu = get_hard_smp_processor_id(cpu);
+	unsigned long timeout = jiffies + msecs_to_jiffies(120000);
 
-	for (tries = 0; tries < 25; tries++) {
+	while (true) {
 		cpu_status = smp_query_cpu_stopped(pcpu);
 		if (cpu_status == QCSS_STOPPED ||
 		    cpu_status == QCSS_HARDWARE_ERROR)
 			break;
-		cpu_relax();
 
+		if (time_after(jiffies, timeout)) {
+			pr_warn("CPU %i (hwid %i) didn't die after 120 seconds\n",
+				cpu, pcpu);
+			timeout = jiffies + msecs_to_jiffies(120000);
+		}
+
+		cond_resched();
 	}
 
-	if (cpu_status != 0) {
-		printk("Querying DEAD? cpu %i (%i) shows %i\n",
-		       cpu, pcpu, cpu_status);
+	if (cpu_status == QCSS_HARDWARE_ERROR) {
+		pr_warn("CPU %i (hwid %i) reported error while dying\n",
+			cpu, pcpu);
 	}
 
 	/* Isolation and deallocation are definitely done by

From 45bc6098a3e279d8e391d22428396687562797e2 Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Tue, 7 Jul 2020 12:43:24 -0700
Subject: [PATCH 13/74] EDAC/{i7core,sb,pnd2,skx}: Fix error event severity

IA32_MCG_STATUS.RIPV indicates whether the return RIP value pushed onto
the stack as part of machine check delivery is valid or not.

Various drivers copied a code fragment that uses the RIPV bit to
determine the severity of the error as either HW_EVENT_ERR_UNCORRECTED
or HW_EVENT_ERR_FATAL, but this check is reversed (marking errors where
RIPV is set as "FATAL").

Reverse the tests so that the error is marked fatal when RIPV is not set.

Reported-by: Gabriele Paoloni <gabriele.paoloni@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: <stable@vger.kernel.org>
Link: https://lkml.kernel.org/r/20200707194324.14884-1-tony.luck@intel.com
---
 drivers/edac/i7core_edac.c | 4 ++--
 drivers/edac/pnd2_edac.c   | 2 +-
 drivers/edac/sb_edac.c     | 4 ++--
 drivers/edac/skx_common.c  | 4 ++--
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/edac/i7core_edac.c b/drivers/edac/i7core_edac.c
index 5860ca41185cf1..2acd9f9284a260 100644
--- a/drivers/edac/i7core_edac.c
+++ b/drivers/edac/i7core_edac.c
@@ -1710,9 +1710,9 @@ static void i7core_mce_output_error(struct mem_ctl_info *mci,
 	if (uncorrected_error) {
 		core_err_cnt = 1;
 		if (ripv)
-			tp_event = HW_EVENT_ERR_FATAL;
-		else
 			tp_event = HW_EVENT_ERR_UNCORRECTED;
+		else
+			tp_event = HW_EVENT_ERR_FATAL;
 	} else {
 		tp_event = HW_EVENT_ERR_CORRECTED;
 	}
diff --git a/drivers/edac/pnd2_edac.c b/drivers/edac/pnd2_edac.c
index fd363746f5b03a..b8fc4b84fd8672 100644
--- a/drivers/edac/pnd2_edac.c
+++ b/drivers/edac/pnd2_edac.c
@@ -1155,7 +1155,7 @@ static void pnd2_mce_output_error(struct mem_ctl_info *mci, const struct mce *m,
 	u32 optypenum = GET_BITFIELD(m->status, 4, 6);
 	int rc;
 
-	tp_event = uc_err ? (ripv ? HW_EVENT_ERR_FATAL : HW_EVENT_ERR_UNCORRECTED) :
+	tp_event = uc_err ? (ripv ? HW_EVENT_ERR_UNCORRECTED : HW_EVENT_ERR_FATAL) :
 						 HW_EVENT_ERR_CORRECTED;
 
 	/*
diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c
index d414698ca32421..c5ab634cb6a49a 100644
--- a/drivers/edac/sb_edac.c
+++ b/drivers/edac/sb_edac.c
@@ -2982,9 +2982,9 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci,
 	if (uncorrected_error) {
 		core_err_cnt = 1;
 		if (ripv) {
-			tp_event = HW_EVENT_ERR_FATAL;
-		} else {
 			tp_event = HW_EVENT_ERR_UNCORRECTED;
+		} else {
+			tp_event = HW_EVENT_ERR_FATAL;
 		}
 	} else {
 		tp_event = HW_EVENT_ERR_CORRECTED;
diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c
index 6d8d6dc626bfee..2b4ce8e5ac2fa6 100644
--- a/drivers/edac/skx_common.c
+++ b/drivers/edac/skx_common.c
@@ -493,9 +493,9 @@ static void skx_mce_output_error(struct mem_ctl_info *mci,
 	if (uncorrected_error) {
 		core_err_cnt = 1;
 		if (ripv) {
-			tp_event = HW_EVENT_ERR_FATAL;
-		} else {
 			tp_event = HW_EVENT_ERR_UNCORRECTED;
+		} else {
+			tp_event = HW_EVENT_ERR_FATAL;
 		}
 	} else {
 		tp_event = HW_EVENT_ERR_CORRECTED;

From 3fb1a96a91120877488071a167d26d76be4be977 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Tue, 18 Aug 2020 09:44:56 -0700
Subject: [PATCH 14/74] libbpf: Fix build on ppc64le architecture
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On ppc64le we get the following warning:

  In file included from btf_dump.c:16:0:
  btf_dump.c: In function ‘btf_dump_emit_struct_def’:
  ../include/linux/kernel.h:20:17: error: comparison of distinct pointer types lacks a cast [-Werror]
    (void) (&_max1 == &_max2);  \
                   ^
  btf_dump.c:882:11: note: in expansion of macro ‘max’
      m_sz = max(0LL, btf__resolve_size(d->btf, m->type));
             ^~~

Fix by explicitly casting to __s64, which is a return type from
btf__resolve_size().

Fixes: 702eddc77a90 ("libbpf: Handle GCC built-in types for Arm NEON")
Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200818164456.1181661-1-andriin@fb.com
---
 tools/lib/bpf/btf_dump.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c
index fe39bd77469768..57c00fa6393290 100644
--- a/tools/lib/bpf/btf_dump.c
+++ b/tools/lib/bpf/btf_dump.c
@@ -879,7 +879,7 @@ static void btf_dump_emit_struct_def(struct btf_dump *d,
 			btf_dump_printf(d, ": %d", m_sz);
 			off = m_off + m_sz;
 		} else {
-			m_sz = max(0LL, btf__resolve_size(d->btf, m->type));
+			m_sz = max((__s64)0, btf__resolve_size(d->btf, m->type));
 			off = m_off + m_sz * 8;
 		}
 		btf_dump_printf(d, ";");

From 8b61fba503904acae24aeb2bd5569b4d6544d48f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alvin=20=C5=A0ipraga?= <alsi@bang-olufsen.dk>
Date: Tue, 18 Aug 2020 10:51:34 +0200
Subject: [PATCH 15/74] macvlan: validate setting of multiple remote source MAC
 addresses
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remote source MAC addresses can be set on a 'source mode' macvlan
interface via the IFLA_MACVLAN_MACADDR_DATA attribute. This commit
tightens the validation of these MAC addresses to match the validation
already performed when setting or adding a single MAC address via the
IFLA_MACVLAN_MACADDR attribute.

iproute2 uses IFLA_MACVLAN_MACADDR_DATA for its 'macvlan macaddr set'
command, and IFLA_MACVLAN_MACADDR for its 'macvlan macaddr add' command,
which demonstrates the inconsistent behaviour that this commit
addresses:

 # ip link add link eth0 name macvlan0 type macvlan mode source
 # ip link set link dev macvlan0 type macvlan macaddr add 01:00:00:00:00:00
 RTNETLINK answers: Cannot assign requested address
 # ip link set link dev macvlan0 type macvlan macaddr set 01:00:00:00:00:00
 # ip -d link show macvlan0
 5: macvlan0@eth0: <BROADCAST,MULTICAST,DYNAMIC,UP,LOWER_UP> mtu 1500 ...
     link/ether 2e:ac:fd:2d:69:f8 brd ff:ff:ff:ff:ff:ff promiscuity 0
     macvlan mode source remotes (1) 01:00:00:00:00:00 numtxqueues 1 ...

With this change, the 'set' command will (rightly) fail in the same way
as the 'add' command.

Signed-off-by: Alvin Šipraga <alsi@bang-olufsen.dk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/macvlan.c | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 4942f6112e51f8..5da04e99798943 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -1269,6 +1269,9 @@ static void macvlan_port_destroy(struct net_device *dev)
 static int macvlan_validate(struct nlattr *tb[], struct nlattr *data[],
 			    struct netlink_ext_ack *extack)
 {
+	struct nlattr *nla, *head;
+	int rem, len;
+
 	if (tb[IFLA_ADDRESS]) {
 		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
 			return -EINVAL;
@@ -1316,6 +1319,20 @@ static int macvlan_validate(struct nlattr *tb[], struct nlattr *data[],
 			return -EADDRNOTAVAIL;
 	}
 
+	if (data[IFLA_MACVLAN_MACADDR_DATA]) {
+		head = nla_data(data[IFLA_MACVLAN_MACADDR_DATA]);
+		len = nla_len(data[IFLA_MACVLAN_MACADDR_DATA]);
+
+		nla_for_each_attr(nla, head, len, rem) {
+			if (nla_type(nla) != IFLA_MACVLAN_MACADDR ||
+			    nla_len(nla) != ETH_ALEN)
+				return -EINVAL;
+
+			if (!is_valid_ether_addr(nla_data(nla)))
+				return -EADDRNOTAVAIL;
+		}
+	}
+
 	if (data[IFLA_MACVLAN_MACADDR_COUNT])
 		return -EINVAL;
 
@@ -1372,10 +1389,6 @@ static int macvlan_changelink_sources(struct macvlan_dev *vlan, u32 mode,
 		len = nla_len(data[IFLA_MACVLAN_MACADDR_DATA]);
 
 		nla_for_each_attr(nla, head, len, rem) {
-			if (nla_type(nla) != IFLA_MACVLAN_MACADDR ||
-			    nla_len(nla) != ETH_ALEN)
-				continue;
-
 			addr = nla_data(nla);
 			ret = macvlan_hash_add_source(vlan, addr);
 			if (ret)

From db06ea341fcd1752fbdb58454507faa140e3842f Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree@solarflare.com>
Date: Tue, 18 Aug 2020 13:43:30 +0100
Subject: [PATCH 16/74] sfc: really check hash is valid before using it

Actually hook up the .rx_buf_hash_valid method in EF100's nic_type.

Fixes: 068885434ccb ("sfc: check hash is valid before using it")
Reported-by: Martin Habets <mhabets@solarflare.com>
Signed-off-by: Edward Cree <ecree@solarflare.com>
Reviewed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/sfc/ef100_nic.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/sfc/ef100_nic.c b/drivers/net/ethernet/sfc/ef100_nic.c
index 206d70f9d95b0a..b8a7e9ed791362 100644
--- a/drivers/net/ethernet/sfc/ef100_nic.c
+++ b/drivers/net/ethernet/sfc/ef100_nic.c
@@ -739,6 +739,7 @@ const struct efx_nic_type ef100_pf_nic_type = {
 	.rx_remove = efx_mcdi_rx_remove,
 	.rx_write = ef100_rx_write,
 	.rx_packet = __ef100_rx_packet,
+	.rx_buf_hash_valid = ef100_rx_buf_hash_valid,
 	.fini_dmaq = efx_fini_dmaq,
 	.max_rx_ip_filters = EFX_MCDI_FILTER_TBL_ROWS,
 	.filter_table_probe = ef100_filter_table_up,
@@ -820,6 +821,7 @@ const struct efx_nic_type ef100_vf_nic_type = {
 	.rx_remove = efx_mcdi_rx_remove,
 	.rx_write = ef100_rx_write,
 	.rx_packet = __ef100_rx_packet,
+	.rx_buf_hash_valid = ef100_rx_buf_hash_valid,
 	.fini_dmaq = efx_fini_dmaq,
 	.max_rx_ip_filters = EFX_MCDI_FILTER_TBL_ROWS,
 	.filter_table_probe = ef100_filter_table_up,

From 9cbbc451098ec1e9942886023203b2247dec94bd Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree@solarflare.com>
Date: Tue, 18 Aug 2020 13:43:57 +0100
Subject: [PATCH 17/74] sfc: take correct lock in ef100_reset()

When downing and upping the ef100 filter table, we need to take a write
 lock on efx->filter_sem, not just a read lock, because we may kfree()
 the table pointers.
Without this, resets cause a WARN_ON from efx_rwsem_assert_write_locked().

Fixes: a9dc3d5612ce ("sfc_ef100: RX filter table management and related gubbins")
Signed-off-by: Edward Cree <ecree@solarflare.com>
Reviewed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/sfc/ef100_nic.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/sfc/ef100_nic.c b/drivers/net/ethernet/sfc/ef100_nic.c
index b8a7e9ed791362..19fe86b3b3169b 100644
--- a/drivers/net/ethernet/sfc/ef100_nic.c
+++ b/drivers/net/ethernet/sfc/ef100_nic.c
@@ -431,18 +431,18 @@ static int ef100_reset(struct efx_nic *efx, enum reset_type reset_type)
 		/* A RESET_TYPE_ALL will cause filters to be removed, so we remove filters
 		 * and reprobe after reset to avoid removing filters twice
 		 */
-		down_read(&efx->filter_sem);
+		down_write(&efx->filter_sem);
 		ef100_filter_table_down(efx);
-		up_read(&efx->filter_sem);
+		up_write(&efx->filter_sem);
 		rc = efx_mcdi_reset(efx, reset_type);
 		if (rc)
 			return rc;
 
 		netif_device_attach(efx->net_dev);
 
-		down_read(&efx->filter_sem);
+		down_write(&efx->filter_sem);
 		rc = ef100_filter_table_up(efx);
-		up_read(&efx->filter_sem);
+		up_write(&efx->filter_sem);
 		if (rc)
 			return rc;
 

From 788f920a0f137baa4dbc1efdd5039c4a0a01b8d7 Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree@solarflare.com>
Date: Tue, 18 Aug 2020 13:44:18 +0100
Subject: [PATCH 18/74] sfc: null out channel->rps_flow_id after freeing it

If an ef100_net_open() fails, ef100_net_stop() may be called without
 channel->rps_flow_id having been written; thus it may hold the address
 freed by a previous ef100_net_stop()'s call to efx_remove_filters().
 This then causes a double-free when efx_remove_filters() is called
 again, leading to a panic.
To prevent this, after freeing it, overwrite it with NULL.

Fixes: a9dc3d5612ce ("sfc_ef100: RX filter table management and related gubbins")
Signed-off-by: Edward Cree <ecree@solarflare.com>
Reviewed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/sfc/rx_common.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/sfc/rx_common.c b/drivers/net/ethernet/sfc/rx_common.c
index ef9bca92b0b792..5e29284c89c982 100644
--- a/drivers/net/ethernet/sfc/rx_common.c
+++ b/drivers/net/ethernet/sfc/rx_common.c
@@ -849,6 +849,7 @@ void efx_remove_filters(struct efx_nic *efx)
 	efx_for_each_channel(channel, efx) {
 		cancel_delayed_work_sync(&channel->filter_work);
 		kfree(channel->rps_flow_id);
+		channel->rps_flow_id = NULL;
 	}
 #endif
 	down_write(&efx->filter_sem);

From e6a43910d55d09dae65772ad571d4c61e459b17a Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree@solarflare.com>
Date: Tue, 18 Aug 2020 13:44:50 +0100
Subject: [PATCH 19/74] sfc: don't free_irq()s if they were never requested

If efx_nic_init_interrupt fails, or was never run (e.g. due to an earlier
 failure in ef100_net_open), freeing irqs in efx_nic_fini_interrupt is not
 needed and will cause error messages and stack traces.
So instead, only do this if efx_nic_init_interrupt successfully completed,
 as indicated by the new efx->irqs_hooked flag.

Fixes: 965b549f3c20 ("sfc_ef100: implement ndo_open/close and EVQ probing")
Signed-off-by: Edward Cree <ecree@solarflare.com>
Reviewed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/sfc/net_driver.h | 2 ++
 drivers/net/ethernet/sfc/nic.c        | 4 ++++
 2 files changed, 6 insertions(+)

diff --git a/drivers/net/ethernet/sfc/net_driver.h b/drivers/net/ethernet/sfc/net_driver.h
index dcb741d8bd119c..062462a1384756 100644
--- a/drivers/net/ethernet/sfc/net_driver.h
+++ b/drivers/net/ethernet/sfc/net_driver.h
@@ -846,6 +846,7 @@ struct efx_async_filter_insertion {
  * @timer_quantum_ns: Interrupt timer quantum, in nanoseconds
  * @timer_max_ns: Interrupt timer maximum value, in nanoseconds
  * @irq_rx_adaptive: Adaptive IRQ moderation enabled for RX event queues
+ * @irqs_hooked: Channel interrupts are hooked
  * @irq_rx_mod_step_us: Step size for IRQ moderation for RX event queues
  * @irq_rx_moderation_us: IRQ moderation time for RX event queues
  * @msg_enable: Log message enable flags
@@ -1004,6 +1005,7 @@ struct efx_nic {
 	unsigned int timer_quantum_ns;
 	unsigned int timer_max_ns;
 	bool irq_rx_adaptive;
+	bool irqs_hooked;
 	unsigned int irq_mod_step_us;
 	unsigned int irq_rx_moderation_us;
 	u32 msg_enable;
diff --git a/drivers/net/ethernet/sfc/nic.c b/drivers/net/ethernet/sfc/nic.c
index d994d136bb0380..d1e908846f5dd9 100644
--- a/drivers/net/ethernet/sfc/nic.c
+++ b/drivers/net/ethernet/sfc/nic.c
@@ -129,6 +129,7 @@ int efx_nic_init_interrupt(struct efx_nic *efx)
 #endif
 	}
 
+	efx->irqs_hooked = true;
 	return 0;
 
  fail2:
@@ -154,6 +155,8 @@ void efx_nic_fini_interrupt(struct efx_nic *efx)
 	efx->net_dev->rx_cpu_rmap = NULL;
 #endif
 
+	if (!efx->irqs_hooked)
+		return;
 	if (EFX_INT_MODE_USE_MSI(efx)) {
 		/* Disable MSI/MSI-X interrupts */
 		efx_for_each_channel(channel, efx)
@@ -163,6 +166,7 @@ void efx_nic_fini_interrupt(struct efx_nic *efx)
 		/* Disable legacy interrupt */
 		free_irq(efx->legacy_irq, efx);
 	}
+	efx->irqs_hooked = false;
 }
 
 /* Register dump */

From 335956421c86f64fd46186d76d3961f6adcff187 Mon Sep 17 00:00:00 2001
From: Ganji Aravind <ganji.aravind@chelsio.com>
Date: Tue, 18 Aug 2020 21:10:57 +0530
Subject: [PATCH 20/74] cxgb4: Fix work request size calculation for loopback
 test

Work request used for sending loopback packet needs to add
the firmware work request only once. So, fix by using
correct structure size.

Fixes: 7235ffae3d2c ("cxgb4: add loopback ethtool self-test")
Signed-off-by: Ganji Aravind <ganji.aravind@chelsio.com>
Reviewed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/chelsio/cxgb4/sge.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c
index d2b587d1670aa6..7c9fe4bc235bb7 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c
@@ -2553,8 +2553,8 @@ int cxgb4_selftest_lb_pkt(struct net_device *netdev)
 
 	pkt_len = ETH_HLEN + sizeof(CXGB4_SELFTEST_LB_STR);
 
-	flits = DIV_ROUND_UP(pkt_len + sizeof(struct cpl_tx_pkt) +
-			     sizeof(*wr), sizeof(__be64));
+	flits = DIV_ROUND_UP(pkt_len + sizeof(*cpl) + sizeof(*wr),
+			     sizeof(__be64));
 	ndesc = flits_to_desc(flits);
 
 	lb = &pi->ethtool_lb;

From c650e04898072e4b579cbf8d9dd5b86bcdbe9b00 Mon Sep 17 00:00:00 2001
From: Ganji Aravind <ganji.aravind@chelsio.com>
Date: Tue, 18 Aug 2020 21:10:58 +0530
Subject: [PATCH 21/74] cxgb4: Fix race between loopback and normal Tx path

Even after Tx queues are marked stopped, there exists a
small window where the current packet in the normal Tx
path is still being sent out and loopback selftest ends
up corrupting the same Tx ring. So, ensure selftest takes
the Tx lock to synchronize access the Tx ring.

Fixes: 7235ffae3d2c ("cxgb4: add loopback ethtool self-test")
Signed-off-by: Ganji Aravind <ganji.aravind@chelsio.com>
Reviewed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/chelsio/cxgb4/sge.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c
index 7c9fe4bc235bb7..869431a1eedd48 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c
@@ -2561,11 +2561,14 @@ int cxgb4_selftest_lb_pkt(struct net_device *netdev)
 	lb->loopback = 1;
 
 	q = &adap->sge.ethtxq[pi->first_qset];
+	__netif_tx_lock(q->txq, smp_processor_id());
 
 	reclaim_completed_tx(adap, &q->q, -1, true);
 	credits = txq_avail(&q->q) - ndesc;
-	if (unlikely(credits < 0))
+	if (unlikely(credits < 0)) {
+		__netif_tx_unlock(q->txq);
 		return -ENOMEM;
+	}
 
 	wr = (void *)&q->q.desc[q->q.pidx];
 	memset(wr, 0, sizeof(struct tx_desc));
@@ -2598,6 +2601,7 @@ int cxgb4_selftest_lb_pkt(struct net_device *netdev)
 	init_completion(&lb->completion);
 	txq_advance(&q->q, ndesc);
 	cxgb4_ring_tx_db(adap, &q->q, ndesc);
+	__netif_tx_unlock(q->txq);
 
 	/* wait for the pkt to return */
 	ret = wait_for_completion_timeout(&lb->completion, 10 * HZ);

From 989e4da042ca4a56bbaca9223d1a93639ad11e17 Mon Sep 17 00:00:00 2001
From: Sumera Priyadarsini <sylphrenadin@gmail.com>
Date: Wed, 19 Aug 2020 00:22:41 +0530
Subject: [PATCH 22/74] net: gianfar: Add of_node_put() before goto statement

Every iteration of for_each_available_child_of_node() decrements
reference count of the previous node, however when control
is transferred from the middle of the loop, as in the case of
a return or break or goto, there is no decrement thus ultimately
resulting in a memory leak.

Fix a potential memory leak in gianfar.c by inserting of_node_put()
before the goto statement.

Issue found with Coccinelle.

Signed-off-by: Sumera Priyadarsini <sylphrenadin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/freescale/gianfar.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c
index b513b8c5c3b5e5..41dd3d0f345244 100644
--- a/drivers/net/ethernet/freescale/gianfar.c
+++ b/drivers/net/ethernet/freescale/gianfar.c
@@ -750,8 +750,10 @@ static int gfar_of_init(struct platform_device *ofdev, struct net_device **pdev)
 				continue;
 
 			err = gfar_parse_group(child, priv, model);
-			if (err)
+			if (err) {
+				of_node_put(child);
 				goto err_grp_init;
+			}
 		}
 	} else { /* SQ_SG_MODE */
 		err = gfar_parse_group(np, priv, model);

From eabe861881a733fc84f286f4d5a1ffaddd4f526f Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Sat, 15 Aug 2020 04:46:41 -0400
Subject: [PATCH 23/74] net: handle the return value of pskb_carve_frag_list()
 correctly

pskb_carve_frag_list() may return -ENOMEM in pskb_carve_inside_nonlinear().
we should handle this correctly or we would get wrong sk_buff.

Fixes: 6fa01ccd8830 ("skbuff: Add pskb_extract() helper function")
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 5c3b906aeef3dc..e18184ffa9c3fa 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -5987,9 +5987,13 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
 	if (skb_has_frag_list(skb))
 		skb_clone_fraglist(skb);
 
-	if (k == 0) {
-		/* split line is in frag list */
-		pskb_carve_frag_list(skb, shinfo, off - pos, gfp_mask);
+	/* split line is in frag list */
+	if (k == 0 && pskb_carve_frag_list(skb, shinfo, off - pos, gfp_mask)) {
+		/* skb_frag_unref() is not needed here as shinfo->nr_frags = 0. */
+		if (skb_has_frag_list(skb))
+			kfree_skb_list(skb_shinfo(skb)->frag_list);
+		kfree(data);
+		return -ENOMEM;
 	}
 	skb_release_data(skb);
 

From 0410d07190961ac526f05085765a8d04d926545b Mon Sep 17 00:00:00 2001
From: Jiri Wiesner <jwiesner@suse.com>
Date: Sun, 16 Aug 2020 20:52:44 +0200
Subject: [PATCH 24/74] bonding: fix active-backup failover for current ARP
 slave

When the ARP monitor is used for link detection, ARP replies are
validated for all slaves (arp_validate=3) and fail_over_mac is set to
active, two slaves of an active-backup bond may get stuck in a state
where both of them are active and pass packets that they receive to
the bond. This state makes IPv6 duplicate address detection fail. The
state is reached thus:
1. The current active slave goes down because the ARP target
   is not reachable.
2. The current ARP slave is chosen and made active.
3. A new slave is enslaved. This new slave becomes the current active
   slave and can reach the ARP target.
As a result, the current ARP slave stays active after the enslave
action has finished and the log is littered with "PROBE BAD" messages:
> bond0: PROBE: c_arp ens10 && cas ens11 BAD
The workaround is to remove the slave with "going back" status from
the bond and re-enslave it. This issue was encountered when DPDK PMD
interfaces were being enslaved to an active-backup bond.

I would be possible to fix the issue in bond_enslave() or
bond_change_active_slave() but the ARP monitor was fixed instead to
keep most of the actions changing the current ARP slave in the ARP
monitor code. The current ARP slave is set as inactive and backup
during the commit phase. A new state, BOND_LINK_FAIL, has been
introduced for slaves in the context of the ARP monitor. This allows
administrators to see how slaves are rotated for sending ARP requests
and attempts are made to find a new active slave.

Fixes: b2220cad583c9 ("bonding: refactor ARP active-backup monitor")
Signed-off-by: Jiri Wiesner <jwiesner@suse.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 415a37e44caeb7..c5d3032dd1a215 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -2948,6 +2948,9 @@ static int bond_ab_arp_inspect(struct bonding *bond)
 			if (bond_time_in_interval(bond, last_rx, 1)) {
 				bond_propose_link_state(slave, BOND_LINK_UP);
 				commit++;
+			} else if (slave->link == BOND_LINK_BACK) {
+				bond_propose_link_state(slave, BOND_LINK_FAIL);
+				commit++;
 			}
 			continue;
 		}
@@ -3056,6 +3059,19 @@ static void bond_ab_arp_commit(struct bonding *bond)
 
 			continue;
 
+		case BOND_LINK_FAIL:
+			bond_set_slave_link_state(slave, BOND_LINK_FAIL,
+						  BOND_SLAVE_NOTIFY_NOW);
+			bond_set_slave_inactive_flags(slave,
+						      BOND_SLAVE_NOTIFY_NOW);
+
+			/* A slave has just been enslaved and has become
+			 * the current active slave.
+			 */
+			if (rtnl_dereference(bond->curr_active_slave))
+				RCU_INIT_POINTER(bond->current_arp_slave, NULL);
+			continue;
+
 		default:
 			slave_err(bond->dev, slave->dev,
 				  "impossible: link_new_state %d on slave\n",
@@ -3106,8 +3122,6 @@ static bool bond_ab_arp_probe(struct bonding *bond)
 			return should_notify_rtnl;
 	}
 
-	bond_set_slave_inactive_flags(curr_arp_slave, BOND_SLAVE_NOTIFY_LATER);
-
 	bond_for_each_slave_rcu(bond, slave, iter) {
 		if (!found && !before && bond_slave_is_up(slave))
 			before = slave;

From 4ef1a7cb08e94da1f2f2a34ee6cefe7ae142dc98 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 17 Aug 2020 14:30:49 +0800
Subject: [PATCH 25/74] ipv6: some fixes for ipv6_dev_find()

This patch is to do 3 things for ipv6_dev_find():

  As David A. noticed,

  - rt6_lookup() is not really needed. Different from __ip_dev_find(),
    ipv6_dev_find() doesn't have a compatibility problem, so remove it.

  As Hideaki suggested,

  - "valid" (non-tentative) check for the address is also needed.
    ipv6_chk_addr() calls ipv6_chk_addr_and_flags(), which will
    traverse the address hash list, but it's heavy to be called
    inside ipv6_dev_find(). This patch is to reuse the code of
    ipv6_chk_addr_and_flags() for ipv6_dev_find().

  - dev parameter is passed into ipv6_dev_find(), as link-local
    addresses from user space has sin6_scope_id set and the dev
    lookup needs it.

Fixes: 81f6cb31222d ("ipv6: add ipv6_dev_find()")
Suggested-by: YOSHIFUJI Hideaki <hideaki.yoshifuji@miraclelinux.com>
Reported-by: David Ahern <dsahern@gmail.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/addrconf.h |  3 ++-
 net/ipv6/addrconf.c    | 60 ++++++++++++++++--------------------------
 net/tipc/udp_media.c   |  8 +++---
 3 files changed, 28 insertions(+), 43 deletions(-)

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index ba3f6c15ad2b5e..18f783dcd55fa3 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -97,7 +97,8 @@ bool ipv6_chk_custom_prefix(const struct in6_addr *addr,
 
 int ipv6_chk_prefix(const struct in6_addr *addr, struct net_device *dev);
 
-struct net_device *ipv6_dev_find(struct net *net, const struct in6_addr *addr);
+struct net_device *ipv6_dev_find(struct net *net, const struct in6_addr *addr,
+				 struct net_device *dev);
 
 struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net,
 				     const struct in6_addr *addr,
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 8e761b8c47c690..01146b66d6669f 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1893,12 +1893,13 @@ EXPORT_SYMBOL(ipv6_chk_addr);
  *   2. does the address exist on the specific device
  *      (skip_dev_check = false)
  */
-int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr,
-			    const struct net_device *dev, bool skip_dev_check,
-			    int strict, u32 banned_flags)
+static struct net_device *
+__ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr,
+			  const struct net_device *dev, bool skip_dev_check,
+			  int strict, u32 banned_flags)
 {
 	unsigned int hash = inet6_addr_hash(net, addr);
-	const struct net_device *l3mdev;
+	struct net_device *l3mdev, *ndev;
 	struct inet6_ifaddr *ifp;
 	u32 ifp_flags;
 
@@ -1909,10 +1910,11 @@ int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr,
 		dev = NULL;
 
 	hlist_for_each_entry_rcu(ifp, &inet6_addr_lst[hash], addr_lst) {
-		if (!net_eq(dev_net(ifp->idev->dev), net))
+		ndev = ifp->idev->dev;
+		if (!net_eq(dev_net(ndev), net))
 			continue;
 
-		if (l3mdev_master_dev_rcu(ifp->idev->dev) != l3mdev)
+		if (l3mdev_master_dev_rcu(ndev) != l3mdev)
 			continue;
 
 		/* Decouple optimistic from tentative for evaluation here.
@@ -1923,15 +1925,23 @@ int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr,
 			    : ifp->flags;
 		if (ipv6_addr_equal(&ifp->addr, addr) &&
 		    !(ifp_flags&banned_flags) &&
-		    (!dev || ifp->idev->dev == dev ||
+		    (!dev || ndev == dev ||
 		     !(ifp->scope&(IFA_LINK|IFA_HOST) || strict))) {
 			rcu_read_unlock();
-			return 1;
+			return ndev;
 		}
 	}
 
 	rcu_read_unlock();
-	return 0;
+	return NULL;
+}
+
+int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr,
+			    const struct net_device *dev, bool skip_dev_check,
+			    int strict, u32 banned_flags)
+{
+	return __ipv6_chk_addr_and_flags(net, addr, dev, skip_dev_check,
+					 strict, banned_flags) ? 1 : 0;
 }
 EXPORT_SYMBOL(ipv6_chk_addr_and_flags);
 
@@ -1990,35 +2000,11 @@ EXPORT_SYMBOL(ipv6_chk_prefix);
  *
  * The caller should be protected by RCU, or RTNL.
  */
-struct net_device *ipv6_dev_find(struct net *net, const struct in6_addr *addr)
+struct net_device *ipv6_dev_find(struct net *net, const struct in6_addr *addr,
+				 struct net_device *dev)
 {
-	unsigned int hash = inet6_addr_hash(net, addr);
-	struct inet6_ifaddr *ifp, *result = NULL;
-	struct net_device *dev = NULL;
-
-	rcu_read_lock();
-	hlist_for_each_entry_rcu(ifp, &inet6_addr_lst[hash], addr_lst) {
-		if (net_eq(dev_net(ifp->idev->dev), net) &&
-		    ipv6_addr_equal(&ifp->addr, addr)) {
-			result = ifp;
-			break;
-		}
-	}
-
-	if (!result) {
-		struct rt6_info *rt;
-
-		rt = rt6_lookup(net, addr, NULL, 0, NULL, 0);
-		if (rt) {
-			dev = rt->dst.dev;
-			ip6_rt_put(rt);
-		}
-	} else {
-		dev = result->idev->dev;
-	}
-	rcu_read_unlock();
-
-	return dev;
+	return __ipv6_chk_addr_and_flags(net, addr, dev, !dev, 1,
+					 IFA_F_TENTATIVE);
 }
 EXPORT_SYMBOL(ipv6_dev_find);
 
diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index 53f0de0676b7a5..911d13cd2e675a 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -660,6 +660,7 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b,
 	struct udp_tunnel_sock_cfg tuncfg = {NULL};
 	struct nlattr *opts[TIPC_NLA_UDP_MAX + 1];
 	u8 node_id[NODE_ID_LEN] = {0,};
+	struct net_device *dev;
 	int rmcast = 0;
 
 	ub = kzalloc(sizeof(*ub), GFP_ATOMIC);
@@ -714,8 +715,6 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b,
 	rcu_assign_pointer(ub->bearer, b);
 	tipc_udp_media_addr_set(&b->addr, &local);
 	if (local.proto == htons(ETH_P_IP)) {
-		struct net_device *dev;
-
 		dev = __ip_dev_find(net, local.ipv4.s_addr, false);
 		if (!dev) {
 			err = -ENODEV;
@@ -738,9 +737,8 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b,
 		b->mtu = b->media->mtu;
 #if IS_ENABLED(CONFIG_IPV6)
 	} else if (local.proto == htons(ETH_P_IPV6)) {
-		struct net_device *dev;
-
-		dev = ipv6_dev_find(net, &local.ipv6);
+		dev = ub->ifindex ? __dev_get_by_index(net, ub->ifindex) : NULL;
+		dev = ipv6_dev_find(net, &local.ipv6, dev);
 		if (!dev) {
 			err = -ENODEV;
 			goto err;

From 840110a4eae190dcbb9907d68216d5d1d9f25839 Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@mellanox.com>
Date: Mon, 17 Aug 2020 16:34:05 +0300
Subject: [PATCH 26/74] ethtool: Fix preserving of wanted feature bits in
 netlink interface

Currently, ethtool-netlink calculates new wanted bits as:
(req_wanted & req_mask) | (old_active & ~req_mask)

It completely discards the old wanted bits, so they are forgotten with
the next ethtool command. Sample steps to reproduce:

1. ethtool -k eth0
   tx-tcp-segmentation: on # TSO is on from the beginning
2. ethtool -K eth0 tx off
   tx-tcp-segmentation: off [not requested]
3. ethtool -k eth0
   tx-tcp-segmentation: off [requested on]
4. ethtool -K eth0 rx off # Some change unrelated to TSO
5. ethtool -k eth0
   tx-tcp-segmentation: off # "Wanted on" is forgotten

This commit fixes it by changing the formula to:
(req_wanted & req_mask) | (old_wanted & ~req_mask),
where old_active was replaced by old_wanted to account for the wanted
bits.

The shortcut condition for the case where nothing was changed now
compares wanted bitmasks, instead of wanted to active.

Fixes: 0980bfcd6954 ("ethtool: set netdev features with FEATURES_SET request")
Signed-off-by: Maxim Mikityanskiy <maximmi@mellanox.com>
Reviewed-by: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ethtool/features.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/net/ethtool/features.c b/net/ethtool/features.c
index 4e632dc987d859..ec196f0fddc924 100644
--- a/net/ethtool/features.c
+++ b/net/ethtool/features.c
@@ -224,7 +224,9 @@ int ethnl_set_features(struct sk_buff *skb, struct genl_info *info)
 	DECLARE_BITMAP(wanted_diff_mask, NETDEV_FEATURE_COUNT);
 	DECLARE_BITMAP(active_diff_mask, NETDEV_FEATURE_COUNT);
 	DECLARE_BITMAP(old_active, NETDEV_FEATURE_COUNT);
+	DECLARE_BITMAP(old_wanted, NETDEV_FEATURE_COUNT);
 	DECLARE_BITMAP(new_active, NETDEV_FEATURE_COUNT);
+	DECLARE_BITMAP(new_wanted, NETDEV_FEATURE_COUNT);
 	DECLARE_BITMAP(req_wanted, NETDEV_FEATURE_COUNT);
 	DECLARE_BITMAP(req_mask, NETDEV_FEATURE_COUNT);
 	struct nlattr *tb[ETHTOOL_A_FEATURES_MAX + 1];
@@ -250,6 +252,7 @@ int ethnl_set_features(struct sk_buff *skb, struct genl_info *info)
 
 	rtnl_lock();
 	ethnl_features_to_bitmap(old_active, dev->features);
+	ethnl_features_to_bitmap(old_wanted, dev->wanted_features);
 	ret = ethnl_parse_bitset(req_wanted, req_mask, NETDEV_FEATURE_COUNT,
 				 tb[ETHTOOL_A_FEATURES_WANTED],
 				 netdev_features_strings, info->extack);
@@ -261,11 +264,11 @@ int ethnl_set_features(struct sk_buff *skb, struct genl_info *info)
 		goto out_rtnl;
 	}
 
-	/* set req_wanted bits not in req_mask from old_active */
+	/* set req_wanted bits not in req_mask from old_wanted */
 	bitmap_and(req_wanted, req_wanted, req_mask, NETDEV_FEATURE_COUNT);
-	bitmap_andnot(new_active, old_active, req_mask, NETDEV_FEATURE_COUNT);
-	bitmap_or(req_wanted, new_active, req_wanted, NETDEV_FEATURE_COUNT);
-	if (bitmap_equal(req_wanted, old_active, NETDEV_FEATURE_COUNT)) {
+	bitmap_andnot(new_wanted, old_wanted, req_mask, NETDEV_FEATURE_COUNT);
+	bitmap_or(req_wanted, new_wanted, req_wanted, NETDEV_FEATURE_COUNT);
+	if (bitmap_equal(req_wanted, old_wanted, NETDEV_FEATURE_COUNT)) {
 		ret = 0;
 		goto out_rtnl;
 	}

From 2847bfed888fbb8bf4c8e8067fd6127538c2c700 Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@mellanox.com>
Date: Mon, 17 Aug 2020 16:34:06 +0300
Subject: [PATCH 27/74] ethtool: Account for hw_features in netlink interface

ethtool-netlink ignores dev->hw_features and may confuse the drivers by
asking them to enable features not in the hw_features bitmask. For
example:

1. ethtool -k eth0
   tls-hw-tx-offload: off [fixed]
2. ethtool -K eth0 tls-hw-tx-offload on
   tls-hw-tx-offload: on
3. ethtool -k eth0
   tls-hw-tx-offload: on [fixed]

Fitler out dev->hw_features from req_wanted to fix it and to resemble
the legacy ethtool behavior.

Fixes: 0980bfcd6954 ("ethtool: set netdev features with FEATURES_SET request")
Signed-off-by: Maxim Mikityanskiy <maximmi@mellanox.com>
Reviewed-by: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ethtool/features.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/ethtool/features.c b/net/ethtool/features.c
index ec196f0fddc924..6b288bfd7678ea 100644
--- a/net/ethtool/features.c
+++ b/net/ethtool/features.c
@@ -273,7 +273,8 @@ int ethnl_set_features(struct sk_buff *skb, struct genl_info *info)
 		goto out_rtnl;
 	}
 
-	dev->wanted_features = ethnl_bitmap_to_features(req_wanted);
+	dev->wanted_features &= ~dev->hw_features;
+	dev->wanted_features |= ethnl_bitmap_to_features(req_wanted) & dev->hw_features;
 	__netdev_update_features(dev);
 	ethnl_features_to_bitmap(new_active, dev->features);
 	mod = !bitmap_equal(old_active, new_active, NETDEV_FEATURE_COUNT);

From f01204ec8be7ea5e8f0230a7d4200e338d563bde Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@mellanox.com>
Date: Mon, 17 Aug 2020 16:34:07 +0300
Subject: [PATCH 28/74] ethtool: Don't omit the netlink reply if no features
 were changed

The legacy ethtool userspace tool shows an error when no features could
be changed. It's useful to have a netlink reply to be able to show this
error when __netdev_update_features wasn't called, for example:

1. ethtool -k eth0
   large-receive-offload: off
2. ethtool -K eth0 rx-fcs on
3. ethtool -K eth0 lro on
   Could not change any device features
   rx-lro: off [requested on]
4. ethtool -K eth0 lro on
   # The output should be the same, but without this patch the kernel
   # doesn't send the reply, and ethtool is unable to detect the error.

This commit makes ethtool-netlink always return a reply when requested,
and it still avoids unnecessary calls to __netdev_update_features if the
wanted features haven't changed.

Fixes: 0980bfcd6954 ("ethtool: set netdev features with FEATURES_SET request")
Signed-off-by: Maxim Mikityanskiy <maximmi@mellanox.com>
Reviewed-by: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ethtool/features.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/net/ethtool/features.c b/net/ethtool/features.c
index 6b288bfd7678ea..495635f152ba68 100644
--- a/net/ethtool/features.c
+++ b/net/ethtool/features.c
@@ -268,14 +268,11 @@ int ethnl_set_features(struct sk_buff *skb, struct genl_info *info)
 	bitmap_and(req_wanted, req_wanted, req_mask, NETDEV_FEATURE_COUNT);
 	bitmap_andnot(new_wanted, old_wanted, req_mask, NETDEV_FEATURE_COUNT);
 	bitmap_or(req_wanted, new_wanted, req_wanted, NETDEV_FEATURE_COUNT);
-	if (bitmap_equal(req_wanted, old_wanted, NETDEV_FEATURE_COUNT)) {
-		ret = 0;
-		goto out_rtnl;
+	if (!bitmap_equal(req_wanted, old_wanted, NETDEV_FEATURE_COUNT)) {
+		dev->wanted_features &= ~dev->hw_features;
+		dev->wanted_features |= ethnl_bitmap_to_features(req_wanted) & dev->hw_features;
+		__netdev_update_features(dev);
 	}
-
-	dev->wanted_features &= ~dev->hw_features;
-	dev->wanted_features |= ethnl_bitmap_to_features(req_wanted) & dev->hw_features;
-	__netdev_update_features(dev);
 	ethnl_features_to_bitmap(new_active, dev->features);
 	mod = !bitmap_equal(old_active, new_active, NETDEV_FEATURE_COUNT);
 

From 17340552ce449ab55e3ccbfc2af1bcc600b4fbb5 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Mon, 17 Aug 2020 23:40:42 +0100
Subject: [PATCH 29/74] net: mscc: ocelot: remove duplicate "the the" phrase in
 Kconfig text

The Kconfig help text contains the phrase "the the" in the help
text. Fix this.

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/ocelot/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/dsa/ocelot/Kconfig b/drivers/net/dsa/ocelot/Kconfig
index f121619d81fef4..2d23ccef7d0edb 100644
--- a/drivers/net/dsa/ocelot/Kconfig
+++ b/drivers/net/dsa/ocelot/Kconfig
@@ -9,7 +9,7 @@ config NET_DSA_MSCC_FELIX
 	select NET_DSA_TAG_OCELOT
 	select FSL_ENETC_MDIO
 	help
-	  This driver supports network switches from the the Vitesse /
+	  This driver supports network switches from the Vitesse /
 	  Microsemi / Microchip Ocelot family of switching cores that are
 	  connected to their host CPU via Ethernet.
 	  The following switches are supported:

From ad6641189c5935192a15eeb4b369dd04ebedfabb Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Mon, 17 Aug 2020 23:44:25 +0100
Subject: [PATCH 30/74] net: ipv4: remove duplicate "the the" phrase in Kconfig
 text

The Kconfig help text contains the phrase "the the" in the help
text. Fix this and reformat the block of help text.

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/Kconfig | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 60db5a6487cc5f..87983e70f03f32 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -661,13 +661,13 @@ config TCP_CONG_BBR
 
 	  BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to
 	  maximize network utilization and minimize queues. It builds an explicit
-	  model of the the bottleneck delivery rate and path round-trip
-	  propagation delay. It tolerates packet loss and delay unrelated to
-	  congestion. It can operate over LAN, WAN, cellular, wifi, or cable
-	  modem links. It can coexist with flows that use loss-based congestion
-	  control, and can operate with shallow buffers, deep buffers,
-	  bufferbloat, policers, or AQM schemes that do not provide a delay
-	  signal. It requires the fq ("Fair Queue") pacing packet scheduler.
+	  model of the bottleneck delivery rate and path round-trip propagation
+	  delay. It tolerates packet loss and delay unrelated to congestion. It
+	  can operate over LAN, WAN, cellular, wifi, or cable modem links. It can
+	  coexist with flows that use loss-based congestion control, and can
+	  operate with shallow buffers, deep buffers, bufferbloat, policers, or
+	  AQM schemes that do not provide a delay signal. It requires the fq
+	  ("Fair Queue") pacing packet scheduler.
 
 choice
 	prompt "Default TCP congestion control"

From e679654a704e5bd676ea6446fa7b764cbabf168a Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Tue, 18 Aug 2020 15:23:09 -0700
Subject: [PATCH 31/74] bpf: Fix a rcu_sched stall issue with bpf
 task/task_file iterator

In our production system, we observed rcu stalls when
'bpftool prog` is running.
  rcu: INFO: rcu_sched self-detected stall on CPU
  rcu: \x097-....: (20999 ticks this GP) idle=302/1/0x4000000000000000 softirq=1508852/1508852 fqs=4913
  \x09(t=21031 jiffies g=2534773 q=179750)
  NMI backtrace for cpu 7
  CPU: 7 PID: 184195 Comm: bpftool Kdump: loaded Tainted: G        W         5.8.0-00004-g68bfc7f8c1b4 #6
  Hardware name: Quanta Twin Lakes MP/Twin Lakes Passive MP, BIOS F09_3A17 05/03/2019
  Call Trace:
  <IRQ>
  dump_stack+0x57/0x70
  nmi_cpu_backtrace.cold+0x14/0x53
  ? lapic_can_unplug_cpu.cold+0x39/0x39
  nmi_trigger_cpumask_backtrace+0xb7/0xc7
  rcu_dump_cpu_stacks+0xa2/0xd0
  rcu_sched_clock_irq.cold+0x1ff/0x3d9
  ? tick_nohz_handler+0x100/0x100
  update_process_times+0x5b/0x90
  tick_sched_timer+0x5e/0xf0
  __hrtimer_run_queues+0x12a/0x2a0
  hrtimer_interrupt+0x10e/0x280
  __sysvec_apic_timer_interrupt+0x51/0xe0
  asm_call_on_stack+0xf/0x20
  </IRQ>
  sysvec_apic_timer_interrupt+0x6f/0x80
  asm_sysvec_apic_timer_interrupt+0x12/0x20
  RIP: 0010:task_file_seq_get_next+0x71/0x220
  Code: 00 00 8b 53 1c 49 8b 7d 00 89 d6 48 8b 47 20 44 8b 18 41 39 d3 76 75 48 8b 4f 20 8b 01 39 d0 76 61 41 89 d1 49 39 c1 48 19 c0 <48> 8b 49 08 21 d0 48 8d 04 c1 4c 8b 08 4d 85 c9 74 46 49 8b 41 38
  RSP: 0018:ffffc90006223e10 EFLAGS: 00000297
  RAX: ffffffffffffffff RBX: ffff888f0d172388 RCX: ffff888c8c07c1c0
  RDX: 00000000000f017b RSI: 00000000000f017b RDI: ffff888c254702c0
  RBP: ffffc90006223e68 R08: ffff888be2a1c140 R09: 00000000000f017b
  R10: 0000000000000002 R11: 0000000000100000 R12: ffff888f23c24118
  R13: ffffc90006223e60 R14: ffffffff828509a0 R15: 00000000ffffffff
  task_file_seq_next+0x52/0xa0
  bpf_seq_read+0xb9/0x320
  vfs_read+0x9d/0x180
  ksys_read+0x5f/0xe0
  do_syscall_64+0x38/0x60
  entry_SYSCALL_64_after_hwframe+0x44/0xa9
  RIP: 0033:0x7f8815f4f76e
  Code: c0 e9 f6 fe ff ff 55 48 8d 3d 76 70 0a 00 48 89 e5 e8 36 06 02 00 66 0f 1f 44 00 00 64 8b 04 25 18 00 00 00 85 c0 75 14 0f 05 <48> 3d 00 f0 ff ff 77 52 c3 66 0f 1f 84 00 00 00 00 00 55 48 89 e5
  RSP: 002b:00007fff8f9df578 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
  RAX: ffffffffffffffda RBX: 000000000170b9c0 RCX: 00007f8815f4f76e
  RDX: 0000000000001000 RSI: 00007fff8f9df5b0 RDI: 0000000000000007
  RBP: 00007fff8f9e05f0 R08: 0000000000000049 R09: 0000000000000010
  R10: 00007f881601fa40 R11: 0000000000000246 R12: 00007fff8f9e05a8
  R13: 00007fff8f9e05a8 R14: 0000000001917f90 R15: 000000000000e22e

Note that `bpftool prog` actually calls a task_file bpf iterator
program to establish an association between prog/map/link/btf anon
files and processes.

In the case where the above rcu stall occured, we had a process
having 1587 tasks and each task having roughly 81305 files.
This implied 129 million bpf prog invocations. Unfortunwtely none of
these files are prog/map/link/btf files so bpf iterator/prog needs
to traverse all these files and not able to return to user space
since there are no seq_file buffer overflow.

This patch fixed the issue in bpf_seq_read() to limit the number
of visited objects. If the maximum number of visited objects is
reached, no more objects will be visited in the current syscall.
If there is nothing written in the seq_file buffer, -EAGAIN will
return to the user so user can try again.

The maximum number of visited objects is set at 1 million.
In our Intel Xeon D-2191 2.3GHZ 18-core server, bpf_seq_read()
visiting 1 million files takes around 0.18 seconds.

We did not use cond_resched() since for some iterators, e.g.,
netlink iterator, where rcu read_lock critical section spans between
consecutive seq_ops->next(), which makes impossible to do cond_resched()
in the key while loop of function bpf_seq_read().

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Cc: Paul E. McKenney <paulmck@kernel.org>
Link: https://lore.kernel.org/bpf/20200818222309.2181348-1-yhs@fb.com
---
 kernel/bpf/bpf_iter.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index b6715964b68561..8faa2ce893968f 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -67,6 +67,9 @@ static void bpf_iter_done_stop(struct seq_file *seq)
 	iter_priv->done_stop = true;
 }
 
+/* maximum visited objects before bailing out */
+#define MAX_ITER_OBJECTS	1000000
+
 /* bpf_seq_read, a customized and simpler version for bpf iterator.
  * no_llseek is assumed for this file.
  * The following are differences from seq_read():
@@ -79,7 +82,7 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
 {
 	struct seq_file *seq = file->private_data;
 	size_t n, offs, copied = 0;
-	int err = 0;
+	int err = 0, num_objs = 0;
 	void *p;
 
 	mutex_lock(&seq->lock);
@@ -135,6 +138,7 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
 	while (1) {
 		loff_t pos = seq->index;
 
+		num_objs++;
 		offs = seq->count;
 		p = seq->op->next(seq, p, &seq->index);
 		if (pos == seq->index) {
@@ -153,6 +157,15 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
 		if (seq->count >= size)
 			break;
 
+		if (num_objs >= MAX_ITER_OBJECTS) {
+			if (offs == 0) {
+				err = -EAGAIN;
+				seq->op->stop(seq, p);
+				goto done;
+			}
+			break;
+		}
+
 		err = seq->op->show(seq, p);
 		if (err > 0) {
 			bpf_iter_dec_seq_num(seq);

From e60572b8d4c39572be6857d1ec91fdf979f8775f Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Tue, 18 Aug 2020 15:23:10 -0700
Subject: [PATCH 32/74] bpf: Avoid visit same object multiple times

Currently when traversing all tasks, the next tid
is always increased by one. This may result in
visiting the same task multiple times in a
pid namespace.

This patch fixed the issue by seting the next
tid as pid_nr_ns(pid, ns) + 1, similar to
funciton next_tgid().

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Cc: Rik van Riel <riel@surriel.com>
Link: https://lore.kernel.org/bpf/20200818222310.2181500-1-yhs@fb.com
---
 kernel/bpf/task_iter.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index f21b5e1e4540e1..99af4cea110253 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -29,8 +29,9 @@ static struct task_struct *task_seq_get_next(struct pid_namespace *ns,
 
 	rcu_read_lock();
 retry:
-	pid = idr_get_next(&ns->idr, tid);
+	pid = find_ge_pid(*tid, ns);
 	if (pid) {
+		*tid = pid_nr_ns(pid, ns);
 		task = get_pid_task(pid, PIDTYPE_PID);
 		if (!task) {
 			++*tid;

From 00fa1d83a8b50351c830521d00135e823c46e7d0 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Tue, 18 Aug 2020 15:23:12 -0700
Subject: [PATCH 33/74] bpftool: Handle EAGAIN error code properly in pids
 collection

When the error code is EAGAIN, the kernel signals the user
space should retry the read() operation for bpf iterators.
Let us do it.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/20200818222312.2181675-1-yhs@fb.com
---
 tools/bpf/bpftool/pids.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/bpf/bpftool/pids.c b/tools/bpf/bpftool/pids.c
index e3b116325403be..df7d8ec7603616 100644
--- a/tools/bpf/bpftool/pids.c
+++ b/tools/bpf/bpftool/pids.c
@@ -134,6 +134,8 @@ int build_obj_refs_table(struct obj_refs_table *table, enum bpf_obj_type type)
 	while (true) {
 		ret = read(fd, buf, sizeof(buf));
 		if (ret < 0) {
+			if (errno == EAGAIN)
+				continue;
 			err = -errno;
 			p_err("failed to read PID iterator output: %d", err);
 			goto out;

From 63d4a4c145cca2e84dc6e62d2ef5cb990c9723c2 Mon Sep 17 00:00:00 2001
From: Shay Agroskin <shayagr@amazon.com>
Date: Wed, 19 Aug 2020 20:28:36 +0300
Subject: [PATCH 34/74] net: ena: Prevent reset after device destruction

The reset work is scheduled by the timer routine whenever it
detects that a device reset is required (e.g. when a keep_alive signal
is missing).
When releasing device resources in ena_destroy_device() the driver
cancels the scheduling of the timer routine without destroying the reset
work explicitly.

This creates the following bug:
    The driver is suspended and the ena_suspend() function is called
	-> This function calls ena_destroy_device() to free the net device
	   resources
	    -> The driver waits for the timer routine to finish
	    its execution and then cancels it, thus preventing from it
	    to be called again.

    If, in its final execution, the timer routine schedules a reset,
    the reset routine might be called afterwards,and a redundant call to
    ena_restore_device() would be made.

By changing the reset routine we allow it to read the device's state
accurately.
This is achieved by checking whether ENA_FLAG_TRIGGER_RESET flag is set
before resetting the device and making both the destruction function and
the flag check are under rtnl lock.
The ENA_FLAG_TRIGGER_RESET is cleared at the end of the destruction
routine. Also surround the flag check with 'likely' because
we expect that the reset routine would be called only when
ENA_FLAG_TRIGGER_RESET flag is set.

The destruction of the timer and reset services in __ena_shutoff() have to
stay, even though the timer routine is destroyed in ena_destroy_device().
This is to avoid a case in which the reset routine is scheduled after
free_netdev() in __ena_shutoff(), which would create an access to freed
memory in adapter->flags.

Fixes: 8c5c7abdeb2d ("net: ena: add power management ops to the ENA driver")
Signed-off-by: Shay Agroskin <shayagr@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/amazon/ena/ena_netdev.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c
index 2a6c9725e09225..44aeace196f07a 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.c
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c
@@ -3601,16 +3601,14 @@ static void ena_fw_reset_device(struct work_struct *work)
 {
 	struct ena_adapter *adapter =
 		container_of(work, struct ena_adapter, reset_task);
-	struct pci_dev *pdev = adapter->pdev;
 
-	if (unlikely(!test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags))) {
-		dev_err(&pdev->dev,
-			"device reset schedule while reset bit is off\n");
-		return;
-	}
 	rtnl_lock();
-	ena_destroy_device(adapter, false);
-	ena_restore_device(adapter);
+
+	if (likely(test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags))) {
+		ena_destroy_device(adapter, false);
+		ena_restore_device(adapter);
+	}
+
 	rtnl_unlock();
 }
 
@@ -4389,8 +4387,11 @@ static void __ena_shutoff(struct pci_dev *pdev, bool shutdown)
 		netdev->rx_cpu_rmap = NULL;
 	}
 #endif /* CONFIG_RFS_ACCEL */
-	del_timer_sync(&adapter->timer_service);
 
+	/* Make sure timer and reset routine won't be called after
+	 * freeing device resources.
+	 */
+	del_timer_sync(&adapter->timer_service);
 	cancel_work_sync(&adapter->reset_task);
 
 	rtnl_lock(); /* lock released inside the below if-else block */

From 8b147f6f3e7de4e51113e3e9ec44aa2debc02c58 Mon Sep 17 00:00:00 2001
From: Shay Agroskin <shayagr@amazon.com>
Date: Wed, 19 Aug 2020 20:28:37 +0300
Subject: [PATCH 35/74] net: ena: Change WARN_ON expression in
 ena_del_napi_in_range()

The ena_del_napi_in_range() function unregisters the napi handler for
rings in a given range.
This function had the following WARN_ON macro:

    WARN_ON(ENA_IS_XDP_INDEX(adapter, i) &&
	    adapter->ena_napi[i].xdp_ring);

This macro prints the call stack if the expression inside of it is
true [1], but the expression inside of it is the wanted situation.
The expression checks whether the ring has an XDP queue and its index
corresponds to a XDP one.

This patch changes the expression to
    !ENA_IS_XDP_INDEX(adapter, i) && adapter->ena_napi[i].xdp_ring
which indicates an unwanted situation.

Also, change the structure of the function. The napi handler is
unregistered for all rings, and so there's no need to check whether the
index is an XDP index or not. By removing this check the code becomes
much more readable.

Fixes: 548c4940b9f1 ("net: ena: Implement XDP_TX action")
Signed-off-by: Shay Agroskin <shayagr@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/amazon/ena/ena_netdev.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c
index 44aeace196f07a..233db15c970d6b 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.c
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c
@@ -2180,13 +2180,10 @@ static void ena_del_napi_in_range(struct ena_adapter *adapter,
 	int i;
 
 	for (i = first_index; i < first_index + count; i++) {
-		/* Check if napi was initialized before */
-		if (!ENA_IS_XDP_INDEX(adapter, i) ||
-		    adapter->ena_napi[i].xdp_ring)
-			netif_napi_del(&adapter->ena_napi[i].napi);
-		else
-			WARN_ON(ENA_IS_XDP_INDEX(adapter, i) &&
-				adapter->ena_napi[i].xdp_ring);
+		netif_napi_del(&adapter->ena_napi[i].napi);
+
+		WARN_ON(!ENA_IS_XDP_INDEX(adapter, i) &&
+			adapter->ena_napi[i].xdp_ring);
 	}
 }
 

From ccd143e5150f24b9ba15145c7221b61dd9e41021 Mon Sep 17 00:00:00 2001
From: Shay Agroskin <shayagr@amazon.com>
Date: Wed, 19 Aug 2020 20:28:38 +0300
Subject: [PATCH 36/74] net: ena: Make missed_tx stat incremental

Most statistics in ena driver are incremented, meaning that a stat's
value is a sum of all increases done to it since driver/queue
initialization.

This patch makes all statistics this way, effectively making missed_tx
statistic incremental.
Also added a comment regarding rx_drops and tx_drops to make it
clearer how these counters are calculated.

Fixes: 11095fdb712b ("net: ena: add statistics for missed tx packets")
Signed-off-by: Shay Agroskin <shayagr@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/amazon/ena/ena_netdev.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c
index 233db15c970d6b..a3a8edf9a734d0 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.c
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c
@@ -3687,7 +3687,7 @@ static int check_missing_comp_in_tx_queue(struct ena_adapter *adapter,
 	}
 
 	u64_stats_update_begin(&tx_ring->syncp);
-	tx_ring->tx_stats.missed_tx = missed_tx;
+	tx_ring->tx_stats.missed_tx += missed_tx;
 	u64_stats_update_end(&tx_ring->syncp);
 
 	return rc;
@@ -4556,6 +4556,9 @@ static void ena_keep_alive_wd(void *adapter_data,
 	tx_drops = ((u64)desc->tx_drops_high << 32) | desc->tx_drops_low;
 
 	u64_stats_update_begin(&adapter->syncp);
+	/* These stats are accumulated by the device, so the counters indicate
+	 * all drops since last reset.
+	 */
 	adapter->dev_stats.rx_drops = rx_drops;
 	adapter->dev_stats.tx_drops = tx_drops;
 	u64_stats_update_end(&adapter->syncp);

From d1fb55592909ea249af70170c7a52e637009564d Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 19 Aug 2020 21:52:38 +0200
Subject: [PATCH 37/74] netlink: fix state reallocation in policy export

Evidently, when I did this previously, we didn't have more than
10 policies and didn't run into the reallocation path, because
it's missing a memset() for the unused policies. Fix that.

Fixes: d07dcf9aadd6 ("netlink: add infrastructure to expose policies to userspace")
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netlink/policy.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/netlink/policy.c b/net/netlink/policy.c
index f6491853c79715..2b3e26f7496f5f 100644
--- a/net/netlink/policy.c
+++ b/net/netlink/policy.c
@@ -51,6 +51,9 @@ static int add_policy(struct nl_policy_dump **statep,
 	if (!state)
 		return -ENOMEM;
 
+	memset(&state->policies[state->n_alloc], 0,
+	       flex_array_size(state, policies, n_alloc - state->n_alloc));
+
 	state->policies[state->n_alloc].policy = policy;
 	state->policies[state->n_alloc].maxtype = maxtype;
 	state->n_alloc = n_alloc;

From 957ff4278e0db34f56c2bc121fdd6393e4523ef2 Mon Sep 17 00:00:00 2001
From: Min Li <min.li.xe@renesas.com>
Date: Tue, 18 Aug 2020 10:41:22 -0400
Subject: [PATCH 38/74] ptp: ptp_clockmatrix: use i2c_master_send for i2c write

The old code for i2c write would break on some controllers, which fails
at handling Repeated Start Condition. So we will just use i2c_master_send
to handle write in one transanction.

Changes since v1:
- Remove indentation change

Signed-off-by: Min Li <min.li.xe@renesas.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/ptp/ptp_clockmatrix.c | 56 +++++++++++++++++++++++++++--------
 drivers/ptp/ptp_clockmatrix.h |  2 ++
 2 files changed, 45 insertions(+), 13 deletions(-)

diff --git a/drivers/ptp/ptp_clockmatrix.c b/drivers/ptp/ptp_clockmatrix.c
index 73aaae5574ed56..e020faff7da531 100644
--- a/drivers/ptp/ptp_clockmatrix.c
+++ b/drivers/ptp/ptp_clockmatrix.c
@@ -142,16 +142,15 @@ static int idtcm_strverscmp(const char *ver1, const char *ver2)
 	return result;
 }
 
-static int idtcm_xfer(struct idtcm *idtcm,
-		      u8 regaddr,
-		      u8 *buf,
-		      u16 count,
-		      bool write)
+static int idtcm_xfer_read(struct idtcm *idtcm,
+			   u8 regaddr,
+			   u8 *buf,
+			   u16 count)
 {
 	struct i2c_client *client = idtcm->client;
 	struct i2c_msg msg[2];
 	int cnt;
-	char *fmt = "i2c_transfer failed at %d in %s for %s, at addr: %04X!\n";
+	char *fmt = "i2c_transfer failed at %d in %s, at addr: %04X!\n";
 
 	msg[0].addr = client->addr;
 	msg[0].flags = 0;
@@ -159,7 +158,7 @@ static int idtcm_xfer(struct idtcm *idtcm,
 	msg[0].buf = &regaddr;
 
 	msg[1].addr = client->addr;
-	msg[1].flags = write ? 0 : I2C_M_RD;
+	msg[1].flags = I2C_M_RD;
 	msg[1].len = count;
 	msg[1].buf = buf;
 
@@ -170,7 +169,6 @@ static int idtcm_xfer(struct idtcm *idtcm,
 			fmt,
 			__LINE__,
 			__func__,
-			write ? "write" : "read",
 			regaddr);
 		return cnt;
 	} else if (cnt != 2) {
@@ -182,6 +180,37 @@ static int idtcm_xfer(struct idtcm *idtcm,
 	return 0;
 }
 
+static int idtcm_xfer_write(struct idtcm *idtcm,
+			    u8 regaddr,
+			    u8 *buf,
+			    u16 count)
+{
+	struct i2c_client *client = idtcm->client;
+	/* we add 1 byte for device register */
+	u8 msg[IDTCM_MAX_WRITE_COUNT + 1];
+	int cnt;
+	char *fmt = "i2c_master_send failed at %d in %s, at addr: %04X!\n";
+
+	if (count > IDTCM_MAX_WRITE_COUNT)
+		return -EINVAL;
+
+	msg[0] = regaddr;
+	memcpy(&msg[1], buf, count);
+
+	cnt = i2c_master_send(client, msg, count + 1);
+
+	if (cnt < 0) {
+		dev_err(&client->dev,
+			fmt,
+			__LINE__,
+			__func__,
+			regaddr);
+		return cnt;
+	}
+
+	return 0;
+}
+
 static int idtcm_page_offset(struct idtcm *idtcm, u8 val)
 {
 	u8 buf[4];
@@ -195,7 +224,7 @@ static int idtcm_page_offset(struct idtcm *idtcm, u8 val)
 	buf[2] = 0x10;
 	buf[3] = 0x20;
 
-	err = idtcm_xfer(idtcm, PAGE_ADDR, buf, sizeof(buf), 1);
+	err = idtcm_xfer_write(idtcm, PAGE_ADDR, buf, sizeof(buf));
 
 	if (err) {
 		idtcm->page_offset = 0xff;
@@ -223,11 +252,12 @@ static int _idtcm_rdwr(struct idtcm *idtcm,
 	err = idtcm_page_offset(idtcm, hi);
 
 	if (err)
-		goto out;
+		return err;
 
-	err = idtcm_xfer(idtcm, lo, buf, count, write);
-out:
-	return err;
+	if (write)
+		return idtcm_xfer_write(idtcm, lo, buf, count);
+
+	return idtcm_xfer_read(idtcm, lo, buf, count);
 }
 
 static int idtcm_read(struct idtcm *idtcm,
diff --git a/drivers/ptp/ptp_clockmatrix.h b/drivers/ptp/ptp_clockmatrix.h
index ffae56c5d97fab..82840d72364adb 100644
--- a/drivers/ptp/ptp_clockmatrix.h
+++ b/drivers/ptp/ptp_clockmatrix.h
@@ -55,6 +55,8 @@
 
 #define PEROUT_ENABLE_OUTPUT_MASK		(0xdeadbeef)
 
+#define IDTCM_MAX_WRITE_COUNT			(512)
+
 /* Values of DPLL_N.DPLL_MODE.PLL_MODE */
 enum pll_mode {
 	PLL_MODE_MIN = 0,

From 9553b62c1dd27df67ab2f52ec8a3bc3501887619 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 18 Aug 2020 18:14:39 +0200
Subject: [PATCH 39/74] net: atlantic: Use readx_poll_timeout() for large
 timeout

Commit
   8dcf2ad39fdb2 ("net: atlantic: add hwmon getter for MAC temperature")

implemented a read callback with an udelay(10000U). This fails to
compile on ARM because the delay is >1ms. I doubt that it is needed to
spin for 10ms even if possible on x86.

>From looking at the code, the context appears to be preemptible so using
usleep() should work and avoid busy spinning.

Use readx_poll_timeout() in the poll loop.

Fixes: 8dcf2ad39fdb2 ("net: atlantic: add hwmon getter for MAC temperature")
Cc: Mark Starovoytov <mstarovoitov@marvell.com>
Cc: Igor Russkikh <irusskikh@marvell.com>
Signed-off-by: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
Acked-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c
index 16a944707ba908..8941ac4df9e375 100644
--- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c
+++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c
@@ -1631,8 +1631,8 @@ static int hw_atl_b0_get_mac_temp(struct aq_hw_s *self, u32 *temp)
 		hw_atl_ts_reset_set(self, 0);
 	}
 
-	err = readx_poll_timeout_atomic(hw_atl_b0_ts_ready_and_latch_high_get,
-					self, val, val == 1, 10000U, 500000U);
+	err = readx_poll_timeout(hw_atl_b0_ts_ready_and_latch_high_get, self,
+				 val, val == 1, 10000U, 500000U);
 	if (err)
 		return err;
 

From cf96d977381d4a23957bade2ddf1c420b74a26b6 Mon Sep 17 00:00:00 2001
From: Wang Hai <wanghai38@huawei.com>
Date: Wed, 19 Aug 2020 10:33:09 +0800
Subject: [PATCH 40/74] net: gemini: Fix missing free_netdev() in error path of
 gemini_ethernet_port_probe()

Replace alloc_etherdev_mq with devm_alloc_etherdev_mqs. In this way,
when probe fails, netdev can be freed automatically.

Fixes: 4d5ae32f5e1e ("net: ethernet: Add a driver for Gemini gigabit ethernet")
Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: Wang Hai <wanghai38@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/cortina/gemini.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/cortina/gemini.c b/drivers/net/ethernet/cortina/gemini.c
index 66e67b24a887c5..62e271aea4a500 100644
--- a/drivers/net/ethernet/cortina/gemini.c
+++ b/drivers/net/ethernet/cortina/gemini.c
@@ -2389,7 +2389,7 @@ static int gemini_ethernet_port_probe(struct platform_device *pdev)
 
 	dev_info(dev, "probe %s ID %d\n", dev_name(dev), id);
 
-	netdev = alloc_etherdev_mq(sizeof(*port), TX_QUEUE_NUM);
+	netdev = devm_alloc_etherdev_mqs(dev, sizeof(*port), TX_QUEUE_NUM, TX_QUEUE_NUM);
 	if (!netdev) {
 		dev_err(dev, "Can't allocate ethernet device #%d\n", id);
 		return -ENOMEM;
@@ -2521,7 +2521,6 @@ static int gemini_ethernet_port_probe(struct platform_device *pdev)
 	}
 
 	port->netdev = NULL;
-	free_netdev(netdev);
 	return ret;
 }
 
@@ -2530,7 +2529,6 @@ static int gemini_ethernet_port_remove(struct platform_device *pdev)
 	struct gemini_ethernet_port *port = platform_get_drvdata(pdev);
 
 	gemini_port_remove(port);
-	free_netdev(port->netdev);
 	return 0;
 }
 

From e17a7c0e0aebb956719ce2a8465f649859c2da7d Mon Sep 17 00:00:00 2001
From: Frederic Barrat <fbarrat@linux.ibm.com>
Date: Wed, 19 Aug 2020 15:07:41 +0200
Subject: [PATCH 41/74] powerpc/powernv/pci: Fix possible crash when releasing
 DMA resources

Fix a typo introduced during recent code cleanup, which could lead to
silently not freeing resources or an oops message (on PCI hotplug or
CAPI reset).

Only impacts ioda2, the code path for ioda1 is correct.

Fixes: 01e12629af4e ("powerpc/powernv/pci: Add explicit tracking of the DMA setup state")
Signed-off-by: Frederic Barrat <fbarrat@linux.ibm.com>
Reviewed-by: Oliver O'Halloran <oohall@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200819130741.16769-1-fbarrat@linux.ibm.com
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index c9c25fb0783c4c..023a4f987bb22d 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2705,7 +2705,7 @@ void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe)
 	struct iommu_table *tbl = pe->table_group.tables[0];
 	int64_t rc;
 
-	if (pe->dma_setup_done)
+	if (!pe->dma_setup_done)
 		return;
 
 	rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0);

From c8502eb2d43b6b9b1dc382299a4d37031be63876 Mon Sep 17 00:00:00 2001
From: Arvind Sankar <nivedita@alum.mit.edu>
Date: Fri, 17 Jul 2020 15:45:26 -0400
Subject: [PATCH 42/74] efi/x86: Mark kernel rodata non-executable for mixed
 mode

When remapping the kernel rodata section RO in the EFI pagetables, the
protection flags that were used for the text section are being reused,
but the rodata section should not be marked executable.

Cc: <stable@vger.kernel.org>
Signed-off-by: Arvind Sankar <nivedita@alum.mit.edu>
Link: https://lore.kernel.org/r/20200717194526.3452089-1-nivedita@alum.mit.edu
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 arch/x86/platform/efi/efi_64.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 413583f904a695..6af4da1149bacf 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -259,6 +259,8 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
 	npages = (__end_rodata - __start_rodata) >> PAGE_SHIFT;
 	rodata = __pa(__start_rodata);
 	pfn = rodata >> PAGE_SHIFT;
+
+	pf = _PAGE_NX | _PAGE_ENC;
 	if (kernel_map_pages_in_pgd(pgd, pfn, rodata, npages, pf)) {
 		pr_err("Failed to map kernel rodata 1:1\n");
 		return 1;

From 98086df8b70c06234a8f4290c46064e44dafa0ed Mon Sep 17 00:00:00 2001
From: Li Heng <liheng40@huawei.com>
Date: Mon, 20 Jul 2020 15:22:18 +0800
Subject: [PATCH 43/74] efi: add missed destroy_workqueue when efisubsys_init
 fails

destroy_workqueue() should be called to destroy efi_rts_wq
when efisubsys_init() init resources fails.

Cc: <stable@vger.kernel.org>
Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: Li Heng <liheng40@huawei.com>
Link: https://lore.kernel.org/r/1595229738-10087-1-git-send-email-liheng40@huawei.com
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 drivers/firmware/efi/efi.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index fdd1db025dbfda..3aa07c3b513697 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -381,6 +381,7 @@ static int __init efisubsys_init(void)
 	efi_kobj = kobject_create_and_add("efi", firmware_kobj);
 	if (!efi_kobj) {
 		pr_err("efi: Firmware registration failed.\n");
+		destroy_workqueue(efi_rts_wq);
 		return -ENOMEM;
 	}
 
@@ -424,6 +425,7 @@ static int __init efisubsys_init(void)
 		generic_ops_unregister();
 err_put:
 	kobject_put(efi_kobj);
+	destroy_workqueue(efi_rts_wq);
 	return error;
 }
 

From 1fd9717d75df68e3c3509b8e7b1138ca63472f88 Mon Sep 17 00:00:00 2001
From: Arvind Sankar <nivedita@alum.mit.edu>
Date: Sat, 25 Jul 2020 11:59:16 -0400
Subject: [PATCH 44/74] efi/libstub: Stop parsing arguments at "--"

Arguments after "--" are arguments for init, not for the kernel.

Cc: <stable@vger.kernel.org>
Signed-off-by: Arvind Sankar <nivedita@alum.mit.edu>
Link: https://lore.kernel.org/r/20200725155916.1376773-1-nivedita@alum.mit.edu
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 drivers/firmware/efi/libstub/efi-stub-helper.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c
index 6bca70bbb43d0d..37ff34e7b85e44 100644
--- a/drivers/firmware/efi/libstub/efi-stub-helper.c
+++ b/drivers/firmware/efi/libstub/efi-stub-helper.c
@@ -201,6 +201,8 @@ efi_status_t efi_parse_options(char const *cmdline)
 		char *param, *val;
 
 		str = next_arg(str, &param, &val);
+		if (!val && !strcmp(param, "--"))
+			break;
 
 		if (!strcmp(param, "nokaslr")) {
 			efi_nokaslr = true;

From a37ca6a2af9df2972372b918f09390c9303acfbd Mon Sep 17 00:00:00 2001
From: Arvind Sankar <nivedita@alum.mit.edu>
Date: Wed, 29 Jul 2020 15:33:00 -0400
Subject: [PATCH 45/74] efi/libstub: Handle NULL cmdline

Treat a NULL cmdline the same as empty. Although this is unlikely to
happen in practice, the x86 kernel entry does check for NULL cmdline and
handles it, so do it here as well.

Cc: <stable@vger.kernel.org>
Signed-off-by: Arvind Sankar <nivedita@alum.mit.edu>
Link: https://lore.kernel.org/r/20200729193300.598448-1-nivedita@alum.mit.edu
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 drivers/firmware/efi/libstub/efi-stub-helper.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c
index 37ff34e7b85e44..f53652a3a10609 100644
--- a/drivers/firmware/efi/libstub/efi-stub-helper.c
+++ b/drivers/firmware/efi/libstub/efi-stub-helper.c
@@ -187,10 +187,14 @@ int efi_printk(const char *fmt, ...)
  */
 efi_status_t efi_parse_options(char const *cmdline)
 {
-	size_t len = strlen(cmdline) + 1;
+	size_t len;
 	efi_status_t status;
 	char *str, *buf;
 
+	if (!cmdline)
+		return EFI_SUCCESS;
+
+	len = strlen(cmdline) + 1;
 	status = efi_bs_call(allocate_pool, EFI_LOADER_DATA, len, (void **)&buf);
 	if (status != EFI_SUCCESS)
 		return status;

From 8a8a3237a78cbc0557f0eb16a89f16d616323e99 Mon Sep 17 00:00:00 2001
From: Arvind Sankar <nivedita@alum.mit.edu>
Date: Thu, 13 Aug 2020 14:58:11 -0400
Subject: [PATCH 46/74] efi/libstub: Handle unterminated cmdline

Make the command line parsing more robust, by handling the case it is
not NUL-terminated.

Use strnlen instead of strlen, and make sure that the temporary copy is
NUL-terminated before parsing.

Cc: <stable@vger.kernel.org>
Signed-off-by: Arvind Sankar <nivedita@alum.mit.edu>
Link: https://lore.kernel.org/r/20200813185811.554051-4-nivedita@alum.mit.edu
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 drivers/firmware/efi/libstub/efi-stub-helper.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c
index f53652a3a10609..f735db55adc037 100644
--- a/drivers/firmware/efi/libstub/efi-stub-helper.c
+++ b/drivers/firmware/efi/libstub/efi-stub-helper.c
@@ -194,12 +194,14 @@ efi_status_t efi_parse_options(char const *cmdline)
 	if (!cmdline)
 		return EFI_SUCCESS;
 
-	len = strlen(cmdline) + 1;
+	len = strnlen(cmdline, COMMAND_LINE_SIZE - 1) + 1;
 	status = efi_bs_call(allocate_pool, EFI_LOADER_DATA, len, (void **)&buf);
 	if (status != EFI_SUCCESS)
 		return status;
 
-	str = skip_spaces(memcpy(buf, cmdline, len));
+	memcpy(buf, cmdline, len - 1);
+	buf[len - 1] = '\0';
+	str = skip_spaces(buf);
 
 	while (*str) {
 		char *param, *val;

From 39ada88f9c862c1ff8929ff67e0d1199c7af73fe Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Thu, 13 Aug 2020 19:38:17 +0200
Subject: [PATCH 47/74] efi/x86: Move 32-bit code into efi_32.c

Now that the old memmap code has been removed, some code that was left
behind in arch/x86/platform/efi/efi.c is only used for 32-bit builds,
which means it can live in efi_32.c as well. So move it over.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 arch/x86/include/asm/efi.h     | 10 -----
 arch/x86/platform/efi/efi.c    | 69 ----------------------------------
 arch/x86/platform/efi/efi_32.c | 44 ++++++++++++++++++----
 3 files changed, 37 insertions(+), 86 deletions(-)

diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index b9c2667ac46cc6..bc9758ef292ef1 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -81,11 +81,8 @@ extern unsigned long efi_fw_vendor, efi_config_table;
 	kernel_fpu_end();						\
 })
 
-
 #define arch_efi_call_virt(p, f, args...)	p->f(args)
 
-#define efi_ioremap(addr, size, type, attr)	ioremap_cache(addr, size)
-
 #else /* !CONFIG_X86_32 */
 
 #define EFI_LOADER_SIGNATURE	"EL64"
@@ -125,9 +122,6 @@ struct efi_scratch {
 	kernel_fpu_end();						\
 })
 
-extern void __iomem *__init efi_ioremap(unsigned long addr, unsigned long size,
-					u32 type, u64 attribute);
-
 #ifdef CONFIG_KASAN
 /*
  * CONFIG_KASAN may redefine memset to __memset.  __memset function is present
@@ -143,17 +137,13 @@ extern void __iomem *__init efi_ioremap(unsigned long addr, unsigned long size,
 #endif /* CONFIG_X86_32 */
 
 extern struct efi_scratch efi_scratch;
-extern void __init efi_set_executable(efi_memory_desc_t *md, bool executable);
 extern int __init efi_memblock_x86_reserve_range(void);
 extern void __init efi_print_memmap(void);
-extern void __init efi_memory_uc(u64 addr, unsigned long size);
 extern void __init efi_map_region(efi_memory_desc_t *md);
 extern void __init efi_map_region_fixed(efi_memory_desc_t *md);
 extern void efi_sync_low_kernel_mappings(void);
 extern int __init efi_alloc_page_tables(void);
 extern int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages);
-extern void __init old_map_region(efi_memory_desc_t *md);
-extern void __init runtime_code_page_mkexec(void);
 extern void __init efi_runtime_update_mappings(void);
 extern void __init efi_dump_pagetable(void);
 extern void __init efi_apply_memmap_quirks(void);
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index f6ea8f1a9d57ae..d37ebe6e70d7a1 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -49,7 +49,6 @@
 #include <asm/efi.h>
 #include <asm/e820/api.h>
 #include <asm/time.h>
-#include <asm/set_memory.h>
 #include <asm/tlbflush.h>
 #include <asm/x86_init.h>
 #include <asm/uv/uv.h>
@@ -496,74 +495,6 @@ void __init efi_init(void)
 		efi_print_memmap();
 }
 
-#if defined(CONFIG_X86_32)
-
-void __init efi_set_executable(efi_memory_desc_t *md, bool executable)
-{
-	u64 addr, npages;
-
-	addr = md->virt_addr;
-	npages = md->num_pages;
-
-	memrange_efi_to_native(&addr, &npages);
-
-	if (executable)
-		set_memory_x(addr, npages);
-	else
-		set_memory_nx(addr, npages);
-}
-
-void __init runtime_code_page_mkexec(void)
-{
-	efi_memory_desc_t *md;
-
-	/* Make EFI runtime service code area executable */
-	for_each_efi_memory_desc(md) {
-		if (md->type != EFI_RUNTIME_SERVICES_CODE)
-			continue;
-
-		efi_set_executable(md, true);
-	}
-}
-
-void __init efi_memory_uc(u64 addr, unsigned long size)
-{
-	unsigned long page_shift = 1UL << EFI_PAGE_SHIFT;
-	u64 npages;
-
-	npages = round_up(size, page_shift) / page_shift;
-	memrange_efi_to_native(&addr, &npages);
-	set_memory_uc(addr, npages);
-}
-
-void __init old_map_region(efi_memory_desc_t *md)
-{
-	u64 start_pfn, end_pfn, end;
-	unsigned long size;
-	void *va;
-
-	start_pfn = PFN_DOWN(md->phys_addr);
-	size	  = md->num_pages << PAGE_SHIFT;
-	end	  = md->phys_addr + size;
-	end_pfn   = PFN_UP(end);
-
-	if (pfn_range_is_mapped(start_pfn, end_pfn)) {
-		va = __va(md->phys_addr);
-
-		if (!(md->attribute & EFI_MEMORY_WB))
-			efi_memory_uc((u64)(unsigned long)va, size);
-	} else
-		va = efi_ioremap(md->phys_addr, size,
-				 md->type, md->attribute);
-
-	md->virt_addr = (u64) (unsigned long) va;
-	if (!va)
-		pr_err("ioremap of 0x%llX failed!\n",
-		       (unsigned long long)md->phys_addr);
-}
-
-#endif
-
 /* Merge contiguous regions of the same type and attribute */
 static void __init efi_merge_regions(void)
 {
diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c
index 826ead67753da5..e06a199423c0fe 100644
--- a/arch/x86/platform/efi/efi_32.c
+++ b/arch/x86/platform/efi/efi_32.c
@@ -29,9 +29,35 @@
 #include <asm/io.h>
 #include <asm/desc.h>
 #include <asm/page.h>
+#include <asm/set_memory.h>
 #include <asm/tlbflush.h>
 #include <asm/efi.h>
 
+void __init efi_map_region(efi_memory_desc_t *md)
+{
+	u64 start_pfn, end_pfn, end;
+	unsigned long size;
+	void *va;
+
+	start_pfn	= PFN_DOWN(md->phys_addr);
+	size		= md->num_pages << PAGE_SHIFT;
+	end		= md->phys_addr + size;
+	end_pfn 	= PFN_UP(end);
+
+	if (pfn_range_is_mapped(start_pfn, end_pfn)) {
+		va = __va(md->phys_addr);
+
+		if (!(md->attribute & EFI_MEMORY_WB))
+			set_memory_uc((unsigned long)va, md->num_pages);
+	} else {
+		va = ioremap_cache(md->phys_addr, size);
+	}
+
+	md->virt_addr = (unsigned long)va;
+	if (!va)
+		pr_err("ioremap of 0x%llX failed!\n", md->phys_addr);
+}
+
 /*
  * To make EFI call EFI runtime service in physical addressing mode we need
  * prolog/epilog before/after the invocation to claim the EFI runtime service
@@ -58,11 +84,6 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
 	return 0;
 }
 
-void __init efi_map_region(efi_memory_desc_t *md)
-{
-	old_map_region(md);
-}
-
 void __init efi_map_region_fixed(efi_memory_desc_t *md) {}
 void __init parse_efi_setup(u64 phys_addr, u32 data_len) {}
 
@@ -107,6 +128,15 @@ efi_status_t __init efi_set_virtual_address_map(unsigned long memory_map_size,
 
 void __init efi_runtime_update_mappings(void)
 {
-	if (__supported_pte_mask & _PAGE_NX)
-		runtime_code_page_mkexec();
+	if (__supported_pte_mask & _PAGE_NX) {
+		efi_memory_desc_t *md;
+
+		/* Make EFI runtime service code area executable */
+		for_each_efi_memory_desc(md) {
+			if (md->type != EFI_RUNTIME_SERVICES_CODE)
+				continue;
+
+			set_memory_x(md->virt_addr, md->num_pages);
+		}
+	}
 }

From fb1201aececc59990b75ef59fca93ae4aa1e1444 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Mon, 17 Aug 2020 12:00:17 +0200
Subject: [PATCH 48/74] Documentation: efi: remove description of efi=old_map

The old EFI runtime region mapping logic that was kept around for some
time has finally been removed entirely, along with the SGI UV1 support
code that was its last remaining user. So remove any mention of the
efi=old_map command line parameter from the docs.

Cc: Jonathan Corbet <corbet@lwn.net>
Cc: linux-doc@vger.kernel.org
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 Documentation/admin-guide/kernel-parameters.txt | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index bdc1f33fd3d10a..a1068742a6df11 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1233,8 +1233,7 @@
 	efi=		[EFI]
 			Format: { "debug", "disable_early_pci_dma",
 				  "nochunk", "noruntime", "nosoftreserve",
-				  "novamap", "no_disable_early_pci_dma",
-				  "old_map" }
+				  "novamap", "no_disable_early_pci_dma" }
 			debug: enable misc debug output.
 			disable_early_pci_dma: disable the busmaster bit on all
 			PCI bridges while in the EFI boot stub.
@@ -1251,8 +1250,6 @@
 			novamap: do not call SetVirtualAddressMap().
 			no_disable_early_pci_dma: Leave the busmaster bit set
 			on all PCI bridges while in the EFI boot stub
-			old_map [X86-64]: switch to the old ioremap-based EFI
-			runtime services mapping. [Needs CONFIG_X86_UV=y]
 
 	efi_no_storage_paranoia [EFI; X86]
 			Using this parameter you can use more than 50% of

From 17899eaf88d689529b866371344c8f269ba79b5f Mon Sep 17 00:00:00 2001
From: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Date: Thu, 6 Aug 2020 08:46:32 -0400
Subject: [PATCH 49/74] powerpc/perf: Fix soft lockups due to missed interrupt
 accounting

Performance monitor interrupt handler checks if any counter has
overflown and calls record_and_restart() in core-book3s which invokes
perf_event_overflow() to record the sample information. Apart from
creating sample, perf_event_overflow() also does the interrupt and
period checks via perf_event_account_interrupt().

Currently we record information only if the SIAR (Sampled Instruction
Address Register) valid bit is set (using siar_valid() check) and
hence the interrupt check.

But it is possible that we do sampling for some events that are not
generating valid SIAR, and hence there is no chance to disable the
event if interrupts are more than max_samples_per_tick. This leads to
soft lockup.

Fix this by adding perf_event_account_interrupt() in the invalid SIAR
code path for a sampling event. ie if SIAR is invalid, just do
interrupt check and don't record the sample information.

Reported-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Tested-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/1596717992-7321-1-git-send-email-atrajeev@linux.vnet.ibm.com
---
 arch/powerpc/perf/core-book3s.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 00038650a007fd..93d20e1ed84514 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -2141,6 +2141,10 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
 
 		if (perf_event_overflow(event, &data, regs))
 			power_pmu_stop(event, 0);
+	} else if (period) {
+		/* Account for interrupt in case of invalid SIAR */
+		if (perf_event_account_interrupt(event))
+			power_pmu_stop(event, 0);
 	}
 }
 

From 90a9b102eddf6a3f987d15f4454e26a2532c1c98 Mon Sep 17 00:00:00 2001
From: Vasant Hegde <hegdevasant@linux.vnet.ibm.com>
Date: Thu, 20 Aug 2020 11:48:44 +0530
Subject: [PATCH 50/74] powerpc/pseries: Do not initiate shutdown when system
 is running on UPS

As per PAPR we have to look for both EPOW sensor value and event
modifier to identify the type of event and take appropriate action.

In LoPAPR v1.1 section 10.2.2 includes table 136 "EPOW Action Codes":

  SYSTEM_SHUTDOWN 3

  The system must be shut down. An EPOW-aware OS logs the EPOW error
  log information, then schedules the system to be shut down to begin
  after an OS defined delay internal (default is 10 minutes.)

Then in section 10.3.2.2.8 there is table 146 "Platform Event Log
Format, Version 6, EPOW Section", which includes the "EPOW Event
Modifier":

  For EPOW sensor value = 3
  0x01 = Normal system shutdown with no additional delay
  0x02 = Loss of utility power, system is running on UPS/Battery
  0x03 = Loss of system critical functions, system should be shutdown
  0x04 = Ambient temperature too high
  All other values = reserved

We have a user space tool (rtas_errd) on LPAR to monitor for
EPOW_SHUTDOWN_ON_UPS. Once it gets an event it initiates shutdown
after predefined time. It also starts monitoring for any new EPOW
events. If it receives "Power restored" event before predefined time
it will cancel the shutdown. Otherwise after predefined time it will
shutdown the system.

Commit 79872e35469b ("powerpc/pseries: All events of
EPOW_SYSTEM_SHUTDOWN must initiate shutdown") changed our handling of
the "on UPS/Battery" case, to immediately shutdown the system. This
breaks existing setups that rely on the userspace tool to delay
shutdown and let the system run on the UPS.

Fixes: 79872e35469b ("powerpc/pseries: All events of EPOW_SYSTEM_SHUTDOWN must initiate shutdown")
Cc: stable@vger.kernel.org # v4.0+
Signed-off-by: Vasant Hegde <hegdevasant@linux.vnet.ibm.com>
[mpe: Massage change log and add PAPR references]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200820061844.306460-1-hegdevasant@linux.vnet.ibm.com
---
 arch/powerpc/platforms/pseries/ras.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
index f3736fcd98fcb8..13c86a292c6d70 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -184,7 +184,6 @@ static void handle_system_shutdown(char event_modifier)
 	case EPOW_SHUTDOWN_ON_UPS:
 		pr_emerg("Loss of system power detected. System is running on"
 			 " UPS/battery. Check RTAS error log for details\n");
-		orderly_poweroff(true);
 		break;
 
 	case EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS:

From 1e891e513e16c145cc9b45b1fdb8bf4a4f2f9557 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= <toke@redhat.com>
Date: Wed, 19 Aug 2020 13:05:34 +0200
Subject: [PATCH 51/74] libbpf: Fix map index used in error message
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The error message emitted by bpf_object__init_user_btf_maps() was using the
wrong section ID.

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20200819110534.9058-1-toke@redhat.com
---
 tools/lib/bpf/libbpf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 5d20b2da442713..0ad0b0491e1f2a 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -2264,7 +2264,7 @@ static int bpf_object__init_user_btf_maps(struct bpf_object *obj, bool strict,
 		data = elf_getdata(scn, NULL);
 	if (!scn || !data) {
 		pr_warn("failed to get Elf_Data from map section %d (%s)\n",
-			obj->efile.maps_shndx, MAPS_ELF_SEC);
+			obj->efile.btf_maps_shndx, MAPS_ELF_SEC);
 		return -EINVAL;
 	}
 

From fb73ed5ef7e1f3b561dae517b9efdf58c153636b Mon Sep 17 00:00:00 2001
From: Kaige Li <likaige@loongson.cn>
Date: Thu, 20 Aug 2020 14:47:55 +0800
Subject: [PATCH 52/74] net: phy: mscc: Fix a couple of spelling mistakes
 "spcified" -> "specified"

There are a couple of spelling mistakes in comment text. Fix these.

Signed-off-by: Kaige Li <likaige@loongson.cn>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/mscc/mscc_main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/phy/mscc/mscc_main.c b/drivers/net/phy/mscc/mscc_main.c
index a4fbf3a4fa9739..6bc7406a1ce731 100644
--- a/drivers/net/phy/mscc/mscc_main.c
+++ b/drivers/net/phy/mscc/mscc_main.c
@@ -1738,13 +1738,13 @@ static int __phy_write_mcb_s6g(struct phy_device *phydev, u32 reg, u8 mcb,
 	return 0;
 }
 
-/* Trigger a read to the spcified MCB */
+/* Trigger a read to the specified MCB */
 static int phy_update_mcb_s6g(struct phy_device *phydev, u32 reg, u8 mcb)
 {
 	return __phy_write_mcb_s6g(phydev, reg, mcb, PHY_MCB_S6G_READ);
 }
 
-/* Trigger a write to the spcified MCB */
+/* Trigger a write to the specified MCB */
 static int phy_commit_mcb_s6g(struct phy_device *phydev, u32 reg, u8 mcb)
 {
 	return __phy_write_mcb_s6g(phydev, reg, mcb, PHY_MCB_S6G_WRITE);

From 3e659a82c45076e354d20db4b0776e716c6e7fe3 Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree@solarflare.com>
Date: Thu, 20 Aug 2020 11:47:19 +0100
Subject: [PATCH 53/74] sfc: fix build warnings on 32-bit

Truncation of DMA_BIT_MASK to 32-bit dma_addr_t is semantically safe,
 but the compiler was warning because it was happening implicitly.
Insert explicit casts to suppress the warnings.

Reported-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Edward Cree <ecree@solarflare.com>
Acked-by: Randy Dunlap <rdunlap@infradead.org> # build-tested
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/sfc/ef100.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/sfc/ef100.c b/drivers/net/ethernet/sfc/ef100.c
index 9729983f48409d..c54b7f8243f3ba 100644
--- a/drivers/net/ethernet/sfc/ef100.c
+++ b/drivers/net/ethernet/sfc/ef100.c
@@ -142,7 +142,7 @@ static int ef100_pci_parse_continue_entry(struct efx_nic *efx, int entry_locatio
 
 		/* Temporarily map new BAR. */
 		rc = efx_init_io(efx, bar,
-				 DMA_BIT_MASK(ESF_GZ_TX_SEND_ADDR_WIDTH),
+				 (dma_addr_t)DMA_BIT_MASK(ESF_GZ_TX_SEND_ADDR_WIDTH),
 				 pci_resource_len(efx->pci_dev, bar));
 		if (rc) {
 			netif_err(efx, probe, efx->net_dev,
@@ -160,7 +160,7 @@ static int ef100_pci_parse_continue_entry(struct efx_nic *efx, int entry_locatio
 
 		/* Put old BAR back. */
 		rc = efx_init_io(efx, previous_bar,
-				 DMA_BIT_MASK(ESF_GZ_TX_SEND_ADDR_WIDTH),
+				 (dma_addr_t)DMA_BIT_MASK(ESF_GZ_TX_SEND_ADDR_WIDTH),
 				 pci_resource_len(efx->pci_dev, previous_bar));
 		if (rc) {
 			netif_err(efx, probe, efx->net_dev,
@@ -334,7 +334,7 @@ static int ef100_pci_parse_xilinx_cap(struct efx_nic *efx, int vndr_cap,
 
 	/* Temporarily map BAR. */
 	rc = efx_init_io(efx, bar,
-			 DMA_BIT_MASK(ESF_GZ_TX_SEND_ADDR_WIDTH),
+			 (dma_addr_t)DMA_BIT_MASK(ESF_GZ_TX_SEND_ADDR_WIDTH),
 			 pci_resource_len(efx->pci_dev, bar));
 	if (rc) {
 		netif_err(efx, probe, efx->net_dev,
@@ -495,7 +495,7 @@ static int ef100_pci_probe(struct pci_dev *pci_dev,
 
 	/* Set up basic I/O (BAR mappings etc) */
 	rc = efx_init_io(efx, fcw.bar,
-			 DMA_BIT_MASK(ESF_GZ_TX_SEND_ADDR_WIDTH),
+			 (dma_addr_t)DMA_BIT_MASK(ESF_GZ_TX_SEND_ADDR_WIDTH),
 			 pci_resource_len(efx->pci_dev, fcw.bar));
 	if (rc)
 		goto fail;

From ce51f63e63c52a4e1eee4dd040fb0ba0af3b43ab Mon Sep 17 00:00:00 2001
From: Peilin Ye <yepeilin.cs@gmail.com>
Date: Thu, 20 Aug 2020 16:30:52 +0200
Subject: [PATCH 54/74] net/smc: Prevent kernel-infoleak in __smc_diag_dump()

__smc_diag_dump() is potentially copying uninitialized kernel stack memory
into socket buffers, since the compiler may leave a 4-byte hole near the
beginning of `struct smcd_diag_dmbinfo`. Fix it by initializing `dinfo`
with memset().

Fixes: 4b1b7d3b30a6 ("net/smc: add SMC-D diag support")
Suggested-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Peilin Ye <yepeilin.cs@gmail.com>
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_diag.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c
index e1f64f4ba23614..da9ba6d1679b78 100644
--- a/net/smc/smc_diag.c
+++ b/net/smc/smc_diag.c
@@ -170,13 +170,15 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb,
 	    (req->diag_ext & (1 << (SMC_DIAG_DMBINFO - 1))) &&
 	    !list_empty(&smc->conn.lgr->list)) {
 		struct smc_connection *conn = &smc->conn;
-		struct smcd_diag_dmbinfo dinfo = {
-			.linkid = *((u32 *)conn->lgr->id),
-			.peer_gid = conn->lgr->peer_gid,
-			.my_gid = conn->lgr->smcd->local_gid,
-			.token = conn->rmb_desc->token,
-			.peer_token = conn->peer_token
-		};
+		struct smcd_diag_dmbinfo dinfo;
+
+		memset(&dinfo, 0, sizeof(dinfo));
+
+		dinfo.linkid = *((u32 *)conn->lgr->id);
+		dinfo.peer_gid = conn->lgr->peer_gid;
+		dinfo.my_gid = conn->lgr->smcd->local_gid;
+		dinfo.token = conn->rmb_desc->token;
+		dinfo.peer_token = conn->peer_token;
 
 		if (nla_put(skb, SMC_DIAG_DMBINFO, sizeof(dinfo), &dinfo) < 0)
 			goto errout;

From 51f6463aacfbfd322bcaadc606da56acef644b05 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 19 Aug 2020 11:23:42 +0200
Subject: [PATCH 55/74] tools/resolve_btfids: Fix sections with wrong alignment

The data of compressed section should be aligned to 4
(for 32bit) or 8 (for 64 bit) bytes.

The binutils ld sets sh_addralign to 1, which makes libelf
fail with misaligned section error during the update as
reported by Jesper:

   FAILED elf_update(WRITE): invalid section alignment

While waiting for ld fix, we can fix compressed sections
sh_addralign value manually.

Adding warning in -vv mode when the fix is triggered:

  $ ./tools/bpf/resolve_btfids/resolve_btfids -vv vmlinux
  ...
  section(36) .comment, size 44, link 0, flags 30, type=1
  section(37) .debug_aranges, size 45684, link 0, flags 800, type=1
   - fixing wrong alignment sh_addralign 16, expected 8
  section(38) .debug_info, size 129104957, link 0, flags 800, type=1
   - fixing wrong alignment sh_addralign 1, expected 8
  section(39) .debug_abbrev, size 1152583, link 0, flags 800, type=1
   - fixing wrong alignment sh_addralign 1, expected 8
  section(40) .debug_line, size 7374522, link 0, flags 800, type=1
   - fixing wrong alignment sh_addralign 1, expected 8
  section(41) .debug_frame, size 702463, link 0, flags 800, type=1
  section(42) .debug_str, size 1017571, link 0, flags 830, type=1
   - fixing wrong alignment sh_addralign 1, expected 8
  section(43) .debug_loc, size 3019453, link 0, flags 800, type=1
   - fixing wrong alignment sh_addralign 1, expected 8
  section(44) .debug_ranges, size 1744583, link 0, flags 800, type=1
   - fixing wrong alignment sh_addralign 16, expected 8
  section(45) .symtab, size 2955888, link 46, flags 0, type=2
  section(46) .strtab, size 2613072, link 0, flags 0, type=3
  ...
  update ok for vmlinux

Another workaround is to disable compressed debug info data
CONFIG_DEBUG_INFO_COMPRESSED kernel option.

Fixes: fbbb68de80a4 ("bpf: Add resolve_btfids tool to resolve BTF IDs in ELF object")
Reported-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Yonghong Song <yhs@fb.com>
Cc: Mark Wielaard <mjw@redhat.com>
Cc: Nick Clifton <nickc@redhat.com>
Link: https://lore.kernel.org/bpf/20200819092342.259004-1-jolsa@kernel.org
---
 tools/bpf/resolve_btfids/main.c | 36 +++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c
index 4d9ecb97586224..0def0bb1f78327 100644
--- a/tools/bpf/resolve_btfids/main.c
+++ b/tools/bpf/resolve_btfids/main.c
@@ -233,6 +233,39 @@ static struct btf_id *add_symbol(struct rb_root *root, char *name, size_t size)
 	return btf_id__add(root, id, false);
 }
 
+/*
+ * The data of compressed section should be aligned to 4
+ * (for 32bit) or 8 (for 64 bit) bytes. The binutils ld
+ * sets sh_addralign to 1, which makes libelf fail with
+ * misaligned section error during the update:
+ *    FAILED elf_update(WRITE): invalid section alignment
+ *
+ * While waiting for ld fix, we fix the compressed sections
+ * sh_addralign value manualy.
+ */
+static int compressed_section_fix(Elf *elf, Elf_Scn *scn, GElf_Shdr *sh)
+{
+	int expected = gelf_getclass(elf) == ELFCLASS32 ? 4 : 8;
+
+	if (!(sh->sh_flags & SHF_COMPRESSED))
+		return 0;
+
+	if (sh->sh_addralign == expected)
+		return 0;
+
+	pr_debug2(" - fixing wrong alignment sh_addralign %u, expected %u\n",
+		  sh->sh_addralign, expected);
+
+	sh->sh_addralign = expected;
+
+	if (gelf_update_shdr(scn, sh) == 0) {
+		printf("FAILED cannot update section header: %s\n",
+			elf_errmsg(-1));
+		return -1;
+	}
+	return 0;
+}
+
 static int elf_collect(struct object *obj)
 {
 	Elf_Scn *scn = NULL;
@@ -309,6 +342,9 @@ static int elf_collect(struct object *obj)
 			obj->efile.idlist_shndx = idx;
 			obj->efile.idlist_addr  = sh.sh_addr;
 		}
+
+		if (compressed_section_fix(elf, scn, &sh))
+			return -1;
 	}
 
 	return 0;

From 5597432dde62befd3ab92e6ef9e073564e277ea8 Mon Sep 17 00:00:00 2001
From: Veronika Kabatova <vkabatov@redhat.com>
Date: Wed, 19 Aug 2020 18:07:10 +0200
Subject: [PATCH 56/74] selftests/bpf: Remove test_align leftovers

Calling generic selftests "make install" fails as rsync expects all
files from TEST_GEN_PROGS to be present. The binary is not generated
anymore (commit 3b09d27cc93d) so we can safely remove it from there
and also from gitignore.

Fixes: 3b09d27cc93d ("selftests/bpf: Move test_align under test_progs")
Signed-off-by: Veronika Kabatova <vkabatov@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Link: https://lore.kernel.org/bpf/20200819160710.1345956-1-vkabatov@redhat.com
---
 tools/testing/selftests/bpf/.gitignore | 1 -
 tools/testing/selftests/bpf/Makefile   | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index 1bb204cee853fc..9a0946ddb705ac 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -6,7 +6,6 @@ test_lpm_map
 test_tag
 FEATURE-DUMP.libbpf
 fixdep
-test_align
 test_dev_cgroup
 /test_progs*
 test_tcpbpf_user
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index a83b5827532f77..fc946b7ac288df 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -32,7 +32,7 @@ LDLIBS += -lcap -lelf -lz -lrt -lpthread
 
 # Order correspond to 'make run_tests' order
 TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \
-	test_align test_verifier_log test_dev_cgroup test_tcpbpf_user \
+	test_verifier_log test_dev_cgroup test_tcpbpf_user \
 	test_sock test_btf test_sockmap get_cgroup_id_user test_socket_cookie \
 	test_cgroup_storage \
 	test_netcnt test_tcpnotify_user test_sock_fields test_sysctl \

From c8a36f1945b2b1b3f9823b66fc2181dc069cf803 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Wed, 19 Aug 2020 22:28:41 -0700
Subject: [PATCH 57/74] bpf: xdp: Fix XDP mode when no mode flags specified

7f0a838254bd ("bpf, xdp: Maintain info on attached XDP BPF programs in net_device")
inadvertently changed which XDP mode is assumed when no mode flags are
specified explicitly. Previously, driver mode was preferred, if driver
supported it. If not, generic SKB mode was chosen. That commit changed default
to SKB mode always. This patch fixes the issue and restores the original
logic.

Fixes: 7f0a838254bd ("bpf, xdp: Maintain info on attached XDP BPF programs in net_device")
Reported-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Tested-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://lore.kernel.org/bpf/20200820052841.1559757-1-andriin@fb.com
---
 net/core/dev.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index b5d1129d83103b..d42c9ea0c3c037 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -8742,13 +8742,15 @@ struct bpf_xdp_link {
 	int flags;
 };
 
-static enum bpf_xdp_mode dev_xdp_mode(u32 flags)
+static enum bpf_xdp_mode dev_xdp_mode(struct net_device *dev, u32 flags)
 {
 	if (flags & XDP_FLAGS_HW_MODE)
 		return XDP_MODE_HW;
 	if (flags & XDP_FLAGS_DRV_MODE)
 		return XDP_MODE_DRV;
-	return XDP_MODE_SKB;
+	if (flags & XDP_FLAGS_SKB_MODE)
+		return XDP_MODE_SKB;
+	return dev->netdev_ops->ndo_bpf ? XDP_MODE_DRV : XDP_MODE_SKB;
 }
 
 static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode)
@@ -8896,7 +8898,7 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack
 		return -EINVAL;
 	}
 
-	mode = dev_xdp_mode(flags);
+	mode = dev_xdp_mode(dev, flags);
 	/* can't replace attached link */
 	if (dev_xdp_link(dev, mode)) {
 		NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link");
@@ -8984,7 +8986,7 @@ static int dev_xdp_detach_link(struct net_device *dev,
 
 	ASSERT_RTNL();
 
-	mode = dev_xdp_mode(link->flags);
+	mode = dev_xdp_mode(dev, link->flags);
 	if (dev_xdp_link(dev, mode) != link)
 		return -EINVAL;
 
@@ -9080,7 +9082,7 @@ static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
 		goto out_unlock;
 	}
 
-	mode = dev_xdp_mode(xdp_link->flags);
+	mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags);
 	bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode);
 	err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL,
 			      xdp_link->flags, new_prog);
@@ -9164,7 +9166,7 @@ int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
 int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 		      int fd, int expected_fd, u32 flags)
 {
-	enum bpf_xdp_mode mode = dev_xdp_mode(flags);
+	enum bpf_xdp_mode mode = dev_xdp_mode(dev, flags);
 	struct bpf_prog *new_prog = NULL, *old_prog = NULL;
 	int err;
 

From c210773d6c6f595f5922d56b7391fe343bc7310e Mon Sep 17 00:00:00 2001
From: Yauheni Kaliuta <yauheni.kaliuta@redhat.com>
Date: Thu, 20 Aug 2020 14:58:43 +0300
Subject: [PATCH 58/74] bpf: selftests: global_funcs: Check err_str before
 strstr

The error path in libbpf.c:load_program() has calls to pr_warn()
which ends up for global_funcs tests to
test_global_funcs.c:libbpf_debug_print().

For the tests with no struct test_def::err_str initialized with a
string, it causes call of strstr() with NULL as the second argument
and it segfaults.

Fix it by calling strstr() only for non-NULL err_str.

Signed-off-by: Yauheni Kaliuta <yauheni.kaliuta@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20200820115843.39454-1-yauheni.kaliuta@redhat.com
---
 tools/testing/selftests/bpf/prog_tests/test_global_funcs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c b/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c
index 25b068591e9a40..193002b14d7f66 100644
--- a/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c
+++ b/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c
@@ -19,7 +19,7 @@ static int libbpf_debug_print(enum libbpf_print_level level,
 	log_buf = va_arg(args, char *);
 	if (!log_buf)
 		goto out;
-	if (strstr(log_buf, err_str) == 0)
+	if (err_str && strstr(log_buf, err_str) == 0)
 		found = true;
 out:
 	printf(format, log_buf);

From 4d820543c54c47a2bd3c95ddbf52f83c89a219a0 Mon Sep 17 00:00:00 2001
From: Haiyang Zhang <haiyangz@microsoft.com>
Date: Thu, 20 Aug 2020 14:53:14 -0700
Subject: [PATCH 59/74] hv_netvsc: Remove "unlikely" from netvsc_select_queue

When using vf_ops->ndo_select_queue, the number of queues of VF is
usually bigger than the synthetic NIC. This condition may happen
often.
Remove "unlikely" from the comparison of ndev->real_num_tx_queues.

Fixes: b3bf5666a510 ("hv_netvsc: defer queue selection to VF")
Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hyperv/netvsc_drv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 787f17e2a97168..0029292cdb9f6d 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -367,7 +367,7 @@ static u16 netvsc_select_queue(struct net_device *ndev, struct sk_buff *skb,
 	}
 	rcu_read_unlock();
 
-	while (unlikely(txq >= ndev->real_num_tx_queues))
+	while (txq >= ndev->real_num_tx_queues)
 		txq -= ndev->real_num_tx_queues;
 
 	return txq;

From c3d897e01aef8ddc43149e4d661b86f823e3aae7 Mon Sep 17 00:00:00 2001
From: Haiyang Zhang <haiyangz@microsoft.com>
Date: Thu, 20 Aug 2020 14:53:15 -0700
Subject: [PATCH 60/74] hv_netvsc: Fix the queue_mapping in netvsc_vf_xmit()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

netvsc_vf_xmit() / dev_queue_xmit() will call VF NIC’s ndo_select_queue
or netdev_pick_tx() again. They will use skb_get_rx_queue() to get the
queue number, so the “skb->queue_mapping - 1” will be used. This may
cause the last queue of VF not been used.

Use skb_record_rx_queue() here, so that the skb_get_rx_queue() called
later will get the correct queue number, and VF will be able to use
all queues.

Fixes: b3bf5666a510 ("hv_netvsc: defer queue selection to VF")
Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hyperv/netvsc_drv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 0029292cdb9f6d..64b0a74c152314 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -502,7 +502,7 @@ static int netvsc_vf_xmit(struct net_device *net, struct net_device *vf_netdev,
 	int rc;
 
 	skb->dev = vf_netdev;
-	skb->queue_mapping = qdisc_skb_cb(skb)->slave_dev_queue_mapping;
+	skb_record_rx_queue(skb, qdisc_skb_cb(skb)->slave_dev_queue_mapping);
 
 	rc = dev_queue_xmit(skb);
 	if (likely(rc == NET_XMIT_SUCCESS || rc == NET_XMIT_CN)) {

From 272502fcb7cda01ab07fc2fcff82d1d2f73d43cc Mon Sep 17 00:00:00 2001
From: Mark Tomlinson <mark.tomlinson@alliedtelesis.co.nz>
Date: Wed, 19 Aug 2020 13:53:58 +1200
Subject: [PATCH 61/74] gre6: Fix reception with IP6_TNL_F_RCV_DSCP_COPY

When receiving an IPv4 packet inside an IPv6 GRE packet, and the
IP6_TNL_F_RCV_DSCP_COPY flag is set on the tunnel, the IPv4 header would
get corrupted. This is due to the common ip6_tnl_rcv() function assuming
that the inner header is always IPv6. This patch checks the tunnel
protocol for IPv4 inner packets, but still defaults to IPv6.

Fixes: 308edfdf1563 ("gre6: Cleanup GREv6 receive path, call common GRE functions")
Signed-off-by: Mark Tomlinson <mark.tomlinson@alliedtelesis.co.nz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_tunnel.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index f635914f42ec60..a0217e5bf3bc11 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -915,7 +915,15 @@ int ip6_tnl_rcv(struct ip6_tnl *t, struct sk_buff *skb,
 		struct metadata_dst *tun_dst,
 		bool log_ecn_err)
 {
-	return __ip6_tnl_rcv(t, skb, tpi, tun_dst, ip6ip6_dscp_ecn_decapsulate,
+	int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t,
+				    const struct ipv6hdr *ipv6h,
+				    struct sk_buff *skb);
+
+	dscp_ecn_decapsulate = ip6ip6_dscp_ecn_decapsulate;
+	if (tpi->proto == htons(ETH_P_IP))
+		dscp_ecn_decapsulate = ip4ip6_dscp_ecn_decapsulate;
+
+	return __ip6_tnl_rcv(t, skb, tpi, tun_dst, dscp_ecn_decapsulate,
 			     log_ecn_err);
 }
 EXPORT_SYMBOL(ip6_tnl_rcv);

From 41506bff84f1563e20e505ca9a0366a30ae2a879 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Wed, 19 Aug 2020 14:45:39 +0200
Subject: [PATCH 62/74] dt-bindings: net: renesas, ether: Improve schema
 validation

- Remove pinctrl consumer properties, as they are handled by core
    dt-schema,
  - Document missing properties,
  - Document missing PHY child node,
  - Add "additionalProperties: false".

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Rob Herring <robh@kernel.org>
Reviewed-by: Sergei Shtylyov <sergei.shtylyov@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../bindings/net/renesas,ether.yaml           | 22 +++++++++++++------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/Documentation/devicetree/bindings/net/renesas,ether.yaml b/Documentation/devicetree/bindings/net/renesas,ether.yaml
index 08678af5ed9364..8ce5ed8a58dd76 100644
--- a/Documentation/devicetree/bindings/net/renesas,ether.yaml
+++ b/Documentation/devicetree/bindings/net/renesas,ether.yaml
@@ -59,9 +59,15 @@ properties:
   clocks:
     maxItems: 1
 
-  pinctrl-0: true
+  power-domains:
+    maxItems: 1
+
+  resets:
+    maxItems: 1
 
-  pinctrl-names: true
+  phy-mode: true
+
+  phy-handle: true
 
   renesas,no-ether-link:
     type: boolean
@@ -74,6 +80,11 @@ properties:
       specify when the Ether LINK signal is active-low instead of normal
       active-high
 
+patternProperties:
+  "^ethernet-phy@[0-9a-f]$":
+    type: object
+    $ref: ethernet-phy.yaml#
+
 required:
   - compatible
   - reg
@@ -83,7 +94,8 @@ required:
   - '#address-cells'
   - '#size-cells'
   - clocks
-  - pinctrl-0
+
+additionalProperties: false
 
 examples:
   # Lager board
@@ -99,8 +111,6 @@ examples:
         clocks = <&mstp8_clks R8A7790_CLK_ETHER>;
         phy-mode = "rmii";
         phy-handle = <&phy1>;
-        pinctrl-0 = <&ether_pins>;
-        pinctrl-names = "default";
         renesas,ether-link-active-low;
         #address-cells = <1>;
         #size-cells = <0>;
@@ -109,7 +119,5 @@ examples:
             reg = <1>;
             interrupt-parent = <&irqc0>;
             interrupts = <0 IRQ_TYPE_LEVEL_LOW>;
-            pinctrl-0 = <&phy1_pins>;
-            pinctrl-names = "default";
         };
     };

From ab921f3cdbec01c68705a7ade8bec628d541fc2b Mon Sep 17 00:00:00 2001
From: David Laight <David.Laight@ACULAB.COM>
Date: Wed, 19 Aug 2020 14:40:52 +0000
Subject: [PATCH 63/74] net: sctp: Fix negotiation of the number of data
 streams.

The number of output and input streams was never being reduced, eg when
processing received INIT or INIT_ACK chunks.
The effect is that DATA chunks can be sent with invalid stream ids
and then discarded by the remote system.

Fixes: 2075e50caf5ea ("sctp: convert to genradix")
Signed-off-by: David Laight <david.laight@aculab.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/stream.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index bda2536dd740f7..6dc95dcc0ff4f0 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -88,12 +88,13 @@ static int sctp_stream_alloc_out(struct sctp_stream *stream, __u16 outcnt,
 	int ret;
 
 	if (outcnt <= stream->outcnt)
-		return 0;
+		goto out;
 
 	ret = genradix_prealloc(&stream->out, outcnt, gfp);
 	if (ret)
 		return ret;
 
+out:
 	stream->outcnt = outcnt;
 	return 0;
 }
@@ -104,12 +105,13 @@ static int sctp_stream_alloc_in(struct sctp_stream *stream, __u16 incnt,
 	int ret;
 
 	if (incnt <= stream->incnt)
-		return 0;
+		goto out;
 
 	ret = genradix_prealloc(&stream->in, incnt, gfp);
 	if (ret)
 		return ret;
 
+out:
 	stream->incnt = incnt;
 	return 0;
 }

From eda814b97dfb8d9f4808eb2f65af9bd3705c4cae Mon Sep 17 00:00:00 2001
From: Alaa Hleihel <alaa@mellanox.com>
Date: Wed, 19 Aug 2020 18:24:10 +0300
Subject: [PATCH 64/74] net/sched: act_ct: Fix skb double-free in
 tcf_ct_handle_fragments() error flow

tcf_ct_handle_fragments() shouldn't free the skb when ip_defrag() call
fails. Otherwise, we will cause a double-free bug.
In such cases, just return the error to the caller.

Fixes: b57dc7c13ea9 ("net/sched: Introduce action ct")
Signed-off-by: Alaa Hleihel <alaa@mellanox.com>
Reviewed-by: Roi Dayan <roid@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_ct.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index e6ad42b11835f8..2c3619165680fd 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -704,7 +704,7 @@ static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb,
 		err = ip_defrag(net, skb, user);
 		local_bh_enable();
 		if (err && err != -EINPROGRESS)
-			goto out_free;
+			return err;
 
 		if (!err) {
 			*defrag = true;

From f6db9096416209474090d64d8284e7c16c3d8873 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Thu, 20 Aug 2020 15:34:47 +0800
Subject: [PATCH 65/74] tipc: call rcu_read_lock() in tipc_aead_encrypt_done()

b->media->send_msg() requires rcu_read_lock(), as we can see
elsewhere in tipc,  tipc_bearer_xmit, tipc_bearer_xmit_skb
and tipc_bearer_bc_xmit().

Syzbot has reported this issue as:

  net/tipc/bearer.c:466 suspicious rcu_dereference_check() usage!
  Workqueue: cryptd cryptd_queue_worker
  Call Trace:
   tipc_l2_send_msg+0x354/0x420 net/tipc/bearer.c:466
   tipc_aead_encrypt_done+0x204/0x3a0 net/tipc/crypto.c:761
   cryptd_aead_crypt+0xe8/0x1d0 crypto/cryptd.c:739
   cryptd_queue_worker+0x118/0x1b0 crypto/cryptd.c:181
   process_one_work+0x94c/0x1670 kernel/workqueue.c:2269
   worker_thread+0x64c/0x1120 kernel/workqueue.c:2415
   kthread+0x3b5/0x4a0 kernel/kthread.c:291
   ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:293

So fix it by calling rcu_read_lock() in tipc_aead_encrypt_done()
for b->media->send_msg().

Fixes: fc1b6d6de220 ("tipc: introduce TIPC encryption & authentication")
Reported-by: syzbot+47bbc6b678d317cccbe0@syzkaller.appspotmail.com
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/crypto.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c
index 001bcb0f2480dc..c38babaa4e579b 100644
--- a/net/tipc/crypto.c
+++ b/net/tipc/crypto.c
@@ -757,10 +757,12 @@ static void tipc_aead_encrypt_done(struct crypto_async_request *base, int err)
 	switch (err) {
 	case 0:
 		this_cpu_inc(tx->stats->stat[STAT_ASYNC_OK]);
+		rcu_read_lock();
 		if (likely(test_bit(0, &b->up)))
 			b->media->send_msg(net, skb, b, &tx_ctx->dst);
 		else
 			kfree_skb(skb);
+		rcu_read_unlock();
 		break;
 	case -EINPROGRESS:
 		return;

From 541cebb51f3422d4f2c6cb95c1e5cc3dcc9e5021 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Fri, 21 Aug 2020 07:15:25 +0000
Subject: [PATCH 66/74] powerpc/32s: Fix module loading failure when
 VMALLOC_END is over 0xf0000000

In is_module_segment(), when VMALLOC_END is over 0xf0000000,
ALIGN(VMALLOC_END, SZ_256M) has value 0.

In that case, addr >= ALIGN(VMALLOC_END, SZ_256M) is always
true then is_module_segment() always returns false.

Use (ALIGN(VMALLOC_END, SZ_256M) - 1) which will have
value 0xffffffff and will be suitable for the comparison.

Fixes: c49643319715 ("powerpc/32s: Only leave NX unset on segments used for modules")
Reported-by: Andreas Schwab <schwab@linux-m68k.org>
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Tested-by: Andreas Schwab <schwab@linux-m68k.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/09fc73fe9c7423c6b4cf93f93df9bb0ed8eefab5.1597994047.git.christophe.leroy@csgroup.eu
---
 arch/powerpc/mm/book3s32/mmu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c
index 82ae9e06a77324..d426eaf76bb048 100644
--- a/arch/powerpc/mm/book3s32/mmu.c
+++ b/arch/powerpc/mm/book3s32/mmu.c
@@ -194,12 +194,12 @@ static bool is_module_segment(unsigned long addr)
 #ifdef MODULES_VADDR
 	if (addr < ALIGN_DOWN(MODULES_VADDR, SZ_256M))
 		return false;
-	if (addr >= ALIGN(MODULES_END, SZ_256M))
+	if (addr > ALIGN(MODULES_END, SZ_256M) - 1)
 		return false;
 #else
 	if (addr < ALIGN_DOWN(VMALLOC_START, SZ_256M))
 		return false;
-	if (addr >= ALIGN(VMALLOC_END, SZ_256M))
+	if (addr > ALIGN(VMALLOC_END, SZ_256M) - 1)
 		return false;
 #endif
 	return true;

From 64ef8f2c4791940d7f3945507b6a45c20d959260 Mon Sep 17 00:00:00 2001
From: Kajol Jain <kjain@linux.ibm.com>
Date: Fri, 21 Aug 2020 13:36:10 +0530
Subject: [PATCH 67/74] powerpc/perf/hv-24x7: Move cpumask file to top folder
 of hv-24x7 driver

Commit 792f73f747b8 ("powerpc/hv-24x7: Add sysfs files inside hv-24x7
device to show cpumask") added cpumask file as part of hv-24x7 driver
inside the interface folder. The cpumask file is supposed to be in the
top folder of the PMU driver in order to make hotplug work.

This patch fixes that issue and creates new group 'cpumask_attr_group'
to add cpumask file and make sure it added in top folder.

  command:# cat /sys/devices/hv_24x7/cpumask
  0

Fixes: 792f73f747b8 ("powerpc/hv-24x7: Add sysfs files inside hv-24x7 device to show cpumask")
Signed-off-by: Kajol Jain <kjain@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200821080610.123997-1-kjain@linux.ibm.com
---
 .../testing/sysfs-bus-event_source-devices-hv_24x7    |  2 +-
 arch/powerpc/perf/hv-24x7.c                           | 11 ++++++++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-bus-event_source-devices-hv_24x7 b/Documentation/ABI/testing/sysfs-bus-event_source-devices-hv_24x7
index f7e32f218f737f..e82fc37be80225 100644
--- a/Documentation/ABI/testing/sysfs-bus-event_source-devices-hv_24x7
+++ b/Documentation/ABI/testing/sysfs-bus-event_source-devices-hv_24x7
@@ -43,7 +43,7 @@ Description:	read only
 		This sysfs interface exposes the number of cores per chip
 		present in the system.
 
-What:		/sys/devices/hv_24x7/interface/cpumask
+What:		/sys/devices/hv_24x7/cpumask
 Date:		July 2020
 Contact:	Linux on PowerPC Developer List <linuxppc-dev@lists.ozlabs.org>
 Description:	read only
diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c
index cdb7bfbd157e0e..6e7e820508df7c 100644
--- a/arch/powerpc/perf/hv-24x7.c
+++ b/arch/powerpc/perf/hv-24x7.c
@@ -1128,6 +1128,15 @@ static struct bin_attribute *if_bin_attrs[] = {
 	NULL,
 };
 
+static struct attribute *cpumask_attrs[] = {
+	&dev_attr_cpumask.attr,
+	NULL,
+};
+
+static struct attribute_group cpumask_attr_group = {
+	.attrs = cpumask_attrs,
+};
+
 static struct attribute *if_attrs[] = {
 	&dev_attr_catalog_len.attr,
 	&dev_attr_catalog_version.attr,
@@ -1135,7 +1144,6 @@ static struct attribute *if_attrs[] = {
 	&dev_attr_sockets.attr,
 	&dev_attr_chipspersocket.attr,
 	&dev_attr_coresperchip.attr,
-	&dev_attr_cpumask.attr,
 	NULL,
 };
 
@@ -1151,6 +1159,7 @@ static const struct attribute_group *attr_groups[] = {
 	&event_desc_group,
 	&event_long_desc_group,
 	&if_group,
+	&cpumask_attr_group,
 	NULL,
 };
 

From 6a3ea3e68b8a8a26c4aaac03432ed92269c9a14e Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Fri, 21 Aug 2020 06:52:29 -0400
Subject: [PATCH 68/74] x86/entry/64: Do not use RDPID in paranoid entry to
 accomodate KVM

KVM has an optmization to avoid expensive MRS read/writes on
VMENTER/EXIT. It caches the MSR values and restores them either when
leaving the run loop, on preemption or when going out to user space.

The affected MSRs are not required for kernel context operations. This
changed with the recently introduced mechanism to handle FSGSBASE in the
paranoid entry code which has to retrieve the kernel GSBASE value by
accessing per CPU memory. The mechanism needs to retrieve the CPU number
and uses either LSL or RDPID if the processor supports it.

Unfortunately RDPID uses MSR_TSC_AUX which is in the list of cached and
lazily restored MSRs, which means between the point where the guest value
is written and the point of restore, MSR_TSC_AUX contains a random number.

If an NMI or any other exception which uses the paranoid entry path happens
in such a context, then RDPID returns the random guest MSR_TSC_AUX value.

As a consequence this reads from the wrong memory location to retrieve the
kernel GSBASE value. Kernel GS is used to for all regular this_cpu_*()
operations. If the GSBASE in the exception handler points to the per CPU
memory of a different CPU then this has the obvious consequences of data
corruption and crashes.

As the paranoid entry path is the only place which accesses MSR_TSX_AUX
(via RDPID) and the fallback via LSL is not significantly slower, remove
the RDPID alternative from the entry path and always use LSL.

The alternative would be to write MSR_TSC_AUX on every VMENTER and VMEXIT
which would be inflicting massive overhead on that code path.

[ tglx: Rewrote changelog ]

Fixes: eaad981291ee3 ("x86/entry/64: Introduce the FIND_PERCPU_BASE macro")
Reported-by: Tom Lendacky <thomas.lendacky@amd.com>
Debugged-by: Tom Lendacky <thomas.lendacky@amd.com>
Suggested-by: Andy Lutomirski <luto@kernel.org>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20200821105229.18938-1-pbonzini@redhat.com
---
 arch/x86/entry/calling.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 98e4d8886f11c5..ae9b0d4615b325 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -374,12 +374,14 @@ For 32-bit we have the following conventions - kernel is built with
  * Fetch the per-CPU GSBASE value for this processor and put it in @reg.
  * We normally use %gs for accessing per-CPU data, but we are setting up
  * %gs here and obviously can not use %gs itself to access per-CPU data.
+ *
+ * Do not use RDPID, because KVM loads guest's TSC_AUX on vm-entry and
+ * may not restore the host's value until the CPU returns to userspace.
+ * Thus the kernel would consume a guest's TSC_AUX if an NMI arrives
+ * while running KVM's run loop.
  */
 .macro GET_PERCPU_BASE reg:req
-	ALTERNATIVE \
-		"LOAD_CPU_AND_NODE_SEG_LIMIT \reg", \
-		"RDPID	\reg", \
-		X86_FEATURE_RDPID
+	LOAD_CPU_AND_NODE_SEG_LIMIT \reg
 	andq	$VDSO_CPUNODE_MASK, \reg
 	movq	__per_cpu_offset(, \reg, 8), \reg
 .endm

From d88d59b64ca35abae208e2781fdb45e69cbed56c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 19 Aug 2020 21:44:39 +0200
Subject: [PATCH 69/74] core/entry: Respect syscall number rewrites

The transcript of the x86 entry code to the generic version failed to
reload the syscall number from ptregs after ptrace and seccomp have run,
which both can modify the syscall number in ptregs. It returns the original
syscall number instead which is obviously not the right thing to do.

Reload the syscall number to fix that.

Fixes: 142781e108b1 ("entry: Provide generic syscall entry functionality")
Reported-by: Kyle Huey <me@kylehuey.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Kyle Huey <me@kylehuey.com>
Tested-by: Kees Cook <keescook@chromium.org>
Acked-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/87blj6ifo8.fsf@nanos.tec.linutronix.de
---
 kernel/entry/common.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 9852e0d62d95cc..fcae019158cad4 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -65,7 +65,8 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall,
 
 	syscall_enter_audit(regs, syscall);
 
-	return ret ? : syscall;
+	/* The above might have changed the syscall number */
+	return ret ? : syscall_get_nr(current, regs);
 }
 
 noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)

From 774d977abfd024e6f73484544b9abe5a5cd62de7 Mon Sep 17 00:00:00 2001
From: Tom Rix <trix@redhat.com>
Date: Fri, 21 Aug 2020 06:56:00 -0700
Subject: [PATCH 70/74] net: dsa: b53: check for timeout

clang static analysis reports this problem

b53_common.c:1583:13: warning: The left expression of the compound
  assignment is an uninitialized value. The computed value will
  also be garbage
        ent.port &= ~BIT(port);
        ~~~~~~~~ ^

ent is set by a successful call to b53_arl_read().  Unsuccessful
calls are caught by an switch statement handling specific returns.
b32_arl_read() calls b53_arl_op_wait() which fails with the
unhandled -ETIMEDOUT.

So add -ETIMEDOUT to the switch statement.  Because
b53_arl_op_wait() already prints out a message, do not add another
one.

Fixes: 1da6df85c6fb ("net: dsa: b53: Implement ARL add/del/dump operations")
Signed-off-by: Tom Rix <trix@redhat.com>
Acked-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/b53/b53_common.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index 6500179c2ca22e..0837ae0e0c5e21 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -1554,6 +1554,8 @@ static int b53_arl_op(struct b53_device *dev, int op, int port,
 		return ret;
 
 	switch (ret) {
+	case -ETIMEDOUT:
+		return ret;
 	case -ENOSPC:
 		dev_dbg(dev->dev, "{%pM,%.4d} no space left in ARL\n",
 			addr, vid);

From b16fc097bc283184cde40e5b30d15705e1590410 Mon Sep 17 00:00:00 2001
From: Tobias Klauser <tklauser@distanz.ch>
Date: Fri, 21 Aug 2020 15:36:42 +0200
Subject: [PATCH 71/74] bpf: Fix two typos in uapi/linux/bpf.h

Also remove trailing whitespaces in bpf_skb_get_tunnel_key example code.

Signed-off-by: Tobias Klauser <tklauser@distanz.ch>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200821133642.18870-1-tklauser@distanz.ch
---
 include/uapi/linux/bpf.h       | 10 +++++-----
 tools/include/uapi/linux/bpf.h | 10 +++++-----
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0480f893facd27..b6238b2209b71a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -767,7 +767,7 @@ union bpf_attr {
  *
  * 		Also, note that **bpf_trace_printk**\ () is slow, and should
  * 		only be used for debugging purposes. For this reason, a notice
- * 		bloc (spanning several lines) is printed to kernel logs and
+ * 		block (spanning several lines) is printed to kernel logs and
  * 		states that the helper should not be used "for production use"
  * 		the first time this helper is used (or more precisely, when
  * 		**trace_printk**\ () buffers are allocated). For passing values
@@ -1033,14 +1033,14 @@ union bpf_attr {
  *
  * 			int ret;
  * 			struct bpf_tunnel_key key = {};
- * 			
+ *
  * 			ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0);
  * 			if (ret < 0)
  * 				return TC_ACT_SHOT;	// drop packet
- * 			
+ *
  * 			if (key.remote_ipv4 != 0x0a000001)
  * 				return TC_ACT_SHOT;	// drop packet
- * 			
+ *
  * 			return TC_ACT_OK;		// accept packet
  *
  * 		This interface can also be used with all encapsulation devices
@@ -1147,7 +1147,7 @@ union bpf_attr {
  * 	Description
  * 		Retrieve the realm or the route, that is to say the
  * 		**tclassid** field of the destination for the *skb*. The
- * 		indentifier retrieved is a user-provided tag, similar to the
+ * 		identifier retrieved is a user-provided tag, similar to the
  * 		one used with the net_cls cgroup (see description for
  * 		**bpf_get_cgroup_classid**\ () helper), but here this tag is
  * 		held by a route (a destination entry), not by a task.
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 0480f893facd27..b6238b2209b71a 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -767,7 +767,7 @@ union bpf_attr {
  *
  * 		Also, note that **bpf_trace_printk**\ () is slow, and should
  * 		only be used for debugging purposes. For this reason, a notice
- * 		bloc (spanning several lines) is printed to kernel logs and
+ * 		block (spanning several lines) is printed to kernel logs and
  * 		states that the helper should not be used "for production use"
  * 		the first time this helper is used (or more precisely, when
  * 		**trace_printk**\ () buffers are allocated). For passing values
@@ -1033,14 +1033,14 @@ union bpf_attr {
  *
  * 			int ret;
  * 			struct bpf_tunnel_key key = {};
- * 			
+ *
  * 			ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0);
  * 			if (ret < 0)
  * 				return TC_ACT_SHOT;	// drop packet
- * 			
+ *
  * 			if (key.remote_ipv4 != 0x0a000001)
  * 				return TC_ACT_SHOT;	// drop packet
- * 			
+ *
  * 			return TC_ACT_OK;		// accept packet
  *
  * 		This interface can also be used with all encapsulation devices
@@ -1147,7 +1147,7 @@ union bpf_attr {
  * 	Description
  * 		Retrieve the realm or the route, that is to say the
  * 		**tclassid** field of the destination for the *skb*. The
- * 		indentifier retrieved is a user-provided tag, similar to the
+ * 		identifier retrieved is a user-provided tag, similar to the
  * 		one used with the net_cls cgroup (see description for
  * 		**bpf_get_cgroup_classid**\ () helper), but here this tag is
  * 		held by a route (a destination entry), not by a task.

From eeaac3634ee0e3f35548be35275efeca888e9b23 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Sat, 22 Aug 2020 15:06:36 +0300
Subject: [PATCH 72/74] net: nexthop: don't allow empty NHA_GROUP

Currently the nexthop code will use an empty NHA_GROUP attribute, but it
requires at least 1 entry in order to function properly. Otherwise we
end up derefencing null or random pointers all over the place due to not
having any nh_grp_entry members allocated, nexthop code relies on having at
least the first member present. Empty NHA_GROUP doesn't make any sense so
just disallow it.
Also add a WARN_ON for any future users of nexthop_create_group().

 BUG: kernel NULL pointer dereference, address: 0000000000000080
 #PF: supervisor read access in kernel mode
 #PF: error_code(0x0000) - not-present page
 PGD 0 P4D 0
 Oops: 0000 [#1] SMP
 CPU: 0 PID: 558 Comm: ip Not tainted 5.9.0-rc1+ #93
 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-2.fc32 04/01/2014
 RIP: 0010:fib_check_nexthop+0x4a/0xaa
 Code: 0f 84 83 00 00 00 48 c7 02 80 03 f7 81 c3 40 80 fe fe 75 12 b8 ea ff ff ff 48 85 d2 74 6b 48 c7 02 40 03 f7 81 c3 48 8b 40 10 <48> 8b 80 80 00 00 00 eb 36 80 78 1a 00 74 12 b8 ea ff ff ff 48 85
 RSP: 0018:ffff88807983ba00 EFLAGS: 00010213
 RAX: 0000000000000000 RBX: ffff88807983bc00 RCX: 0000000000000000
 RDX: ffff88807983bc00 RSI: 0000000000000000 RDI: ffff88807bdd0a80
 RBP: ffff88807983baf8 R08: 0000000000000dc0 R09: 000000000000040a
 R10: 0000000000000000 R11: ffff88807bdd0ae8 R12: 0000000000000000
 R13: 0000000000000000 R14: ffff88807bea3100 R15: 0000000000000001
 FS:  00007f10db393700(0000) GS:ffff88807dc00000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 0000000000000080 CR3: 000000007bd0f004 CR4: 00000000003706f0
 Call Trace:
  fib_create_info+0x64d/0xaf7
  fib_table_insert+0xf6/0x581
  ? __vma_adjust+0x3b6/0x4d4
  inet_rtm_newroute+0x56/0x70
  rtnetlink_rcv_msg+0x1e3/0x20d
  ? rtnl_calcit.isra.0+0xb8/0xb8
  netlink_rcv_skb+0x5b/0xac
  netlink_unicast+0xfa/0x17b
  netlink_sendmsg+0x334/0x353
  sock_sendmsg_nosec+0xf/0x3f
  ____sys_sendmsg+0x1a0/0x1fc
  ? copy_msghdr_from_user+0x4c/0x61
  ___sys_sendmsg+0x63/0x84
  ? handle_mm_fault+0xa39/0x11b5
  ? sockfd_lookup_light+0x72/0x9a
  __sys_sendmsg+0x50/0x6e
  do_syscall_64+0x54/0xbe
  entry_SYSCALL_64_after_hwframe+0x44/0xa9
 RIP: 0033:0x7f10dacc0bb7
 Code: d8 64 89 02 48 c7 c0 ff ff ff ff eb cd 66 0f 1f 44 00 00 8b 05 9a 4b 2b 00 85 c0 75 2e 48 63 ff 48 63 d2 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 01 c3 48 8b 15 b1 f2 2a 00 f7 d8 64 89 02 48
 RSP: 002b:00007ffcbe628bf8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
 RAX: ffffffffffffffda RBX: 00007ffcbe628f80 RCX: 00007f10dacc0bb7
 RDX: 0000000000000000 RSI: 00007ffcbe628c60 RDI: 0000000000000003
 RBP: 000000005f41099c R08: 0000000000000001 R09: 0000000000000008
 R10: 00000000000005e9 R11: 0000000000000246 R12: 0000000000000000
 R13: 0000000000000000 R14: 00007ffcbe628d70 R15: 0000563a86c6e440
 Modules linked in:
 CR2: 0000000000000080

CC: David Ahern <dsahern@gmail.com>
Fixes: 430a049190de ("nexthop: Add support for nexthop groups")
Reported-by: syzbot+a61aa19b0c14c8770bd9@syzkaller.appspotmail.com
Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/nexthop.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index cc8049b100b241..134e923822750a 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -446,7 +446,7 @@ static int nh_check_attr_group(struct net *net, struct nlattr *tb[],
 	unsigned int i, j;
 	u8 nhg_fdb = 0;
 
-	if (len & (sizeof(struct nexthop_grp) - 1)) {
+	if (!len || len & (sizeof(struct nexthop_grp) - 1)) {
 		NL_SET_ERR_MSG(extack,
 			       "Invalid length for nexthop group attribute");
 		return -EINVAL;
@@ -1187,6 +1187,9 @@ static struct nexthop *nexthop_create_group(struct net *net,
 	struct nexthop *nh;
 	int i;
 
+	if (WARN_ON(!num_nh))
+		return ERR_PTR(-EINVAL);
+
 	nh = nexthop_alloc();
 	if (!nh)
 		return ERR_PTR(-ENOMEM);

From a9ed4a6560b8562b7e2e2bed9527e88001f7b682 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Wed, 19 Aug 2020 17:12:17 +0100
Subject: [PATCH 73/74] epoll: Keep a reference on files added to the check
 list

When adding a new fd to an epoll, and that this new fd is an
epoll fd itself, we recursively scan the fds attached to it
to detect cycles, and add non-epool files to a "check list"
that gets subsequently parsed.

However, this check list isn't completely safe when deletions
can happen concurrently. To sidestep the issue, make sure that
a struct file placed on the check list sees its f_count increased,
ensuring that a concurrent deletion won't result in the file
disapearing from under our feet.

Cc: stable@vger.kernel.org
Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/eventpoll.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 12eebcdea9c8a2..196003d9242c60 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1994,9 +1994,11 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
 			 * not already there, and calling reverse_path_check()
 			 * during ep_insert().
 			 */
-			if (list_empty(&epi->ffd.file->f_tfile_llink))
+			if (list_empty(&epi->ffd.file->f_tfile_llink)) {
+				get_file(epi->ffd.file);
 				list_add(&epi->ffd.file->f_tfile_llink,
 					 &tfile_check_list);
+			}
 		}
 	}
 	mutex_unlock(&ep->mtx);
@@ -2040,6 +2042,7 @@ static void clear_tfile_check_list(void)
 		file = list_first_entry(&tfile_check_list, struct file,
 					f_tfile_llink);
 		list_del_init(&file->f_tfile_llink);
+		fput(file);
 	}
 	INIT_LIST_HEAD(&tfile_check_list);
 }
@@ -2204,13 +2207,17 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
 					clear_tfile_check_list();
 					goto error_tgt_fput;
 				}
-			} else
+			} else {
+				get_file(tf.file);
 				list_add(&tf.file->f_tfile_llink,
 							&tfile_check_list);
+			}
 			error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
 			if (error) {
 out_del:
 				list_del(&tf.file->f_tfile_llink);
+				if (!is_file_epoll(tf.file))
+					fput(tf.file);
 				goto error_tgt_fput;
 			}
 			if (is_file_epoll(tf.file)) {

From 52c479697c9b73f628140dcdfcd39ea302d05482 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 22 Aug 2020 18:25:52 -0400
Subject: [PATCH 74/74] do_epoll_ctl(): clean the failure exits up a bit

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/eventpoll.c | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 196003d9242c60..e0decff22ae273 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -2203,29 +2203,22 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
 			full_check = 1;
 			if (is_file_epoll(tf.file)) {
 				error = -ELOOP;
-				if (ep_loop_check(ep, tf.file) != 0) {
-					clear_tfile_check_list();
+				if (ep_loop_check(ep, tf.file) != 0)
 					goto error_tgt_fput;
-				}
 			} else {
 				get_file(tf.file);
 				list_add(&tf.file->f_tfile_llink,
 							&tfile_check_list);
 			}
 			error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
-			if (error) {
-out_del:
-				list_del(&tf.file->f_tfile_llink);
-				if (!is_file_epoll(tf.file))
-					fput(tf.file);
+			if (error)
 				goto error_tgt_fput;
-			}
 			if (is_file_epoll(tf.file)) {
 				tep = tf.file->private_data;
 				error = epoll_mutex_lock(&tep->mtx, 1, nonblock);
 				if (error) {
 					mutex_unlock(&ep->mtx);
-					goto out_del;
+					goto error_tgt_fput;
 				}
 			}
 		}
@@ -2246,8 +2239,6 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
 			error = ep_insert(ep, epds, tf.file, fd, full_check);
 		} else
 			error = -EEXIST;
-		if (full_check)
-			clear_tfile_check_list();
 		break;
 	case EPOLL_CTL_DEL:
 		if (epi)
@@ -2270,8 +2261,10 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
 	mutex_unlock(&ep->mtx);
 
 error_tgt_fput:
-	if (full_check)
+	if (full_check) {
+		clear_tfile_check_list();
 		mutex_unlock(&epmutex);
+	}
 
 	fdput(tf);
 error_fput: