From be052d137226e1cffbdb7f75e50410cee9df0d9a Mon Sep 17 00:00:00 2001 From: Andrea Parri Date: Wed, 31 Jan 2024 15:49:33 +0100 Subject: [PATCH 1/7] membarrier: riscv: Add full memory barrier in switch_mm() The membarrier system call requires a full memory barrier after storing to rq->curr, before going back to user-space. The barrier is only needed when switching between processes: the barrier is implied by mmdrop() when switching from kernel to userspace, and it's not needed when switching from userspace to kernel. Rely on the feature/mechanism ARCH_HAS_MEMBARRIER_CALLBACKS and on the primitive membarrier_arch_switch_mm(), already adopted by the PowerPC architecture, to insert the required barrier. Fixes: fab957c11efe2f ("RISC-V: Atomic and Locking Code") Signed-off-by: Andrea Parri Reviewed-by: Mathieu Desnoyers Link: https://lore.kernel.org/r/20240131144936.29190-2-parri.andrea@gmail.com Signed-off-by: Palmer Dabbelt --- MAINTAINERS | 2 +- arch/riscv/Kconfig | 1 + arch/riscv/include/asm/membarrier.h | 31 +++++++++++++++++++++++++++++ arch/riscv/mm/context.c | 2 ++ kernel/sched/core.c | 5 +++-- 5 files changed, 38 insertions(+), 3 deletions(-) create mode 100644 arch/riscv/include/asm/membarrier.h diff --git a/MAINTAINERS b/MAINTAINERS index 4a9e0aeb523a1..2cffc87be9c7f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13732,7 +13732,7 @@ M: Mathieu Desnoyers M: "Paul E. McKenney" L: linux-kernel@vger.kernel.org S: Supported -F: arch/powerpc/include/asm/membarrier.h +F: arch/*/include/asm/membarrier.h F: include/uapi/linux/membarrier.h F: kernel/sched/membarrier.c diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index c785a02005738..1df4afccc3818 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -27,6 +27,7 @@ config RISCV select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_GIGANTIC_PAGE select ARCH_HAS_KCOV + select ARCH_HAS_MEMBARRIER_CALLBACKS select ARCH_HAS_MMIOWB select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE select ARCH_HAS_PMEM_API diff --git a/arch/riscv/include/asm/membarrier.h b/arch/riscv/include/asm/membarrier.h new file mode 100644 index 0000000000000..6c016ebb5020a --- /dev/null +++ b/arch/riscv/include/asm/membarrier.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _ASM_RISCV_MEMBARRIER_H +#define _ASM_RISCV_MEMBARRIER_H + +static inline void membarrier_arch_switch_mm(struct mm_struct *prev, + struct mm_struct *next, + struct task_struct *tsk) +{ + /* + * Only need the full barrier when switching between processes. + * Barrier when switching from kernel to userspace is not + * required here, given that it is implied by mmdrop(). Barrier + * when switching from userspace to kernel is not needed after + * store to rq->curr. + */ + if (IS_ENABLED(CONFIG_SMP) && + likely(!(atomic_read(&next->membarrier_state) & + (MEMBARRIER_STATE_PRIVATE_EXPEDITED | + MEMBARRIER_STATE_GLOBAL_EXPEDITED)) || !prev)) + return; + + /* + * The membarrier system call requires a full memory barrier + * after storing to rq->curr, before going back to user-space. + * Matches a full barrier in the proximity of the membarrier + * system call entry. + */ + smp_mb(); +} + +#endif /* _ASM_RISCV_MEMBARRIER_H */ diff --git a/arch/riscv/mm/context.c b/arch/riscv/mm/context.c index 217fd4de61342..ba8eb3944687c 100644 --- a/arch/riscv/mm/context.c +++ b/arch/riscv/mm/context.c @@ -323,6 +323,8 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next, if (unlikely(prev == next)) return; + membarrier_arch_switch_mm(prev, next, task); + /* * Mark the current MM context as inactive, and the next as * active. This is at least used by the icache flushing diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 97571d390f184..9b406d9886541 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6679,8 +6679,9 @@ static void __sched notrace __schedule(unsigned int sched_mode) * * Here are the schemes providing that barrier on the * various architectures: - * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. - * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. + * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC, + * RISC-V. switch_mm() relies on membarrier_arch_switch_mm() + * on PowerPC and on RISC-V. * - finish_lock_switch() for weakly-ordered * architectures where spin_unlock is a full barrier, * - switch_to() for arm64 (weakly-ordered, spin_unlock From df1fec1b83380560bbc01fb4fa8e30347d4da706 Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Thu, 15 Feb 2024 08:04:23 -0800 Subject: [PATCH 2/7] Merge patch series "membarrier: riscv: Core serializing command" RISC-V was lacking a membarrier implementation for the store/fetch ordering, which is a bit tricky because of the deferred icache flushing we use in RISC-V. * b4-shazam-merge: membarrier: riscv: Provide core serializing command locking: Introduce prepare_sync_core_cmd() membarrier: Create Documentation/scheduler/membarrier.rst membarrier: riscv: Add full memory barrier in switch_mm() Link: https://lore.kernel.org/r/20240131144936.29190-1-parri.andrea@gmail.com Signed-off-by: Palmer Dabbelt --- .../membarrier-sync-core/arch-support.txt | 18 ++++++++- Documentation/scheduler/index.rst | 1 + Documentation/scheduler/membarrier.rst | 39 +++++++++++++++++++ MAINTAINERS | 2 + arch/riscv/Kconfig | 3 ++ arch/riscv/include/asm/membarrier.h | 19 +++++++++ arch/riscv/include/asm/sync_core.h | 29 ++++++++++++++ include/linux/sync_core.h | 16 +++++++- init/Kconfig | 3 ++ kernel/sched/core.c | 11 +++++- kernel/sched/membarrier.c | 13 +++++-- 11 files changed, 147 insertions(+), 7 deletions(-) create mode 100644 Documentation/scheduler/membarrier.rst create mode 100644 arch/riscv/include/asm/sync_core.h diff --git a/Documentation/features/sched/membarrier-sync-core/arch-support.txt b/Documentation/features/sched/membarrier-sync-core/arch-support.txt index 23260ca449468..76597adfb7d5a 100644 --- a/Documentation/features/sched/membarrier-sync-core/arch-support.txt +++ b/Documentation/features/sched/membarrier-sync-core/arch-support.txt @@ -10,6 +10,22 @@ # Rely on implicit context synchronization as a result of exception return # when returning from IPI handler, and when returning to user-space. # +# * riscv +# +# riscv uses xRET as return from interrupt and to return to user-space. +# +# Given that xRET is not core serializing, we rely on FENCE.I for providing +# core serialization: +# +# - by calling sync_core_before_usermode() on return from interrupt (cf. +# ipi_sync_core()), +# +# - via switch_mm() and sync_core_before_usermode() (respectively, for +# uthread->uthread and kthread->uthread transitions) before returning +# to user-space. +# +# The serialization in switch_mm() is activated by prepare_sync_core_cmd(). +# # * x86 # # x86-32 uses IRET as return from interrupt, which takes care of the IPI. @@ -44,7 +60,7 @@ | openrisc: | TODO | | parisc: | TODO | | powerpc: | ok | - | riscv: | TODO | + | riscv: | ok | | s390: | ok | | sh: | TODO | | sparc: | TODO | diff --git a/Documentation/scheduler/index.rst b/Documentation/scheduler/index.rst index 3170747226f6d..43bd8a145b7a9 100644 --- a/Documentation/scheduler/index.rst +++ b/Documentation/scheduler/index.rst @@ -7,6 +7,7 @@ Scheduler completion + membarrier sched-arch sched-bwc sched-deadline diff --git a/Documentation/scheduler/membarrier.rst b/Documentation/scheduler/membarrier.rst new file mode 100644 index 0000000000000..2387804b1c633 --- /dev/null +++ b/Documentation/scheduler/membarrier.rst @@ -0,0 +1,39 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======================== +membarrier() System Call +======================== + +MEMBARRIER_CMD_{PRIVATE,GLOBAL}_EXPEDITED - Architecture requirements +===================================================================== + +Memory barriers before updating rq->curr +---------------------------------------- + +The commands MEMBARRIER_CMD_PRIVATE_EXPEDITED and MEMBARRIER_CMD_GLOBAL_EXPEDITED +require each architecture to have a full memory barrier after coming from +user-space, before updating rq->curr. This barrier is implied by the sequence +rq_lock(); smp_mb__after_spinlock() in __schedule(). The barrier matches a full +barrier in the proximity of the membarrier system call exit, cf. +membarrier_{private,global}_expedited(). + +Memory barriers after updating rq->curr +--------------------------------------- + +The commands MEMBARRIER_CMD_PRIVATE_EXPEDITED and MEMBARRIER_CMD_GLOBAL_EXPEDITED +require each architecture to have a full memory barrier after updating rq->curr, +before returning to user-space. The schemes providing this barrier on the various +architectures are as follows. + + - alpha, arc, arm, hexagon, mips rely on the full barrier implied by + spin_unlock() in finish_lock_switch(). + + - arm64 relies on the full barrier implied by switch_to(). + + - powerpc, riscv, s390, sparc, x86 rely on the full barrier implied by + switch_mm(), if mm is not NULL; they rely on the full barrier implied + by mmdrop(), otherwise. On powerpc and riscv, switch_mm() relies on + membarrier_arch_switch_mm(). + +The barrier matches a full barrier in the proximity of the membarrier system call +entry, cf. membarrier_{private,global}_expedited(). diff --git a/MAINTAINERS b/MAINTAINERS index 2cffc87be9c7f..09acf13a481bb 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13732,7 +13732,9 @@ M: Mathieu Desnoyers M: "Paul E. McKenney" L: linux-kernel@vger.kernel.org S: Supported +F: Documentation/scheduler/membarrier.rst F: arch/*/include/asm/membarrier.h +F: arch/*/include/asm/sync_core.h F: include/uapi/linux/membarrier.h F: kernel/sched/membarrier.c diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 1df4afccc3818..fa3b41bc94510 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -28,14 +28,17 @@ config RISCV select ARCH_HAS_GIGANTIC_PAGE select ARCH_HAS_KCOV select ARCH_HAS_MEMBARRIER_CALLBACKS + select ARCH_HAS_MEMBARRIER_SYNC_CORE select ARCH_HAS_MMIOWB select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE select ARCH_HAS_PMEM_API + select ARCH_HAS_PREPARE_SYNC_CORE_CMD select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_SET_DIRECT_MAP if MMU select ARCH_HAS_SET_MEMORY if MMU select ARCH_HAS_STRICT_KERNEL_RWX if MMU && !XIP_KERNEL select ARCH_HAS_STRICT_MODULE_RWX if MMU && !XIP_KERNEL + select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE select ARCH_HAS_SYSCALL_WRAPPER select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_UBSAN_SANITIZE_ALL diff --git a/arch/riscv/include/asm/membarrier.h b/arch/riscv/include/asm/membarrier.h index 6c016ebb5020a..47b240d0d596a 100644 --- a/arch/riscv/include/asm/membarrier.h +++ b/arch/riscv/include/asm/membarrier.h @@ -22,6 +22,25 @@ static inline void membarrier_arch_switch_mm(struct mm_struct *prev, /* * The membarrier system call requires a full memory barrier * after storing to rq->curr, before going back to user-space. + * + * This barrier is also needed for the SYNC_CORE command when + * switching between processes; in particular, on a transition + * from a thread belonging to another mm to a thread belonging + * to the mm for which a membarrier SYNC_CORE is done on CPU0: + * + * - [CPU0] sets all bits in the mm icache_stale_mask (in + * prepare_sync_core_cmd()); + * + * - [CPU1] stores to rq->curr (by the scheduler); + * + * - [CPU0] loads rq->curr within membarrier and observes + * cpu_rq(1)->curr->mm != mm, so the IPI is skipped on + * CPU1; this means membarrier relies on switch_mm() to + * issue the sync-core; + * + * - [CPU1] switch_mm() loads icache_stale_mask; if the bit + * is zero, switch_mm() may incorrectly skip the sync-core. + * * Matches a full barrier in the proximity of the membarrier * system call entry. */ diff --git a/arch/riscv/include/asm/sync_core.h b/arch/riscv/include/asm/sync_core.h new file mode 100644 index 0000000000000..9153016da8f14 --- /dev/null +++ b/arch/riscv/include/asm/sync_core.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_RISCV_SYNC_CORE_H +#define _ASM_RISCV_SYNC_CORE_H + +/* + * RISC-V implements return to user-space through an xRET instruction, + * which is not core serializing. + */ +static inline void sync_core_before_usermode(void) +{ + asm volatile ("fence.i" ::: "memory"); +} + +#ifdef CONFIG_SMP +/* + * Ensure the next switch_mm() on every CPU issues a core serializing + * instruction for the given @mm. + */ +static inline void prepare_sync_core_cmd(struct mm_struct *mm) +{ + cpumask_setall(&mm->context.icache_stale_mask); +} +#else +static inline void prepare_sync_core_cmd(struct mm_struct *mm) +{ +} +#endif /* CONFIG_SMP */ + +#endif /* _ASM_RISCV_SYNC_CORE_H */ diff --git a/include/linux/sync_core.h b/include/linux/sync_core.h index 013da4b8b3272..67bb9794b8758 100644 --- a/include/linux/sync_core.h +++ b/include/linux/sync_core.h @@ -17,5 +17,19 @@ static inline void sync_core_before_usermode(void) } #endif -#endif /* _LINUX_SYNC_CORE_H */ +#ifdef CONFIG_ARCH_HAS_PREPARE_SYNC_CORE_CMD +#include +#else +/* + * This is a dummy prepare_sync_core_cmd() implementation that can be used on + * all architectures which provide unconditional core serializing instructions + * in switch_mm(). + * If your architecture doesn't provide such core serializing instructions in + * switch_mm(), you may need to write your own functions. + */ +static inline void prepare_sync_core_cmd(struct mm_struct *mm) +{ +} +#endif +#endif /* _LINUX_SYNC_CORE_H */ diff --git a/init/Kconfig b/init/Kconfig index 2f044774d6d82..70abc9e294601 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1992,6 +1992,9 @@ source "kernel/Kconfig.locks" config ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE bool +config ARCH_HAS_PREPARE_SYNC_CORE_CMD + bool + config ARCH_HAS_SYNC_CORE_BEFORE_USERMODE bool diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9b406d9886541..c9c95c05bf09e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6608,7 +6608,9 @@ static void __sched notrace __schedule(unsigned int sched_mode) * if (signal_pending_state()) if (p->state & @state) * * Also, the membarrier system call requires a full memory barrier - * after coming from user-space, before storing to rq->curr. + * after coming from user-space, before storing to rq->curr; this + * barrier matches a full barrier in the proximity of the membarrier + * system call exit. */ rq_lock(rq, &rf); smp_mb__after_spinlock(); @@ -6686,6 +6688,13 @@ static void __sched notrace __schedule(unsigned int sched_mode) * architectures where spin_unlock is a full barrier, * - switch_to() for arm64 (weakly-ordered, spin_unlock * is a RELEASE barrier), + * + * The barrier matches a full barrier in the proximity of + * the membarrier system call entry. + * + * On RISC-V, this barrier pairing is also needed for the + * SYNC_CORE command when switching between processes, cf. + * the inline comments in membarrier_arch_switch_mm(). */ ++*switch_count; diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 4e715b9b278e7..809194cd779f4 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -254,7 +254,7 @@ static int membarrier_global_expedited(void) return 0; /* - * Matches memory barriers around rq->curr modification in + * Matches memory barriers after rq->curr modification in * scheduler. */ smp_mb(); /* system call entry is not a mb. */ @@ -304,7 +304,7 @@ static int membarrier_global_expedited(void) /* * Memory barrier on the caller thread _after_ we finished - * waiting for the last IPI. Matches memory barriers around + * waiting for the last IPI. Matches memory barriers before * rq->curr modification in scheduler. */ smp_mb(); /* exit from system call is not a mb */ @@ -324,6 +324,7 @@ static int membarrier_private_expedited(int flags, int cpu_id) MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY)) return -EPERM; ipi_func = ipi_sync_core; + prepare_sync_core_cmd(mm); } else if (flags == MEMBARRIER_FLAG_RSEQ) { if (!IS_ENABLED(CONFIG_RSEQ)) return -EINVAL; @@ -343,8 +344,12 @@ static int membarrier_private_expedited(int flags, int cpu_id) return 0; /* - * Matches memory barriers around rq->curr modification in + * Matches memory barriers after rq->curr modification in * scheduler. + * + * On RISC-V, this barrier pairing is also needed for the + * SYNC_CORE command when switching between processes, cf. + * the inline comments in membarrier_arch_switch_mm(). */ smp_mb(); /* system call entry is not a mb. */ @@ -420,7 +425,7 @@ static int membarrier_private_expedited(int flags, int cpu_id) /* * Memory barrier on the caller thread _after_ we finished - * waiting for the last IPI. Matches memory barriers around + * waiting for the last IPI. Matches memory barriers before * rq->curr modification in scheduler. */ smp_mb(); /* exit from system call is not a mb */ From e4a8b43527fd308e479e7d1911c1ebd8849f3f50 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Wed, 13 Dec 2023 21:29:58 +0100 Subject: [PATCH 3/7] riscv: Use WRITE_ONCE() when setting page table entries To avoid any compiler "weirdness" when accessing page table entries which are concurrently modified by the HW, let's use WRITE_ONCE() macro (commit 20a004e7b017 ("arm64: mm: Use READ_ONCE/WRITE_ONCE when accessing page tables") gives a great explanation with more details). Signed-off-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20231213203001.179237-2-alexghiti@rivosinc.com Signed-off-by: Han Gao --- arch/riscv/include/asm/pgtable-64.h | 6 +++--- arch/riscv/include/asm/pgtable.h | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h index 7a5097202e157..a65a352dcfbfb 100644 --- a/arch/riscv/include/asm/pgtable-64.h +++ b/arch/riscv/include/asm/pgtable-64.h @@ -198,7 +198,7 @@ static inline int pud_user(pud_t pud) static inline void set_pud(pud_t *pudp, pud_t pud) { - *pudp = pud; + WRITE_ONCE(*pudp, pud); } static inline void pud_clear(pud_t *pudp) @@ -274,7 +274,7 @@ static inline unsigned long _pmd_pfn(pmd_t pmd) static inline void set_p4d(p4d_t *p4dp, p4d_t p4d) { if (pgtable_l4_enabled) - *p4dp = p4d; + WRITE_ONCE(*p4dp, p4d); else set_pud((pud_t *)p4dp, (pud_t){ p4d_val(p4d) }); } @@ -347,7 +347,7 @@ static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) static inline void set_pgd(pgd_t *pgdp, pgd_t pgd) { if (pgtable_l5_enabled) - *pgdp = pgd; + WRITE_ONCE(*pgdp, pgd); else set_p4d((p4d_t *)pgdp, (p4d_t){ pgd_val(pgd) }); } diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 719c3041ae1c2..f8e72df4113a6 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -248,7 +248,7 @@ static inline int pmd_leaf(pmd_t pmd) static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) { - *pmdp = pmd; + WRITE_ONCE(*pmdp, pmd); } static inline void pmd_clear(pmd_t *pmdp) @@ -515,7 +515,7 @@ static inline int pte_same(pte_t pte_a, pte_t pte_b) */ static inline void set_pte(pte_t *ptep, pte_t pteval) { - *ptep = pteval; + WRITE_ONCE(*ptep, pteval); } void flush_icache_pte(pte_t pte); From 4cac026cdd114bddece675ca773947f272570dcb Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Wed, 13 Dec 2023 21:29:59 +0100 Subject: [PATCH 4/7] mm: Introduce pudp/p4dp/pgdp_get() functions Instead of directly dereferencing page tables entries, which can cause issues (see commit 20a004e7b017 ("arm64: mm: Use READ_ONCE/WRITE_ONCE when accessing page tables"), let's introduce new functions to get the pud/p4d/pgd entries (the pte and pmd versions already exist). Note that arm pgd_t is actually an array so pgdp_get() is defined as a macro to avoid a build error. Those new functions will be used in subsequent commits by the riscv architecture. Signed-off-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20231213203001.179237-3-alexghiti@rivosinc.com Signed-off-by: Han Gao --- arch/arm/include/asm/pgtable.h | 2 ++ include/linux/pgtable.h | 21 +++++++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h index 16b02f44c7d31..d657b84b6bf70 100644 --- a/arch/arm/include/asm/pgtable.h +++ b/arch/arm/include/asm/pgtable.h @@ -151,6 +151,8 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; +#define pgdp_get(pgpd) READ_ONCE(*pgdp) + #define pud_page(pud) pmd_page(__pmd(pud_val(pud))) #define pud_write(pud) pmd_write(__pmd(pud_val(pud))) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index af7639c3b0a3a..8b7daccd11bef 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -292,6 +292,27 @@ static inline pmd_t pmdp_get(pmd_t *pmdp) } #endif +#ifndef pudp_get +static inline pud_t pudp_get(pud_t *pudp) +{ + return READ_ONCE(*pudp); +} +#endif + +#ifndef p4dp_get +static inline p4d_t p4dp_get(p4d_t *p4dp) +{ + return READ_ONCE(*p4dp); +} +#endif + +#ifndef pgdp_get +static inline pgd_t pgdp_get(pgd_t *pgdp) +{ + return READ_ONCE(*pgdp); +} +#endif + #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, From da437b592c41004feb106d5c9911f8ca1008a2d3 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Wed, 13 Dec 2023 21:30:00 +0100 Subject: [PATCH 5/7] riscv: mm: Only compile pgtable.c if MMU All functions defined in there depend on MMU, so no need to compile it for !MMU configs. Signed-off-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20231213203001.179237-4-alexghiti@rivosinc.com Signed-off-by: Han Gao --- arch/riscv/mm/Makefile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/riscv/mm/Makefile b/arch/riscv/mm/Makefile index 3a4dfc8babcf8..2c869f8026a88 100644 --- a/arch/riscv/mm/Makefile +++ b/arch/riscv/mm/Makefile @@ -13,10 +13,9 @@ endif KCOV_INSTRUMENT_init.o := n obj-y += init.o -obj-$(CONFIG_MMU) += extable.o fault.o pageattr.o +obj-$(CONFIG_MMU) += extable.o fault.o pageattr.o pgtable.o obj-y += cacheflush.o obj-y += context.o -obj-y += pgtable.o obj-y += pmem.o ifeq ($(CONFIG_MMU),y) From de63ddb22821093de4a61b13e804c9a2927231db Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Wed, 13 Dec 2023 21:30:01 +0100 Subject: [PATCH 6/7] riscv: Use accessors to page table entries instead of direct dereference As very well explained in commit 20a004e7b017 ("arm64: mm: Use READ_ONCE/WRITE_ONCE when accessing page tables"), an architecture whose page table walker can modify the PTE in parallel must use READ_ONCE()/WRITE_ONCE() macro to avoid any compiler transformation. So apply that to riscv which is such architecture. Signed-off-by: Alexandre Ghiti Acked-by: Anup Patel Link: https://lore.kernel.org/r/20231213203001.179237-5-alexghiti@rivosinc.com Signed-off-by: Han Gao --- arch/riscv/include/asm/kfence.h | 4 +-- arch/riscv/include/asm/pgtable-64.h | 16 ++------- arch/riscv/include/asm/pgtable.h | 29 ++++------------ arch/riscv/kernel/efi.c | 2 +- arch/riscv/kvm/mmu.c | 22 ++++++------- arch/riscv/mm/fault.c | 16 ++++----- arch/riscv/mm/hugetlbpage.c | 12 +++---- arch/riscv/mm/kasan_init.c | 45 +++++++++++++------------ arch/riscv/mm/pageattr.c | 44 ++++++++++++------------- arch/riscv/mm/pgtable.c | 51 ++++++++++++++++++++++++++--- 10 files changed, 128 insertions(+), 113 deletions(-) diff --git a/arch/riscv/include/asm/kfence.h b/arch/riscv/include/asm/kfence.h index 0bbffd528096d..7388edd88986f 100644 --- a/arch/riscv/include/asm/kfence.h +++ b/arch/riscv/include/asm/kfence.h @@ -18,9 +18,9 @@ static inline bool kfence_protect_page(unsigned long addr, bool protect) pte_t *pte = virt_to_kpte(addr); if (protect) - set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT)); + set_pte(pte, __pte(pte_val(ptep_get(pte)) & ~_PAGE_PRESENT)); else - set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT)); + set_pte(pte, __pte(pte_val(ptep_get(pte)) | _PAGE_PRESENT)); flush_tlb_kernel_range(addr, addr + PAGE_SIZE); diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h index a65a352dcfbfb..3272ca7a5270b 100644 --- a/arch/riscv/include/asm/pgtable-64.h +++ b/arch/riscv/include/asm/pgtable-64.h @@ -336,13 +336,7 @@ static inline struct page *p4d_page(p4d_t p4d) #define pud_index(addr) (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)) #define pud_offset pud_offset -static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) -{ - if (pgtable_l4_enabled) - return p4d_pgtable(*p4d) + pud_index(address); - - return (pud_t *)p4d; -} +pud_t *pud_offset(p4d_t *p4d, unsigned long address); static inline void set_pgd(pgd_t *pgdp, pgd_t pgd) { @@ -400,12 +394,6 @@ static inline struct page *pgd_page(pgd_t pgd) #define p4d_index(addr) (((addr) >> P4D_SHIFT) & (PTRS_PER_P4D - 1)) #define p4d_offset p4d_offset -static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address) -{ - if (pgtable_l5_enabled) - return pgd_pgtable(*pgd) + p4d_index(address); - - return (p4d_t *)pgd; -} +p4d_t *p4d_offset(pgd_t *pgd, unsigned long address); #endif /* _ASM_RISCV_PGTABLE_64_H */ diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index f8e72df4113a6..37829dab4a0a4 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -549,19 +549,12 @@ static inline void pte_clear(struct mm_struct *mm, __set_pte_at(ptep, __pte(0)); } -#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS -static inline int ptep_set_access_flags(struct vm_area_struct *vma, - unsigned long address, pte_t *ptep, - pte_t entry, int dirty) -{ - if (!pte_same(*ptep, entry)) - __set_pte_at(ptep, entry); - /* - * update_mmu_cache will unconditionally execute, handling both - * the case that the PTE changed and the spurious fault case. - */ - return true; -} +#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS /* defined in mm/pgtable.c */ +extern int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, + pte_t *ptep, pte_t entry, int dirty); +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG /* defined in mm/pgtable.c */ +extern int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, + pte_t *ptep); #define __HAVE_ARCH_PTEP_GET_AND_CLEAR static inline pte_t ptep_get_and_clear(struct mm_struct *mm, @@ -574,16 +567,6 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, return pte; } -#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG -static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, - unsigned long address, - pte_t *ptep) -{ - if (!pte_young(*ptep)) - return 0; - return test_and_clear_bit(_PAGE_ACCESSED_OFFSET, &pte_val(*ptep)); -} - #define __HAVE_ARCH_PTEP_SET_WRPROTECT static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep) diff --git a/arch/riscv/kernel/efi.c b/arch/riscv/kernel/efi.c index aa6209a74c83f..b64bf1624a052 100644 --- a/arch/riscv/kernel/efi.c +++ b/arch/riscv/kernel/efi.c @@ -60,7 +60,7 @@ int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md) static int __init set_permissions(pte_t *ptep, unsigned long addr, void *data) { efi_memory_desc_t *md = data; - pte_t pte = READ_ONCE(*ptep); + pte_t pte = ptep_get(ptep); unsigned long val; if (md->attribute & EFI_MEMORY_RO) { diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c index 068c745938710..a9e2fd7245e1e 100644 --- a/arch/riscv/kvm/mmu.c +++ b/arch/riscv/kvm/mmu.c @@ -103,7 +103,7 @@ static bool gstage_get_leaf_entry(struct kvm *kvm, gpa_t addr, *ptep_level = current_level; ptep = (pte_t *)kvm->arch.pgd; ptep = &ptep[gstage_pte_index(addr, current_level)]; - while (ptep && pte_val(*ptep)) { + while (ptep && pte_val(ptep_get(ptep))) { if (gstage_pte_leaf(ptep)) { *ptep_level = current_level; *ptepp = ptep; @@ -113,7 +113,7 @@ static bool gstage_get_leaf_entry(struct kvm *kvm, gpa_t addr, if (current_level) { current_level--; *ptep_level = current_level; - ptep = (pte_t *)gstage_pte_page_vaddr(*ptep); + ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep)); ptep = &ptep[gstage_pte_index(addr, current_level)]; } else { ptep = NULL; @@ -149,25 +149,25 @@ static int gstage_set_pte(struct kvm *kvm, u32 level, if (gstage_pte_leaf(ptep)) return -EEXIST; - if (!pte_val(*ptep)) { + if (!pte_val(ptep_get(ptep))) { if (!pcache) return -ENOMEM; next_ptep = kvm_mmu_memory_cache_alloc(pcache); if (!next_ptep) return -ENOMEM; - *ptep = pfn_pte(PFN_DOWN(__pa(next_ptep)), - __pgprot(_PAGE_TABLE)); + set_pte(ptep, pfn_pte(PFN_DOWN(__pa(next_ptep)), + __pgprot(_PAGE_TABLE))); } else { if (gstage_pte_leaf(ptep)) return -EEXIST; - next_ptep = (pte_t *)gstage_pte_page_vaddr(*ptep); + next_ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep)); } current_level--; ptep = &next_ptep[gstage_pte_index(addr, current_level)]; } - *ptep = *new_pte; + set_pte(ptep, *new_pte); if (gstage_pte_leaf(ptep)) gstage_remote_tlb_flush(kvm, current_level, addr); @@ -239,11 +239,11 @@ static void gstage_op_pte(struct kvm *kvm, gpa_t addr, BUG_ON(addr & (page_size - 1)); - if (!pte_val(*ptep)) + if (!pte_val(ptep_get(ptep))) return; if (ptep_level && !gstage_pte_leaf(ptep)) { - next_ptep = (pte_t *)gstage_pte_page_vaddr(*ptep); + next_ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep)); next_ptep_level = ptep_level - 1; ret = gstage_level_to_page_size(next_ptep_level, &next_page_size); @@ -261,7 +261,7 @@ static void gstage_op_pte(struct kvm *kvm, gpa_t addr, if (op == GSTAGE_OP_CLEAR) set_pte(ptep, __pte(0)); else if (op == GSTAGE_OP_WP) - set_pte(ptep, __pte(pte_val(*ptep) & ~_PAGE_WRITE)); + set_pte(ptep, __pte(pte_val(ptep_get(ptep)) & ~_PAGE_WRITE)); gstage_remote_tlb_flush(kvm, ptep_level, addr); } } @@ -603,7 +603,7 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) &ptep, &ptep_level)) return false; - return pte_young(*ptep); + return pte_young(ptep_get(ptep)); } int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index 655b2b1bb529f..8960f4c844976 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -137,24 +137,24 @@ static inline void vmalloc_fault(struct pt_regs *regs, int code, unsigned long a pgd = (pgd_t *)pfn_to_virt(pfn) + index; pgd_k = init_mm.pgd + index; - if (!pgd_present(*pgd_k)) { + if (!pgd_present(pgdp_get(pgd_k))) { no_context(regs, addr); return; } - set_pgd(pgd, *pgd_k); + set_pgd(pgd, pgdp_get(pgd_k)); p4d_k = p4d_offset(pgd_k, addr); - if (!p4d_present(*p4d_k)) { + if (!p4d_present(p4dp_get(p4d_k))) { no_context(regs, addr); return; } pud_k = pud_offset(p4d_k, addr); - if (!pud_present(*pud_k)) { + if (!pud_present(pudp_get(pud_k))) { no_context(regs, addr); return; } - if (pud_leaf(*pud_k)) + if (pud_leaf(pudp_get(pud_k))) goto flush_tlb; /* @@ -162,11 +162,11 @@ static inline void vmalloc_fault(struct pt_regs *regs, int code, unsigned long a * to copy individual PTEs */ pmd_k = pmd_offset(pud_k, addr); - if (!pmd_present(*pmd_k)) { + if (!pmd_present(pmdp_get(pmd_k))) { no_context(regs, addr); return; } - if (pmd_leaf(*pmd_k)) + if (pmd_leaf(pmdp_get(pmd_k))) goto flush_tlb; /* @@ -176,7 +176,7 @@ static inline void vmalloc_fault(struct pt_regs *regs, int code, unsigned long a * silently loop forever. */ pte_k = pte_offset_kernel(pmd_k, addr); - if (!pte_present(*pte_k)) { + if (!pte_present(ptep_get(pte_k))) { no_context(regs, addr); return; } diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c index fbe918801667d..5ef2a6891158a 100644 --- a/arch/riscv/mm/hugetlbpage.c +++ b/arch/riscv/mm/hugetlbpage.c @@ -54,7 +54,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, } if (sz == PMD_SIZE) { - if (want_pmd_share(vma, addr) && pud_none(*pud)) + if (want_pmd_share(vma, addr) && pud_none(pudp_get(pud))) pte = huge_pmd_share(mm, vma, addr, pud); else pte = (pte_t *)pmd_alloc(mm, pud, addr); @@ -93,11 +93,11 @@ pte_t *huge_pte_offset(struct mm_struct *mm, pmd_t *pmd; pgd = pgd_offset(mm, addr); - if (!pgd_present(*pgd)) + if (!pgd_present(pgdp_get(pgd))) return NULL; p4d = p4d_offset(pgd, addr); - if (!p4d_present(*p4d)) + if (!p4d_present(p4dp_get(p4d))) return NULL; pud = pud_offset(p4d, addr); @@ -105,7 +105,7 @@ pte_t *huge_pte_offset(struct mm_struct *mm, /* must be pud huge, non-present or none */ return (pte_t *)pud; - if (!pud_present(*pud)) + if (!pud_present(pudp_get(pud))) return NULL; pmd = pmd_offset(pud, addr); @@ -113,7 +113,7 @@ pte_t *huge_pte_offset(struct mm_struct *mm, /* must be pmd huge, non-present or none */ return (pte_t *)pmd; - if (!pmd_present(*pmd)) + if (!pmd_present(pmdp_get(pmd))) return NULL; for_each_napot_order(order) { @@ -351,7 +351,7 @@ void huge_pte_clear(struct mm_struct *mm, pte_t *ptep, unsigned long sz) { - pte_t pte = READ_ONCE(*ptep); + pte_t pte = ptep_get(ptep); int i, pte_num; if (!pte_napot(pte)) { diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c index 5e39dcf23fdbc..e962518530373 100644 --- a/arch/riscv/mm/kasan_init.c +++ b/arch/riscv/mm/kasan_init.c @@ -31,7 +31,7 @@ static void __init kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned phys_addr_t phys_addr; pte_t *ptep, *p; - if (pmd_none(*pmd)) { + if (pmd_none(pmdp_get(pmd))) { p = memblock_alloc(PTRS_PER_PTE * sizeof(pte_t), PAGE_SIZE); set_pmd(pmd, pfn_pmd(PFN_DOWN(__pa(p)), PAGE_TABLE)); } @@ -39,7 +39,7 @@ static void __init kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned ptep = pte_offset_kernel(pmd, vaddr); do { - if (pte_none(*ptep)) { + if (pte_none(ptep_get(ptep))) { phys_addr = memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE); set_pte(ptep, pfn_pte(PFN_DOWN(phys_addr), PAGE_KERNEL)); memset(__va(phys_addr), KASAN_SHADOW_INIT, PAGE_SIZE); @@ -53,7 +53,7 @@ static void __init kasan_populate_pmd(pud_t *pud, unsigned long vaddr, unsigned pmd_t *pmdp, *p; unsigned long next; - if (pud_none(*pud)) { + if (pud_none(pudp_get(pud))) { p = memblock_alloc(PTRS_PER_PMD * sizeof(pmd_t), PAGE_SIZE); set_pud(pud, pfn_pud(PFN_DOWN(__pa(p)), PAGE_TABLE)); } @@ -63,7 +63,8 @@ static void __init kasan_populate_pmd(pud_t *pud, unsigned long vaddr, unsigned do { next = pmd_addr_end(vaddr, end); - if (pmd_none(*pmdp) && IS_ALIGNED(vaddr, PMD_SIZE) && (next - vaddr) >= PMD_SIZE) { + if (pmd_none(pmdp_get(pmdp)) && IS_ALIGNED(vaddr, PMD_SIZE) && + (next - vaddr) >= PMD_SIZE) { phys_addr = memblock_phys_alloc(PMD_SIZE, PMD_SIZE); if (phys_addr) { set_pmd(pmdp, pfn_pmd(PFN_DOWN(phys_addr), PAGE_KERNEL)); @@ -83,7 +84,7 @@ static void __init kasan_populate_pud(p4d_t *p4d, pud_t *pudp, *p; unsigned long next; - if (p4d_none(*p4d)) { + if (p4d_none(p4dp_get(p4d))) { p = memblock_alloc(PTRS_PER_PUD * sizeof(pud_t), PAGE_SIZE); set_p4d(p4d, pfn_p4d(PFN_DOWN(__pa(p)), PAGE_TABLE)); } @@ -93,7 +94,8 @@ static void __init kasan_populate_pud(p4d_t *p4d, do { next = pud_addr_end(vaddr, end); - if (pud_none(*pudp) && IS_ALIGNED(vaddr, PUD_SIZE) && (next - vaddr) >= PUD_SIZE) { + if (pud_none(pudp_get(pudp)) && IS_ALIGNED(vaddr, PUD_SIZE) && + (next - vaddr) >= PUD_SIZE) { phys_addr = memblock_phys_alloc(PUD_SIZE, PUD_SIZE); if (phys_addr) { set_pud(pudp, pfn_pud(PFN_DOWN(phys_addr), PAGE_KERNEL)); @@ -113,7 +115,7 @@ static void __init kasan_populate_p4d(pgd_t *pgd, p4d_t *p4dp, *p; unsigned long next; - if (pgd_none(*pgd)) { + if (pgd_none(pgdp_get(pgd))) { p = memblock_alloc(PTRS_PER_P4D * sizeof(p4d_t), PAGE_SIZE); set_pgd(pgd, pfn_pgd(PFN_DOWN(__pa(p)), PAGE_TABLE)); } @@ -123,7 +125,8 @@ static void __init kasan_populate_p4d(pgd_t *pgd, do { next = p4d_addr_end(vaddr, end); - if (p4d_none(*p4dp) && IS_ALIGNED(vaddr, P4D_SIZE) && (next - vaddr) >= P4D_SIZE) { + if (p4d_none(p4dp_get(p4dp)) && IS_ALIGNED(vaddr, P4D_SIZE) && + (next - vaddr) >= P4D_SIZE) { phys_addr = memblock_phys_alloc(P4D_SIZE, P4D_SIZE); if (phys_addr) { set_p4d(p4dp, pfn_p4d(PFN_DOWN(phys_addr), PAGE_KERNEL)); @@ -145,7 +148,7 @@ static void __init kasan_populate_pgd(pgd_t *pgdp, do { next = pgd_addr_end(vaddr, end); - if (pgd_none(*pgdp) && IS_ALIGNED(vaddr, PGDIR_SIZE) && + if (pgd_none(pgdp_get(pgdp)) && IS_ALIGNED(vaddr, PGDIR_SIZE) && (next - vaddr) >= PGDIR_SIZE) { phys_addr = memblock_phys_alloc(PGDIR_SIZE, PGDIR_SIZE); if (phys_addr) { @@ -168,7 +171,7 @@ static void __init kasan_early_clear_pud(p4d_t *p4dp, if (!pgtable_l4_enabled) { pudp = (pud_t *)p4dp; } else { - base_pud = pt_ops.get_pud_virt(pfn_to_phys(_p4d_pfn(*p4dp))); + base_pud = pt_ops.get_pud_virt(pfn_to_phys(_p4d_pfn(p4dp_get(p4dp)))); pudp = base_pud + pud_index(vaddr); } @@ -193,7 +196,7 @@ static void __init kasan_early_clear_p4d(pgd_t *pgdp, if (!pgtable_l5_enabled) { p4dp = (p4d_t *)pgdp; } else { - base_p4d = pt_ops.get_p4d_virt(pfn_to_phys(_pgd_pfn(*pgdp))); + base_p4d = pt_ops.get_p4d_virt(pfn_to_phys(_pgd_pfn(pgdp_get(pgdp)))); p4dp = base_p4d + p4d_index(vaddr); } @@ -239,14 +242,14 @@ static void __init kasan_early_populate_pud(p4d_t *p4dp, if (!pgtable_l4_enabled) { pudp = (pud_t *)p4dp; } else { - base_pud = pt_ops.get_pud_virt(pfn_to_phys(_p4d_pfn(*p4dp))); + base_pud = pt_ops.get_pud_virt(pfn_to_phys(_p4d_pfn(p4dp_get(p4dp)))); pudp = base_pud + pud_index(vaddr); } do { next = pud_addr_end(vaddr, end); - if (pud_none(*pudp) && IS_ALIGNED(vaddr, PUD_SIZE) && + if (pud_none(pudp_get(pudp)) && IS_ALIGNED(vaddr, PUD_SIZE) && (next - vaddr) >= PUD_SIZE) { phys_addr = __pa((uintptr_t)kasan_early_shadow_pmd); set_pud(pudp, pfn_pud(PFN_DOWN(phys_addr), PAGE_TABLE)); @@ -277,14 +280,14 @@ static void __init kasan_early_populate_p4d(pgd_t *pgdp, if (!pgtable_l5_enabled) { p4dp = (p4d_t *)pgdp; } else { - base_p4d = pt_ops.get_p4d_virt(pfn_to_phys(_pgd_pfn(*pgdp))); + base_p4d = pt_ops.get_p4d_virt(pfn_to_phys(_pgd_pfn(pgdp_get(pgdp)))); p4dp = base_p4d + p4d_index(vaddr); } do { next = p4d_addr_end(vaddr, end); - if (p4d_none(*p4dp) && IS_ALIGNED(vaddr, P4D_SIZE) && + if (p4d_none(p4dp_get(p4dp)) && IS_ALIGNED(vaddr, P4D_SIZE) && (next - vaddr) >= P4D_SIZE) { phys_addr = __pa((uintptr_t)kasan_early_shadow_pud); set_p4d(p4dp, pfn_p4d(PFN_DOWN(phys_addr), PAGE_TABLE)); @@ -305,7 +308,7 @@ static void __init kasan_early_populate_pgd(pgd_t *pgdp, do { next = pgd_addr_end(vaddr, end); - if (pgd_none(*pgdp) && IS_ALIGNED(vaddr, PGDIR_SIZE) && + if (pgd_none(pgdp_get(pgdp)) && IS_ALIGNED(vaddr, PGDIR_SIZE) && (next - vaddr) >= PGDIR_SIZE) { phys_addr = __pa((uintptr_t)kasan_early_shadow_p4d); set_pgd(pgdp, pfn_pgd(PFN_DOWN(phys_addr), PAGE_TABLE)); @@ -381,7 +384,7 @@ static void __init kasan_shallow_populate_pud(p4d_t *p4d, do { next = pud_addr_end(vaddr, end); - if (pud_none(*pud_k)) { + if (pud_none(pudp_get(pud_k))) { p = memblock_alloc(PAGE_SIZE, PAGE_SIZE); set_pud(pud_k, pfn_pud(PFN_DOWN(__pa(p)), PAGE_TABLE)); continue; @@ -401,7 +404,7 @@ static void __init kasan_shallow_populate_p4d(pgd_t *pgd, do { next = p4d_addr_end(vaddr, end); - if (p4d_none(*p4d_k)) { + if (p4d_none(p4dp_get(p4d_k))) { p = memblock_alloc(PAGE_SIZE, PAGE_SIZE); set_p4d(p4d_k, pfn_p4d(PFN_DOWN(__pa(p)), PAGE_TABLE)); continue; @@ -420,7 +423,7 @@ static void __init kasan_shallow_populate_pgd(unsigned long vaddr, unsigned long do { next = pgd_addr_end(vaddr, end); - if (pgd_none(*pgd_k)) { + if (pgd_none(pgdp_get(pgd_k))) { p = memblock_alloc(PAGE_SIZE, PAGE_SIZE); set_pgd(pgd_k, pfn_pgd(PFN_DOWN(__pa(p)), PAGE_TABLE)); continue; @@ -451,7 +454,7 @@ static void __init create_tmp_mapping(void) /* Copy the last p4d since it is shared with the kernel mapping. */ if (pgtable_l5_enabled) { - ptr = (p4d_t *)pgd_page_vaddr(*pgd_offset_k(KASAN_SHADOW_END)); + ptr = (p4d_t *)pgd_page_vaddr(pgdp_get(pgd_offset_k(KASAN_SHADOW_END))); memcpy(tmp_p4d, ptr, sizeof(p4d_t) * PTRS_PER_P4D); set_pgd(&tmp_pg_dir[pgd_index(KASAN_SHADOW_END)], pfn_pgd(PFN_DOWN(__pa(tmp_p4d)), PAGE_TABLE)); @@ -462,7 +465,7 @@ static void __init create_tmp_mapping(void) /* Copy the last pud since it is shared with the kernel mapping. */ if (pgtable_l4_enabled) { - ptr = (pud_t *)p4d_page_vaddr(*(base_p4d + p4d_index(KASAN_SHADOW_END))); + ptr = (pud_t *)p4d_page_vaddr(p4dp_get(base_p4d + p4d_index(KASAN_SHADOW_END))); memcpy(tmp_pud, ptr, sizeof(pud_t) * PTRS_PER_PUD); set_p4d(&base_p4d[p4d_index(KASAN_SHADOW_END)], pfn_p4d(PFN_DOWN(__pa(tmp_pud)), PAGE_TABLE)); diff --git a/arch/riscv/mm/pageattr.c b/arch/riscv/mm/pageattr.c index f61b2f8291e35..271d01a5ba4da 100644 --- a/arch/riscv/mm/pageattr.c +++ b/arch/riscv/mm/pageattr.c @@ -29,7 +29,7 @@ static unsigned long set_pageattr_masks(unsigned long val, struct mm_walk *walk) static int pageattr_p4d_entry(p4d_t *p4d, unsigned long addr, unsigned long next, struct mm_walk *walk) { - p4d_t val = READ_ONCE(*p4d); + p4d_t val = p4dp_get(p4d); if (p4d_leaf(val)) { val = __p4d(set_pageattr_masks(p4d_val(val), walk)); @@ -42,7 +42,7 @@ static int pageattr_p4d_entry(p4d_t *p4d, unsigned long addr, static int pageattr_pud_entry(pud_t *pud, unsigned long addr, unsigned long next, struct mm_walk *walk) { - pud_t val = READ_ONCE(*pud); + pud_t val = pudp_get(pud); if (pud_leaf(val)) { val = __pud(set_pageattr_masks(pud_val(val), walk)); @@ -55,7 +55,7 @@ static int pageattr_pud_entry(pud_t *pud, unsigned long addr, static int pageattr_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk) { - pmd_t val = READ_ONCE(*pmd); + pmd_t val = pmdp_get(pmd); if (pmd_leaf(val)) { val = __pmd(set_pageattr_masks(pmd_val(val), walk)); @@ -68,7 +68,7 @@ static int pageattr_pmd_entry(pmd_t *pmd, unsigned long addr, static int pageattr_pte_entry(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk) { - pte_t val = READ_ONCE(*pte); + pte_t val = ptep_get(pte); val = __pte(set_pageattr_masks(pte_val(val), walk)); set_pte(pte, val); @@ -108,10 +108,10 @@ static int __split_linear_mapping_pmd(pud_t *pudp, vaddr <= (vaddr & PMD_MASK) && end >= next) continue; - if (pmd_leaf(*pmdp)) { + if (pmd_leaf(pmdp_get(pmdp))) { struct page *pte_page; - unsigned long pfn = _pmd_pfn(*pmdp); - pgprot_t prot = __pgprot(pmd_val(*pmdp) & ~_PAGE_PFN_MASK); + unsigned long pfn = _pmd_pfn(pmdp_get(pmdp)); + pgprot_t prot = __pgprot(pmd_val(pmdp_get(pmdp)) & ~_PAGE_PFN_MASK); pte_t *ptep_new; int i; @@ -148,10 +148,10 @@ static int __split_linear_mapping_pud(p4d_t *p4dp, vaddr <= (vaddr & PUD_MASK) && end >= next) continue; - if (pud_leaf(*pudp)) { + if (pud_leaf(pudp_get(pudp))) { struct page *pmd_page; - unsigned long pfn = _pud_pfn(*pudp); - pgprot_t prot = __pgprot(pud_val(*pudp) & ~_PAGE_PFN_MASK); + unsigned long pfn = _pud_pfn(pudp_get(pudp)); + pgprot_t prot = __pgprot(pud_val(pudp_get(pudp)) & ~_PAGE_PFN_MASK); pmd_t *pmdp_new; int i; @@ -197,10 +197,10 @@ static int __split_linear_mapping_p4d(pgd_t *pgdp, vaddr <= (vaddr & P4D_MASK) && end >= next) continue; - if (p4d_leaf(*p4dp)) { + if (p4d_leaf(p4dp_get(p4dp))) { struct page *pud_page; - unsigned long pfn = _p4d_pfn(*p4dp); - pgprot_t prot = __pgprot(p4d_val(*p4dp) & ~_PAGE_PFN_MASK); + unsigned long pfn = _p4d_pfn(p4dp_get(p4dp)); + pgprot_t prot = __pgprot(p4d_val(p4dp_get(p4dp)) & ~_PAGE_PFN_MASK); pud_t *pudp_new; int i; @@ -427,29 +427,29 @@ bool kernel_page_present(struct page *page) pte_t *pte; pgd = pgd_offset_k(addr); - if (!pgd_present(*pgd)) + if (!pgd_present(pgdp_get(pgd))) return false; - if (pgd_leaf(*pgd)) + if (pgd_leaf(pgdp_get(pgd))) return true; p4d = p4d_offset(pgd, addr); - if (!p4d_present(*p4d)) + if (!p4d_present(p4dp_get(p4d))) return false; - if (p4d_leaf(*p4d)) + if (p4d_leaf(p4dp_get(p4d))) return true; pud = pud_offset(p4d, addr); - if (!pud_present(*pud)) + if (!pud_present(pudp_get(pud))) return false; - if (pud_leaf(*pud)) + if (pud_leaf(pudp_get(pud))) return true; pmd = pmd_offset(pud, addr); - if (!pmd_present(*pmd)) + if (!pmd_present(pmdp_get(pmd))) return false; - if (pmd_leaf(*pmd)) + if (pmd_leaf(pmdp_get(pmd))) return true; pte = pte_offset_kernel(pmd, addr); - return pte_present(*pte); + return pte_present(ptep_get(pte)); } diff --git a/arch/riscv/mm/pgtable.c b/arch/riscv/mm/pgtable.c index fef4e7328e490..ef887efcb6790 100644 --- a/arch/riscv/mm/pgtable.c +++ b/arch/riscv/mm/pgtable.c @@ -5,6 +5,47 @@ #include #include +int ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, + pte_t entry, int dirty) +{ + if (!pte_same(ptep_get(ptep), entry)) + __set_pte_at(ptep, entry); + /* + * update_mmu_cache will unconditionally execute, handling both + * the case that the PTE changed and the spurious fault case. + */ + return true; +} + +int ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, + pte_t *ptep) +{ + if (!pte_young(ptep_get(ptep))) + return 0; + return test_and_clear_bit(_PAGE_ACCESSED_OFFSET, &pte_val(*ptep)); +} +EXPORT_SYMBOL_GPL(ptep_test_and_clear_young); + +#ifdef CONFIG_64BIT +pud_t *pud_offset(p4d_t *p4d, unsigned long address) +{ + if (pgtable_l4_enabled) + return p4d_pgtable(p4dp_get(p4d)) + pud_index(address); + + return (pud_t *)p4d; +} + +p4d_t *p4d_offset(pgd_t *pgd, unsigned long address) +{ + if (pgtable_l5_enabled) + return pgd_pgtable(pgdp_get(pgd)) + p4d_index(address); + + return (p4d_t *)pgd; +} +#endif + #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) { @@ -25,7 +66,7 @@ int pud_set_huge(pud_t *pud, phys_addr_t phys, pgprot_t prot) int pud_clear_huge(pud_t *pud) { - if (!pud_leaf(READ_ONCE(*pud))) + if (!pud_leaf(pudp_get(pud))) return 0; pud_clear(pud); return 1; @@ -33,7 +74,7 @@ int pud_clear_huge(pud_t *pud) int pud_free_pmd_page(pud_t *pud, unsigned long addr) { - pmd_t *pmd = pud_pgtable(*pud); + pmd_t *pmd = pud_pgtable(pudp_get(pud)); int i; pud_clear(pud); @@ -63,7 +104,7 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t phys, pgprot_t prot) int pmd_clear_huge(pmd_t *pmd) { - if (!pmd_leaf(READ_ONCE(*pmd))) + if (!pmd_leaf(pmdp_get(pmd))) return 0; pmd_clear(pmd); return 1; @@ -71,7 +112,7 @@ int pmd_clear_huge(pmd_t *pmd) int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) { - pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd); + pte_t *pte = (pte_t *)pmd_page_vaddr(pmdp_get(pmd)); pmd_clear(pmd); @@ -88,7 +129,7 @@ pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, pmd_t pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); VM_BUG_ON(address & ~HPAGE_PMD_MASK); - VM_BUG_ON(pmd_trans_huge(*pmdp)); + VM_BUG_ON(pmd_trans_huge(pmdp_get(pmdp))); /* * When leaf PTE entries (regular pages) are collapsed into a leaf * PMD entry (huge page), a valid non-leaf PTE is converted into a From 723b1dec1d2608f6d6055dc6f9a99ca94ac70b0c Mon Sep 17 00:00:00 2001 From: Han Gao Date: Wed, 7 Aug 2024 18:18:57 +0800 Subject: [PATCH 7/7] symbol: gpl: export pud_offset/p4d_offset symbol export pud_offset/p4d_offset symbol Signed-off-by: Han Gao Signed-off-by: Han Gao --- arch/riscv/mm/pgtable.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/riscv/mm/pgtable.c b/arch/riscv/mm/pgtable.c index ef887efcb6790..21ea6ed76470b 100644 --- a/arch/riscv/mm/pgtable.c +++ b/arch/riscv/mm/pgtable.c @@ -36,6 +36,7 @@ pud_t *pud_offset(p4d_t *p4d, unsigned long address) return (pud_t *)p4d; } +EXPORT_SYMBOL_GPL(pud_offset); p4d_t *p4d_offset(pgd_t *pgd, unsigned long address) { @@ -44,6 +45,7 @@ p4d_t *p4d_offset(pgd_t *pgd, unsigned long address) return (p4d_t *)pgd; } +EXPORT_SYMBOL_GPL(p4d_offset); #endif #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP