From 252e75d247658737e31bec39616c07202dad0108 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Fri, 9 Dec 2022 16:30:37 +0000
Subject: [PATCH 1/7] arm64/mm: Separate out page size used by kernel vs
 hardware

Until now, PAGE_SHIFT, PAGE_SIZE and PAGE_MASK have defined the page
size (either 4KB, 16KB or 64KB) that is used both by the kernel (for
memory allocation, fault handling, etc) and by the hardware translation
tables.

This change separates these quantities so that PAGE_SHIFT, PAGE_SIZE and
PAGE_MASK are all used for the kernel's purposes, and new macros,
SUBPAGE_SHIFT, SUBPAGE_SIZE and SUBPAGE_MASK are all used for the
hardware tanslation tables. Although they have the same value at the
moment, the intention is that a future commit will allow these values to
be different. This will allow (for example) a kernel to use 16KB
granules but for those 16KB granules to be represented in the hardware
as 4 contiguous 4KB ptes. This is achieved by reducing the number of
PTRS_PER_PTE that the kernel sees and a single kernel pte becomes
multiple contiguous hw ptes ("sub-ptes").

Why is this useful? In the short term, it allows us to separate and
measure the performance impacts due to TLB pressure and kernel
bookkeeping structures. In the longer term, it provides the possibility
for a number of memory and performance improvements.

This initial commit introduces the new macros and updates the arch code
to use them appropriately. Both sets of macros have the same values for
now, so no behavioural changes are intended.

Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
---
 arch/arm64/include/asm/kvm_arm.h       | 30 +++++++++++------------
 arch/arm64/include/asm/kvm_pgtable.h   |  4 ++--
 arch/arm64/include/asm/kvm_pkvm.h      |  8 +++----
 arch/arm64/include/asm/page-def.h      |  4 ++++
 arch/arm64/include/asm/pgtable-hwdef.h | 33 +++++++++++++-------------
 arch/arm64/include/asm/pgtable.h       |  2 +-
 arch/arm64/kernel/head.S               | 24 +++++++++----------
 arch/arm64/kernel/smp.c                |  2 +-
 arch/arm64/kvm/hyp/pgtable.c           |  8 +++----
 arch/arm64/kvm/reset.c                 | 12 +++++-----
 arch/arm64/mm/fault.c                  |  3 ++-
 arch/arm64/mm/hugetlbpage.c            | 14 +++++------
 arch/arm64/mm/proc.S                   |  6 ++---
 13 files changed, 78 insertions(+), 72 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index 8aa8492dafc0f..f741eedf45d6c 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -164,7 +164,7 @@
  *
  * The table roughly translates to :
  *
- *	SL0(PAGE_SIZE, Entry_level) = TGRAN_SL0_BASE - Entry_Level
+ *	SL0(SUBPAGE_SIZE, Entry_level) = TGRAN_SL0_BASE - Entry_Level
  *
  * Where TGRAN_SL0_BASE is a magic number depending on the page size:
  * 	TGRAN_SL0_BASE(4K) = 2
@@ -206,21 +206,21 @@
  * descriptors in section D4.2.8 in ARM DDI 0487C.a.
  *
  * The algorithm defines the expectations on the translation table
- * addresses for each level, based on PAGE_SIZE, entry level
+ * addresses for each level, based on SUBPAGE_SIZE, entry level
  * and the translation table size (T0SZ). The variable "x" in the
  * algorithm determines the alignment of a table base address at a given
  * level and thus determines the alignment of VTTBR:BADDR for stage2
  * page table entry level.
  * Since the number of bits resolved at the entry level could vary
  * depending on the T0SZ, the value of "x" is defined based on a
- * Magic constant for a given PAGE_SIZE and Entry Level. The
- * intermediate levels must be always aligned to the PAGE_SIZE (i.e,
- * x = PAGE_SHIFT).
+ * Magic constant for a given SUBPAGE_SIZE and Entry Level. The
+ * intermediate levels must be always aligned to the SUBPAGE_SIZE (i.e,
+ * x = SUBPAGE_SHIFT).
  *
  * The value of "x" for entry level is calculated as :
  *    x = Magic_N - T0SZ
  *
- * where Magic_N is an integer depending on the page size and the entry
+ * where Magic_N is an integer depending on the (sub) page size and the entry
  * level of the page table as below:
  *
  *	--------------------------------------------
@@ -237,34 +237,34 @@
  *
  * We have a magic formula for the Magic_N below:
  *
- *  Magic_N(PAGE_SIZE, Level) = 64 - ((PAGE_SHIFT - 3) * Number_of_levels)
+ *  Magic_N(SUBPAGE_SIZE, Level) = 64 - ((SUBPAGE_SHIFT - 3) * Number_of_levels)
  *
  * where Number_of_levels = (4 - Level). We are only interested in the
  * value for Entry_Level for the stage2 page table.
  *
  * So, given that T0SZ = (64 - IPA_SHIFT), we can compute 'x' as follows:
  *
- *	x = (64 - ((PAGE_SHIFT - 3) * Number_of_levels)) - (64 - IPA_SHIFT)
- *	  = IPA_SHIFT - ((PAGE_SHIFT - 3) * Number of levels)
+ *	x = (64 - ((SUBPAGE_SHIFT - 3) * Number_of_levels)) - (64 - IPA_SHIFT)
+ *	  = IPA_SHIFT - ((SUBPAGE_SHIFT - 3) * Number of levels)
  *
  * Here is one way to explain the Magic Formula:
  *
  *  x = log2(Size_of_Entry_Level_Table)
  *
- * Since, we can resolve (PAGE_SHIFT - 3) bits at each level, and another
- * PAGE_SHIFT bits in the PTE, we have :
+ * Since, we can resolve (SUBPAGE_SHIFT - 3) bits at each level, and another
+ * SUBPAGE_SHIFT bits in the PTE, we have :
  *
- *  Bits_Entry_level = IPA_SHIFT - ((PAGE_SHIFT - 3) * (n - 1) + PAGE_SHIFT)
- *		     = IPA_SHIFT - (PAGE_SHIFT - 3) * n - 3
+ *  Bits_Entry_level = IPA_SHIFT - ((SUBPAGE_SHIFT - 3) * (n - 1) + SUBPAGE_SHIFT)
+ *		     = IPA_SHIFT - (SUBPAGE_SHIFT - 3) * n - 3
  *  where n = number of levels, and since each pointer is 8bytes, we have:
  *
  *  x = Bits_Entry_Level + 3
- *    = IPA_SHIFT - (PAGE_SHIFT - 3) * n
+ *    = IPA_SHIFT - (SUBPAGE_SHIFT - 3) * n
  *
  * The only constraint here is that, we have to find the number of page table
  * levels for a given IPA size (which we do, see stage2_pt_levels())
  */
-#define ARM64_VTTBR_X(ipa, levels)	((ipa) - ((levels) * (PAGE_SHIFT - 3)))
+#define ARM64_VTTBR_X(ipa, levels)	((ipa) - ((levels) * (SUBPAGE_SHIFT - 3)))
 
 #define VTTBR_CNP_BIT     (UL(1))
 #define VTTBR_VMID_SHIFT  (UL(48))
diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index 3252eb50ecfe5..899626bcc559d 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -39,7 +39,7 @@ typedef u64 kvm_pte_t;
 
 #define KVM_PTE_VALID			BIT(0)
 
-#define KVM_PTE_ADDR_MASK		GENMASK(47, PAGE_SHIFT)
+#define KVM_PTE_ADDR_MASK		GENMASK(47, SUBPAGE_SHIFT)
 #define KVM_PTE_ADDR_51_48		GENMASK(15, 12)
 
 static inline bool kvm_pte_valid(kvm_pte_t pte)
@@ -51,7 +51,7 @@ static inline u64 kvm_pte_to_phys(kvm_pte_t pte)
 {
 	u64 pa = pte & KVM_PTE_ADDR_MASK;
 
-	if (PAGE_SHIFT == 16)
+	if (SUBPAGE_SHIFT == 16)
 		pa |= FIELD_GET(KVM_PTE_ADDR_51_48, pte) << 48;
 
 	return pa;
diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h
index 9f4ad2a8df59c..64b4992efd286 100644
--- a/arch/arm64/include/asm/kvm_pkvm.h
+++ b/arch/arm64/include/asm/kvm_pkvm.h
@@ -20,7 +20,7 @@ static inline unsigned long __hyp_pgtable_max_pages(unsigned long nr_pages)
 
 	/* Provision the worst case scenario */
 	for (i = 0; i < KVM_PGTABLE_MAX_LEVELS; i++) {
-		nr_pages = DIV_ROUND_UP(nr_pages, PTRS_PER_PTE);
+		nr_pages = DIV_ROUND_UP(nr_pages, SUBPTES_PER_PTE);
 		total += nr_pages;
 	}
 
@@ -34,7 +34,7 @@ static inline unsigned long __hyp_pgtable_total_pages(void)
 	/* Cover all of memory with page-granularity */
 	for (i = 0; i < kvm_nvhe_sym(hyp_memblock_nr); i++) {
 		struct memblock_region *reg = &kvm_nvhe_sym(hyp_memory)[i];
-		res += __hyp_pgtable_max_pages(reg->size >> PAGE_SHIFT);
+		res += __hyp_pgtable_max_pages(reg->size >> SUBPAGE_SHIFT);
 	}
 
 	return res;
@@ -47,7 +47,7 @@ static inline unsigned long hyp_s1_pgtable_pages(void)
 	res = __hyp_pgtable_total_pages();
 
 	/* Allow 1 GiB for private mappings */
-	res += __hyp_pgtable_max_pages(SZ_1G >> PAGE_SHIFT);
+	res += __hyp_pgtable_max_pages(SZ_1G >> SUBPAGE_SHIFT);
 
 	return res;
 }
@@ -63,7 +63,7 @@ static inline unsigned long host_s2_pgtable_pages(void)
 	res = __hyp_pgtable_total_pages() + 16;
 
 	/* Allow 1 GiB for MMIO mappings */
-	res += __hyp_pgtable_max_pages(SZ_1G >> PAGE_SHIFT);
+	res += __hyp_pgtable_max_pages(SZ_1G >> SUBPAGE_SHIFT);
 
 	return res;
 }
diff --git a/arch/arm64/include/asm/page-def.h b/arch/arm64/include/asm/page-def.h
index 2403f7b4cdbfb..da102f16c49d1 100644
--- a/arch/arm64/include/asm/page-def.h
+++ b/arch/arm64/include/asm/page-def.h
@@ -15,4 +15,8 @@
 #define PAGE_SIZE		(_AC(1, UL) << PAGE_SHIFT)
 #define PAGE_MASK		(~(PAGE_SIZE-1))
 
+#define SUBPAGE_SHIFT		CONFIG_ARM64_PAGE_SHIFT
+#define SUBPAGE_SIZE		(_AC(1, UL) << SUBPAGE_SHIFT)
+#define SUBPAGE_MASK		(~(SUBPAGE_SIZE-1))
+
 #endif /* __ASM_PAGE_DEF_H */
diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index 5ab8d163198fd..975f7a1b9ebb4 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -9,38 +9,39 @@
 
 /*
  * Number of page-table levels required to address 'va_bits' wide
- * address, without section mapping. We resolve the top (va_bits - PAGE_SHIFT)
- * bits with (PAGE_SHIFT - 3) bits at each page table level. Hence:
+ * address, without section mapping. We resolve the top (va_bits - SUBPAGE_SHIFT)
+ * bits with (SUBPAGE_SHIFT - 3) bits at each page table level. Hence:
  *
- *  levels = DIV_ROUND_UP((va_bits - PAGE_SHIFT), (PAGE_SHIFT - 3))
+ *  levels = DIV_ROUND_UP((va_bits - SUBPAGE_SHIFT), (SUBPAGE_SHIFT - 3))
  *
  * where DIV_ROUND_UP(n, d) => (((n) + (d) - 1) / (d))
  *
  * We cannot include linux/kernel.h which defines DIV_ROUND_UP here
  * due to build issues. So we open code DIV_ROUND_UP here:
  *
- *	((((va_bits) - PAGE_SHIFT) + (PAGE_SHIFT - 3) - 1) / (PAGE_SHIFT - 3))
+ *	((((va_bits) - SUBPAGE_SHIFT) + (SUBPAGE_SHIFT - 3) - 1) / (SUBPAGE_SHIFT - 3))
  *
  * which gets simplified as :
  */
-#define ARM64_HW_PGTABLE_LEVELS(va_bits) (((va_bits) - 4) / (PAGE_SHIFT - 3))
+#define ARM64_HW_PGTABLE_LEVELS(va_bits) (((va_bits) - 4) / (SUBPAGE_SHIFT - 3))
 
 /*
  * Size mapped by an entry at level n ( 0 <= n <= 3)
- * We map (PAGE_SHIFT - 3) at all translation levels and PAGE_SHIFT bits
+ * We map (SUBPAGE_SHIFT - 3) at all translation levels and SUBPAGE_SHIFT bits
  * in the final page. The maximum number of translation levels supported by
  * the architecture is 4. Hence, starting at level n, we have further
  * ((4 - n) - 1) levels of translation excluding the offset within the page.
  * So, the total number of bits mapped by an entry at level n is :
  *
- *  ((4 - n) - 1) * (PAGE_SHIFT - 3) + PAGE_SHIFT
+ *  ((4 - n) - 1) * (SUBPAGE_SHIFT - 3) + SUBPAGE_SHIFT
  *
  * Rearranging it a bit we get :
- *   (4 - n) * (PAGE_SHIFT - 3) + 3
+ *   (4 - n) * (SUBPAGE_SHIFT - 3) + 3
  */
-#define ARM64_HW_PGTABLE_LEVEL_SHIFT(n)	((PAGE_SHIFT - 3) * (4 - (n)) + 3)
+#define ARM64_HW_PGTABLE_LEVEL_SHIFT(n)	((SUBPAGE_SHIFT - 3) * (4 - (n)) + 3)
 
-#define PTRS_PER_PTE		(1 << (PAGE_SHIFT - 3))
+#define SUBPTES_PER_PTE		(1 << (SUBPAGE_SHIFT - 3))
+#define PTRS_PER_PTE		(SUBPTES_PER_PTE >> (PAGE_SHIFT - SUBPAGE_SHIFT))
 
 /*
  * PMD_SHIFT determines the size a level 2 page table entry can map.
@@ -49,7 +50,7 @@
 #define PMD_SHIFT		ARM64_HW_PGTABLE_LEVEL_SHIFT(2)
 #define PMD_SIZE		(_AC(1, UL) << PMD_SHIFT)
 #define PMD_MASK		(~(PMD_SIZE-1))
-#define PTRS_PER_PMD		(1 << (PAGE_SHIFT - 3))
+#define PTRS_PER_PMD		(1 << (SUBPAGE_SHIFT - 3))
 #endif
 
 /*
@@ -59,7 +60,7 @@
 #define PUD_SHIFT		ARM64_HW_PGTABLE_LEVEL_SHIFT(1)
 #define PUD_SIZE		(_AC(1, UL) << PUD_SHIFT)
 #define PUD_MASK		(~(PUD_SIZE-1))
-#define PTRS_PER_PUD		(1 << (PAGE_SHIFT - 3))
+#define PTRS_PER_PUD		(1 << (SUBPAGE_SHIFT - 3))
 #endif
 
 /*
@@ -74,9 +75,9 @@
 /*
  * Contiguous page definitions.
  */
-#define CONT_PTE_SHIFT		(CONFIG_ARM64_CONT_PTE_SHIFT + PAGE_SHIFT)
-#define CONT_PTES		(1 << (CONT_PTE_SHIFT - PAGE_SHIFT))
-#define CONT_PTE_SIZE		(CONT_PTES * PAGE_SIZE)
+#define CONT_PTE_SHIFT		(CONFIG_ARM64_CONT_PTE_SHIFT + SUBPAGE_SHIFT)
+#define CONT_PTES		(1 << (CONT_PTE_SHIFT - SUBPAGE_SHIFT))
+#define CONT_PTE_SIZE		(CONT_PTES * SUBPAGE_SIZE)
 #define CONT_PTE_MASK		(~(CONT_PTE_SIZE - 1))
 
 #define CONT_PMD_SHIFT		(CONFIG_ARM64_CONT_PMD_SHIFT + PMD_SHIFT)
@@ -155,7 +156,7 @@
 #define PTE_PXN			(_AT(pteval_t, 1) << 53)	/* Privileged XN */
 #define PTE_UXN			(_AT(pteval_t, 1) << 54)	/* User XN */
 
-#define PTE_ADDR_LOW		(((_AT(pteval_t, 1) << (48 - PAGE_SHIFT)) - 1) << PAGE_SHIFT)
+#define PTE_ADDR_LOW		(((_AT(pteval_t, 1) << (48 - SUBPAGE_SHIFT)) - 1) << SUBPAGE_SHIFT)
 #ifdef CONFIG_ARM64_PA_BITS_52
 #define PTE_ADDR_HIGH		(_AT(pteval_t, 0xf) << 12)
 #define PTE_ADDR_MASK		(PTE_ADDR_LOW | PTE_ADDR_HIGH)
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index edf6625ce9654..05be6139646ca 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1092,7 +1092,7 @@ static inline void update_mmu_cache(struct vm_area_struct *vma,
 
 static inline bool pud_sect_supported(void)
 {
-	return PAGE_SIZE == SZ_4K;
+	return SUBPAGE_SIZE == SZ_4K;
 }
 
 
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 2196aad7b55bc..d63a915f17295 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -194,7 +194,7 @@ SYM_FUNC_END(clear_page_tables)
  * formed from n pages.
  *
  *	tbl:	location of page table
- *	rtbl:	address to be used for first level page table entry (typically tbl + PAGE_SIZE)
+ *	rtbl:	address to be used for first level page table entry (typically tbl + SUBPAGE_SIZE)
  *	vstart:	virtual address of start of range
  *	vend:	virtual address of end of range - we map [vstart, vend - 1]
  *	flags:	flags to use to map last level entries
@@ -210,38 +210,38 @@ SYM_FUNC_END(clear_page_tables)
  */
 	.macro map_memory, tbl, rtbl, vstart, vend, flags, phys, order, istart, iend, tmp, count, sv, extra_shift
 	sub \vend, \vend, #1
-	add \rtbl, \tbl, #PAGE_SIZE
+	add \rtbl, \tbl, #SUBPAGE_SIZE
 	mov \count, #0
 
 	.ifnb	\extra_shift
 	tst	\vend, #~((1 << (\extra_shift)) - 1)
 	b.eq	.L_\@
-	compute_indices \vstart, \vend, #\extra_shift, #(PAGE_SHIFT - 3), \istart, \iend, \count
+	compute_indices \vstart, \vend, #\extra_shift, #(SUBPAGE_SHIFT - 3), \istart, \iend, \count
 	mov \sv, \rtbl
-	populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp
+	populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #SUBPAGE_SIZE, \tmp
 	mov \tbl, \sv
 	.endif
 .L_\@:
 	compute_indices \vstart, \vend, #PGDIR_SHIFT, #\order, \istart, \iend, \count
 	mov \sv, \rtbl
-	populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp
+	populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #SUBPAGE_SIZE, \tmp
 	mov \tbl, \sv
 
 #if SWAPPER_PGTABLE_LEVELS > 3
-	compute_indices \vstart, \vend, #PUD_SHIFT, #(PAGE_SHIFT - 3), \istart, \iend, \count
+	compute_indices \vstart, \vend, #PUD_SHIFT, #(SUBPAGE_SHIFT - 3), \istart, \iend, \count
 	mov \sv, \rtbl
-	populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp
+	populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #SUBPAGE_SIZE, \tmp
 	mov \tbl, \sv
 #endif
 
 #if SWAPPER_PGTABLE_LEVELS > 2
-	compute_indices \vstart, \vend, #SWAPPER_TABLE_SHIFT, #(PAGE_SHIFT - 3), \istart, \iend, \count
+	compute_indices \vstart, \vend, #SWAPPER_TABLE_SHIFT, #(SUBPAGE_SHIFT - 3), \istart, \iend, \count
 	mov \sv, \rtbl
-	populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp
+	populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #SUBPAGE_SIZE, \tmp
 	mov \tbl, \sv
 #endif
 
-	compute_indices \vstart, \vend, #SWAPPER_BLOCK_SHIFT, #(PAGE_SHIFT - 3), \istart, \iend, \count
+	compute_indices \vstart, \vend, #SWAPPER_BLOCK_SHIFT, #(SUBPAGE_SHIFT - 3), \istart, \iend, \count
 	bic \rtbl, \phys, #SWAPPER_BLOCK_SIZE - 1
 	populate_entries \tbl, \rtbl, \istart, \iend, \flags, #SWAPPER_BLOCK_SIZE, \tmp
 	.endm
@@ -264,7 +264,7 @@ SYM_FUNC_START_LOCAL(remap_region)
 
 	// Get the index offset for the start of the last level table
 	lsr	x1, x1, x6
-	bfi	x1, xzr, #0, #PAGE_SHIFT - 3
+	bfi	x1, xzr, #0, #SUBPAGE_SHIFT - 3
 
 	// Derive the start and end indexes into the last level table
 	// associated with the provided region
@@ -308,7 +308,7 @@ SYM_FUNC_START_LOCAL(create_idmap)
 	 */
 #if (VA_BITS < 48)
 #define IDMAP_PGD_ORDER	(VA_BITS - PGDIR_SHIFT)
-#define EXTRA_SHIFT	(PGDIR_SHIFT + PAGE_SHIFT - 3)
+#define EXTRA_SHIFT	(PGDIR_SHIFT + SUBPAGE_SHIFT - 3)
 
 	/*
 	 * If VA_BITS < 48, we have to configure an additional table level.
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index ffc5d76cf6955..1006bbbf2b73b 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -163,7 +163,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle)
 			pr_crit("CPU%u: does not support 52-bit VAs\n", cpu);
 		if (status & CPU_STUCK_REASON_NO_GRAN) {
 			pr_crit("CPU%u: does not support %luK granule\n",
-				cpu, PAGE_SIZE / SZ_1K);
+				cpu, SUBPAGE_SIZE / SZ_1K);
 		}
 		cpus_stuck_in_kernel++;
 		break;
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index cdf8e76b0be14..d21ca75d05027 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -83,7 +83,7 @@ static bool kvm_block_mapping_supported(u64 addr, u64 end, u64 phys, u32 level)
 static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, u32 level)
 {
 	u64 shift = kvm_granule_shift(level);
-	u64 mask = BIT(PAGE_SHIFT - 3) - 1;
+	u64 mask = BIT(SUBPAGE_SHIFT - 3) - 1;
 
 	return (data->addr >> shift) & mask;
 }
@@ -126,7 +126,7 @@ static kvm_pte_t kvm_phys_to_pte(u64 pa)
 {
 	kvm_pte_t pte = pa & KVM_PTE_ADDR_MASK;
 
-	if (PAGE_SHIFT == 16)
+	if (SUBPAGE_SHIFT == 16)
 		pte |= FIELD_PREP(KVM_PTE_ADDR_51_48, pa >> 48);
 
 	return pte;
@@ -236,7 +236,7 @@ static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data,
 	if (WARN_ON_ONCE(level >= KVM_PGTABLE_MAX_LEVELS))
 		return -EINVAL;
 
-	for (idx = kvm_pgtable_idx(data, level); idx < PTRS_PER_PTE; ++idx) {
+	for (idx = kvm_pgtable_idx(data, level); idx < SUBPTES_PER_PTE; ++idx) {
 		kvm_pte_t *ptep = &pgtable[idx];
 
 		if (data->addr >= data->end)
@@ -264,7 +264,7 @@ static int _kvm_pgtable_walk(struct kvm_pgtable_walk_data *data)
 		return -EINVAL;
 
 	for (idx = kvm_pgd_page_idx(data); data->addr < data->end; ++idx) {
-		kvm_pte_t *ptep = &pgt->pgd[idx * PTRS_PER_PTE];
+		kvm_pte_t *ptep = &pgt->pgd[idx * SUBPTES_PER_PTE];
 
 		ret = __kvm_pgtable_walk(data, ptep, pgt->start_level);
 		if (ret)
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index 5ae18472205a9..3a5afede12e95 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -362,26 +362,26 @@ int kvm_set_ipa_limit(void)
 				ID_AA64MMFR0_EL1_PARANGE_SHIFT);
 	/*
 	 * IPA size beyond 48 bits could not be supported
-	 * on either 4K or 16K page size. Hence let's cap
+	 * on either 4K or 16K subpage size. Hence let's cap
 	 * it to 48 bits, in case it's reported as larger
 	 * on the system.
 	 */
-	if (PAGE_SIZE != SZ_64K)
+	if (SUBPAGE_SIZE != SZ_64K)
 		parange = min(parange, (unsigned int)ID_AA64MMFR0_EL1_PARANGE_48);
 
 	/*
-	 * Check with ARMv8.5-GTG that our PAGE_SIZE is supported at
+	 * Check with ARMv8.5-GTG that our SUBPAGE_SIZE is supported at
 	 * Stage-2. If not, things will stop very quickly.
 	 */
 	switch (cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_EL1_TGRAN_2_SHIFT)) {
 	case ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_NONE:
-		kvm_err("PAGE_SIZE not supported at Stage-2, giving up\n");
+		kvm_err("SUBPAGE_SIZE not supported at Stage-2, giving up\n");
 		return -EINVAL;
 	case ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_DEFAULT:
-		kvm_debug("PAGE_SIZE supported at Stage-2 (default)\n");
+		kvm_debug("SUBPAGE_SIZE supported at Stage-2 (default)\n");
 		break;
 	case ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_MIN ... ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_MAX:
-		kvm_debug("PAGE_SIZE supported at Stage-2 (advertised)\n");
+		kvm_debug("SUBPAGE_SIZE supported at Stage-2 (advertised)\n");
 		break;
 	default:
 		kvm_err("Unsupported value for TGRAN_2, giving up\n");
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 5b391490e045b..4cd6e5c4c9a68 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -141,8 +141,9 @@ static void show_pte(unsigned long addr)
 		return;
 	}
 
-	pr_alert("%s pgtable: %luk pages, %llu-bit VAs, pgdp=%016lx\n",
+	pr_alert("%s pgtable: %luk pages, %luk subpages, %llu-bit VAs, pgdp=%016lx\n",
 		 mm == &init_mm ? "swapper" : "user", PAGE_SIZE / SZ_1K,
+		 SUBPAGE_SIZE / SZ_1K,
 		 vabits_actual, mm_to_pgd_phys(mm));
 	pgdp = pgd_offset(mm, addr);
 	pgd = READ_ONCE(*pgdp);
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 35e9a468d13e6..032f3fbefdd77 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -21,13 +21,13 @@
 /*
  * HugeTLB Support Matrix
  *
- * ---------------------------------------------------
- * | Page Size | CONT PTE |  PMD  | CONT PMD |  PUD  |
- * ---------------------------------------------------
- * |     4K    |   64K    |   2M  |    32M   |   1G  |
- * |    16K    |    2M    |  32M  |     1G   |       |
- * |    64K    |    2M    | 512M  |    16G   |       |
- * ---------------------------------------------------
+ * ------------------------------------------------------
+ * | SubPage Size | CONT PTE |  PMD  | CONT PMD |  PUD  |
+ * ------------------------------------------------------
+ * |      4K      |   64K    |   2M  |    32M   |   1G  |
+ * |     16K      |    2M    |  32M  |     1G   |       |
+ * |     64K      |    2M    | 512M  |    16G   |       |
+ * ------------------------------------------------------
  */
 
 /*
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index b9ecbbae1e1ab..93c23f629a841 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -233,14 +233,14 @@ SYM_FUNC_END(idmap_cpu_replace_ttbr1)
 	.macro	kpti_map_pgtbl, type, level
 	str	xzr, [temp_pte, #8 * (\level + 1)]	// break before make
 	dsb	nshst
-	add	pte, temp_pte, #PAGE_SIZE * (\level + 1)
+	add	pte, temp_pte, #SUBPAGE_SIZE * (\level + 1)
 	lsr	pte, pte, #12
 	tlbi	vaae1, pte
 	dsb	nsh
 	isb
 
 	phys_to_pte pte, cur_\type\()p
-	add	cur_\type\()p, temp_pte, #PAGE_SIZE * (\level + 1)
+	add	cur_\type\()p, temp_pte, #SUBPAGE_SIZE * (\level + 1)
 	orr	pte, pte, pte_flags
 	str	pte, [temp_pte, #8 * (\level + 1)]
 	dsb	nshst
@@ -346,7 +346,7 @@ SYM_TYPED_FUNC_START(idmap_kpti_install_ng_mappings)
 	/* PTE */
 	pte_to_phys	cur_ptep, pmd
 	kpti_map_pgtbl	pte, 3
-	kpti_mk_tbl_ng	pte, PTRS_PER_PTE
+	kpti_mk_tbl_ng	pte, SUBPTES_PER_PTE
 	b		.Lnext_pmd
 
 	.unreq	cpu
-- 
GitLab


From 726a0223890eeafaa22a87bddcd21f361b82de10 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Wed, 7 Dec 2022 16:47:22 +0000
Subject: [PATCH 2/7] arm64/mm: Define and use arch-specific ptep_get()

The arch code currently uses READ_ONCE(*ptep) all over the place, which
is going to become a problem once pte_t is larger than the HW can read
atomically. pte_t will become larger when we indpendently configure
PAGE_SHIFT and SUBPAGE_SHIFT in the near future.

So define our own arch-specific ptep_get(), which for now does
READ_ONCE(*ptep) (which is exactly what the generic version does), and
update all the call sites to use it instead. This means we will have one
place to manage a large pte_t read in future.

No behavioural changes intended.

Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
---
 arch/arm64/include/asm/pgtable.h | 16 ++++++++++------
 arch/arm64/kernel/efi.c          |  2 +-
 arch/arm64/mm/fault.c            |  4 ++--
 arch/arm64/mm/hugetlbpage.c      |  4 ++--
 arch/arm64/mm/kasan_init.c       |  2 +-
 arch/arm64/mm/mmu.c              | 10 +++++-----
 arch/arm64/mm/pageattr.c         |  4 ++--
 arch/arm64/mm/trans_pgd.c        |  2 +-
 8 files changed, 24 insertions(+), 20 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 05be6139646ca..df8df8ee3cf4d 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -274,6 +274,12 @@ static inline void set_pte(pte_t *ptep, pte_t pte)
 	}
 }
 
+#define __HAVE_ARCH_PTEP_GET
+static inline pte_t ptep_get(pte_t *ptep)
+{
+	return READ_ONCE(*ptep);
+}
+
 extern void __sync_icache_dcache(pte_t pteval);
 
 /*
@@ -300,7 +306,7 @@ static inline void __check_racy_pte_update(struct mm_struct *mm, pte_t *ptep,
 	if (!IS_ENABLED(CONFIG_DEBUG_VM))
 		return;
 
-	old_pte = READ_ONCE(*ptep);
+	old_pte = ptep_get(ptep);
 
 	if (!pte_valid(old_pte) || !pte_valid(pte))
 		return;
@@ -334,7 +340,7 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
 	 */
 	if (system_supports_mte() && pte_access_permitted(pte, false) &&
 	    !pte_special(pte)) {
-		pte_t old_pte = READ_ONCE(*ptep);
+		pte_t old_pte = ptep_get(ptep);
 		/*
 		 * We only need to synchronise if the new PTE has tags enabled
 		 * or if swapping in (in which case another mapping may have
@@ -879,8 +885,7 @@ static inline bool pud_user_accessible_page(pud_t pud)
 static inline int __ptep_test_and_clear_young(pte_t *ptep)
 {
 	pte_t old_pte, pte;
-
-	pte = READ_ONCE(*ptep);
+	pte = ptep_get(ptep);
 	do {
 		old_pte = pte;
 		pte = pte_mkold(pte);
@@ -961,8 +966,7 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
 static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep)
 {
 	pte_t old_pte, pte;
-
-	pte = READ_ONCE(*ptep);
+	pte = ptep_get(ptep);
 	do {
 		old_pte = pte;
 		pte = pte_wrprotect(pte);
diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c
index a908a37f03678..d9ea8c53e951a 100644
--- a/arch/arm64/kernel/efi.c
+++ b/arch/arm64/kernel/efi.c
@@ -99,7 +99,7 @@ int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md)
 static int __init set_permissions(pte_t *ptep, unsigned long addr, void *data)
 {
 	efi_memory_desc_t *md = data;
-	pte_t pte = READ_ONCE(*ptep);
+	pte_t pte = ptep_get(ptep);
 
 	if (md->attribute & EFI_MEMORY_RO)
 		pte = set_pte_bit(pte, __pgprot(PTE_RDONLY));
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 4cd6e5c4c9a68..619821cada6db 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -177,7 +177,7 @@ static void show_pte(unsigned long addr)
 			break;
 
 		ptep = pte_offset_map(pmdp, addr);
-		pte = READ_ONCE(*ptep);
+		pte = ptep_get(ptep);
 		pr_cont(", pte=%016llx", pte_val(pte));
 		pte_unmap(ptep);
 	} while(0);
@@ -200,7 +200,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
 			  pte_t entry, int dirty)
 {
 	pteval_t old_pteval, pteval;
-	pte_t pte = READ_ONCE(*ptep);
+	pte_t pte = ptep_get(ptep);
 
 	if (pte_same(pte, entry))
 		return 0;
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 032f3fbefdd77..b68cfff83a44b 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -510,7 +510,7 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm,
 	size_t pgsize;
 	pte_t pte;
 
-	if (!pte_cont(READ_ONCE(*ptep))) {
+	if (!pte_cont(ptep_get(ptep))) {
 		ptep_set_wrprotect(mm, addr, ptep);
 		return;
 	}
@@ -535,7 +535,7 @@ pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
 	size_t pgsize;
 	int ncontig;
 
-	if (!pte_cont(READ_ONCE(*ptep)))
+	if (!pte_cont(ptep_get(ptep)))
 		return ptep_clear_flush(vma, addr, ptep);
 
 	ncontig = find_num_contig(mm, addr, ptep, &pgsize);
diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index e969e68de005f..d7d480ba0f4dd 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -113,7 +113,7 @@ static void __init kasan_pte_populate(pmd_t *pmdp, unsigned long addr,
 			memset(__va(page_phys), KASAN_SHADOW_INIT, PAGE_SIZE);
 		next = addr + PAGE_SIZE;
 		set_pte(ptep, pfn_pte(__phys_to_pfn(page_phys), PAGE_KERNEL));
-	} while (ptep++, addr = next, addr != end && pte_none(READ_ONCE(*ptep)));
+	} while (ptep++, addr = next, addr != end && pte_none(ptep_get(ptep)));
 }
 
 static void __init kasan_pmd_populate(pud_t *pudp, unsigned long addr,
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 9a7c389651540..25ebed8021d7b 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -174,7 +174,7 @@ static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end,
 
 	ptep = pte_set_fixmap_offset(pmdp, addr);
 	do {
-		pte_t old_pte = READ_ONCE(*ptep);
+		pte_t old_pte = ptep_get(ptep);
 
 		set_pte(ptep, pfn_pte(__phys_to_pfn(phys), prot));
 
@@ -854,7 +854,7 @@ int kern_addr_valid(unsigned long addr)
 		return pfn_valid(pmd_pfn(pmd));
 
 	ptep = pte_offset_kernel(pmdp, addr);
-	pte = READ_ONCE(*ptep);
+	pte = ptep_get(ptep);
 	if (pte_none(pte))
 		return 0;
 
@@ -905,7 +905,7 @@ static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr,
 
 	do {
 		ptep = pte_offset_kernel(pmdp, addr);
-		pte = READ_ONCE(*ptep);
+		pte = ptep_get(ptep);
 		if (pte_none(pte))
 			continue;
 
@@ -1038,7 +1038,7 @@ static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr,
 
 	do {
 		ptep = pte_offset_kernel(pmdp, addr);
-		pte = READ_ONCE(*ptep);
+		pte = ptep_get(ptep);
 
 		/*
 		 * This is just a sanity check here which verifies that
@@ -1057,7 +1057,7 @@ static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr,
 	 */
 	ptep = pte_offset_kernel(pmdp, 0UL);
 	for (i = 0; i < PTRS_PER_PTE; i++) {
-		if (!pte_none(READ_ONCE(ptep[i])))
+		if (!pte_none(ptep_get(ptep++)))
 			return;
 	}
 
diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
index 5922178d7a064..3c5555184281c 100644
--- a/arch/arm64/mm/pageattr.c
+++ b/arch/arm64/mm/pageattr.c
@@ -33,7 +33,7 @@ bool can_set_direct_map(void)
 static int change_page_range(pte_t *ptep, unsigned long addr, void *data)
 {
 	struct page_change_data *cdata = data;
-	pte_t pte = READ_ONCE(*ptep);
+	pte_t pte = ptep_get(ptep);
 
 	pte = clear_pte_bit(pte, cdata->clear_mask);
 	pte = set_pte_bit(pte, cdata->set_mask);
@@ -244,5 +244,5 @@ bool kernel_page_present(struct page *page)
 		return true;
 
 	ptep = pte_offset_kernel(pmdp, addr);
-	return pte_valid(READ_ONCE(*ptep));
+	return pte_valid(ptep_get(ptep));
 }
diff --git a/arch/arm64/mm/trans_pgd.c b/arch/arm64/mm/trans_pgd.c
index 4ea2eefbc053f..5584dbacf049f 100644
--- a/arch/arm64/mm/trans_pgd.c
+++ b/arch/arm64/mm/trans_pgd.c
@@ -32,7 +32,7 @@ static void *trans_alloc(struct trans_pgd_info *info)
 
 static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr)
 {
-	pte_t pte = READ_ONCE(*src_ptep);
+	pte_t pte = ptep_get(src_ptep);
 
 	if (pte_valid(pte)) {
 		/*
-- 
GitLab


From f125b96ce78d8779965dd1d3aff5b433770de80d Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Thu, 8 Dec 2022 15:47:05 +0000
Subject: [PATCH 3/7] arm64/mm: Un-deduplicate pmd/pud setters from pte setters

Previously, set_pmd_at() and set_pud_at() were implemented in terms of
set_pte_at(), pmdp_set_access_flags() in terms of
ptep_set_access_flags(), pmdp_test_and_clear_young() in terms of
pmdp_test_and_clear_young() and pmdp_set_wrprotect() in terms of
pmdp_set_wrprotect(). This all works because the underlying types are
the same size and the relavent bits are all in the same place.

However, pte_t is about to grow in order to enable separation of kernel
and hw granule sizes. So we need to separate the implementation of
pmd/pud functions from the pte functions.

No behavioural changes intended.

Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
---
 arch/arm64/include/asm/pgtable.h | 132 +++++++++++++++++++------------
 arch/arm64/mm/fault.c            |  24 +++---
 2 files changed, 90 insertions(+), 66 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index df8df8ee3cf4d..8d01a86fbdbff 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -298,17 +298,13 @@ extern void __sync_icache_dcache(pte_t pteval);
  *   PTE_DIRTY || (PTE_WRITE && !PTE_RDONLY)
  */
 
-static inline void __check_racy_pte_update(struct mm_struct *mm, pte_t *ptep,
-					   pte_t pte)
+static inline void __check_racy_pte_update(struct mm_struct *mm, pte_t old_pte,
+					   pte_t new_pte)
 {
-	pte_t old_pte;
-
 	if (!IS_ENABLED(CONFIG_DEBUG_VM))
 		return;
 
-	old_pte = ptep_get(ptep);
-
-	if (!pte_valid(old_pte) || !pte_valid(pte))
+	if (!pte_valid(old_pte) || !pte_valid(new_pte))
 		return;
 	if (mm != current->active_mm && atomic_read(&mm->mm_users) <= 1)
 		return;
@@ -318,19 +314,19 @@ static inline void __check_racy_pte_update(struct mm_struct *mm, pte_t *ptep,
 	 * (ptep_set_access_flags safely changes valid ptes without going
 	 * through an invalid entry).
 	 */
-	VM_WARN_ONCE(!pte_young(pte),
+	VM_WARN_ONCE(!pte_young(new_pte),
 		     "%s: racy access flag clearing: 0x%016llx -> 0x%016llx",
-		     __func__, pte_val(old_pte), pte_val(pte));
-	VM_WARN_ONCE(pte_write(old_pte) && !pte_dirty(pte),
+		     __func__, pte_val(old_pte), pte_val(new_pte));
+	VM_WARN_ONCE(pte_write(old_pte) && !pte_dirty(new_pte),
 		     "%s: racy dirty state clearing: 0x%016llx -> 0x%016llx",
-		     __func__, pte_val(old_pte), pte_val(pte));
+		     __func__, pte_val(old_pte), pte_val(new_pte));
 }
 
-static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
-				pte_t *ptep, pte_t pte)
+static inline void __prep_set_pte_at(struct mm_struct *mm, unsigned long addr,
+				     pte_t old_pte, pte_t new_pte)
 {
-	if (pte_present(pte) && pte_user_exec(pte) && !pte_special(pte))
-		__sync_icache_dcache(pte);
+	if (pte_present(new_pte) && pte_user_exec(new_pte) && !pte_special(new_pte))
+		__sync_icache_dcache(new_pte);
 
 	/*
 	 * If the PTE would provide user space access to the tags associated
@@ -338,9 +334,8 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
 	 * pte_access_permitted() returns false for exec only mappings, they
 	 * don't expose tags (instruction fetches don't check tags).
 	 */
-	if (system_supports_mte() && pte_access_permitted(pte, false) &&
-	    !pte_special(pte)) {
-		pte_t old_pte = ptep_get(ptep);
+	if (system_supports_mte() && pte_access_permitted(new_pte, false) &&
+	    !pte_special(new_pte)) {
 		/*
 		 * We only need to synchronise if the new PTE has tags enabled
 		 * or if swapping in (in which case another mapping may have
@@ -348,20 +343,21 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
 		 * (!pte_none() && !pte_present()) is an open coded version of
 		 * is_swap_pte()
 		 */
-		if (pte_tagged(pte) || (!pte_none(old_pte) && !pte_present(old_pte)))
-			mte_sync_tags(old_pte, pte);
+		if (pte_tagged(new_pte) || (!pte_none(old_pte) && !pte_present(old_pte)))
+			mte_sync_tags(old_pte, new_pte);
 	}
 
-	__check_racy_pte_update(mm, ptep, pte);
-
-	set_pte(ptep, pte);
+	__check_racy_pte_update(mm, old_pte, new_pte);
 }
 
 static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
 			      pte_t *ptep, pte_t pte)
 {
+	pte_t old_pte = ptep_get(ptep);
+
 	page_table_check_pte_set(mm, addr, ptep, pte);
-	return __set_pte_at(mm, addr, ptep, pte);
+	__prep_set_pte_at(mm, addr, old_pte, pte);
+	set_pte(ptep, pte);
 }
 
 /*
@@ -534,20 +530,6 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd)
 #define pud_pfn(pud)		((__pud_to_phys(pud) & PUD_MASK) >> PAGE_SHIFT)
 #define pfn_pud(pfn,prot)	__pud(__phys_to_pud_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot))
 
-static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
-			      pmd_t *pmdp, pmd_t pmd)
-{
-	page_table_check_pmd_set(mm, addr, pmdp, pmd);
-	return __set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd));
-}
-
-static inline void set_pud_at(struct mm_struct *mm, unsigned long addr,
-			      pud_t *pudp, pud_t pud)
-{
-	page_table_check_pud_set(mm, addr, pudp, pud);
-	return __set_pte_at(mm, addr, (pte_t *)pudp, pud_pte(pud));
-}
-
 #define __p4d_to_phys(p4d)	__pte_to_phys(p4d_pte(p4d))
 #define __phys_to_p4d_val(phys)	__phys_to_pte_val(phys)
 
@@ -644,6 +626,16 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
 	}
 }
 
+static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
+			      pmd_t *pmdp, pmd_t pmd)
+{
+	pmd_t old_pmd = READ_ONCE(*pmdp);
+
+	page_table_check_pmd_set(mm, addr, pmdp, pmd);
+	__prep_set_pte_at(mm, addr, pmd_pte(old_pmd), pmd_pte(pmd));
+	set_pmd(pmdp, pmd);
+}
+
 static inline void pmd_clear(pmd_t *pmdp)
 {
 	set_pmd(pmdp, __pmd(0));
@@ -707,6 +699,16 @@ static inline void set_pud(pud_t *pudp, pud_t pud)
 	}
 }
 
+static inline void set_pud_at(struct mm_struct *mm, unsigned long addr,
+			      pud_t *pudp, pud_t pud)
+{
+	pud_t old_pud = READ_ONCE(*pudp);
+
+	page_table_check_pud_set(mm, addr, pudp, pud);
+	__prep_set_pte_at(mm, addr, pud_pte(old_pud), pud_pte(pud));
+	set_pud(pudp, pud);
+}
+
 static inline void pud_clear(pud_t *pudp)
 {
 	set_pud(pudp, __pud(0));
@@ -837,9 +839,20 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
 }
 
 #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
-extern int ptep_set_access_flags(struct vm_area_struct *vma,
+extern int __entry_set_access_flags(pteval_t *ptevalp, pteval_t entry);
+
+static inline int ptep_set_access_flags(struct vm_area_struct *vma,
 				 unsigned long address, pte_t *ptep,
-				 pte_t entry, int dirty);
+				 pte_t entry, int dirty)
+{
+	int changed = __entry_set_access_flags(&pte_val(*ptep), pte_val(entry));
+
+	/* Invalidate a stale read-only entry */
+	if (changed && dirty)
+		flush_tlb_page(vma, address);
+
+	return changed;
+}
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
@@ -847,7 +860,13 @@ static inline int pmdp_set_access_flags(struct vm_area_struct *vma,
 					unsigned long address, pmd_t *pmdp,
 					pmd_t entry, int dirty)
 {
-	return ptep_set_access_flags(vma, address, (pte_t *)pmdp, pmd_pte(entry), dirty);
+	int changed = __entry_set_access_flags(&pmd_val(*pmdp), pmd_val(entry));
+
+	/* Invalidate a stale read-only entry */
+	if (changed && dirty)
+		flush_tlb_page(vma, address);
+
+	return changed;
 }
 
 static inline int pud_devmap(pud_t pud)
@@ -882,7 +901,9 @@ static inline bool pud_user_accessible_page(pud_t pud)
  * Atomic pte/pmd modifications.
  */
 #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
-static inline int __ptep_test_and_clear_young(pte_t *ptep)
+static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
+					    unsigned long address,
+					    pte_t *ptep)
 {
 	pte_t old_pte, pte;
 	pte = ptep_get(ptep);
@@ -896,13 +917,6 @@ static inline int __ptep_test_and_clear_young(pte_t *ptep)
 	return pte_young(pte);
 }
 
-static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
-					    unsigned long address,
-					    pte_t *ptep)
-{
-	return __ptep_test_and_clear_young(ptep);
-}
-
 #define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
 static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
 					 unsigned long address, pte_t *ptep)
@@ -930,7 +944,16 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
 					    unsigned long address,
 					    pmd_t *pmdp)
 {
-	return ptep_test_and_clear_young(vma, address, (pte_t *)pmdp);
+	pmd_t old_pmd, pmd;
+	pmd = READ_ONCE(*pmdp);
+	do {
+		old_pmd = pmd;
+		pmd = pmd_mkold(pmd);
+		pmd_val(pmd) = cmpxchg_relaxed(&pmd_val(*pmdp),
+					       pmd_val(old_pmd), pmd_val(pmd));
+	} while (pmd_val(pmd) != pmd_val(old_pmd));
+
+	return pmd_young(pmd);
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
@@ -980,7 +1003,14 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres
 static inline void pmdp_set_wrprotect(struct mm_struct *mm,
 				      unsigned long address, pmd_t *pmdp)
 {
-	ptep_set_wrprotect(mm, address, (pte_t *)pmdp);
+	pmd_t old_pmd, pmd;
+	pmd = READ_ONCE(*pmdp);
+	do {
+		old_pmd = pmd;
+		pmd = pmd_wrprotect(pmd);
+		pmd_val(pmd) = cmpxchg_relaxed(&pmd_val(*pmdp),
+					       pmd_val(old_pmd), pmd_val(pmd));
+	} while (pmd_val(pmd) != pmd_val(old_pmd));
 }
 
 #define pmdp_establish pmdp_establish
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 619821cada6db..7bb0a613c327e 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -195,38 +195,32 @@ static void show_pte(unsigned long addr)
  *
  * Returns whether or not the PTE actually changed.
  */
-int ptep_set_access_flags(struct vm_area_struct *vma,
-			  unsigned long address, pte_t *ptep,
-			  pte_t entry, int dirty)
+int __entry_set_access_flags(pteval_t *ptevalp, pteval_t entry)
 {
-	pteval_t old_pteval, pteval;
-	pte_t pte = ptep_get(ptep);
+	pteval_t old_pteval;
+	pteval_t pteval = READ_ONCE(*ptevalp);
 
-	if (pte_same(pte, entry))
+	if (pteval == entry)
 		return 0;
 
 	/* only preserve the access flags and write permission */
-	pte_val(entry) &= PTE_RDONLY | PTE_AF | PTE_WRITE | PTE_DIRTY;
+	entry &= PTE_RDONLY | PTE_AF | PTE_WRITE | PTE_DIRTY;
 
 	/*
 	 * Setting the flags must be done atomically to avoid racing with the
 	 * hardware update of the access/dirty state. The PTE_RDONLY bit must
-	 * be set to the most permissive (lowest value) of *ptep and entry
+	 * be set to the most permissive (lowest value) of *ptevalp and entry
 	 * (calculated as: a & b == ~(~a | ~b)).
 	 */
-	pte_val(entry) ^= PTE_RDONLY;
-	pteval = pte_val(pte);
+	entry ^= PTE_RDONLY;
 	do {
 		old_pteval = pteval;
 		pteval ^= PTE_RDONLY;
-		pteval |= pte_val(entry);
+		pteval |= entry;
 		pteval ^= PTE_RDONLY;
-		pteval = cmpxchg_relaxed(&pte_val(*ptep), old_pteval, pteval);
+		pteval = cmpxchg_relaxed(ptevalp, old_pteval, pteval);
 	} while (pteval != old_pteval);
 
-	/* Invalidate a stale read-only entry */
-	if (dirty)
-		flush_tlb_page(vma, address);
 	return 1;
 }
 
-- 
GitLab


From 5c54d9fbdc1f2805517b406777eb85c0021a53c3 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Fri, 9 Dec 2022 16:15:01 +0000
Subject: [PATCH 4/7] arm64/mm: Fixup pte accessors to work for multiple
 sub-ptes per ptr

With the distinction between kernel PAGE_SIZE and hardware SUBPAGE_SIZE
in place, modify the pte accessors so that they can work on all the
sub-ptes in a ptr block (when SUBPAGE_SIZE != PAGE_SIZE).

The implementation assumes that the CPU is only ever racing against the
table walker (which will only write the AF and DBM bits), and never
races against another CPU. So we continue to ensure that changes to each
HW "sub-pte" are atomic, but we don't attempt to make changes to the
full kernel pte (which may contain multiple sub-ptes) atomic.

The pte_t type is expanded to fill the size of the set of sub-ptes,
although all but the first entry are just treated as padding. The tricky
part of this patch is the need to fold HW access and dirty information
into the kernel pte from all the hw sub-ptes.

For the case where PAGE_SIZE == SUBPAGE_SIZE (as is always the case
currently), no behavioural changes intended.

Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
---
 arch/arm64/include/asm/pgtable-types.h |   9 +-
 arch/arm64/include/asm/pgtable.h       | 167 +++++++++++++++++++++----
 2 files changed, 151 insertions(+), 25 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable-types.h b/arch/arm64/include/asm/pgtable-types.h
index b8f158ae25273..4ff7accdbedc8 100644
--- a/arch/arm64/include/asm/pgtable-types.h
+++ b/arch/arm64/include/asm/pgtable-types.h
@@ -10,6 +10,7 @@
 #define __ASM_PGTABLE_TYPES_H
 
 #include <asm/types.h>
+#include <asm/page-def.h>
 
 typedef u64 pteval_t;
 typedef u64 pmdval_t;
@@ -20,9 +21,11 @@ typedef u64 pgdval_t;
 /*
  * These are used to make use of C type-checking..
  */
-typedef struct { pteval_t pte; } pte_t;
-#define pte_val(x)	((x).pte)
-#define __pte(x)	((pte_t) { (x) } )
+#define SUBPTES_PER_PTR	(1 << (PAGE_SHIFT - SUBPAGE_SHIFT))
+
+typedef struct { pteval_t pte[SUBPTES_PER_PTR]; } pte_t;
+#define pte_val(x)	((x).pte[0])
+#define __pte(x)	((pte_t) { { (x) } } )
 
 #if CONFIG_PGTABLE_LEVELS > 2
 typedef struct { pmdval_t pmd; } pmd_t;
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 8d01a86fbdbff..2f51e745403b7 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -260,9 +260,37 @@ static inline pte_t pte_mkdevmap(pte_t pte)
 	return set_pte_bit(pte, __pgprot(PTE_DEVMAP | PTE_SPECIAL));
 }
 
+static inline pte_t next_subpte(pte_t pte)
+{
+	phys_addr_t phys = __pte_to_phys(pte);
+	pteval_t pteaddr = __phys_to_pte_val(phys + SUBPAGE_SIZE);
+	pteval_t pteattr = (pte_val(pte) & ~PTE_ADDR_MASK);
+
+	return __pte(pteaddr | pteattr);
+}
+
+static inline pte_t clear_async_pte_bits(pte_t pte)
+{
+	if (pte_present(pte)) {
+		pte = clear_pte_bit(pte, __pgprot(PTE_AF));
+		pte = clear_pte_bit(pte, __pgprot(PTE_DIRTY));
+		if (pte_val(pte) & PTE_DBM)
+			pte = clear_pte_bit(pte, __pgprot(PTE_RDONLY));
+	}
+
+	return pte;
+}
+
 static inline void set_pte(pte_t *ptep, pte_t pte)
 {
-	WRITE_ONCE(*ptep, pte);
+	int i;
+	pte_t subpte = pte;
+
+	for (i = 0; i < SUBPTES_PER_PTR; i++) {
+		WRITE_ONCE(ptep->pte[i], pte_val(subpte));
+		if (pte_present(pte))
+			subpte = next_subpte(subpte);
+	}
 
 	/*
 	 * Only if the new pte is valid and kernel, otherwise TLB maintenance
@@ -277,7 +305,44 @@ static inline void set_pte(pte_t *ptep, pte_t pte)
 #define __HAVE_ARCH_PTEP_GET
 static inline pte_t ptep_get(pte_t *ptep)
 {
-	return READ_ONCE(*ptep);
+	int i;
+	pte_t pte;
+	pte_t ptebase;
+	pte_t subpte;
+
+	pte = __pte(READ_ONCE(ptep->pte[0]));
+
+	/*
+	 * Knock out any don't care bits to form ptebase for consistency checks
+	 * against the sub-ptes. (e.g. AF, DIRTY and RDONLY if DBM enabled).
+	 */
+	ptebase = clear_async_pte_bits(pte);
+
+	for (i = 1; i < SUBPTES_PER_PTR; i++) {
+		subpte = __pte(READ_ONCE(ptep->pte[i]));
+
+		if (pte_present(pte)) {
+			/* Gather HW AF bits from sub-ptes. */
+			if (pte_young(subpte))
+				pte = pte_mkyoung(pte);
+
+			/* Gather HW DMB from sub-ptes. */
+			if (pte_hw_dirty(subpte))
+				pte = clear_pte_bit(pte, __pgprot(PTE_RDONLY));
+
+			/* Gather SW dirty; it's per-sub-pte due to ptep_set_wrprotect() */
+			if (pte_sw_dirty(subpte))
+				pte = set_pte_bit(pte, __pgprot(PTE_DIRTY));
+
+			/* Check consistency amongst sub-ptes. */
+			subpte = clear_async_pte_bits(subpte);
+			ptebase = next_subpte(ptebase);
+		}
+
+		WARN_ON_ONCE(pte_val(ptebase) != pte_val(subpte));
+	}
+
+	return pte;
 }
 
 extern void __sync_icache_dcache(pte_t pteval);
@@ -845,7 +910,13 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma,
 				 unsigned long address, pte_t *ptep,
 				 pte_t entry, int dirty)
 {
-	int changed = __entry_set_access_flags(&pte_val(*ptep), pte_val(entry));
+	int changed = 0;
+	int i;
+
+	for (i = 0; i < SUBPTES_PER_PTR; i++) {
+		changed |= __entry_set_access_flags(&ptep->pte[i], pte_val(entry));
+		entry = next_subpte(entry);
+	}
 
 	/* Invalidate a stale read-only entry */
 	if (changed && dirty)
@@ -901,20 +972,30 @@ static inline bool pud_user_accessible_page(pud_t pud)
  * Atomic pte/pmd modifications.
  */
 #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
+static inline int __ptep_test_and_clear_young(pteval_t *ptevalp)
+{
+	pteval_t old_pteval, pteval;
+	pteval = READ_ONCE(*ptevalp);
+	do {
+		old_pteval = pteval;
+		pteval = pte_val(pte_mkold(__pte(pteval)));
+		pteval = cmpxchg_relaxed(ptevalp, old_pteval, pteval);
+	} while (pteval != old_pteval);
+
+	return pte_young(__pte(pteval));
+}
+
 static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
 					    unsigned long address,
 					    pte_t *ptep)
 {
-	pte_t old_pte, pte;
-	pte = ptep_get(ptep);
-	do {
-		old_pte = pte;
-		pte = pte_mkold(pte);
-		pte_val(pte) = cmpxchg_relaxed(&pte_val(*ptep),
-					       pte_val(old_pte), pte_val(pte));
-	} while (pte_val(pte) != pte_val(old_pte));
+	int i;
+	int young = 0;
 
-	return pte_young(pte);
+	for (i = 0; i < SUBPTES_PER_PTR; i++)
+		young |= __ptep_test_and_clear_young(&ptep->pte[i]);
+
+	return young;
 }
 
 #define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
@@ -961,7 +1042,42 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
 static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
 				       unsigned long address, pte_t *ptep)
 {
-	pte_t pte = __pte(xchg_relaxed(&pte_val(*ptep), 0));
+	int i;
+	pte_t pte;
+	pte_t ptebase;
+	pte_t subpte;
+
+	pte = __pte(xchg_relaxed(&ptep->pte[0], 0));
+
+	/*
+	 * Knock out any don't care bits to form ptebase for consistency checks
+	 * against the sub-ptes. (e.g. AF, DIRTY and RDONLY if DBM enabled).
+	 */
+	ptebase = clear_async_pte_bits(pte);
+
+	for (i = 1; i < SUBPTES_PER_PTR; i++) {
+		subpte = __pte(xchg_relaxed(&ptep->pte[i], 0));
+
+		if (pte_present(pte)) {
+			/* Gather HW AF bits from sub-ptes. */
+			if (pte_young(subpte))
+				pte = pte_mkyoung(pte);
+
+			/* Gather HW DMB from sub-ptes. */
+			if (pte_hw_dirty(subpte))
+				pte = clear_pte_bit(pte, __pgprot(PTE_RDONLY));
+
+			/* Gather SW dirty; it's per-sub-pte due to ptep_set_wrprotect() */
+			if (pte_sw_dirty(subpte))
+				pte = set_pte_bit(pte, __pgprot(PTE_DIRTY));
+
+			/* Check consistency amongst sub-ptes. */
+			subpte = clear_async_pte_bits(subpte);
+			ptebase = next_subpte(ptebase);
+		}
+
+		WARN_ON_ONCE(pte_val(ptebase) != pte_val(subpte));
+	}
 
 	page_table_check_pte_clear(mm, address, pte);
 
@@ -986,16 +1102,23 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
  * dirty status (PTE_DBM && !PTE_RDONLY) to the software PTE_DIRTY bit.
  */
 #define __HAVE_ARCH_PTEP_SET_WRPROTECT
-static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep)
+static inline void __ptep_set_wrprotect(pteval_t *ptevalp)
 {
-	pte_t old_pte, pte;
-	pte = ptep_get(ptep);
+	pteval_t old_pteval, pteval;
+	pteval = READ_ONCE(*ptevalp);
 	do {
-		old_pte = pte;
-		pte = pte_wrprotect(pte);
-		pte_val(pte) = cmpxchg_relaxed(&pte_val(*ptep),
-					       pte_val(old_pte), pte_val(pte));
-	} while (pte_val(pte) != pte_val(old_pte));
+		old_pteval = pteval;
+		pteval = pte_val(pte_wrprotect(__pte(pteval)));
+		pteval = cmpxchg_relaxed(ptevalp, old_pteval, pteval);
+	} while (pteval != old_pteval);
+}
+
+static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep)
+{
+	int i;
+
+	for (i = 0; i < SUBPTES_PER_PTR; i++)
+		__ptep_set_wrprotect(&ptep->pte[i]);
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -1042,7 +1165,7 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
 #define __swp_entry(type,offset) ((swp_entry_t) { ((type) << __SWP_TYPE_SHIFT) | ((offset) << __SWP_OFFSET_SHIFT) })
 
 #define __pte_to_swp_entry(pte)	((swp_entry_t) { pte_val(pte) })
-#define __swp_entry_to_pte(swp)	((pte_t) { (swp).val })
+#define __swp_entry_to_pte(swp)	__pte((swp).val)
 
 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
 #define __pmd_to_swp_entry(pmd)		((swp_entry_t) { pmd_val(pmd) })
-- 
GitLab


From 9b5db4b099fb2d249e4e9e9f9ea3a9dbd508f743 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Fri, 9 Dec 2022 16:31:20 +0000
Subject: [PATCH 5/7] arm64/mm: Fixup tlbi routines to work with SUBPAGE_SIZE

The tlbi routines' interfaces work in units of PAGE_SIZE, so when
SUBPAGE_SIZE is less than PAGE_SIZE, the implementation must now
transparently do the conversion to match hardware expectations.

No behavioural changes intended for the case where SUBPAGE_SIZE ==
PAGE_SIZE.

Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
---
 arch/arm64/include/asm/tlbflush.h | 49 +++++++++++++++++++++----------
 1 file changed, 34 insertions(+), 15 deletions(-)

diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
index 412a3b9a3c25d..4230fe28f3518 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -64,7 +64,7 @@
 
 /*
  * Get translation granule of the system, which is decided by
- * PAGE_SIZE.  Used by TTL.
+ * SUBPAGE_SIZE.  Used by TTL.
  *  - 4KB	: 1
  *  - 16KB	: 2
  *  - 64KB	: 3
@@ -75,7 +75,7 @@
 
 static inline unsigned long get_trans_granule(void)
 {
-	switch (PAGE_SIZE) {
+	switch (SUBPAGE_SIZE) {
 	case SZ_4K:
 		return TLBI_TTL_TG_4K;
 	case SZ_16K:
@@ -135,7 +135,7 @@ static inline unsigned long get_trans_granule(void)
  */
 #define __TLBI_VADDR_RANGE(addr, asid, scale, num, ttl)		\
 	({							\
-		unsigned long __ta = (addr) >> PAGE_SHIFT;	\
+		unsigned long __ta = (addr) >> SUBPAGE_SHIFT;	\
 		__ta &= GENMASK_ULL(36, 0);			\
 		__ta |= (unsigned long)(ttl) << 37;		\
 		__ta |= (unsigned long)(num) << 39;		\
@@ -258,11 +258,16 @@ static inline void flush_tlb_page_nosync(struct vm_area_struct *vma,
 					 unsigned long uaddr)
 {
 	unsigned long addr;
+	int i;
 
 	dsb(ishst);
-	addr = __TLBI_VADDR(uaddr, ASID(vma->vm_mm));
-	__tlbi(vale1is, addr);
-	__tlbi_user(vale1is, addr);
+
+	for (i = 0; i < SUBPTES_PER_PTR; i++) {
+		addr = __TLBI_VADDR(uaddr, ASID(vma->vm_mm));
+		__tlbi(vale1is, addr);
+		__tlbi_user(vale1is, addr);
+		uaddr += SUBPAGE_SIZE;
+	}
 }
 
 static inline void flush_tlb_page(struct vm_area_struct *vma,
@@ -276,7 +281,7 @@ static inline void flush_tlb_page(struct vm_area_struct *vma,
  * This is meant to avoid soft lock-ups on large TLB flushing ranges and not
  * necessarily a performance improvement.
  */
-#define MAX_TLBI_OPS	PTRS_PER_PTE
+#define MAX_TLBI_OPS	SUBPTES_PER_PTE
 
 static inline void __flush_tlb_range(struct vm_area_struct *vma,
 				     unsigned long start, unsigned long end,
@@ -287,9 +292,16 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma,
 	int scale = 0;
 	unsigned long asid, addr, pages;
 
+	/*
+	 * Adjust stride to account for kernel PAGE_SIZE vs hw SUBPAGE_SIZE
+	 * difference.
+	 */
+	if (stride == PAGE_SIZE)
+		stride = SUBPAGE_SIZE;
+
 	start = round_down(start, stride);
 	end = round_up(end, stride);
-	pages = (end - start) >> PAGE_SHIFT;
+	pages = (end - start) >> SUBPAGE_SHIFT;
 
 	/*
 	 * When not uses TLB range ops, we can handle up to
@@ -337,7 +349,7 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma,
 				__tlbi_user_level(vae1is, addr, tlb_level);
 			}
 			start += stride;
-			pages -= stride >> PAGE_SHIFT;
+			pages -= stride >> SUBPAGE_SHIFT;
 			continue;
 		}
 
@@ -352,7 +364,7 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma,
 				__tlbi(rvae1is, addr);
 				__tlbi_user(rvae1is, addr);
 			}
-			start += __TLBI_RANGE_PAGES(num, scale) << PAGE_SHIFT;
+			start += __TLBI_RANGE_PAGES(num, scale) << SUBPAGE_SHIFT;
 			pages -= __TLBI_RANGE_PAGES(num, scale);
 		}
 		scale++;
@@ -368,14 +380,14 @@ static inline void flush_tlb_range(struct vm_area_struct *vma,
 	 * table entries as part of collapsing hugepages or moving page tables.
 	 * Set the tlb_level to 0 because we can not get enough information here.
 	 */
-	__flush_tlb_range(vma, start, end, PAGE_SIZE, false, 0);
+	__flush_tlb_range(vma, start, end, SUBPAGE_SIZE, false, 0);
 }
 
 static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end)
 {
 	unsigned long addr;
 
-	if ((end - start) > (MAX_TLBI_OPS * PAGE_SIZE)) {
+	if ((end - start) > (MAX_TLBI_OPS * SUBPAGE_SIZE)) {
 		flush_tlb_all();
 		return;
 	}
@@ -384,7 +396,7 @@ static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end
 	end = __TLBI_VADDR(end, 0);
 
 	dsb(ishst);
-	for (addr = start; addr < end; addr += 1 << (PAGE_SHIFT - 12))
+	for (addr = start; addr < end; addr += 1 << (SUBPAGE_SHIFT - 12))
 		__tlbi(vaale1is, addr);
 	dsb(ish);
 	isb();
@@ -396,10 +408,17 @@ static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end
  */
 static inline void __flush_tlb_kernel_pgtable(unsigned long kaddr)
 {
-	unsigned long addr = __TLBI_VADDR(kaddr, 0);
+	unsigned long addr;
+	int i;
 
 	dsb(ishst);
-	__tlbi(vaae1is, addr);
+
+	for (i = 0; i < SUBPTES_PER_PTR; i++) {
+		addr = __TLBI_VADDR(kaddr, 0);
+		__tlbi(vaae1is, addr);
+		kaddr += SUBPAGE_SIZE;
+	}
+
 	dsb(ish);
 	isb();
 }
-- 
GitLab


From 2060fe7b5fc48467d20f7aaaff6b00d0d7cd4fb8 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Fri, 9 Dec 2022 18:22:59 +0000
Subject: [PATCH 6/7] arm64/mm: Introduce KERNEL_PAGE_SHIFT Kconfig choice

Now that the kernel page size and hardware page size are handled
separately in the kernel, introduce a Kconfig choice to allow the user
to set a kernel page size that is bigger than the hardware page size. By
default the kernel page size is the same as the selected hardware page
size, but it is now possible to select power-of-2 multiples of the
hardware page size for the kernel, upto 64KB.

Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
---
 arch/arm64/Kconfig                | 40 ++++++++++++++++++++++++++++++-
 arch/arm64/include/asm/page-def.h |  2 +-
 2 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 505c8a1ccbe0c..c871db4f8ffbc 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -256,6 +256,14 @@ config ARM64_PAGE_SHIFT
 	default 14 if ARM64_16K_PAGES
 	default 12
 
+config KERNEL_PAGE_SHIFT
+	int
+	default 12 if KERNEL_4K_PAGES
+	default 13 if KERNEL_8K_PAGES
+	default 14 if KERNEL_16K_PAGES
+	default 15 if KERNEL_32K_PAGES
+	default 16 if KERNEL_64K_PAGES
+
 config ARM64_CONT_PTE_SHIFT
 	int
 	default 5 if ARM64_64K_PAGES
@@ -1136,7 +1144,7 @@ config SOCIONEXT_SYNQUACER_PREITS
 endmenu # "ARM errata workarounds via the alternatives framework"
 
 choice
-	prompt "Page size"
+	prompt "Hardware Page Size"
 	default ARM64_4K_PAGES
 	help
 	  Page size (translation granule) configuration.
@@ -1163,6 +1171,36 @@ config ARM64_64K_PAGES
 
 endchoice
 
+choice
+	prompt "Kernel Page Size"
+	default KERNEL_4K_PAGES if ARM64_4K_PAGES
+	default KERNEL_16K_PAGES if ARM64_16K_PAGES
+	default KERNEL_64K_PAGES if ARM64_64K_PAGES
+	help
+	  Allows choosing a kernel page size that is an integer multiple of the
+	  hardware page size. By default, kernel and hardware page sizes match.
+
+config KERNEL_4K_PAGES
+	bool "4KB"
+	depends on ARM64_4K_PAGES
+
+config KERNEL_8K_PAGES
+	bool "8KB"
+	depends on ARM64_4K_PAGES
+
+config KERNEL_16K_PAGES
+	bool "16KB"
+	depends on ARM64_4K_PAGES || ARM64_16K_PAGES
+
+config KERNEL_32K_PAGES
+	bool "32KB"
+	depends on ARM64_4K_PAGES || ARM64_16K_PAGES
+
+config KERNEL_64K_PAGES
+	bool "64KB"
+
+endchoice
+
 choice
 	prompt "Virtual address space size"
 	default ARM64_VA_BITS_39 if ARM64_4K_PAGES
diff --git a/arch/arm64/include/asm/page-def.h b/arch/arm64/include/asm/page-def.h
index da102f16c49d1..2b95cbdf57667 100644
--- a/arch/arm64/include/asm/page-def.h
+++ b/arch/arm64/include/asm/page-def.h
@@ -11,7 +11,7 @@
 #include <linux/const.h>
 
 /* PAGE_SHIFT determines the page size */
-#define PAGE_SHIFT		CONFIG_ARM64_PAGE_SHIFT
+#define PAGE_SHIFT		CONFIG_KERNEL_PAGE_SHIFT
 #define PAGE_SIZE		(_AC(1, UL) << PAGE_SHIFT)
 #define PAGE_MASK		(~(PAGE_SIZE-1))
 
-- 
GitLab


From 396a53b403616c6a03028df3f16344114bfc661d Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Mon, 12 Dec 2022 18:45:45 +0000
Subject: [PATCH 7/7] arm64/mm: Use SUBPAGE_SIZE page tables where easy.

Page tables are now SUBPAGE_SIZE but we are still allocating PAGE_SIZE.
This means that the difference is wasted memory.

Fix early page tables allocations to use SUBPAGE_SIZE, therefore
preventing the waste. It is much more difficult to solve the problem for
user space page tables, because these use a per-page-table lock that is
burried in the 'struct page'. If we now allocate multiple page tables
per page, then we have an issue... Solve that later.

Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
---
 arch/arm64/include/asm/kernel-pgtable.h | 6 +++---
 arch/arm64/include/asm/memory.h         | 4 ++--
 arch/arm64/kernel/vmlinux.lds.S         | 8 ++++----
 arch/arm64/mm/mmu.c                     | 7 ++++---
 4 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h
index 32d14f481f0c3..92cdbc6704432 100644
--- a/arch/arm64/include/asm/kernel-pgtable.h
+++ b/arch/arm64/include/asm/kernel-pgtable.h
@@ -85,13 +85,13 @@
 			+ EARLY_PGDS((vstart), (vend), add) 	/* each PGDIR needs a next level page table */	\
 			+ EARLY_PUDS((vstart), (vend), add)	/* each PUD needs a next level page table */	\
 			+ EARLY_PMDS((vstart), (vend), add))	/* each PMD needs a next level page table */
-#define INIT_DIR_SIZE (PAGE_SIZE * EARLY_PAGES(KIMAGE_VADDR, _end, EARLY_KASLR))
+#define INIT_DIR_SIZE (SUBPAGE_SIZE * EARLY_PAGES(KIMAGE_VADDR, _end, EARLY_KASLR))
 
 /* the initial ID map may need two extra pages if it needs to be extended */
 #if VA_BITS < 48
-#define INIT_IDMAP_DIR_SIZE	((INIT_IDMAP_DIR_PAGES + 2) * PAGE_SIZE)
+#define INIT_IDMAP_DIR_SIZE	((INIT_IDMAP_DIR_PAGES + 2) * SUBPAGE_SIZE)
 #else
-#define INIT_IDMAP_DIR_SIZE	(INIT_IDMAP_DIR_PAGES * PAGE_SIZE)
+#define INIT_IDMAP_DIR_SIZE	(INIT_IDMAP_DIR_PAGES * SUBPAGE_SIZE)
 #endif
 #define INIT_IDMAP_DIR_PAGES	EARLY_PAGES(KIMAGE_VADDR, _end + MAX_FDT_SIZE + SWAPPER_BLOCK_SIZE, 1)
 
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index 9dd08cd339c3f..d7dd5a9eb567a 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -166,13 +166,13 @@
  *  Open-coded (swapper_pg_dir - reserved_pg_dir) as this cannot be calculated
  *  until link time.
  */
-#define RESERVED_SWAPPER_OFFSET	(PAGE_SIZE)
+#define RESERVED_SWAPPER_OFFSET	(SUBPAGE_SIZE)
 
 /*
  *  Open-coded (swapper_pg_dir - tramp_pg_dir) as this cannot be calculated
  *  until link time.
  */
-#define TRAMP_SWAPPER_OFFSET	(2 * PAGE_SIZE)
+#define TRAMP_SWAPPER_OFFSET	(2 * SUBPAGE_SIZE)
 
 #ifndef __ASSEMBLY__
 
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 45131e354e27f..35d38ebc938d6 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -199,18 +199,18 @@ SECTIONS
 	}
 
 	idmap_pg_dir = .;
-	. += PAGE_SIZE;
+	. += SUBPAGE_SIZE;
 
 #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
 	tramp_pg_dir = .;
-	. += PAGE_SIZE;
+	. += SUBPAGE_SIZE;
 #endif
 
 	reserved_pg_dir = .;
-	. += PAGE_SIZE;
+	. += SUBPAGE_SIZE;
 
 	swapper_pg_dir = .;
-	. += PAGE_SIZE;
+	. += SUBPAGE_SIZE;
 
 	. = ALIGN(SEGMENT_ALIGN);
 	__init_begin = .;
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 25ebed8021d7b..94a6555a2bd53 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -110,7 +110,7 @@ static phys_addr_t __init early_pgtable_alloc(int shift)
 	phys_addr_t phys;
 	void *ptr;
 
-	phys = memblock_phys_alloc_range(PAGE_SIZE, PAGE_SIZE, 0,
+	phys = memblock_phys_alloc_range(SUBPAGE_SIZE, SUBPAGE_SIZE, 0,
 					 MEMBLOCK_ALLOC_NOLEAKTRACE);
 	if (!phys)
 		panic("Failed to allocate page table page\n");
@@ -120,9 +120,10 @@ static phys_addr_t __init early_pgtable_alloc(int shift)
 	 * slot will be free, so we can (ab)use the FIX_PTE slot to initialise
 	 * any level of table.
 	 */
-	ptr = pte_set_fixmap(phys);
+	ptr = pte_set_fixmap(PAGE_ALIGN_DOWN(phys));
+	ptr += offset_in_page(phys);
 
-	memset(ptr, 0, PAGE_SIZE);
+	memset(ptr, 0, SUBPAGE_SIZE);
 
 	/*
 	 * Implicit barriers also ensure the zeroed page is visible to the page
-- 
GitLab